KVM: Disable irq while unregistering user notifier
[pandora-kernel.git] / arch / x86 / kvm / x86.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * derived from drivers/kvm/kvm_main.c
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright (C) 2008 Qumranet, Inc.
8  * Copyright IBM Corporation, 2008
9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10  *
11  * Authors:
12  *   Avi Kivity   <avi@qumranet.com>
13  *   Yaniv Kamay  <yaniv@qumranet.com>
14  *   Amit Shah    <amit.shah@qumranet.com>
15  *   Ben-Ami Yassour <benami@il.ibm.com>
16  *
17  * This work is licensed under the terms of the GNU GPL, version 2.  See
18  * the COPYING file in the top-level directory.
19  *
20  */
21
22 #include <linux/kvm_host.h>
23 #include "irq.h"
24 #include "mmu.h"
25 #include "i8254.h"
26 #include "tss.h"
27 #include "kvm_cache_regs.h"
28 #include "x86.h"
29
30 #include <linux/clocksource.h>
31 #include <linux/interrupt.h>
32 #include <linux/kvm.h>
33 #include <linux/fs.h>
34 #include <linux/vmalloc.h>
35 #include <linux/module.h>
36 #include <linux/mman.h>
37 #include <linux/highmem.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/cpufreq.h>
41 #include <linux/user-return-notifier.h>
42 #include <linux/srcu.h>
43 #include <linux/slab.h>
44 #include <linux/perf_event.h>
45 #include <linux/uaccess.h>
46 #include <linux/hash.h>
47 #include <linux/pci.h>
48 #include <trace/events/kvm.h>
49
50 #define CREATE_TRACE_POINTS
51 #include "trace.h"
52
53 #include <asm/debugreg.h>
54 #include <asm/msr.h>
55 #include <asm/desc.h>
56 #include <asm/mtrr.h>
57 #include <asm/mce.h>
58 #include <asm/i387.h>
59 #include <asm/xcr.h>
60 #include <asm/pvclock.h>
61 #include <asm/div64.h>
62
63 #define MAX_IO_MSRS 256
64 #define KVM_MAX_MCE_BANKS 32
65 #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
66
67 #define emul_to_vcpu(ctxt) \
68         container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
69
70 /* EFER defaults:
71  * - enable syscall per default because its emulated by KVM
72  * - enable LME and LMA per default on 64 bit KVM
73  */
74 #ifdef CONFIG_X86_64
75 static
76 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
77 #else
78 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
79 #endif
80
81 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
82 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
83
84 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
85 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
86                                     struct kvm_cpuid_entry2 __user *entries);
87 static void process_nmi(struct kvm_vcpu *vcpu);
88
89 struct kvm_x86_ops *kvm_x86_ops;
90 EXPORT_SYMBOL_GPL(kvm_x86_ops);
91
92 int ignore_msrs = 0;
93 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
94
95 unsigned int min_timer_period_us = 500;
96 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
97
98 bool kvm_has_tsc_control;
99 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
100 u32  kvm_max_guest_tsc_khz;
101 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
102
103 #define KVM_NR_SHARED_MSRS 16
104
105 struct kvm_shared_msrs_global {
106         int nr;
107         u32 msrs[KVM_NR_SHARED_MSRS];
108 };
109
110 struct kvm_shared_msrs {
111         struct user_return_notifier urn;
112         bool registered;
113         struct kvm_shared_msr_values {
114                 u64 host;
115                 u64 curr;
116         } values[KVM_NR_SHARED_MSRS];
117 };
118
119 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
120 static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
121
122 struct kvm_stats_debugfs_item debugfs_entries[] = {
123         { "pf_fixed", VCPU_STAT(pf_fixed) },
124         { "pf_guest", VCPU_STAT(pf_guest) },
125         { "tlb_flush", VCPU_STAT(tlb_flush) },
126         { "invlpg", VCPU_STAT(invlpg) },
127         { "exits", VCPU_STAT(exits) },
128         { "io_exits", VCPU_STAT(io_exits) },
129         { "mmio_exits", VCPU_STAT(mmio_exits) },
130         { "signal_exits", VCPU_STAT(signal_exits) },
131         { "irq_window", VCPU_STAT(irq_window_exits) },
132         { "nmi_window", VCPU_STAT(nmi_window_exits) },
133         { "halt_exits", VCPU_STAT(halt_exits) },
134         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
135         { "hypercalls", VCPU_STAT(hypercalls) },
136         { "request_irq", VCPU_STAT(request_irq_exits) },
137         { "irq_exits", VCPU_STAT(irq_exits) },
138         { "host_state_reload", VCPU_STAT(host_state_reload) },
139         { "efer_reload", VCPU_STAT(efer_reload) },
140         { "fpu_reload", VCPU_STAT(fpu_reload) },
141         { "insn_emulation", VCPU_STAT(insn_emulation) },
142         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
143         { "irq_injections", VCPU_STAT(irq_injections) },
144         { "nmi_injections", VCPU_STAT(nmi_injections) },
145         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
146         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
147         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
148         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
149         { "mmu_flooded", VM_STAT(mmu_flooded) },
150         { "mmu_recycled", VM_STAT(mmu_recycled) },
151         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
152         { "mmu_unsync", VM_STAT(mmu_unsync) },
153         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
154         { "largepages", VM_STAT(lpages) },
155         { NULL }
156 };
157
158 u64 __read_mostly host_xcr0;
159
160 int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
161
162 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
163 {
164         int i;
165         for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
166                 vcpu->arch.apf.gfns[i] = ~0;
167 }
168
169 static void kvm_on_user_return(struct user_return_notifier *urn)
170 {
171         unsigned slot;
172         struct kvm_shared_msrs *locals
173                 = container_of(urn, struct kvm_shared_msrs, urn);
174         struct kvm_shared_msr_values *values;
175         unsigned long flags;
176
177         /*
178          * Disabling irqs at this point since the following code could be
179          * interrupted and executed through kvm_arch_hardware_disable()
180          */
181         local_irq_save(flags);
182         if (locals->registered) {
183                 locals->registered = false;
184                 user_return_notifier_unregister(urn);
185         }
186         local_irq_restore(flags);
187         for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
188                 values = &locals->values[slot];
189                 if (values->host != values->curr) {
190                         wrmsrl(shared_msrs_global.msrs[slot], values->host);
191                         values->curr = values->host;
192                 }
193         }
194 }
195
196 static void shared_msr_update(unsigned slot, u32 msr)
197 {
198         struct kvm_shared_msrs *smsr;
199         u64 value;
200
201         smsr = &__get_cpu_var(shared_msrs);
202         /* only read, and nobody should modify it at this time,
203          * so don't need lock */
204         if (slot >= shared_msrs_global.nr) {
205                 printk(KERN_ERR "kvm: invalid MSR slot!");
206                 return;
207         }
208         rdmsrl_safe(msr, &value);
209         smsr->values[slot].host = value;
210         smsr->values[slot].curr = value;
211 }
212
213 void kvm_define_shared_msr(unsigned slot, u32 msr)
214 {
215         if (slot >= shared_msrs_global.nr)
216                 shared_msrs_global.nr = slot + 1;
217         shared_msrs_global.msrs[slot] = msr;
218         /* we need ensured the shared_msr_global have been updated */
219         smp_wmb();
220 }
221 EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
222
223 static void kvm_shared_msr_cpu_online(void)
224 {
225         unsigned i;
226
227         for (i = 0; i < shared_msrs_global.nr; ++i)
228                 shared_msr_update(i, shared_msrs_global.msrs[i]);
229 }
230
231 void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
232 {
233         struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
234
235         if (((value ^ smsr->values[slot].curr) & mask) == 0)
236                 return;
237         smsr->values[slot].curr = value;
238         wrmsrl(shared_msrs_global.msrs[slot], value);
239         if (!smsr->registered) {
240                 smsr->urn.on_user_return = kvm_on_user_return;
241                 user_return_notifier_register(&smsr->urn);
242                 smsr->registered = true;
243         }
244 }
245 EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
246
247 static void drop_user_return_notifiers(void *ignore)
248 {
249         struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
250
251         if (smsr->registered)
252                 kvm_on_user_return(&smsr->urn);
253 }
254
255 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
256 {
257         if (irqchip_in_kernel(vcpu->kvm))
258                 return vcpu->arch.apic_base;
259         else
260                 return vcpu->arch.apic_base;
261 }
262 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
263
264 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
265 {
266         /* TODO: reserve bits check */
267         if (irqchip_in_kernel(vcpu->kvm))
268                 kvm_lapic_set_base(vcpu, data);
269         else
270                 vcpu->arch.apic_base = data;
271 }
272 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
273
274 #define EXCPT_BENIGN            0
275 #define EXCPT_CONTRIBUTORY      1
276 #define EXCPT_PF                2
277
278 static int exception_class(int vector)
279 {
280         switch (vector) {
281         case PF_VECTOR:
282                 return EXCPT_PF;
283         case DE_VECTOR:
284         case TS_VECTOR:
285         case NP_VECTOR:
286         case SS_VECTOR:
287         case GP_VECTOR:
288                 return EXCPT_CONTRIBUTORY;
289         default:
290                 break;
291         }
292         return EXCPT_BENIGN;
293 }
294
295 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
296                 unsigned nr, bool has_error, u32 error_code,
297                 bool reinject)
298 {
299         u32 prev_nr;
300         int class1, class2;
301
302         kvm_make_request(KVM_REQ_EVENT, vcpu);
303
304         if (!vcpu->arch.exception.pending) {
305         queue:
306                 vcpu->arch.exception.pending = true;
307                 vcpu->arch.exception.has_error_code = has_error;
308                 vcpu->arch.exception.nr = nr;
309                 vcpu->arch.exception.error_code = error_code;
310                 vcpu->arch.exception.reinject = reinject;
311                 return;
312         }
313
314         /* to check exception */
315         prev_nr = vcpu->arch.exception.nr;
316         if (prev_nr == DF_VECTOR) {
317                 /* triple fault -> shutdown */
318                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
319                 return;
320         }
321         class1 = exception_class(prev_nr);
322         class2 = exception_class(nr);
323         if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
324                 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
325                 /* generate double fault per SDM Table 5-5 */
326                 vcpu->arch.exception.pending = true;
327                 vcpu->arch.exception.has_error_code = true;
328                 vcpu->arch.exception.nr = DF_VECTOR;
329                 vcpu->arch.exception.error_code = 0;
330         } else
331                 /* replace previous exception with a new one in a hope
332                    that instruction re-execution will regenerate lost
333                    exception */
334                 goto queue;
335 }
336
337 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
338 {
339         kvm_multiple_exception(vcpu, nr, false, 0, false);
340 }
341 EXPORT_SYMBOL_GPL(kvm_queue_exception);
342
343 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
344 {
345         kvm_multiple_exception(vcpu, nr, false, 0, true);
346 }
347 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
348
349 void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
350 {
351         if (err)
352                 kvm_inject_gp(vcpu, 0);
353         else
354                 kvm_x86_ops->skip_emulated_instruction(vcpu);
355 }
356 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
357
358 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
359 {
360         ++vcpu->stat.pf_guest;
361         vcpu->arch.cr2 = fault->address;
362         kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
363 }
364 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
365
366 void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
367 {
368         if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
369                 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
370         else
371                 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
372 }
373
374 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
375 {
376         atomic_inc(&vcpu->arch.nmi_queued);
377         kvm_make_request(KVM_REQ_NMI, vcpu);
378 }
379 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
380
381 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
382 {
383         kvm_multiple_exception(vcpu, nr, true, error_code, false);
384 }
385 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
386
387 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
388 {
389         kvm_multiple_exception(vcpu, nr, true, error_code, true);
390 }
391 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
392
393 /*
394  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
395  * a #GP and return false.
396  */
397 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
398 {
399         if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
400                 return true;
401         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
402         return false;
403 }
404 EXPORT_SYMBOL_GPL(kvm_require_cpl);
405
406 /*
407  * This function will be used to read from the physical memory of the currently
408  * running guest. The difference to kvm_read_guest_page is that this function
409  * can read from guest physical or from the guest's guest physical memory.
410  */
411 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
412                             gfn_t ngfn, void *data, int offset, int len,
413                             u32 access)
414 {
415         gfn_t real_gfn;
416         gpa_t ngpa;
417
418         ngpa     = gfn_to_gpa(ngfn);
419         real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
420         if (real_gfn == UNMAPPED_GVA)
421                 return -EFAULT;
422
423         real_gfn = gpa_to_gfn(real_gfn);
424
425         return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
426 }
427 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
428
429 int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
430                                void *data, int offset, int len, u32 access)
431 {
432         return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
433                                        data, offset, len, access);
434 }
435
436 /*
437  * Load the pae pdptrs.  Return true is they are all valid.
438  */
439 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
440 {
441         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
442         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
443         int i;
444         int ret;
445         u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
446
447         ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
448                                       offset * sizeof(u64), sizeof(pdpte),
449                                       PFERR_USER_MASK|PFERR_WRITE_MASK);
450         if (ret < 0) {
451                 ret = 0;
452                 goto out;
453         }
454         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
455                 if (is_present_gpte(pdpte[i]) &&
456                     (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
457                         ret = 0;
458                         goto out;
459                 }
460         }
461         ret = 1;
462
463         memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
464         __set_bit(VCPU_EXREG_PDPTR,
465                   (unsigned long *)&vcpu->arch.regs_avail);
466         __set_bit(VCPU_EXREG_PDPTR,
467                   (unsigned long *)&vcpu->arch.regs_dirty);
468 out:
469
470         return ret;
471 }
472 EXPORT_SYMBOL_GPL(load_pdptrs);
473
474 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
475 {
476         u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
477         bool changed = true;
478         int offset;
479         gfn_t gfn;
480         int r;
481
482         if (is_long_mode(vcpu) || !is_pae(vcpu))
483                 return false;
484
485         if (!test_bit(VCPU_EXREG_PDPTR,
486                       (unsigned long *)&vcpu->arch.regs_avail))
487                 return true;
488
489         gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
490         offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
491         r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
492                                        PFERR_USER_MASK | PFERR_WRITE_MASK);
493         if (r < 0)
494                 goto out;
495         changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
496 out:
497
498         return changed;
499 }
500
501 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
502 {
503         unsigned long old_cr0 = kvm_read_cr0(vcpu);
504         unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
505                                     X86_CR0_CD | X86_CR0_NW;
506
507         cr0 |= X86_CR0_ET;
508
509 #ifdef CONFIG_X86_64
510         if (cr0 & 0xffffffff00000000UL)
511                 return 1;
512 #endif
513
514         cr0 &= ~CR0_RESERVED_BITS;
515
516         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
517                 return 1;
518
519         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
520                 return 1;
521
522         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
523 #ifdef CONFIG_X86_64
524                 if ((vcpu->arch.efer & EFER_LME)) {
525                         int cs_db, cs_l;
526
527                         if (!is_pae(vcpu))
528                                 return 1;
529                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
530                         if (cs_l)
531                                 return 1;
532                 } else
533 #endif
534                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
535                                                  kvm_read_cr3(vcpu)))
536                         return 1;
537         }
538
539         kvm_x86_ops->set_cr0(vcpu, cr0);
540
541         if ((cr0 ^ old_cr0) & X86_CR0_PG) {
542                 kvm_clear_async_pf_completion_queue(vcpu);
543                 kvm_async_pf_hash_reset(vcpu);
544         }
545
546         if ((cr0 ^ old_cr0) & update_bits)
547                 kvm_mmu_reset_context(vcpu);
548         return 0;
549 }
550 EXPORT_SYMBOL_GPL(kvm_set_cr0);
551
552 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
553 {
554         (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
555 }
556 EXPORT_SYMBOL_GPL(kvm_lmsw);
557
558 int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
559 {
560         u64 xcr0;
561
562         /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
563         if (index != XCR_XFEATURE_ENABLED_MASK)
564                 return 1;
565         xcr0 = xcr;
566         if (!(xcr0 & XSTATE_FP))
567                 return 1;
568         if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
569                 return 1;
570         if (xcr0 & ~host_xcr0)
571                 return 1;
572         vcpu->arch.xcr0 = xcr0;
573         vcpu->guest_xcr0_loaded = 0;
574         return 0;
575 }
576
577 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
578 {
579         if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
580             __kvm_set_xcr(vcpu, index, xcr)) {
581                 kvm_inject_gp(vcpu, 0);
582                 return 1;
583         }
584         return 0;
585 }
586 EXPORT_SYMBOL_GPL(kvm_set_xcr);
587
588 static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
589 {
590         struct kvm_cpuid_entry2 *best;
591
592         if (!static_cpu_has(X86_FEATURE_XSAVE))
593                 return 0;
594
595         best = kvm_find_cpuid_entry(vcpu, 1, 0);
596         return best && (best->ecx & bit(X86_FEATURE_XSAVE));
597 }
598
599 static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
600 {
601         struct kvm_cpuid_entry2 *best;
602
603         best = kvm_find_cpuid_entry(vcpu, 7, 0);
604         return best && (best->ebx & bit(X86_FEATURE_SMEP));
605 }
606
607 static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
608 {
609         struct kvm_cpuid_entry2 *best;
610
611         best = kvm_find_cpuid_entry(vcpu, 7, 0);
612         return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
613 }
614
615 static void update_cpuid(struct kvm_vcpu *vcpu)
616 {
617         struct kvm_cpuid_entry2 *best;
618         struct kvm_lapic *apic = vcpu->arch.apic;
619
620         best = kvm_find_cpuid_entry(vcpu, 1, 0);
621         if (!best)
622                 return;
623
624         /* Update OSXSAVE bit */
625         if (cpu_has_xsave && best->function == 0x1) {
626                 best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
627                 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
628                         best->ecx |= bit(X86_FEATURE_OSXSAVE);
629         }
630
631         if (apic) {
632                 if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
633                         apic->lapic_timer.timer_mode_mask = 3 << 17;
634                 else
635                         apic->lapic_timer.timer_mode_mask = 1 << 17;
636         }
637 }
638
639 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
640 {
641         unsigned long old_cr4 = kvm_read_cr4(vcpu);
642         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
643                                    X86_CR4_PAE | X86_CR4_SMEP;
644         if (cr4 & CR4_RESERVED_BITS)
645                 return 1;
646
647         if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
648                 return 1;
649
650         if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
651                 return 1;
652
653         if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
654                 return 1;
655
656         if (is_long_mode(vcpu)) {
657                 if (!(cr4 & X86_CR4_PAE))
658                         return 1;
659         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
660                    && ((cr4 ^ old_cr4) & pdptr_bits)
661                    && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
662                                    kvm_read_cr3(vcpu)))
663                 return 1;
664
665         if (kvm_x86_ops->set_cr4(vcpu, cr4))
666                 return 1;
667
668         if ((cr4 ^ old_cr4) & pdptr_bits)
669                 kvm_mmu_reset_context(vcpu);
670
671         if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
672                 update_cpuid(vcpu);
673
674         return 0;
675 }
676 EXPORT_SYMBOL_GPL(kvm_set_cr4);
677
678 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
679 {
680         if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
681                 kvm_mmu_sync_roots(vcpu);
682                 kvm_mmu_flush_tlb(vcpu);
683                 return 0;
684         }
685
686         if (is_long_mode(vcpu)) {
687                 if (cr3 & CR3_L_MODE_RESERVED_BITS)
688                         return 1;
689         } else {
690                 if (is_pae(vcpu)) {
691                         if (cr3 & CR3_PAE_RESERVED_BITS)
692                                 return 1;
693                         if (is_paging(vcpu) &&
694                             !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
695                                 return 1;
696                 }
697                 /*
698                  * We don't check reserved bits in nonpae mode, because
699                  * this isn't enforced, and VMware depends on this.
700                  */
701         }
702
703         /*
704          * Does the new cr3 value map to physical memory? (Note, we
705          * catch an invalid cr3 even in real-mode, because it would
706          * cause trouble later on when we turn on paging anyway.)
707          *
708          * A real CPU would silently accept an invalid cr3 and would
709          * attempt to use it - with largely undefined (and often hard
710          * to debug) behavior on the guest side.
711          */
712         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
713                 return 1;
714         vcpu->arch.cr3 = cr3;
715         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
716         vcpu->arch.mmu.new_cr3(vcpu);
717         return 0;
718 }
719 EXPORT_SYMBOL_GPL(kvm_set_cr3);
720
721 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
722 {
723         if (cr8 & CR8_RESERVED_BITS)
724                 return 1;
725         if (irqchip_in_kernel(vcpu->kvm))
726                 kvm_lapic_set_tpr(vcpu, cr8);
727         else
728                 vcpu->arch.cr8 = cr8;
729         return 0;
730 }
731 EXPORT_SYMBOL_GPL(kvm_set_cr8);
732
733 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
734 {
735         if (irqchip_in_kernel(vcpu->kvm))
736                 return kvm_lapic_get_cr8(vcpu);
737         else
738                 return vcpu->arch.cr8;
739 }
740 EXPORT_SYMBOL_GPL(kvm_get_cr8);
741
742 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
743 {
744         switch (dr) {
745         case 0 ... 3:
746                 vcpu->arch.db[dr] = val;
747                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
748                         vcpu->arch.eff_db[dr] = val;
749                 break;
750         case 4:
751                 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
752                         return 1; /* #UD */
753                 /* fall through */
754         case 6:
755                 if (val & 0xffffffff00000000ULL)
756                         return -1; /* #GP */
757                 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
758                 break;
759         case 5:
760                 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
761                         return 1; /* #UD */
762                 /* fall through */
763         default: /* 7 */
764                 if (val & 0xffffffff00000000ULL)
765                         return -1; /* #GP */
766                 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
767                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
768                         kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
769                         vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
770                 }
771                 break;
772         }
773
774         return 0;
775 }
776
777 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
778 {
779         int res;
780
781         res = __kvm_set_dr(vcpu, dr, val);
782         if (res > 0)
783                 kvm_queue_exception(vcpu, UD_VECTOR);
784         else if (res < 0)
785                 kvm_inject_gp(vcpu, 0);
786
787         return res;
788 }
789 EXPORT_SYMBOL_GPL(kvm_set_dr);
790
791 static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
792 {
793         switch (dr) {
794         case 0 ... 3:
795                 *val = vcpu->arch.db[dr];
796                 break;
797         case 4:
798                 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
799                         return 1;
800                 /* fall through */
801         case 6:
802                 *val = vcpu->arch.dr6;
803                 break;
804         case 5:
805                 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
806                         return 1;
807                 /* fall through */
808         default: /* 7 */
809                 *val = vcpu->arch.dr7;
810                 break;
811         }
812
813         return 0;
814 }
815
816 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
817 {
818         if (_kvm_get_dr(vcpu, dr, val)) {
819                 kvm_queue_exception(vcpu, UD_VECTOR);
820                 return 1;
821         }
822         return 0;
823 }
824 EXPORT_SYMBOL_GPL(kvm_get_dr);
825
826 /*
827  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
828  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
829  *
830  * This list is modified at module load time to reflect the
831  * capabilities of the host cpu. This capabilities test skips MSRs that are
832  * kvm-specific. Those are put in the beginning of the list.
833  */
834
835 #define KVM_SAVE_MSRS_BEGIN     9
836 static u32 msrs_to_save[] = {
837         MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
838         MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
839         HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
840         HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
841         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
842         MSR_STAR,
843 #ifdef CONFIG_X86_64
844         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
845 #endif
846         MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
847         MSR_TSC_AUX,
848 };
849
850 static unsigned num_msrs_to_save;
851
852 static u32 emulated_msrs[] = {
853         MSR_IA32_TSCDEADLINE,
854         MSR_IA32_MISC_ENABLE,
855         MSR_IA32_MCG_STATUS,
856         MSR_IA32_MCG_CTL,
857 };
858
859 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
860 {
861         u64 old_efer = vcpu->arch.efer;
862
863         if (efer & efer_reserved_bits)
864                 return 1;
865
866         if (is_paging(vcpu)
867             && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
868                 return 1;
869
870         if (efer & EFER_FFXSR) {
871                 struct kvm_cpuid_entry2 *feat;
872
873                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
874                 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
875                         return 1;
876         }
877
878         if (efer & EFER_SVME) {
879                 struct kvm_cpuid_entry2 *feat;
880
881                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
882                 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
883                         return 1;
884         }
885
886         efer &= ~EFER_LMA;
887         efer |= vcpu->arch.efer & EFER_LMA;
888
889         kvm_x86_ops->set_efer(vcpu, efer);
890
891         vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
892
893         /* Update reserved bits */
894         if ((efer ^ old_efer) & EFER_NX)
895                 kvm_mmu_reset_context(vcpu);
896
897         return 0;
898 }
899
900 void kvm_enable_efer_bits(u64 mask)
901 {
902        efer_reserved_bits &= ~mask;
903 }
904 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
905
906 /*
907  * Writes msr value into into the appropriate "register".
908  * Returns 0 on success, non-0 otherwise.
909  * Assumes vcpu_load() was already called.
910  */
911 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
912 {
913         switch (msr_index) {
914         case MSR_FS_BASE:
915         case MSR_GS_BASE:
916         case MSR_KERNEL_GS_BASE:
917         case MSR_CSTAR:
918         case MSR_LSTAR:
919                 if (is_noncanonical_address(data))
920                         return 1;
921                 break;
922         case MSR_IA32_SYSENTER_EIP:
923         case MSR_IA32_SYSENTER_ESP:
924                 /*
925                  * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
926                  * non-canonical address is written on Intel but not on
927                  * AMD (which ignores the top 32-bits, because it does
928                  * not implement 64-bit SYSENTER).
929                  *
930                  * 64-bit code should hence be able to write a non-canonical
931                  * value on AMD.  Making the address canonical ensures that
932                  * vmentry does not fail on Intel after writing a non-canonical
933                  * value, and that something deterministic happens if the guest
934                  * invokes 64-bit SYSENTER.
935                  */
936                 data = get_canonical(data);
937         }
938         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
939 }
940 EXPORT_SYMBOL_GPL(kvm_set_msr);
941
942 /*
943  * Adapt set_msr() to msr_io()'s calling convention
944  */
945 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
946 {
947         return kvm_set_msr(vcpu, index, *data);
948 }
949
950 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
951 {
952         int version;
953         int r;
954         struct pvclock_wall_clock wc;
955         struct timespec boot;
956
957         if (!wall_clock)
958                 return;
959
960         r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
961         if (r)
962                 return;
963
964         if (version & 1)
965                 ++version;  /* first time write, random junk */
966
967         ++version;
968
969         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
970
971         /*
972          * The guest calculates current wall clock time by adding
973          * system time (updated by kvm_guest_time_update below) to the
974          * wall clock specified here.  guest system time equals host
975          * system time for us, thus we must fill in host boot time here.
976          */
977         getboottime(&boot);
978
979         wc.sec = boot.tv_sec;
980         wc.nsec = boot.tv_nsec;
981         wc.version = version;
982
983         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
984
985         version++;
986         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
987 }
988
989 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
990 {
991         uint32_t quotient, remainder;
992
993         /* Don't try to replace with do_div(), this one calculates
994          * "(dividend << 32) / divisor" */
995         __asm__ ( "divl %4"
996                   : "=a" (quotient), "=d" (remainder)
997                   : "0" (0), "1" (dividend), "r" (divisor) );
998         return quotient;
999 }
1000
1001 static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
1002                                s8 *pshift, u32 *pmultiplier)
1003 {
1004         uint64_t scaled64;
1005         int32_t  shift = 0;
1006         uint64_t tps64;
1007         uint32_t tps32;
1008
1009         tps64 = base_khz * 1000LL;
1010         scaled64 = scaled_khz * 1000LL;
1011         while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
1012                 tps64 >>= 1;
1013                 shift--;
1014         }
1015
1016         tps32 = (uint32_t)tps64;
1017         while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
1018                 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
1019                         scaled64 >>= 1;
1020                 else
1021                         tps32 <<= 1;
1022                 shift++;
1023         }
1024
1025         *pshift = shift;
1026         *pmultiplier = div_frac(scaled64, tps32);
1027
1028         pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
1029                  __func__, base_khz, scaled_khz, shift, *pmultiplier);
1030 }
1031
1032 static inline u64 get_kernel_ns(void)
1033 {
1034         struct timespec ts;
1035
1036         WARN_ON(preemptible());
1037         ktime_get_ts(&ts);
1038         monotonic_to_bootbased(&ts);
1039         return timespec_to_ns(&ts);
1040 }
1041
1042 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1043 unsigned long max_tsc_khz;
1044
1045 static inline int kvm_tsc_changes_freq(void)
1046 {
1047         int cpu = get_cpu();
1048         int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
1049                   cpufreq_quick_get(cpu) != 0;
1050         put_cpu();
1051         return ret;
1052 }
1053
1054 u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
1055 {
1056         if (vcpu->arch.virtual_tsc_khz)
1057                 return vcpu->arch.virtual_tsc_khz;
1058         else
1059                 return __this_cpu_read(cpu_tsc_khz);
1060 }
1061
1062 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
1063 {
1064         u64 ret;
1065
1066         WARN_ON(preemptible());
1067         if (kvm_tsc_changes_freq())
1068                 printk_once(KERN_WARNING
1069                  "kvm: unreliable cycle conversion on adjustable rate TSC\n");
1070         ret = nsec * vcpu_tsc_khz(vcpu);
1071         do_div(ret, USEC_PER_SEC);
1072         return ret;
1073 }
1074
1075 static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1076 {
1077         /* Compute a scale to convert nanoseconds in TSC cycles */
1078         kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1079                            &vcpu->arch.tsc_catchup_shift,
1080                            &vcpu->arch.tsc_catchup_mult);
1081 }
1082
1083 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1084 {
1085         u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
1086                                       vcpu->arch.tsc_catchup_mult,
1087                                       vcpu->arch.tsc_catchup_shift);
1088         tsc += vcpu->arch.last_tsc_write;
1089         return tsc;
1090 }
1091
1092 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1093 {
1094         struct kvm *kvm = vcpu->kvm;
1095         u64 offset, ns, elapsed;
1096         unsigned long flags;
1097         s64 sdiff;
1098
1099         raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1100         offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1101         ns = get_kernel_ns();
1102         elapsed = ns - kvm->arch.last_tsc_nsec;
1103         sdiff = data - kvm->arch.last_tsc_write;
1104         if (sdiff < 0)
1105                 sdiff = -sdiff;
1106
1107         /*
1108          * Special case: close write to TSC within 5 seconds of
1109          * another CPU is interpreted as an attempt to synchronize
1110          * The 5 seconds is to accommodate host load / swapping as
1111          * well as any reset of TSC during the boot process.
1112          *
1113          * In that case, for a reliable TSC, we can match TSC offsets,
1114          * or make a best guest using elapsed value.
1115          */
1116         if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
1117             elapsed < 5ULL * NSEC_PER_SEC) {
1118                 if (!check_tsc_unstable()) {
1119                         offset = kvm->arch.last_tsc_offset;
1120                         pr_debug("kvm: matched tsc offset for %llu\n", data);
1121                 } else {
1122                         u64 delta = nsec_to_cycles(vcpu, elapsed);
1123                         offset += delta;
1124                         pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1125                 }
1126                 ns = kvm->arch.last_tsc_nsec;
1127         }
1128         kvm->arch.last_tsc_nsec = ns;
1129         kvm->arch.last_tsc_write = data;
1130         kvm->arch.last_tsc_offset = offset;
1131         kvm_x86_ops->write_tsc_offset(vcpu, offset);
1132         raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1133
1134         /* Reset of TSC must disable overshoot protection below */
1135         vcpu->arch.hv_clock.tsc_timestamp = 0;
1136         vcpu->arch.last_tsc_write = data;
1137         vcpu->arch.last_tsc_nsec = ns;
1138 }
1139 EXPORT_SYMBOL_GPL(kvm_write_tsc);
1140
1141 static int kvm_guest_time_update(struct kvm_vcpu *v)
1142 {
1143         unsigned long flags;
1144         struct kvm_vcpu_arch *vcpu = &v->arch;
1145         unsigned long this_tsc_khz;
1146         s64 kernel_ns, max_kernel_ns;
1147         u64 tsc_timestamp;
1148
1149         /* Keep irq disabled to prevent changes to the clock */
1150         local_irq_save(flags);
1151         tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
1152         kernel_ns = get_kernel_ns();
1153         this_tsc_khz = vcpu_tsc_khz(v);
1154         if (unlikely(this_tsc_khz == 0)) {
1155                 local_irq_restore(flags);
1156                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
1157                 return 1;
1158         }
1159
1160         /*
1161          * We may have to catch up the TSC to match elapsed wall clock
1162          * time for two reasons, even if kvmclock is used.
1163          *   1) CPU could have been running below the maximum TSC rate
1164          *   2) Broken TSC compensation resets the base at each VCPU
1165          *      entry to avoid unknown leaps of TSC even when running
1166          *      again on the same CPU.  This may cause apparent elapsed
1167          *      time to disappear, and the guest to stand still or run
1168          *      very slowly.
1169          */
1170         if (vcpu->tsc_catchup) {
1171                 u64 tsc = compute_guest_tsc(v, kernel_ns);
1172                 if (tsc > tsc_timestamp) {
1173                         kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
1174                         tsc_timestamp = tsc;
1175                 }
1176         }
1177
1178         local_irq_restore(flags);
1179
1180         if (!vcpu->pv_time_enabled)
1181                 return 0;
1182
1183         /*
1184          * Time as measured by the TSC may go backwards when resetting the base
1185          * tsc_timestamp.  The reason for this is that the TSC resolution is
1186          * higher than the resolution of the other clock scales.  Thus, many
1187          * possible measurments of the TSC correspond to one measurement of any
1188          * other clock, and so a spread of values is possible.  This is not a
1189          * problem for the computation of the nanosecond clock; with TSC rates
1190          * around 1GHZ, there can only be a few cycles which correspond to one
1191          * nanosecond value, and any path through this code will inevitably
1192          * take longer than that.  However, with the kernel_ns value itself,
1193          * the precision may be much lower, down to HZ granularity.  If the
1194          * first sampling of TSC against kernel_ns ends in the low part of the
1195          * range, and the second in the high end of the range, we can get:
1196          *
1197          * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
1198          *
1199          * As the sampling errors potentially range in the thousands of cycles,
1200          * it is possible such a time value has already been observed by the
1201          * guest.  To protect against this, we must compute the system time as
1202          * observed by the guest and ensure the new system time is greater.
1203          */
1204         max_kernel_ns = 0;
1205         if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
1206                 max_kernel_ns = vcpu->last_guest_tsc -
1207                                 vcpu->hv_clock.tsc_timestamp;
1208                 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
1209                                     vcpu->hv_clock.tsc_to_system_mul,
1210                                     vcpu->hv_clock.tsc_shift);
1211                 max_kernel_ns += vcpu->last_kernel_ns;
1212         }
1213
1214         if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1215                 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1216                                    &vcpu->hv_clock.tsc_shift,
1217                                    &vcpu->hv_clock.tsc_to_system_mul);
1218                 vcpu->hw_tsc_khz = this_tsc_khz;
1219         }
1220
1221         if (max_kernel_ns > kernel_ns)
1222                 kernel_ns = max_kernel_ns;
1223
1224         /* With all the info we got, fill in the values */
1225         vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1226         vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1227         vcpu->last_kernel_ns = kernel_ns;
1228         vcpu->last_guest_tsc = tsc_timestamp;
1229         vcpu->hv_clock.flags = 0;
1230
1231         /*
1232          * The interface expects us to write an even number signaling that the
1233          * update is finished. Since the guest won't see the intermediate
1234          * state, we just increase by 2 at the end.
1235          */
1236         vcpu->hv_clock.version += 2;
1237
1238         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
1239                                 &vcpu->hv_clock,
1240                                 sizeof(vcpu->hv_clock));
1241         return 0;
1242 }
1243
1244 static bool msr_mtrr_valid(unsigned msr)
1245 {
1246         switch (msr) {
1247         case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
1248         case MSR_MTRRfix64K_00000:
1249         case MSR_MTRRfix16K_80000:
1250         case MSR_MTRRfix16K_A0000:
1251         case MSR_MTRRfix4K_C0000:
1252         case MSR_MTRRfix4K_C8000:
1253         case MSR_MTRRfix4K_D0000:
1254         case MSR_MTRRfix4K_D8000:
1255         case MSR_MTRRfix4K_E0000:
1256         case MSR_MTRRfix4K_E8000:
1257         case MSR_MTRRfix4K_F0000:
1258         case MSR_MTRRfix4K_F8000:
1259         case MSR_MTRRdefType:
1260         case MSR_IA32_CR_PAT:
1261                 return true;
1262         case 0x2f8:
1263                 return true;
1264         }
1265         return false;
1266 }
1267
1268 static bool valid_pat_type(unsigned t)
1269 {
1270         return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
1271 }
1272
1273 static bool valid_mtrr_type(unsigned t)
1274 {
1275         return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
1276 }
1277
1278 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1279 {
1280         int i;
1281
1282         if (!msr_mtrr_valid(msr))
1283                 return false;
1284
1285         if (msr == MSR_IA32_CR_PAT) {
1286                 for (i = 0; i < 8; i++)
1287                         if (!valid_pat_type((data >> (i * 8)) & 0xff))
1288                                 return false;
1289                 return true;
1290         } else if (msr == MSR_MTRRdefType) {
1291                 if (data & ~0xcff)
1292                         return false;
1293                 return valid_mtrr_type(data & 0xff);
1294         } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
1295                 for (i = 0; i < 8 ; i++)
1296                         if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
1297                                 return false;
1298                 return true;
1299         }
1300
1301         /* variable MTRRs */
1302         return valid_mtrr_type(data & 0xff);
1303 }
1304
1305 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1306 {
1307         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1308
1309         if (!mtrr_valid(vcpu, msr, data))
1310                 return 1;
1311
1312         if (msr == MSR_MTRRdefType) {
1313                 vcpu->arch.mtrr_state.def_type = data;
1314                 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
1315         } else if (msr == MSR_MTRRfix64K_00000)
1316                 p[0] = data;
1317         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1318                 p[1 + msr - MSR_MTRRfix16K_80000] = data;
1319         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1320                 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
1321         else if (msr == MSR_IA32_CR_PAT)
1322                 vcpu->arch.pat = data;
1323         else {  /* Variable MTRRs */
1324                 int idx, is_mtrr_mask;
1325                 u64 *pt;
1326
1327                 idx = (msr - 0x200) / 2;
1328                 is_mtrr_mask = msr - 0x200 - 2 * idx;
1329                 if (!is_mtrr_mask)
1330                         pt =
1331                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1332                 else
1333                         pt =
1334                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1335                 *pt = data;
1336         }
1337
1338         kvm_mmu_reset_context(vcpu);
1339         return 0;
1340 }
1341
1342 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1343 {
1344         u64 mcg_cap = vcpu->arch.mcg_cap;
1345         unsigned bank_num = mcg_cap & 0xff;
1346
1347         switch (msr) {
1348         case MSR_IA32_MCG_STATUS:
1349                 vcpu->arch.mcg_status = data;
1350                 break;
1351         case MSR_IA32_MCG_CTL:
1352                 if (!(mcg_cap & MCG_CTL_P))
1353                         return 1;
1354                 if (data != 0 && data != ~(u64)0)
1355                         return -1;
1356                 vcpu->arch.mcg_ctl = data;
1357                 break;
1358         default:
1359                 if (msr >= MSR_IA32_MC0_CTL &&
1360                     msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1361                         u32 offset = msr - MSR_IA32_MC0_CTL;
1362                         /* only 0 or all 1s can be written to IA32_MCi_CTL
1363                          * some Linux kernels though clear bit 10 in bank 4 to
1364                          * workaround a BIOS/GART TBL issue on AMD K8s, ignore
1365                          * this to avoid an uncatched #GP in the guest
1366                          */
1367                         if ((offset & 0x3) == 0 &&
1368                             data != 0 && (data | (1 << 10)) != ~(u64)0)
1369                                 return -1;
1370                         vcpu->arch.mce_banks[offset] = data;
1371                         break;
1372                 }
1373                 return 1;
1374         }
1375         return 0;
1376 }
1377
1378 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
1379 {
1380         struct kvm *kvm = vcpu->kvm;
1381         int lm = is_long_mode(vcpu);
1382         u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
1383                 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
1384         u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
1385                 : kvm->arch.xen_hvm_config.blob_size_32;
1386         u32 page_num = data & ~PAGE_MASK;
1387         u64 page_addr = data & PAGE_MASK;
1388         u8 *page;
1389         int r;
1390
1391         r = -E2BIG;
1392         if (page_num >= blob_size)
1393                 goto out;
1394         r = -ENOMEM;
1395         page = kzalloc(PAGE_SIZE, GFP_KERNEL);
1396         if (!page)
1397                 goto out;
1398         r = -EFAULT;
1399         if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
1400                 goto out_free;
1401         if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
1402                 goto out_free;
1403         r = 0;
1404 out_free:
1405         kfree(page);
1406 out:
1407         return r;
1408 }
1409
1410 static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
1411 {
1412         return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
1413 }
1414
1415 static bool kvm_hv_msr_partition_wide(u32 msr)
1416 {
1417         bool r = false;
1418         switch (msr) {
1419         case HV_X64_MSR_GUEST_OS_ID:
1420         case HV_X64_MSR_HYPERCALL:
1421                 r = true;
1422                 break;
1423         }
1424
1425         return r;
1426 }
1427
1428 static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1429 {
1430         struct kvm *kvm = vcpu->kvm;
1431
1432         switch (msr) {
1433         case HV_X64_MSR_GUEST_OS_ID:
1434                 kvm->arch.hv_guest_os_id = data;
1435                 /* setting guest os id to zero disables hypercall page */
1436                 if (!kvm->arch.hv_guest_os_id)
1437                         kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
1438                 break;
1439         case HV_X64_MSR_HYPERCALL: {
1440                 u64 gfn;
1441                 unsigned long addr;
1442                 u8 instructions[4];
1443
1444                 /* if guest os id is not set hypercall should remain disabled */
1445                 if (!kvm->arch.hv_guest_os_id)
1446                         break;
1447                 if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
1448                         kvm->arch.hv_hypercall = data;
1449                         break;
1450                 }
1451                 gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
1452                 addr = gfn_to_hva(kvm, gfn);
1453                 if (kvm_is_error_hva(addr))
1454                         return 1;
1455                 kvm_x86_ops->patch_hypercall(vcpu, instructions);
1456                 ((unsigned char *)instructions)[3] = 0xc3; /* ret */
1457                 if (__copy_to_user((void __user *)addr, instructions, 4))
1458                         return 1;
1459                 kvm->arch.hv_hypercall = data;
1460                 break;
1461         }
1462         default:
1463                 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1464                           "data 0x%llx\n", msr, data);
1465                 return 1;
1466         }
1467         return 0;
1468 }
1469
1470 static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1471 {
1472         switch (msr) {
1473         case HV_X64_MSR_APIC_ASSIST_PAGE: {
1474                 unsigned long addr;
1475
1476                 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1477                         vcpu->arch.hv_vapic = data;
1478                         break;
1479                 }
1480                 addr = gfn_to_hva(vcpu->kvm, data >>
1481                                   HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
1482                 if (kvm_is_error_hva(addr))
1483                         return 1;
1484                 if (__clear_user((void __user *)addr, PAGE_SIZE))
1485                         return 1;
1486                 vcpu->arch.hv_vapic = data;
1487                 break;
1488         }
1489         case HV_X64_MSR_EOI:
1490                 return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
1491         case HV_X64_MSR_ICR:
1492                 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
1493         case HV_X64_MSR_TPR:
1494                 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1495         default:
1496                 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1497                           "data 0x%llx\n", msr, data);
1498                 return 1;
1499         }
1500
1501         return 0;
1502 }
1503
1504 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1505 {
1506         gpa_t gpa = data & ~0x3f;
1507
1508         /* Bits 2:5 are resrved, Should be zero */
1509         if (data & 0x3c)
1510                 return 1;
1511
1512         vcpu->arch.apf.msr_val = data;
1513
1514         if (!(data & KVM_ASYNC_PF_ENABLED)) {
1515                 kvm_clear_async_pf_completion_queue(vcpu);
1516                 kvm_async_pf_hash_reset(vcpu);
1517                 return 0;
1518         }
1519
1520         if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
1521                                         sizeof(u32)))
1522                 return 1;
1523
1524         vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
1525         kvm_async_pf_wakeup_all(vcpu);
1526         return 0;
1527 }
1528
1529 static void kvmclock_reset(struct kvm_vcpu *vcpu)
1530 {
1531         vcpu->arch.pv_time_enabled = false;
1532 }
1533
1534 static void accumulate_steal_time(struct kvm_vcpu *vcpu)
1535 {
1536         u64 delta;
1537
1538         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1539                 return;
1540
1541         delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
1542         vcpu->arch.st.last_steal = current->sched_info.run_delay;
1543         vcpu->arch.st.accum_steal = delta;
1544 }
1545
1546 static void record_steal_time(struct kvm_vcpu *vcpu)
1547 {
1548         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1549                 return;
1550
1551         if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1552                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
1553                 return;
1554
1555         vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
1556         vcpu->arch.st.steal.version += 2;
1557         vcpu->arch.st.accum_steal = 0;
1558
1559         kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1560                 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
1561 }
1562
1563 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1564 {
1565         switch (msr) {
1566         case MSR_EFER:
1567                 return set_efer(vcpu, data);
1568         case MSR_K7_HWCR:
1569                 data &= ~(u64)0x40;     /* ignore flush filter disable */
1570                 data &= ~(u64)0x100;    /* ignore ignne emulation enable */
1571                 if (data != 0) {
1572                         pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1573                                 data);
1574                         return 1;
1575                 }
1576                 break;
1577         case MSR_FAM10H_MMIO_CONF_BASE:
1578                 if (data != 0) {
1579                         pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
1580                                 "0x%llx\n", data);
1581                         return 1;
1582                 }
1583                 break;
1584         case MSR_AMD64_NB_CFG:
1585                 break;
1586         case MSR_IA32_DEBUGCTLMSR:
1587                 if (!data) {
1588                         /* We support the non-activated case already */
1589                         break;
1590                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
1591                         /* Values other than LBR and BTF are vendor-specific,
1592                            thus reserved and should throw a #GP */
1593                         return 1;
1594                 }
1595                 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
1596                         __func__, data);
1597                 break;
1598         case MSR_IA32_UCODE_REV:
1599         case MSR_IA32_UCODE_WRITE:
1600         case MSR_VM_HSAVE_PA:
1601         case MSR_AMD64_PATCH_LOADER:
1602                 break;
1603         case 0x200 ... 0x2ff:
1604                 return set_msr_mtrr(vcpu, msr, data);
1605         case MSR_IA32_APICBASE:
1606                 kvm_set_apic_base(vcpu, data);
1607                 break;
1608         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1609                 return kvm_x2apic_msr_write(vcpu, msr, data);
1610         case MSR_IA32_TSCDEADLINE:
1611                 kvm_set_lapic_tscdeadline_msr(vcpu, data);
1612                 break;
1613         case MSR_IA32_MISC_ENABLE:
1614                 vcpu->arch.ia32_misc_enable_msr = data;
1615                 break;
1616         case MSR_KVM_WALL_CLOCK_NEW:
1617         case MSR_KVM_WALL_CLOCK:
1618                 vcpu->kvm->arch.wall_clock = data;
1619                 kvm_write_wall_clock(vcpu->kvm, data);
1620                 break;
1621         case MSR_KVM_SYSTEM_TIME_NEW:
1622         case MSR_KVM_SYSTEM_TIME: {
1623                 u64 gpa_offset;
1624                 kvmclock_reset(vcpu);
1625
1626                 vcpu->arch.time = data;
1627                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1628
1629                 /* we verify if the enable bit is set... */
1630                 if (!(data & 1))
1631                         break;
1632
1633                 gpa_offset = data & ~(PAGE_MASK | 1);
1634
1635                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
1636                      &vcpu->arch.pv_time, data & ~1ULL,
1637                      sizeof(struct pvclock_vcpu_time_info)))
1638                         vcpu->arch.pv_time_enabled = false;
1639                 else
1640                         vcpu->arch.pv_time_enabled = true;
1641                 break;
1642         }
1643         case MSR_KVM_ASYNC_PF_EN:
1644                 if (kvm_pv_enable_async_pf(vcpu, data))
1645                         return 1;
1646                 break;
1647         case MSR_KVM_STEAL_TIME:
1648
1649                 if (unlikely(!sched_info_on()))
1650                         return 1;
1651
1652                 if (data & KVM_STEAL_RESERVED_MASK)
1653                         return 1;
1654
1655                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
1656                                                 data & KVM_STEAL_VALID_BITS,
1657                                                 sizeof(struct kvm_steal_time)))
1658                         return 1;
1659
1660                 vcpu->arch.st.msr_val = data;
1661
1662                 if (!(data & KVM_MSR_ENABLED))
1663                         break;
1664
1665                 vcpu->arch.st.last_steal = current->sched_info.run_delay;
1666
1667                 preempt_disable();
1668                 accumulate_steal_time(vcpu);
1669                 preempt_enable();
1670
1671                 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
1672
1673                 break;
1674
1675         case MSR_IA32_MCG_CTL:
1676         case MSR_IA32_MCG_STATUS:
1677         case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1678                 return set_msr_mce(vcpu, msr, data);
1679
1680         /* Performance counters are not protected by a CPUID bit,
1681          * so we should check all of them in the generic path for the sake of
1682          * cross vendor migration.
1683          * Writing a zero into the event select MSRs disables them,
1684          * which we perfectly emulate ;-). Any other value should be at least
1685          * reported, some guests depend on them.
1686          */
1687         case MSR_P6_EVNTSEL0:
1688         case MSR_P6_EVNTSEL1:
1689         case MSR_K7_EVNTSEL0:
1690         case MSR_K7_EVNTSEL1:
1691         case MSR_K7_EVNTSEL2:
1692         case MSR_K7_EVNTSEL3:
1693                 if (data != 0)
1694                         pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1695                                 "0x%x data 0x%llx\n", msr, data);
1696                 break;
1697         /* at least RHEL 4 unconditionally writes to the perfctr registers,
1698          * so we ignore writes to make it happy.
1699          */
1700         case MSR_P6_PERFCTR0:
1701         case MSR_P6_PERFCTR1:
1702         case MSR_K7_PERFCTR0:
1703         case MSR_K7_PERFCTR1:
1704         case MSR_K7_PERFCTR2:
1705         case MSR_K7_PERFCTR3:
1706                 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1707                         "0x%x data 0x%llx\n", msr, data);
1708                 break;
1709         case MSR_K7_CLK_CTL:
1710                 /*
1711                  * Ignore all writes to this no longer documented MSR.
1712                  * Writes are only relevant for old K7 processors,
1713                  * all pre-dating SVM, but a recommended workaround from
1714                  * AMD for these chips. It is possible to speicify the
1715                  * affected processor models on the command line, hence
1716                  * the need to ignore the workaround.
1717                  */
1718                 break;
1719         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1720                 if (kvm_hv_msr_partition_wide(msr)) {
1721                         int r;
1722                         mutex_lock(&vcpu->kvm->lock);
1723                         r = set_msr_hyperv_pw(vcpu, msr, data);
1724                         mutex_unlock(&vcpu->kvm->lock);
1725                         return r;
1726                 } else
1727                         return set_msr_hyperv(vcpu, msr, data);
1728                 break;
1729         case MSR_IA32_BBL_CR_CTL3:
1730                 /* Drop writes to this legacy MSR -- see rdmsr
1731                  * counterpart for further detail.
1732                  */
1733                 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1734                 break;
1735         default:
1736                 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1737                         return xen_hvm_config(vcpu, data);
1738                 if (!ignore_msrs) {
1739                         pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
1740                                 msr, data);
1741                         return 1;
1742                 } else {
1743                         pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
1744                                 msr, data);
1745                         break;
1746                 }
1747         }
1748         return 0;
1749 }
1750 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1751
1752
1753 /*
1754  * Reads an msr value (of 'msr_index') into 'pdata'.
1755  * Returns 0 on success, non-0 otherwise.
1756  * Assumes vcpu_load() was already called.
1757  */
1758 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1759 {
1760         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1761 }
1762
1763 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1764 {
1765         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1766
1767         if (!msr_mtrr_valid(msr))
1768                 return 1;
1769
1770         if (msr == MSR_MTRRdefType)
1771                 *pdata = vcpu->arch.mtrr_state.def_type +
1772                          (vcpu->arch.mtrr_state.enabled << 10);
1773         else if (msr == MSR_MTRRfix64K_00000)
1774                 *pdata = p[0];
1775         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1776                 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
1777         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1778                 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
1779         else if (msr == MSR_IA32_CR_PAT)
1780                 *pdata = vcpu->arch.pat;
1781         else {  /* Variable MTRRs */
1782                 int idx, is_mtrr_mask;
1783                 u64 *pt;
1784
1785                 idx = (msr - 0x200) / 2;
1786                 is_mtrr_mask = msr - 0x200 - 2 * idx;
1787                 if (!is_mtrr_mask)
1788                         pt =
1789                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1790                 else
1791                         pt =
1792                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1793                 *pdata = *pt;
1794         }
1795
1796         return 0;
1797 }
1798
1799 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1800 {
1801         u64 data;
1802         u64 mcg_cap = vcpu->arch.mcg_cap;
1803         unsigned bank_num = mcg_cap & 0xff;
1804
1805         switch (msr) {
1806         case MSR_IA32_P5_MC_ADDR:
1807         case MSR_IA32_P5_MC_TYPE:
1808                 data = 0;
1809                 break;
1810         case MSR_IA32_MCG_CAP:
1811                 data = vcpu->arch.mcg_cap;
1812                 break;
1813         case MSR_IA32_MCG_CTL:
1814                 if (!(mcg_cap & MCG_CTL_P))
1815                         return 1;
1816                 data = vcpu->arch.mcg_ctl;
1817                 break;
1818         case MSR_IA32_MCG_STATUS:
1819                 data = vcpu->arch.mcg_status;
1820                 break;
1821         default:
1822                 if (msr >= MSR_IA32_MC0_CTL &&
1823                     msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1824                         u32 offset = msr - MSR_IA32_MC0_CTL;
1825                         data = vcpu->arch.mce_banks[offset];
1826                         break;
1827                 }
1828                 return 1;
1829         }
1830         *pdata = data;
1831         return 0;
1832 }
1833
1834 static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1835 {
1836         u64 data = 0;
1837         struct kvm *kvm = vcpu->kvm;
1838
1839         switch (msr) {
1840         case HV_X64_MSR_GUEST_OS_ID:
1841                 data = kvm->arch.hv_guest_os_id;
1842                 break;
1843         case HV_X64_MSR_HYPERCALL:
1844                 data = kvm->arch.hv_hypercall;
1845                 break;
1846         default:
1847                 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1848                 return 1;
1849         }
1850
1851         *pdata = data;
1852         return 0;
1853 }
1854
1855 static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1856 {
1857         u64 data = 0;
1858
1859         switch (msr) {
1860         case HV_X64_MSR_VP_INDEX: {
1861                 int r;
1862                 struct kvm_vcpu *v;
1863                 kvm_for_each_vcpu(r, v, vcpu->kvm)
1864                         if (v == vcpu)
1865                                 data = r;
1866                 break;
1867         }
1868         case HV_X64_MSR_EOI:
1869                 return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
1870         case HV_X64_MSR_ICR:
1871                 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
1872         case HV_X64_MSR_TPR:
1873                 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
1874         case HV_X64_MSR_APIC_ASSIST_PAGE:
1875                 data = vcpu->arch.hv_vapic;
1876                 break;
1877         default:
1878                 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1879                 return 1;
1880         }
1881         *pdata = data;
1882         return 0;
1883 }
1884
1885 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1886 {
1887         u64 data;
1888
1889         switch (msr) {
1890         case MSR_IA32_PLATFORM_ID:
1891         case MSR_IA32_EBL_CR_POWERON:
1892         case MSR_IA32_DEBUGCTLMSR:
1893         case MSR_IA32_LASTBRANCHFROMIP:
1894         case MSR_IA32_LASTBRANCHTOIP:
1895         case MSR_IA32_LASTINTFROMIP:
1896         case MSR_IA32_LASTINTTOIP:
1897         case MSR_K8_SYSCFG:
1898         case MSR_K8_TSEG_ADDR:
1899         case MSR_K8_TSEG_MASK:
1900         case MSR_K7_HWCR:
1901         case MSR_VM_HSAVE_PA:
1902         case MSR_P6_PERFCTR0:
1903         case MSR_P6_PERFCTR1:
1904         case MSR_P6_EVNTSEL0:
1905         case MSR_P6_EVNTSEL1:
1906         case MSR_K7_EVNTSEL0:
1907         case MSR_K7_PERFCTR0:
1908         case MSR_K8_INT_PENDING_MSG:
1909         case MSR_AMD64_NB_CFG:
1910         case MSR_FAM10H_MMIO_CONF_BASE:
1911                 data = 0;
1912                 break;
1913         case MSR_IA32_UCODE_REV:
1914                 data = 0x100000000ULL;
1915                 break;
1916         case MSR_MTRRcap:
1917                 data = 0x500 | KVM_NR_VAR_MTRR;
1918                 break;
1919         case 0x200 ... 0x2ff:
1920                 return get_msr_mtrr(vcpu, msr, pdata);
1921         case 0xcd: /* fsb frequency */
1922                 data = 3;
1923                 break;
1924                 /*
1925                  * MSR_EBC_FREQUENCY_ID
1926                  * Conservative value valid for even the basic CPU models.
1927                  * Models 0,1: 000 in bits 23:21 indicating a bus speed of
1928                  * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
1929                  * and 266MHz for model 3, or 4. Set Core Clock
1930                  * Frequency to System Bus Frequency Ratio to 1 (bits
1931                  * 31:24) even though these are only valid for CPU
1932                  * models > 2, however guests may end up dividing or
1933                  * multiplying by zero otherwise.
1934                  */
1935         case MSR_EBC_FREQUENCY_ID:
1936                 data = 1 << 24;
1937                 break;
1938         case MSR_IA32_APICBASE:
1939                 data = kvm_get_apic_base(vcpu);
1940                 break;
1941         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1942                 return kvm_x2apic_msr_read(vcpu, msr, pdata);
1943                 break;
1944         case MSR_IA32_TSCDEADLINE:
1945                 data = kvm_get_lapic_tscdeadline_msr(vcpu);
1946                 break;
1947         case MSR_IA32_MISC_ENABLE:
1948                 data = vcpu->arch.ia32_misc_enable_msr;
1949                 break;
1950         case MSR_IA32_PERF_STATUS:
1951                 /* TSC increment by tick */
1952                 data = 1000ULL;
1953                 /* CPU multiplier */
1954                 data |= (((uint64_t)4ULL) << 40);
1955                 break;
1956         case MSR_EFER:
1957                 data = vcpu->arch.efer;
1958                 break;
1959         case MSR_KVM_WALL_CLOCK:
1960         case MSR_KVM_WALL_CLOCK_NEW:
1961                 data = vcpu->kvm->arch.wall_clock;
1962                 break;
1963         case MSR_KVM_SYSTEM_TIME:
1964         case MSR_KVM_SYSTEM_TIME_NEW:
1965                 data = vcpu->arch.time;
1966                 break;
1967         case MSR_KVM_ASYNC_PF_EN:
1968                 data = vcpu->arch.apf.msr_val;
1969                 break;
1970         case MSR_KVM_STEAL_TIME:
1971                 data = vcpu->arch.st.msr_val;
1972                 break;
1973         case MSR_IA32_P5_MC_ADDR:
1974         case MSR_IA32_P5_MC_TYPE:
1975         case MSR_IA32_MCG_CAP:
1976         case MSR_IA32_MCG_CTL:
1977         case MSR_IA32_MCG_STATUS:
1978         case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1979                 return get_msr_mce(vcpu, msr, pdata);
1980         case MSR_K7_CLK_CTL:
1981                 /*
1982                  * Provide expected ramp-up count for K7. All other
1983                  * are set to zero, indicating minimum divisors for
1984                  * every field.
1985                  *
1986                  * This prevents guest kernels on AMD host with CPU
1987                  * type 6, model 8 and higher from exploding due to
1988                  * the rdmsr failing.
1989                  */
1990                 data = 0x20000000;
1991                 break;
1992         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1993                 if (kvm_hv_msr_partition_wide(msr)) {
1994                         int r;
1995                         mutex_lock(&vcpu->kvm->lock);
1996                         r = get_msr_hyperv_pw(vcpu, msr, pdata);
1997                         mutex_unlock(&vcpu->kvm->lock);
1998                         return r;
1999                 } else
2000                         return get_msr_hyperv(vcpu, msr, pdata);
2001                 break;
2002         case MSR_IA32_BBL_CR_CTL3:
2003                 /* This legacy MSR exists but isn't fully documented in current
2004                  * silicon.  It is however accessed by winxp in very narrow
2005                  * scenarios where it sets bit #19, itself documented as
2006                  * a "reserved" bit.  Best effort attempt to source coherent
2007                  * read data here should the balance of the register be
2008                  * interpreted by the guest:
2009                  *
2010                  * L2 cache control register 3: 64GB range, 256KB size,
2011                  * enabled, latency 0x1, configured
2012                  */
2013                 data = 0xbe702111;
2014                 break;
2015         default:
2016                 if (!ignore_msrs) {
2017                         pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
2018                         return 1;
2019                 } else {
2020                         pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
2021                         data = 0;
2022                 }
2023                 break;
2024         }
2025         *pdata = data;
2026         return 0;
2027 }
2028 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
2029
2030 /*
2031  * Read or write a bunch of msrs. All parameters are kernel addresses.
2032  *
2033  * @return number of msrs set successfully.
2034  */
2035 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2036                     struct kvm_msr_entry *entries,
2037                     int (*do_msr)(struct kvm_vcpu *vcpu,
2038                                   unsigned index, u64 *data))
2039 {
2040         int i, idx;
2041
2042         idx = srcu_read_lock(&vcpu->kvm->srcu);
2043         for (i = 0; i < msrs->nmsrs; ++i)
2044                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2045                         break;
2046         srcu_read_unlock(&vcpu->kvm->srcu, idx);
2047
2048         return i;
2049 }
2050
2051 /*
2052  * Read or write a bunch of msrs. Parameters are user addresses.
2053  *
2054  * @return number of msrs set successfully.
2055  */
2056 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2057                   int (*do_msr)(struct kvm_vcpu *vcpu,
2058                                 unsigned index, u64 *data),
2059                   int writeback)
2060 {
2061         struct kvm_msrs msrs;
2062         struct kvm_msr_entry *entries;
2063         int r, n;
2064         unsigned size;
2065
2066         r = -EFAULT;
2067         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2068                 goto out;
2069
2070         r = -E2BIG;
2071         if (msrs.nmsrs >= MAX_IO_MSRS)
2072                 goto out;
2073
2074         r = -ENOMEM;
2075         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2076         entries = kmalloc(size, GFP_KERNEL);
2077         if (!entries)
2078                 goto out;
2079
2080         r = -EFAULT;
2081         if (copy_from_user(entries, user_msrs->entries, size))
2082                 goto out_free;
2083
2084         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2085         if (r < 0)
2086                 goto out_free;
2087
2088         r = -EFAULT;
2089         if (writeback && copy_to_user(user_msrs->entries, entries, size))
2090                 goto out_free;
2091
2092         r = n;
2093
2094 out_free:
2095         kfree(entries);
2096 out:
2097         return r;
2098 }
2099
2100 int kvm_dev_ioctl_check_extension(long ext)
2101 {
2102         int r;
2103
2104         switch (ext) {
2105         case KVM_CAP_IRQCHIP:
2106         case KVM_CAP_HLT:
2107         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
2108         case KVM_CAP_SET_TSS_ADDR:
2109         case KVM_CAP_EXT_CPUID:
2110         case KVM_CAP_CLOCKSOURCE:
2111         case KVM_CAP_PIT:
2112         case KVM_CAP_NOP_IO_DELAY:
2113         case KVM_CAP_MP_STATE:
2114         case KVM_CAP_SYNC_MMU:
2115         case KVM_CAP_USER_NMI:
2116         case KVM_CAP_REINJECT_CONTROL:
2117         case KVM_CAP_IRQ_INJECT_STATUS:
2118         case KVM_CAP_ASSIGN_DEV_IRQ:
2119         case KVM_CAP_IRQFD:
2120         case KVM_CAP_IOEVENTFD:
2121         case KVM_CAP_PIT2:
2122         case KVM_CAP_PIT_STATE2:
2123         case KVM_CAP_SET_IDENTITY_MAP_ADDR:
2124         case KVM_CAP_XEN_HVM:
2125         case KVM_CAP_ADJUST_CLOCK:
2126         case KVM_CAP_VCPU_EVENTS:
2127         case KVM_CAP_HYPERV:
2128         case KVM_CAP_HYPERV_VAPIC:
2129         case KVM_CAP_HYPERV_SPIN:
2130         case KVM_CAP_PCI_SEGMENT:
2131         case KVM_CAP_DEBUGREGS:
2132         case KVM_CAP_X86_ROBUST_SINGLESTEP:
2133         case KVM_CAP_XSAVE:
2134         case KVM_CAP_ASYNC_PF:
2135         case KVM_CAP_GET_TSC_KHZ:
2136                 r = 1;
2137                 break;
2138         case KVM_CAP_COALESCED_MMIO:
2139                 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
2140                 break;
2141         case KVM_CAP_VAPIC:
2142                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
2143                 break;
2144         case KVM_CAP_NR_VCPUS:
2145                 r = KVM_SOFT_MAX_VCPUS;
2146                 break;
2147         case KVM_CAP_MAX_VCPUS:
2148                 r = KVM_MAX_VCPUS;
2149                 break;
2150         case KVM_CAP_NR_MEMSLOTS:
2151                 r = KVM_MEMORY_SLOTS;
2152                 break;
2153         case KVM_CAP_PV_MMU:    /* obsolete */
2154                 r = 0;
2155                 break;
2156         case KVM_CAP_IOMMU:
2157                 r = iommu_present(&pci_bus_type);
2158                 break;
2159         case KVM_CAP_MCE:
2160                 r = KVM_MAX_MCE_BANKS;
2161                 break;
2162         case KVM_CAP_XCRS:
2163                 r = cpu_has_xsave;
2164                 break;
2165         case KVM_CAP_TSC_CONTROL:
2166                 r = kvm_has_tsc_control;
2167                 break;
2168         case KVM_CAP_TSC_DEADLINE_TIMER:
2169                 r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
2170                 break;
2171         default:
2172                 r = 0;
2173                 break;
2174         }
2175         return r;
2176
2177 }
2178
2179 long kvm_arch_dev_ioctl(struct file *filp,
2180                         unsigned int ioctl, unsigned long arg)
2181 {
2182         void __user *argp = (void __user *)arg;
2183         long r;
2184
2185         switch (ioctl) {
2186         case KVM_GET_MSR_INDEX_LIST: {
2187                 struct kvm_msr_list __user *user_msr_list = argp;
2188                 struct kvm_msr_list msr_list;
2189                 unsigned n;
2190
2191                 r = -EFAULT;
2192                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2193                         goto out;
2194                 n = msr_list.nmsrs;
2195                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2196                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2197                         goto out;
2198                 r = -E2BIG;
2199                 if (n < msr_list.nmsrs)
2200                         goto out;
2201                 r = -EFAULT;
2202                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2203                                  num_msrs_to_save * sizeof(u32)))
2204                         goto out;
2205                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
2206                                  &emulated_msrs,
2207                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2208                         goto out;
2209                 r = 0;
2210                 break;
2211         }
2212         case KVM_GET_SUPPORTED_CPUID: {
2213                 struct kvm_cpuid2 __user *cpuid_arg = argp;
2214                 struct kvm_cpuid2 cpuid;
2215
2216                 r = -EFAULT;
2217                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2218                         goto out;
2219                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
2220                                                       cpuid_arg->entries);
2221                 if (r)
2222                         goto out;
2223
2224                 r = -EFAULT;
2225                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
2226                         goto out;
2227                 r = 0;
2228                 break;
2229         }
2230         case KVM_X86_GET_MCE_CAP_SUPPORTED: {
2231                 u64 mce_cap;
2232
2233                 mce_cap = KVM_MCE_CAP_SUPPORTED;
2234                 r = -EFAULT;
2235                 if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
2236                         goto out;
2237                 r = 0;
2238                 break;
2239         }
2240         default:
2241                 r = -EINVAL;
2242         }
2243 out:
2244         return r;
2245 }
2246
2247 static void wbinvd_ipi(void *garbage)
2248 {
2249         wbinvd();
2250 }
2251
2252 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
2253 {
2254         return vcpu->kvm->arch.iommu_domain &&
2255                 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
2256 }
2257
2258 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2259 {
2260         /* Address WBINVD may be executed by guest */
2261         if (need_emulate_wbinvd(vcpu)) {
2262                 if (kvm_x86_ops->has_wbinvd_exit())
2263                         cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
2264                 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
2265                         smp_call_function_single(vcpu->cpu,
2266                                         wbinvd_ipi, NULL, 1);
2267         }
2268
2269         kvm_x86_ops->vcpu_load(vcpu, cpu);
2270         if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2271                 /* Make sure TSC doesn't go backwards */
2272                 s64 tsc_delta;
2273                 u64 tsc;
2274
2275                 tsc = kvm_x86_ops->read_l1_tsc(vcpu);
2276                 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
2277                              tsc - vcpu->arch.last_guest_tsc;
2278
2279                 if (tsc_delta < 0)
2280                         mark_tsc_unstable("KVM discovered backwards TSC");
2281                 if (check_tsc_unstable()) {
2282                         kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
2283                         vcpu->arch.tsc_catchup = 1;
2284                 }
2285                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2286                 if (vcpu->cpu != cpu)
2287                         kvm_migrate_timers(vcpu);
2288                 vcpu->cpu = cpu;
2289         }
2290
2291         accumulate_steal_time(vcpu);
2292         kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2293 }
2294
2295 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2296 {
2297         kvm_x86_ops->vcpu_put(vcpu);
2298         kvm_put_guest_fpu(vcpu);
2299         vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
2300 }
2301
2302 static int is_efer_nx(void)
2303 {
2304         unsigned long long efer = 0;
2305
2306         rdmsrl_safe(MSR_EFER, &efer);
2307         return efer & EFER_NX;
2308 }
2309
2310 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2311 {
2312         int i;
2313         struct kvm_cpuid_entry2 *e, *entry;
2314
2315         entry = NULL;
2316         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2317                 e = &vcpu->arch.cpuid_entries[i];
2318                 if (e->function == 0x80000001) {
2319                         entry = e;
2320                         break;
2321                 }
2322         }
2323         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
2324                 entry->edx &= ~(1 << 20);
2325                 printk(KERN_INFO "kvm: guest NX capability removed\n");
2326         }
2327 }
2328
2329 /* when an old userspace process fills a new kernel module */
2330 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2331                                     struct kvm_cpuid *cpuid,
2332                                     struct kvm_cpuid_entry __user *entries)
2333 {
2334         int r, i;
2335         struct kvm_cpuid_entry *cpuid_entries;
2336
2337         r = -E2BIG;
2338         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2339                 goto out;
2340         r = -ENOMEM;
2341         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
2342         if (!cpuid_entries)
2343                 goto out;
2344         r = -EFAULT;
2345         if (copy_from_user(cpuid_entries, entries,
2346                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2347                 goto out_free;
2348         for (i = 0; i < cpuid->nent; i++) {
2349                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
2350                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
2351                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
2352                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
2353                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
2354                 vcpu->arch.cpuid_entries[i].index = 0;
2355                 vcpu->arch.cpuid_entries[i].flags = 0;
2356                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
2357                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
2358                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
2359         }
2360         vcpu->arch.cpuid_nent = cpuid->nent;
2361         cpuid_fix_nx_cap(vcpu);
2362         r = 0;
2363         kvm_apic_set_version(vcpu);
2364         kvm_x86_ops->cpuid_update(vcpu);
2365         update_cpuid(vcpu);
2366
2367 out_free:
2368         vfree(cpuid_entries);
2369 out:
2370         return r;
2371 }
2372
2373 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
2374                                      struct kvm_cpuid2 *cpuid,
2375                                      struct kvm_cpuid_entry2 __user *entries)
2376 {
2377         int r;
2378
2379         r = -E2BIG;
2380         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2381                 goto out;
2382         r = -EFAULT;
2383         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
2384                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
2385                 goto out;
2386         vcpu->arch.cpuid_nent = cpuid->nent;
2387         kvm_apic_set_version(vcpu);
2388         kvm_x86_ops->cpuid_update(vcpu);
2389         update_cpuid(vcpu);
2390         return 0;
2391
2392 out:
2393         return r;
2394 }
2395
2396 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
2397                                      struct kvm_cpuid2 *cpuid,
2398                                      struct kvm_cpuid_entry2 __user *entries)
2399 {
2400         int r;
2401
2402         r = -E2BIG;
2403         if (cpuid->nent < vcpu->arch.cpuid_nent)
2404                 goto out;
2405         r = -EFAULT;
2406         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
2407                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
2408                 goto out;
2409         return 0;
2410
2411 out:
2412         cpuid->nent = vcpu->arch.cpuid_nent;
2413         return r;
2414 }
2415
2416 static void cpuid_mask(u32 *word, int wordnum)
2417 {
2418         *word &= boot_cpu_data.x86_capability[wordnum];
2419 }
2420
2421 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2422                            u32 index)
2423 {
2424         entry->function = function;
2425         entry->index = index;
2426         cpuid_count(entry->function, entry->index,
2427                     &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
2428         entry->flags = 0;
2429 }
2430
2431 static bool supported_xcr0_bit(unsigned bit)
2432 {
2433         u64 mask = ((u64)1 << bit);
2434
2435         return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
2436 }
2437
2438 #define F(x) bit(X86_FEATURE_##x)
2439
2440 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2441                          u32 index, int *nent, int maxnent)
2442 {
2443         unsigned f_nx = is_efer_nx() ? F(NX) : 0;
2444 #ifdef CONFIG_X86_64
2445         unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
2446                                 ? F(GBPAGES) : 0;
2447         unsigned f_lm = F(LM);
2448 #else
2449         unsigned f_gbpages = 0;
2450         unsigned f_lm = 0;
2451 #endif
2452         unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
2453
2454         /* cpuid 1.edx */
2455         const u32 kvm_supported_word0_x86_features =
2456                 F(FPU) | F(VME) | F(DE) | F(PSE) |
2457                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
2458                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
2459                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
2460                 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
2461                 0 /* Reserved, DS, ACPI */ | F(MMX) |
2462                 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
2463                 0 /* HTT, TM, Reserved, PBE */;
2464         /* cpuid 0x80000001.edx */
2465         const u32 kvm_supported_word1_x86_features =
2466                 F(FPU) | F(VME) | F(DE) | F(PSE) |
2467                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
2468                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
2469                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
2470                 F(PAT) | F(PSE36) | 0 /* Reserved */ |
2471                 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
2472                 F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
2473                 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
2474         /* cpuid 1.ecx */
2475         const u32 kvm_supported_word4_x86_features =
2476                 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
2477                 0 /* DS-CPL, VMX, SMX, EST */ |
2478                 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
2479                 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
2480                 0 /* Reserved, DCA */ | F(XMM4_1) |
2481                 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
2482                 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
2483                 F(F16C) | F(RDRAND);
2484         /* cpuid 0x80000001.ecx */
2485         const u32 kvm_supported_word6_x86_features =
2486                 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
2487                 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
2488                 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
2489                 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
2490
2491         /* cpuid 0xC0000001.edx */
2492         const u32 kvm_supported_word5_x86_features =
2493                 F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
2494                 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
2495                 F(PMM) | F(PMM_EN);
2496
2497         /* cpuid 7.0.ebx */
2498         const u32 kvm_supported_word9_x86_features =
2499                 F(SMEP) | F(FSGSBASE) | F(ERMS);
2500
2501         /* all calls to cpuid_count() should be made on the same cpu */
2502         get_cpu();
2503         do_cpuid_1_ent(entry, function, index);
2504         ++*nent;
2505
2506         switch (function) {
2507         case 0:
2508                 entry->eax = min(entry->eax, (u32)0xd);
2509                 break;
2510         case 1:
2511                 entry->edx &= kvm_supported_word0_x86_features;
2512                 cpuid_mask(&entry->edx, 0);
2513                 entry->ecx &= kvm_supported_word4_x86_features;
2514                 cpuid_mask(&entry->ecx, 4);
2515                 /* we support x2apic emulation even if host does not support
2516                  * it since we emulate x2apic in software */
2517                 entry->ecx |= F(X2APIC);
2518                 break;
2519         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
2520          * may return different values. This forces us to get_cpu() before
2521          * issuing the first command, and also to emulate this annoying behavior
2522          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
2523         case 2: {
2524                 int t, times = entry->eax & 0xff;
2525
2526                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
2527                 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2528                 for (t = 1; t < times && *nent < maxnent; ++t) {
2529                         do_cpuid_1_ent(&entry[t], function, 0);
2530                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
2531                         ++*nent;
2532                 }
2533                 break;
2534         }
2535         /* function 4 has additional index. */
2536         case 4: {
2537                 int i, cache_type;
2538
2539                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2540                 /* read more entries until cache_type is zero */
2541                 for (i = 1; *nent < maxnent; ++i) {
2542                         cache_type = entry[i - 1].eax & 0x1f;
2543                         if (!cache_type)
2544                                 break;
2545                         do_cpuid_1_ent(&entry[i], function, i);
2546                         entry[i].flags |=
2547                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2548                         ++*nent;
2549                 }
2550                 break;
2551         }
2552         case 7: {
2553                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2554                 /* Mask ebx against host capbability word 9 */
2555                 if (index == 0) {
2556                         entry->ebx &= kvm_supported_word9_x86_features;
2557                         cpuid_mask(&entry->ebx, 9);
2558                 } else
2559                         entry->ebx = 0;
2560                 entry->eax = 0;
2561                 entry->ecx = 0;
2562                 entry->edx = 0;
2563                 break;
2564         }
2565         case 9:
2566                 break;
2567         /* function 0xb has additional index. */
2568         case 0xb: {
2569                 int i, level_type;
2570
2571                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2572                 /* read more entries until level_type is zero */
2573                 for (i = 1; *nent < maxnent; ++i) {
2574                         level_type = entry[i - 1].ecx & 0xff00;
2575                         if (!level_type)
2576                                 break;
2577                         do_cpuid_1_ent(&entry[i], function, i);
2578                         entry[i].flags |=
2579                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2580                         ++*nent;
2581                 }
2582                 break;
2583         }
2584         case 0xd: {
2585                 int idx, i;
2586
2587                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2588                 for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
2589                         do_cpuid_1_ent(&entry[i], function, idx);
2590                         if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
2591                                 continue;
2592                         entry[i].flags |=
2593                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2594                         ++*nent;
2595                         ++i;
2596                 }
2597                 break;
2598         }
2599         case KVM_CPUID_SIGNATURE: {
2600                 char signature[12] = "KVMKVMKVM\0\0";
2601                 u32 *sigptr = (u32 *)signature;
2602                 entry->eax = 0;
2603                 entry->ebx = sigptr[0];
2604                 entry->ecx = sigptr[1];
2605                 entry->edx = sigptr[2];
2606                 break;
2607         }
2608         case KVM_CPUID_FEATURES:
2609                 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
2610                              (1 << KVM_FEATURE_NOP_IO_DELAY) |
2611                              (1 << KVM_FEATURE_CLOCKSOURCE2) |
2612                              (1 << KVM_FEATURE_ASYNC_PF) |
2613                              (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
2614
2615                 if (sched_info_on())
2616                         entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
2617
2618                 entry->ebx = 0;
2619                 entry->ecx = 0;
2620                 entry->edx = 0;
2621                 break;
2622         case 0x80000000:
2623                 entry->eax = min(entry->eax, 0x8000001a);
2624                 break;
2625         case 0x80000001:
2626                 entry->edx &= kvm_supported_word1_x86_features;
2627                 cpuid_mask(&entry->edx, 1);
2628                 entry->ecx &= kvm_supported_word6_x86_features;
2629                 cpuid_mask(&entry->ecx, 6);
2630                 break;
2631         case 0x80000008: {
2632                 unsigned g_phys_as = (entry->eax >> 16) & 0xff;
2633                 unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
2634                 unsigned phys_as = entry->eax & 0xff;
2635
2636                 if (!g_phys_as)
2637                         g_phys_as = phys_as;
2638                 entry->eax = g_phys_as | (virt_as << 8);
2639                 entry->ebx = entry->edx = 0;
2640                 break;
2641         }
2642         case 0x80000019:
2643                 entry->ecx = entry->edx = 0;
2644                 break;
2645         case 0x8000001a:
2646                 break;
2647         case 0x8000001d:
2648                 break;
2649         /*Add support for Centaur's CPUID instruction*/
2650         case 0xC0000000:
2651                 /*Just support up to 0xC0000004 now*/
2652                 entry->eax = min(entry->eax, 0xC0000004);
2653                 break;
2654         case 0xC0000001:
2655                 entry->edx &= kvm_supported_word5_x86_features;
2656                 cpuid_mask(&entry->edx, 5);
2657                 break;
2658         case 3: /* Processor serial number */
2659         case 5: /* MONITOR/MWAIT */
2660         case 6: /* Thermal management */
2661         case 0xA: /* Architectural Performance Monitoring */
2662         case 0x80000007: /* Advanced power management */
2663         case 0xC0000002:
2664         case 0xC0000003:
2665         case 0xC0000004:
2666         default:
2667                 entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
2668                 break;
2669         }
2670
2671         kvm_x86_ops->set_supported_cpuid(function, entry);
2672
2673         put_cpu();
2674 }
2675
2676 #undef F
2677
2678 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
2679                                      struct kvm_cpuid_entry2 __user *entries)
2680 {
2681         struct kvm_cpuid_entry2 *cpuid_entries;
2682         int limit, nent = 0, r = -E2BIG;
2683         u32 func;
2684
2685         if (cpuid->nent < 1)
2686                 goto out;
2687         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2688                 cpuid->nent = KVM_MAX_CPUID_ENTRIES;
2689         r = -ENOMEM;
2690         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
2691         if (!cpuid_entries)
2692                 goto out;
2693
2694         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
2695         limit = cpuid_entries[0].eax;
2696         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
2697                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
2698                              &nent, cpuid->nent);
2699         r = -E2BIG;
2700         if (nent >= cpuid->nent)
2701                 goto out_free;
2702
2703         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
2704         limit = cpuid_entries[nent - 1].eax;
2705         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
2706                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
2707                              &nent, cpuid->nent);
2708
2709
2710
2711         r = -E2BIG;
2712         if (nent >= cpuid->nent)
2713                 goto out_free;
2714
2715         /* Add support for Centaur's CPUID instruction. */
2716         if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
2717                 do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
2718                                 &nent, cpuid->nent);
2719
2720                 r = -E2BIG;
2721                 if (nent >= cpuid->nent)
2722                         goto out_free;
2723
2724                 limit = cpuid_entries[nent - 1].eax;
2725                 for (func = 0xC0000001;
2726                         func <= limit && nent < cpuid->nent; ++func)
2727                         do_cpuid_ent(&cpuid_entries[nent], func, 0,
2728                                         &nent, cpuid->nent);
2729
2730                 r = -E2BIG;
2731                 if (nent >= cpuid->nent)
2732                         goto out_free;
2733         }
2734
2735         do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
2736                      cpuid->nent);
2737
2738         r = -E2BIG;
2739         if (nent >= cpuid->nent)
2740                 goto out_free;
2741
2742         do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
2743                      cpuid->nent);
2744
2745         r = -E2BIG;
2746         if (nent >= cpuid->nent)
2747                 goto out_free;
2748
2749         r = -EFAULT;
2750         if (copy_to_user(entries, cpuid_entries,
2751                          nent * sizeof(struct kvm_cpuid_entry2)))
2752                 goto out_free;
2753         cpuid->nent = nent;
2754         r = 0;
2755
2756 out_free:
2757         vfree(cpuid_entries);
2758 out:
2759         return r;
2760 }
2761
2762 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2763                                     struct kvm_lapic_state *s)
2764 {
2765         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2766
2767         return 0;
2768 }
2769
2770 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2771                                     struct kvm_lapic_state *s)
2772 {
2773         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
2774         kvm_apic_post_state_restore(vcpu);
2775         update_cr8_intercept(vcpu);
2776
2777         return 0;
2778 }
2779
2780 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2781                                     struct kvm_interrupt *irq)
2782 {
2783         if (irq->irq < 0 || irq->irq >= 256)
2784                 return -EINVAL;
2785         if (irqchip_in_kernel(vcpu->kvm))
2786                 return -ENXIO;
2787
2788         kvm_queue_interrupt(vcpu, irq->irq, false);
2789         kvm_make_request(KVM_REQ_EVENT, vcpu);
2790
2791         return 0;
2792 }
2793
2794 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
2795 {
2796         kvm_inject_nmi(vcpu);
2797
2798         return 0;
2799 }
2800
2801 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
2802                                            struct kvm_tpr_access_ctl *tac)
2803 {
2804         if (tac->flags)
2805                 return -EINVAL;
2806         vcpu->arch.tpr_access_reporting = !!tac->enabled;
2807         return 0;
2808 }
2809
2810 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2811                                         u64 mcg_cap)
2812 {
2813         int r;
2814         unsigned bank_num = mcg_cap & 0xff, bank;
2815
2816         r = -EINVAL;
2817         if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2818                 goto out;
2819         if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
2820                 goto out;
2821         r = 0;
2822         vcpu->arch.mcg_cap = mcg_cap;
2823         /* Init IA32_MCG_CTL to all 1s */
2824         if (mcg_cap & MCG_CTL_P)
2825                 vcpu->arch.mcg_ctl = ~(u64)0;
2826         /* Init IA32_MCi_CTL to all 1s */
2827         for (bank = 0; bank < bank_num; bank++)
2828                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2829 out:
2830         return r;
2831 }
2832
2833 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2834                                       struct kvm_x86_mce *mce)
2835 {
2836         u64 mcg_cap = vcpu->arch.mcg_cap;
2837         unsigned bank_num = mcg_cap & 0xff;
2838         u64 *banks = vcpu->arch.mce_banks;
2839
2840         if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
2841                 return -EINVAL;
2842         /*
2843          * if IA32_MCG_CTL is not all 1s, the uncorrected error
2844          * reporting is disabled
2845          */
2846         if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
2847             vcpu->arch.mcg_ctl != ~(u64)0)
2848                 return 0;
2849         banks += 4 * mce->bank;
2850         /*
2851          * if IA32_MCi_CTL is not all 1s, the uncorrected error
2852          * reporting is disabled for the bank
2853          */
2854         if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
2855                 return 0;
2856         if (mce->status & MCI_STATUS_UC) {
2857                 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
2858                     !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
2859                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2860                         return 0;
2861                 }
2862                 if (banks[1] & MCI_STATUS_VAL)
2863                         mce->status |= MCI_STATUS_OVER;
2864                 banks[2] = mce->addr;
2865                 banks[3] = mce->misc;
2866                 vcpu->arch.mcg_status = mce->mcg_status;
2867                 banks[1] = mce->status;
2868                 kvm_queue_exception(vcpu, MC_VECTOR);
2869         } else if (!(banks[1] & MCI_STATUS_VAL)
2870                    || !(banks[1] & MCI_STATUS_UC)) {
2871                 if (banks[1] & MCI_STATUS_VAL)
2872                         mce->status |= MCI_STATUS_OVER;
2873                 banks[2] = mce->addr;
2874                 banks[3] = mce->misc;
2875                 banks[1] = mce->status;
2876         } else
2877                 banks[1] |= MCI_STATUS_OVER;
2878         return 0;
2879 }
2880
2881 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2882                                                struct kvm_vcpu_events *events)
2883 {
2884         process_nmi(vcpu);
2885         events->exception.injected =
2886                 vcpu->arch.exception.pending &&
2887                 !kvm_exception_is_soft(vcpu->arch.exception.nr);
2888         events->exception.nr = vcpu->arch.exception.nr;
2889         events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2890         events->exception.pad = 0;
2891         events->exception.error_code = vcpu->arch.exception.error_code;
2892
2893         events->interrupt.injected =
2894                 vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
2895         events->interrupt.nr = vcpu->arch.interrupt.nr;
2896         events->interrupt.soft = 0;
2897         events->interrupt.shadow =
2898                 kvm_x86_ops->get_interrupt_shadow(vcpu,
2899                         KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
2900
2901         events->nmi.injected = vcpu->arch.nmi_injected;
2902         events->nmi.pending = vcpu->arch.nmi_pending != 0;
2903         events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2904         events->nmi.pad = 0;
2905
2906         events->sipi_vector = vcpu->arch.sipi_vector;
2907
2908         events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2909                          | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2910                          | KVM_VCPUEVENT_VALID_SHADOW);
2911         memset(&events->reserved, 0, sizeof(events->reserved));
2912 }
2913
2914 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2915                                               struct kvm_vcpu_events *events)
2916 {
2917         if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
2918                               | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2919                               | KVM_VCPUEVENT_VALID_SHADOW))
2920                 return -EINVAL;
2921
2922         process_nmi(vcpu);
2923         vcpu->arch.exception.pending = events->exception.injected;
2924         vcpu->arch.exception.nr = events->exception.nr;
2925         vcpu->arch.exception.has_error_code = events->exception.has_error_code;
2926         vcpu->arch.exception.error_code = events->exception.error_code;
2927
2928         vcpu->arch.interrupt.pending = events->interrupt.injected;
2929         vcpu->arch.interrupt.nr = events->interrupt.nr;
2930         vcpu->arch.interrupt.soft = events->interrupt.soft;
2931         if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2932                 kvm_x86_ops->set_interrupt_shadow(vcpu,
2933                                                   events->interrupt.shadow);
2934
2935         vcpu->arch.nmi_injected = events->nmi.injected;
2936         if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
2937                 vcpu->arch.nmi_pending = events->nmi.pending;
2938         kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
2939
2940         if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2941                 vcpu->arch.sipi_vector = events->sipi_vector;
2942
2943         kvm_make_request(KVM_REQ_EVENT, vcpu);
2944
2945         return 0;
2946 }
2947
2948 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2949                                              struct kvm_debugregs *dbgregs)
2950 {
2951         memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2952         dbgregs->dr6 = vcpu->arch.dr6;
2953         dbgregs->dr7 = vcpu->arch.dr7;
2954         dbgregs->flags = 0;
2955         memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
2956 }
2957
2958 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2959                                             struct kvm_debugregs *dbgregs)
2960 {
2961         if (dbgregs->flags)
2962                 return -EINVAL;
2963
2964         if (dbgregs->dr6 & ~0xffffffffull)
2965                 return -EINVAL;
2966         if (dbgregs->dr7 & ~0xffffffffull)
2967                 return -EINVAL;
2968
2969         memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2970         vcpu->arch.dr6 = dbgregs->dr6;
2971         vcpu->arch.dr7 = dbgregs->dr7;
2972
2973         return 0;
2974 }
2975
2976 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
2977                                          struct kvm_xsave *guest_xsave)
2978 {
2979         if (cpu_has_xsave)
2980                 memcpy(guest_xsave->region,
2981                         &vcpu->arch.guest_fpu.state->xsave,
2982                         xstate_size);
2983         else {
2984                 memcpy(guest_xsave->region,
2985                         &vcpu->arch.guest_fpu.state->fxsave,
2986                         sizeof(struct i387_fxsave_struct));
2987                 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
2988                         XSTATE_FPSSE;
2989         }
2990 }
2991
2992 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
2993                                         struct kvm_xsave *guest_xsave)
2994 {
2995         u64 xstate_bv =
2996                 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
2997
2998         if (cpu_has_xsave)
2999                 memcpy(&vcpu->arch.guest_fpu.state->xsave,
3000                         guest_xsave->region, xstate_size);
3001         else {
3002                 if (xstate_bv & ~XSTATE_FPSSE)
3003                         return -EINVAL;
3004                 memcpy(&vcpu->arch.guest_fpu.state->fxsave,
3005                         guest_xsave->region, sizeof(struct i387_fxsave_struct));
3006         }
3007         return 0;
3008 }
3009
3010 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
3011                                         struct kvm_xcrs *guest_xcrs)
3012 {
3013         if (!cpu_has_xsave) {
3014                 guest_xcrs->nr_xcrs = 0;
3015                 return;
3016         }
3017
3018         guest_xcrs->nr_xcrs = 1;
3019         guest_xcrs->flags = 0;
3020         guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
3021         guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
3022 }
3023
3024 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
3025                                        struct kvm_xcrs *guest_xcrs)
3026 {
3027         int i, r = 0;
3028
3029         if (!cpu_has_xsave)
3030                 return -EINVAL;
3031
3032         if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
3033                 return -EINVAL;
3034
3035         for (i = 0; i < guest_xcrs->nr_xcrs; i++)
3036                 /* Only support XCR0 currently */
3037                 if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
3038                         r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
3039                                 guest_xcrs->xcrs[0].value);
3040                         break;
3041                 }
3042         if (r)
3043                 r = -EINVAL;
3044         return r;
3045 }
3046
3047 long kvm_arch_vcpu_ioctl(struct file *filp,
3048                          unsigned int ioctl, unsigned long arg)
3049 {
3050         struct kvm_vcpu *vcpu = filp->private_data;
3051         void __user *argp = (void __user *)arg;
3052         int r;
3053         union {
3054                 struct kvm_lapic_state *lapic;
3055                 struct kvm_xsave *xsave;
3056                 struct kvm_xcrs *xcrs;
3057                 void *buffer;
3058         } u;
3059
3060         u.buffer = NULL;
3061         switch (ioctl) {
3062         case KVM_GET_LAPIC: {
3063                 r = -EINVAL;
3064                 if (!vcpu->arch.apic)
3065                         goto out;
3066                 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
3067
3068                 r = -ENOMEM;
3069                 if (!u.lapic)
3070                         goto out;
3071                 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
3072                 if (r)
3073                         goto out;
3074                 r = -EFAULT;
3075                 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
3076                         goto out;
3077                 r = 0;
3078                 break;
3079         }
3080         case KVM_SET_LAPIC: {
3081                 r = -EINVAL;
3082                 if (!vcpu->arch.apic)
3083                         goto out;
3084                 u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
3085                 r = -ENOMEM;
3086                 if (!u.lapic)
3087                         goto out;
3088                 r = -EFAULT;
3089                 if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
3090                         goto out;
3091                 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
3092                 if (r)
3093                         goto out;
3094                 r = 0;
3095                 break;
3096         }
3097         case KVM_INTERRUPT: {
3098                 struct kvm_interrupt irq;
3099
3100                 r = -EFAULT;
3101                 if (copy_from_user(&irq, argp, sizeof irq))
3102                         goto out;
3103                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
3104                 if (r)
3105                         goto out;
3106                 r = 0;
3107                 break;
3108         }
3109         case KVM_NMI: {
3110                 r = kvm_vcpu_ioctl_nmi(vcpu);
3111                 if (r)
3112                         goto out;
3113                 r = 0;
3114                 break;
3115         }
3116         case KVM_SET_CPUID: {
3117                 struct kvm_cpuid __user *cpuid_arg = argp;
3118                 struct kvm_cpuid cpuid;
3119
3120                 r = -EFAULT;
3121                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3122                         goto out;
3123                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
3124                 if (r)
3125                         goto out;
3126                 break;
3127         }
3128         case KVM_SET_CPUID2: {
3129                 struct kvm_cpuid2 __user *cpuid_arg = argp;
3130                 struct kvm_cpuid2 cpuid;
3131
3132                 r = -EFAULT;
3133                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3134                         goto out;
3135                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
3136                                               cpuid_arg->entries);
3137                 if (r)
3138                         goto out;
3139                 break;
3140         }
3141         case KVM_GET_CPUID2: {
3142                 struct kvm_cpuid2 __user *cpuid_arg = argp;
3143                 struct kvm_cpuid2 cpuid;
3144
3145                 r = -EFAULT;
3146                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3147                         goto out;
3148                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
3149                                               cpuid_arg->entries);
3150                 if (r)
3151                         goto out;
3152                 r = -EFAULT;
3153                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
3154                         goto out;
3155                 r = 0;
3156                 break;
3157         }
3158         case KVM_GET_MSRS:
3159                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
3160                 break;
3161         case KVM_SET_MSRS:
3162                 r = msr_io(vcpu, argp, do_set_msr, 0);
3163                 break;
3164         case KVM_TPR_ACCESS_REPORTING: {
3165                 struct kvm_tpr_access_ctl tac;
3166
3167                 r = -EFAULT;
3168                 if (copy_from_user(&tac, argp, sizeof tac))
3169                         goto out;
3170                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
3171                 if (r)
3172                         goto out;
3173                 r = -EFAULT;
3174                 if (copy_to_user(argp, &tac, sizeof tac))
3175                         goto out;
3176                 r = 0;
3177                 break;
3178         };
3179         case KVM_SET_VAPIC_ADDR: {
3180                 struct kvm_vapic_addr va;
3181
3182                 r = -EINVAL;
3183                 if (!irqchip_in_kernel(vcpu->kvm))
3184                         goto out;
3185                 r = -EFAULT;
3186                 if (copy_from_user(&va, argp, sizeof va))
3187                         goto out;
3188                 r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
3189                 break;
3190         }
3191         case KVM_X86_SETUP_MCE: {
3192                 u64 mcg_cap;
3193
3194                 r = -EFAULT;
3195                 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
3196                         goto out;
3197                 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
3198                 break;
3199         }
3200         case KVM_X86_SET_MCE: {
3201                 struct kvm_x86_mce mce;
3202
3203                 r = -EFAULT;
3204                 if (copy_from_user(&mce, argp, sizeof mce))
3205                         goto out;
3206                 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
3207                 break;
3208         }
3209         case KVM_GET_VCPU_EVENTS: {
3210                 struct kvm_vcpu_events events;
3211
3212                 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
3213
3214                 r = -EFAULT;
3215                 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
3216                         break;