x86/iopl/64: Properly context-switch IOPL on Xen PV
[pandora-kernel.git] / arch / x86 / xen / enlighten.c
index 1f92865..b255312 100644 (file)
@@ -62,6 +62,7 @@
 #include <asm/reboot.h>
 #include <asm/stackprotector.h>
 #include <asm/hypervisor.h>
+#include <asm/pci_x86.h>
 
 #include "xen-ops.h"
 #include "mmu.h"
@@ -128,6 +129,21 @@ static void xen_vcpu_setup(int cpu)
 
        BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
+       /*
+        * This path is called twice on PVHVM - first during bootup via
+        * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being
+        * hotplugged: cpu_up -> xen_hvm_cpu_notify.
+        * As we can only do the VCPUOP_register_vcpu_info once lets
+        * not over-write its result.
+        *
+        * For PV it is called during restore (xen_vcpu_restore) and bootup
+        * (xen_setup_vcpu_info_placement). The hotplug mechanism does not
+        * use this function.
+        */
+       if (xen_hvm_domain()) {
+               if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
+                       return;
+       }
        if (cpu < MAX_VIRT_CPUS)
                per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
 
@@ -197,6 +213,9 @@ static void __init xen_banner(void)
               xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
 }
 
+#define CPUID_THERM_POWER_LEAF 6
+#define APERFMPERF_PRESENT 0
+
 static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
 static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
 
@@ -217,6 +236,11 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
                maskedx = cpuid_leaf1_edx_mask;
                break;
 
+       case CPUID_THERM_POWER_LEAF:
+               /* Disabling APERFMPERF for kernel usage */
+               maskecx = ~(1 << APERFMPERF_PRESENT);
+               break;
+
        case 0xb:
                /* Suppress extended topology stuff */
                maskebx = 0;
@@ -297,6 +321,7 @@ static void set_aliased_prot(void *v, pgprot_t prot)
        pte_t pte;
        unsigned long pfn;
        struct page *page;
+       unsigned char dummy;
 
        ptep = lookup_address((unsigned long)v, &level);
        BUG_ON(ptep == NULL);
@@ -306,6 +331,32 @@ static void set_aliased_prot(void *v, pgprot_t prot)
 
        pte = pfn_pte(pfn, prot);
 
+       /*
+        * Careful: update_va_mapping() will fail if the virtual address
+        * we're poking isn't populated in the page tables.  We don't
+        * need to worry about the direct map (that's always in the page
+        * tables), but we need to be careful about vmap space.  In
+        * particular, the top level page table can lazily propagate
+        * entries between processes, so if we've switched mms since we
+        * vmapped the target in the first place, we might not have the
+        * top-level page table entry populated.
+        *
+        * We disable preemption because we want the same mm active when
+        * we probe the target and when we issue the hypercall.  We'll
+        * have the same nominal mm, but if we're a kernel thread, lazy
+        * mm dropping could change our pgd.
+        *
+        * Out of an abundance of caution, this uses __get_user() to fault
+        * in the target address just in case there's some obscure case
+        * in which the target address isn't readable.
+        */
+
+       preempt_disable();
+
+       pagefault_disable();    /* Avoid warnings due to being atomic. */
+       __get_user(dummy, (unsigned char __user __force *)v);
+       pagefault_enable();
+
        if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
                BUG();
 
@@ -317,6 +368,8 @@ static void set_aliased_prot(void *v, pgprot_t prot)
                                BUG();
        } else
                kmap_flush_unused();
+
+       preempt_enable();
 }
 
 static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
@@ -324,6 +377,17 @@ static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
        const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
        int i;
 
+       /*
+        * We need to mark the all aliases of the LDT pages RO.  We
+        * don't need to call vm_flush_aliases(), though, since that's
+        * only responsible for flushing aliases out the TLBs, not the
+        * page tables, and Xen will flush the TLB for us if needed.
+        *
+        * To avoid confusing future readers: none of this is necessary
+        * to load the LDT.  The hypervisor only checks this when the
+        * LDT is faulted in due to subsequent descriptor access.
+        */
+
        for(i = 0; i < entries; i += entries_per_page)
                set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
 }
@@ -704,7 +768,7 @@ static void xen_load_sp0(struct tss_struct *tss,
        xen_mc_issue(PARAVIRT_LAZY_CPU);
 }
 
-static void xen_set_iopl_mask(unsigned mask)
+void xen_set_iopl_mask(unsigned mask)
 {
        struct physdev_set_iopl set_iopl;
 
@@ -809,7 +873,16 @@ static void xen_write_cr4(unsigned long cr4)
 
        native_write_cr4(cr4);
 }
-
+#ifdef CONFIG_X86_64
+static inline unsigned long xen_read_cr8(void)
+{
+       return 0;
+}
+static inline void xen_write_cr8(unsigned long val)
+{
+       BUG_ON(val);
+}
+#endif
 static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 {
        int ret;
@@ -978,13 +1051,23 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
        .read_cr4_safe = native_read_cr4_safe,
        .write_cr4 = xen_write_cr4,
 
+#ifdef CONFIG_X86_64
+       .read_cr8 = xen_read_cr8,
+       .write_cr8 = xen_write_cr8,
+#endif
+
        .wbinvd = native_wbinvd,
 
        .read_msr = native_read_msr_safe,
+       .rdmsr_regs = native_rdmsr_safe_regs,
        .write_msr = xen_write_msr_safe,
+       .wrmsr_regs = native_wrmsr_safe_regs,
+
        .read_tsc = native_read_tsc,
        .read_pmc = native_read_pmc,
 
+       .read_tscp = native_read_tscp,
+
        .iret = xen_iret,
        .irq_enable_sysexit = xen_sysexit,
 #ifdef CONFIG_X86_64
@@ -1277,9 +1360,15 @@ asmlinkage void __init xen_start_kernel(void)
 
                /* Make sure ACS will be enabled */
                pci_request_acs();
-       }
-               
 
+               /* Avoid searching for BIOS MP tables */
+               x86_init.mpparse.find_smp_config = x86_init_noop;
+               x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+       }
+#ifdef CONFIG_PCI
+       /* PCI BIOS service won't work from a PV guest. */
+       pci_probe &= ~PCI_PROBE_BIOS;
+#endif
        xen_raw_console_write("about to get started...\n");
 
        xen_setup_runstate_info(0);
@@ -1357,8 +1446,11 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
        switch (action) {
        case CPU_UP_PREPARE:
                xen_vcpu_setup(cpu);
-               if (xen_have_vector_callback)
+               if (xen_have_vector_callback) {
                        xen_init_lock_cpu(cpu);
+                       if (xen_feature(XENFEAT_hvm_safe_pvclock))
+                               xen_setup_timer(cpu);
+               }
                break;
        default:
                break;