x86/iopl/64: Properly context-switch IOPL on Xen PV

[pandora-kernel.git] / arch / x86 / xen / enlighten.c
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c

index 1f92865..b255312 100644 (file)
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -62,6 +62,7 @@
  #include <asm/reboot.h>
  #include <asm/stackprotector.h>
  #include <asm/hypervisor.h>
+#include <asm/pci_x86.h>
  
  #include "xen-ops.h"
  #include "mmu.h"
@@ -128,6 +129,21 @@ static void xen_vcpu_setup(int cpu)
  
         BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
  
+       /*
+        * This path is called twice on PVHVM - first during bootup via
+        * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being
+        * hotplugged: cpu_up -> xen_hvm_cpu_notify.
+        * As we can only do the VCPUOP_register_vcpu_info once lets
+        * not over-write its result.
+        *
+        * For PV it is called during restore (xen_vcpu_restore) and bootup
+        * (xen_setup_vcpu_info_placement). The hotplug mechanism does not
+        * use this function.
+        */
+       if (xen_hvm_domain()) {
+               if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
+                       return;
+       }
         if (cpu < MAX_VIRT_CPUS)
                 per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
  
@@ -197,6 +213,9 @@ static void __init xen_banner(void)
                xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
  }
  
+#define CPUID_THERM_POWER_LEAF 6
+#define APERFMPERF_PRESENT 0
+
  static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
  static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
  
@@ -217,6 +236,11 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
                 maskedx = cpuid_leaf1_edx_mask;
                 break;
  
+       case CPUID_THERM_POWER_LEAF:
+               /* Disabling APERFMPERF for kernel usage */
+               maskecx = ~(1 << APERFMPERF_PRESENT);
+               break;
+
         case 0xb:
                 /* Suppress extended topology stuff */
                 maskebx = 0;
@@ -297,6 +321,7 @@ static void set_aliased_prot(void *v, pgprot_t prot)
         pte_t pte;
         unsigned long pfn;
         struct page *page;
+       unsigned char dummy;
  
         ptep = lookup_address((unsigned long)v, &level);
         BUG_ON(ptep == NULL);
@@ -306,6 +331,32 @@ static void set_aliased_prot(void *v, pgprot_t prot)
  
         pte = pfn_pte(pfn, prot);
  
+       /*
+        * Careful: update_va_mapping() will fail if the virtual address
+        * we're poking isn't populated in the page tables.  We don't
+        * need to worry about the direct map (that's always in the page
+        * tables), but we need to be careful about vmap space.  In
+        * particular, the top level page table can lazily propagate
+        * entries between processes, so if we've switched mms since we
+        * vmapped the target in the first place, we might not have the
+        * top-level page table entry populated.
+        *
+        * We disable preemption because we want the same mm active when
+        * we probe the target and when we issue the hypercall.  We'll
+        * have the same nominal mm, but if we're a kernel thread, lazy
+        * mm dropping could change our pgd.
+        *
+        * Out of an abundance of caution, this uses __get_user() to fault
+        * in the target address just in case there's some obscure case
+        * in which the target address isn't readable.
+        */
+
+       preempt_disable();
+
+       pagefault_disable();    /* Avoid warnings due to being atomic. */
+       __get_user(dummy, (unsigned char __user __force *)v);
+       pagefault_enable();
+
         if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
                 BUG();
  
@@ -317,6 +368,8 @@ static void set_aliased_prot(void *v, pgprot_t prot)
                                 BUG();
         } else
                 kmap_flush_unused();
+
+       preempt_enable();
  }
  
  static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
@@ -324,6 +377,17 @@ static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
         const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
         int i;
  
+       /*
+        * We need to mark the all aliases of the LDT pages RO.  We
+        * don't need to call vm_flush_aliases(), though, since that's
+        * only responsible for flushing aliases out the TLBs, not the
+        * page tables, and Xen will flush the TLB for us if needed.
+        *
+        * To avoid confusing future readers: none of this is necessary
+        * to load the LDT.  The hypervisor only checks this when the
+        * LDT is faulted in due to subsequent descriptor access.
+        */
+
         for(i = 0; i < entries; i += entries_per_page)
                 set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
  }
@@ -704,7 +768,7 @@ static void xen_load_sp0(struct tss_struct *tss,
         xen_mc_issue(PARAVIRT_LAZY_CPU);
  }
  
-static void xen_set_iopl_mask(unsigned mask)
+void xen_set_iopl_mask(unsigned mask)
  {
         struct physdev_set_iopl set_iopl;
  
@@ -809,7 +873,16 @@ static void xen_write_cr4(unsigned long cr4)
  
         native_write_cr4(cr4);
  }
-
+#ifdef CONFIG_X86_64
+static inline unsigned long xen_read_cr8(void)
+{
+       return 0;
+}
+static inline void xen_write_cr8(unsigned long val)
+{
+       BUG_ON(val);
+}
+#endif
  static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
  {
         int ret;
@@ -978,13 +1051,23 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
         .read_cr4_safe = native_read_cr4_safe,
         .write_cr4 = xen_write_cr4,
  
+#ifdef CONFIG_X86_64
+       .read_cr8 = xen_read_cr8,
+       .write_cr8 = xen_write_cr8,
+#endif
+
         .wbinvd = native_wbinvd,
  
         .read_msr = native_read_msr_safe,
+       .rdmsr_regs = native_rdmsr_safe_regs,
         .write_msr = xen_write_msr_safe,
+       .wrmsr_regs = native_wrmsr_safe_regs,
+
         .read_tsc = native_read_tsc,
         .read_pmc = native_read_pmc,
  
+       .read_tscp = native_read_tscp,
+
         .iret = xen_iret,
         .irq_enable_sysexit = xen_sysexit,
  #ifdef CONFIG_X86_64
@@ -1277,9 +1360,15 @@ asmlinkage void __init xen_start_kernel(void)
  
                 /* Make sure ACS will be enabled */
                 pci_request_acs();
-       }
-               
  
+               /* Avoid searching for BIOS MP tables */
+               x86_init.mpparse.find_smp_config = x86_init_noop;
+               x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+       }
+#ifdef CONFIG_PCI
+       /* PCI BIOS service won't work from a PV guest. */
+       pci_probe &= ~PCI_PROBE_BIOS;
+#endif
         xen_raw_console_write("about to get started...\n");
  
         xen_setup_runstate_info(0);
@@ -1357,8 +1446,11 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
         switch (action) {
         case CPU_UP_PREPARE:
                 xen_vcpu_setup(cpu);
-               if (xen_have_vector_callback)
+               if (xen_have_vector_callback) {
                         xen_init_lock_cpu(cpu);
+                       if (xen_feature(XENFEAT_hvm_safe_pvclock))
+                               xen_setup_timer(cpu);
+               }
                 break;
         default:
                 break;