Merge branch 'linus' into xen-64bit
authorIngo Molnar <mingo@elte.hu>
Mon, 21 Jul 2008 13:06:09 +0000 (15:06 +0200)
committerIngo Molnar <mingo@elte.hu>
Mon, 21 Jul 2008 13:06:09 +0000 (15:06 +0200)
48 files changed:
arch/x86/ia32/ia32entry.S
arch/x86/kernel/asm-offsets_64.c
arch/x86/kernel/cpu/amd_64.c
arch/x86/kernel/cpu/common_64.c
arch/x86/kernel/entry_64.S
arch/x86/kernel/head64.c
arch/x86/kernel/head_64.S
arch/x86/kernel/irq_32.c
arch/x86/kernel/paravirt.c
arch/x86/kernel/process_64.c
arch/x86/kernel/setup.c
arch/x86/kernel/smpboot.c
arch/x86/mm/init_32.c
arch/x86/vdso/Makefile
arch/x86/vdso/vdso32-setup.c
arch/x86/vdso/vdso32.S
arch/x86/xen/Kconfig
arch/x86/xen/Makefile
arch/x86/xen/enlighten.c
arch/x86/xen/mmu.c
arch/x86/xen/mmu.h
arch/x86/xen/multicalls.c
arch/x86/xen/setup.c
arch/x86/xen/smp.c
arch/x86/xen/suspend.c
arch/x86/xen/xen-asm_32.S [moved from arch/x86/xen/xen-asm.S with 100% similarity]
arch/x86/xen/xen-asm_64.S [new file with mode: 0644]
arch/x86/xen/xen-head.S
arch/x86/xen/xen-ops.h
drivers/net/xen-netfront.c
drivers/xen/manage.c
include/asm-x86/paravirt.h
include/asm-x86/percpu.h
include/asm-x86/pgtable.h
include/asm-x86/pgtable_32.h
include/asm-x86/pgtable_64.h
include/asm-x86/setup.h
include/asm-x86/smp.h
include/asm-x86/vdso.h
include/asm-x86/xen/hypercall.h
include/asm-x86/xen/interface.h
include/asm-x86/xen/interface_32.h [new file with mode: 0644]
include/asm-x86/xen/interface_64.h [new file with mode: 0644]
include/asm-x86/xen/page.h
include/xen/hvc-console.h
include/xen/interface/callback.h
include/xen/xen-ops.h
kernel/power/Kconfig

index 20371d0..0ae1e77 100644 (file)
@@ -321,6 +321,7 @@ ENTRY(ia32_syscall)
        /*CFI_REL_OFFSET        rflags,EFLAGS-RIP*/
        /*CFI_REL_OFFSET        cs,CS-RIP*/
        CFI_REL_OFFSET  rip,RIP-RIP
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        SWAPGS
        /*
         * No need to follow this irqs on/off section: the syscall
index bacf5de..aa89387 100644 (file)
@@ -18,6 +18,8 @@
 #include <asm/ia32.h>
 #include <asm/bootparam.h>
 
+#include <xen/interface/xen.h>
+
 #define __NO_STUBS 1
 #undef __SYSCALL
 #undef _ASM_X86_64_UNISTD_H_
@@ -131,5 +133,14 @@ int main(void)
        OFFSET(BP_loadflags, boot_params, hdr.loadflags);
        OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
        OFFSET(BP_version, boot_params, hdr.version);
+
+       BLANK();
+       DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
+#ifdef CONFIG_XEN
+       BLANK();
+       OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+       OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#undef ENTRY
+#endif
        return 0;
 }
index 7c36fb8..d1692b2 100644 (file)
@@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
        /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
        if (c->x86_power & (1<<8))
                set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+       set_cpu_cap(c, X86_FEATURE_SYSCALL32);
 }
 
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
index 7b8cc72..736f50f 100644 (file)
@@ -16,6 +16,7 @@
 #include <asm/i387.h>
 #include <asm/msr.h>
 #include <asm/io.h>
+#include <asm/linkage.h>
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
@@ -316,9 +317,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
                c->x86_phys_bits = eax & 0xff;
        }
 
-       /* Assume all 64-bit CPUs support 32-bit syscall */
-       set_cpu_cap(c, X86_FEATURE_SYSCALL32);
-
        if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
            cpu_devs[c->x86_vendor]->c_early_init)
                cpu_devs[c->x86_vendor]->c_early_init(c);
@@ -517,8 +515,7 @@ void pda_init(int cpu)
 }
 
 char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
-                          DEBUG_STKSZ]
-__attribute__((section(".bss.page_aligned")));
+                          DEBUG_STKSZ] __page_aligned_bss;
 
 extern asmlinkage void ignore_sysret(void);
 
index ae63e58..80d5663 100644 (file)
@@ -1189,6 +1189,7 @@ END(device_not_available)
        /* runs on exception stack */
 KPROBE_ENTRY(debug)
        INTR_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        pushq $0
        CFI_ADJUST_CFA_OFFSET 8         
        paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1199,7 @@ KPROBE_END(debug)
        /* runs on exception stack */   
 KPROBE_ENTRY(nmi)
        INTR_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        pushq $-1
        CFI_ADJUST_CFA_OFFSET 8
        paranoidentry do_nmi, 0, 0
@@ -1211,6 +1213,7 @@ KPROBE_END(nmi)
 
 KPROBE_ENTRY(int3)
        INTR_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        pushq $0
        CFI_ADJUST_CFA_OFFSET 8
        paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1240,7 @@ END(coprocessor_segment_overrun)
        /* runs on exception stack */
 ENTRY(double_fault)
        XCPT_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        paranoidentry do_double_fault
        jmp paranoid_exit1
        CFI_ENDPROC
@@ -1253,6 +1257,7 @@ END(segment_not_present)
        /* runs on exception stack */
 ENTRY(stack_segment)
        XCPT_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        paranoidentry do_stack_segment
        jmp paranoid_exit1
        CFI_ENDPROC
@@ -1278,6 +1283,7 @@ END(spurious_interrupt_bug)
        /* runs on exception stack */
 ENTRY(machine_check)
        INTR_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        pushq $0
        CFI_ADJUST_CFA_OFFSET 8 
        paranoidentry do_machine_check
@@ -1312,3 +1318,103 @@ KPROBE_ENTRY(ignore_sysret)
        sysret
        CFI_ENDPROC
 ENDPROC(ignore_sysret)
+
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+       zeroentry xen_do_hypervisor_callback
+END(xen_hypervisor_callback)
+
+/*
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+*/
+ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
+       CFI_STARTPROC
+/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+   see the correct pointer to the pt_regs */
+       movq %rdi, %rsp            # we don't return, adjust the stack frame
+       CFI_ENDPROC
+       CFI_DEFAULT_STACK
+11:    incl %gs:pda_irqcount
+       movq %rsp,%rbp
+       CFI_DEF_CFA_REGISTER rbp
+       cmovzq %gs:pda_irqstackptr,%rsp
+       pushq %rbp                      # backlink for old unwinder
+       call xen_evtchn_do_upcall
+       popq %rsp
+       CFI_DEF_CFA_REGISTER rsp
+       decl %gs:pda_irqcount
+       jmp  error_exit
+       CFI_ENDPROC
+END(do_hypervisor_callback)
+
+/*
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we do not need to fix up as Xen has already reloaded all segment
+# registers that could be reloaded and zeroed the others.
+# Category 2 we fix up by killing the current process. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by comparing each saved segment register
+# with its current contents: any discrepancy means we in category 1.
+*/
+ENTRY(xen_failsafe_callback)
+       framesz = (RIP-0x30)    /* workaround buggy gas */
+       _frame framesz
+       CFI_REL_OFFSET rcx, 0
+       CFI_REL_OFFSET r11, 8
+       movw %ds,%cx
+       cmpw %cx,0x10(%rsp)
+       CFI_REMEMBER_STATE
+       jne 1f
+       movw %es,%cx
+       cmpw %cx,0x18(%rsp)
+       jne 1f
+       movw %fs,%cx
+       cmpw %cx,0x20(%rsp)
+       jne 1f
+       movw %gs,%cx
+       cmpw %cx,0x28(%rsp)
+       jne 1f
+       /* All segments match their saved values => Category 2 (Bad IRET). */
+       movq (%rsp),%rcx
+       CFI_RESTORE rcx
+       movq 8(%rsp),%r11
+       CFI_RESTORE r11
+       addq $0x30,%rsp
+       CFI_ADJUST_CFA_OFFSET -0x30
+       pushq $0
+       CFI_ADJUST_CFA_OFFSET 8
+       pushq %r11
+       CFI_ADJUST_CFA_OFFSET 8
+       pushq %rcx
+       CFI_ADJUST_CFA_OFFSET 8
+       jmp general_protection
+       CFI_RESTORE_STATE
+1:     /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+       movq (%rsp),%rcx
+       CFI_RESTORE rcx
+       movq 8(%rsp),%r11
+       CFI_RESTORE r11
+       addq $0x30,%rsp
+       CFI_ADJUST_CFA_OFFSET -0x30
+       pushq $0
+       CFI_ADJUST_CFA_OFFSET 8
+       SAVE_ALL
+       jmp error_exit
+       CFI_ENDPROC
+END(xen_failsafe_callback)
+
+#endif /* CONFIG_XEN */
index c978198..1b318e9 100644 (file)
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
 static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
 #endif
 
+void __init x86_64_init_pda(void)
+{
+       _cpu_pda = __cpu_pda;
+       cpu_pda(0) = &_boot_cpu_pda;
+       pda_init(0);
+}
+
 static void __init zap_identity_mappings(void)
 {
        pgd_t *pgd = pgd_offset_k(0UL);
@@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
 
        early_printk("Kernel alive\n");
 
-       _cpu_pda = __cpu_pda;
-       cpu_pda(0) = &_boot_cpu_pda;
-       pda_init(0);
+       x86_64_init_pda();
 
        early_printk("Kernel really alive\n");
 
index b07ac7b..db3280a 100644 (file)
@@ -407,6 +407,7 @@ ENTRY(phys_base)
        /* This must match the first entry in level2_kernel_pgt */
        .quad   0x0000000000000000
 
+#include "../../x86/xen/xen-head.S"
        
        .section .bss, "aw", @nobits
        .align L1_CACHE_BYTES
index 47a6f6f..1cf8c1f 100644 (file)
@@ -83,11 +83,8 @@ union irq_ctx {
 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
 
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
-               __attribute__((__section__(".bss.page_aligned")));
-
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
-               __attribute__((__section__(".bss.page_aligned")));
+static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
 
 static void call_on_stack(void *func, void *stack)
 {
index e0f571d..2963ab5 100644 (file)
@@ -29,6 +29,7 @@
 #include <asm/desc.h>
 #include <asm/setup.h>
 #include <asm/arch_hooks.h>
+#include <asm/pgtable.h>
 #include <asm/time.h>
 #include <asm/pgalloc.h>
 #include <asm/irq.h>
@@ -373,6 +374,9 @@ struct pv_mmu_ops pv_mmu_ops = {
 #ifndef CONFIG_X86_64
        .pagetable_setup_start = native_pagetable_setup_start,
        .pagetable_setup_done = native_pagetable_setup_done,
+#else
+       .pagetable_setup_start = paravirt_nop,
+       .pagetable_setup_done = paravirt_nop,
 #endif
 
        .read_cr2 = native_read_cr2,
index a8e5362..e8a8e1b 100644 (file)
@@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
-       struct thread_struct *prev = &prev_p->thread,
-                                *next = &next_p->thread;
+       struct thread_struct *prev = &prev_p->thread;
+       struct thread_struct *next = &next_p->thread;
        int cpu = smp_processor_id();
        struct tss_struct *tss = &per_cpu(init_tss, cpu);
        unsigned fsindex, gsindex;
@@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
        /* 
         * Switch FS and GS.
+        *
+        * Segment register != 0 always requires a reload.  Also
+        * reload when it has changed.  When prev process used 64bit
+        * base always reload to avoid an information leak.
         */
-       { 
-               /* segment register != 0 always requires a reload. 
-                  also reload when it has changed. 
-                  when prev process used 64bit base always reload
-                  to avoid an information leak. */
-               if (unlikely(fsindex | next->fsindex | prev->fs)) {
-                       loadsegment(fs, next->fsindex);
-                       /* check if the user used a selector != 0
-                        * if yes clear 64bit base, since overloaded base
-                         * is always mapped to the Null selector
-                         */
-                       if (fsindex)
+       if (unlikely(fsindex | next->fsindex | prev->fs)) {
+               loadsegment(fs, next->fsindex);
+               /* 
+                * Check if the user used a selector != 0; if yes
+                *  clear 64bit base, since overloaded base is always
+                *  mapped to the Null selector
+                */
+               if (fsindex)
                        prev->fs = 0;                           
-               }
-               /* when next process has a 64bit base use it */
-               if (next->fs) 
-                       wrmsrl(MSR_FS_BASE, next->fs); 
-               prev->fsindex = fsindex;
-
-               if (unlikely(gsindex | next->gsindex | prev->gs)) {
-                       load_gs_index(next->gsindex);
-                       if (gsindex)
+       }
+       /* when next process has a 64bit base use it */
+       if (next->fs)
+               wrmsrl(MSR_FS_BASE, next->fs);
+       prev->fsindex = fsindex;
+
+       if (unlikely(gsindex | next->gsindex | prev->gs)) {
+               load_gs_index(next->gsindex);
+               if (gsindex)
                        prev->gs = 0;                           
-               }
-               if (next->gs)
-                       wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 
-               prev->gsindex = gsindex;
        }
+       if (next->gs)
+               wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
+       prev->gsindex = gsindex;
 
        /* Must be after DS reload */
        unlazy_fpu(prev_p);
@@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        write_pda(pcurrent, next_p); 
 
        write_pda(kernelstack,
-       (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+                 (unsigned long)task_stack_page(next_p) +
+                 THREAD_SIZE - PDA_STACKOFFSET);
 #ifdef CONFIG_CC_STACKPROTECTOR
        write_pda(stack_canary, next_p->stack_canary);
        /*
index 531b55b..c9010f8 100644 (file)
@@ -824,7 +824,10 @@ void __init setup_arch(char **cmdline_p)
        vmi_init();
 #endif
 
+       paravirt_pagetable_setup_start(swapper_pg_dir);
        paging_init();
+       paravirt_pagetable_setup_done(swapper_pg_dir);
+       paravirt_post_allocator_init();
 
 #ifdef CONFIG_X86_64
        map_vsyscall();
index 687376a..1deb3b6 100644 (file)
@@ -768,7 +768,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
  *
  * Must be called after the _cpu_pda pointer table is initialized.
  */
-static int __cpuinit get_local_pda(int cpu)
+int __cpuinit get_local_pda(int cpu)
 {
        struct x8664_pda *oldpda, *newpda;
        unsigned long size = sizeof(struct x8664_pda);
index 9689a51..7113acd 100644 (file)
@@ -868,8 +868,6 @@ void __init paging_init(void)
         */
        sparse_init();
        zone_sizes_init();
-
-       paravirt_post_allocator_init();
 }
 
 /*
index b7ad9f8..4d6ef0a 100644 (file)
@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
 # Build multiple 32-bit vDSO images to choose from at boot time.
 #
 obj-$(VDSO32-y)                        += vdso32-syms.lds
-vdso32.so-$(CONFIG_X86_32)     += int80
+vdso32.so-$(VDSO32-y)          += int80
 vdso32.so-$(CONFIG_COMPAT)     += syscall
 vdso32.so-$(VDSO32-y)          += sysenter
 
index 0bce542..513f330 100644 (file)
@@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
        }
 }
 
-/*
- * These symbols are defined by vdso32.S to mark the bounds
- * of the ELF DSO images included therein.
- */
-extern const char vdso32_default_start, vdso32_default_end;
-extern const char vdso32_sysenter_start, vdso32_sysenter_end;
 static struct page *vdso32_pages[1];
 
 #ifdef CONFIG_X86_64
 
 #define        vdso32_sysenter()       (boot_cpu_has(X86_FEATURE_SYSENTER32))
+#define        vdso32_syscall()        (boot_cpu_has(X86_FEATURE_SYSCALL32))
 
 /* May not be __init: called during resume */
 void syscall32_cpu_init(void)
@@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map)
 #else  /* CONFIG_X86_32 */
 
 #define vdso32_sysenter()      (boot_cpu_has(X86_FEATURE_SEP))
+#define vdso32_syscall()       (0)
 
 void enable_sep_cpu(void)
 {
@@ -296,12 +292,15 @@ int __init sysenter_setup(void)
        gate_vma_init();
 #endif
 
-       if (!vdso32_sysenter()) {
-               vsyscall = &vdso32_default_start;
-               vsyscall_len = &vdso32_default_end - &vdso32_default_start;
-       } else {
+       if (vdso32_syscall()) {
+               vsyscall = &vdso32_syscall_start;
+               vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
+       } else if (vdso32_sysenter()){
                vsyscall = &vdso32_sysenter_start;
                vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
+       } else {
+               vsyscall = &vdso32_int80_start;
+               vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
        }
 
        memcpy(syscall_page, vsyscall, vsyscall_len);
index 1e36f72..2ce5f82 100644 (file)
@@ -2,14 +2,17 @@
 
 __INITDATA
 
-       .globl vdso32_default_start, vdso32_default_end
-vdso32_default_start:
-#ifdef CONFIG_X86_32
+       .globl vdso32_int80_start, vdso32_int80_end
+vdso32_int80_start:
        .incbin "arch/x86/vdso/vdso32-int80.so"
-#else
+vdso32_int80_end:
+
+       .globl vdso32_syscall_start, vdso32_syscall_end
+vdso32_syscall_start:
+#ifdef CONFIG_COMPAT
        .incbin "arch/x86/vdso/vdso32-syscall.so"
 #endif
-vdso32_default_end:
+vdso32_syscall_end:
 
        .globl vdso32_sysenter_start, vdso32_sysenter_end
 vdso32_sysenter_start:
index c2cc995..3815e42 100644 (file)
@@ -6,8 +6,8 @@ config XEN
        bool "Xen guest support"
        select PARAVIRT
        select PARAVIRT_CLOCK
-       depends on X86_32
-       depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
+       depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
+       depends on X86_CMPXCHG && X86_TSC
        help
          This is the Linux Xen port.  Enabling this will allow the
          kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,16 @@ config XEN
 
 config XEN_MAX_DOMAIN_MEMORY
        int "Maximum allowed size of a domain in gigabytes"
-       default 8
+       default 8 if X86_32
+       default 32 if X86_64
        depends on XEN
        help
          The pseudo-physical to machine address array is sized
          according to the maximum possible memory size of a Xen
          domain.  This array uses 1 page per gigabyte, so there's no
-         need to be too stingy here.
\ No newline at end of file
+         need to be too stingy here.
+
+config XEN_SAVE_RESTORE
+       bool
+       depends on PM
+       default y
\ No newline at end of file
index 2ba2d16..59c1e53 100644 (file)
@@ -1,4 +1,4 @@
 obj-y          := enlighten.o setup.o multicalls.o mmu.o \
-                       time.o xen-asm.o grant-table.o suspend.o
+                       time.o xen-asm_$(BITS).o grant-table.o suspend.o
 
 obj-$(CONFIG_SMP)      += smp.o
index bb50845..3da6acb 100644 (file)
@@ -33,6 +33,7 @@
 #include <xen/interface/sched.h>
 #include <xen/features.h>
 #include <xen/page.h>
+#include <xen/hvc-console.h>
 
 #include <asm/paravirt.h>
 #include <asm/page.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/fixmap.h>
 #include <asm/processor.h>
+#include <asm/msr-index.h>
 #include <asm/setup.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/reboot.h>
-#include <asm/pgalloc.h>
 
 #include "xen-ops.h"
 #include "mmu.h"
@@ -56,6 +57,18 @@ EXPORT_SYMBOL_GPL(hypercall_page);
 DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
 DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 
+/*
+ * Identity map, in addition to plain kernel map.  This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
+
 /*
  * Note about cr3 (pagetable base) values:
  *
@@ -363,14 +376,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
 
 static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
 {
-       xen_mc_batch();
-
-       load_TLS_descriptor(t, cpu, 0);
-       load_TLS_descriptor(t, cpu, 1);
-       load_TLS_descriptor(t, cpu, 2);
-
-       xen_mc_issue(PARAVIRT_LAZY_CPU);
-
        /*
         * XXX sleazy hack: If we're being called in a lazy-cpu zone,
         * it means we're in a context switch, and %gs has just been
@@ -379,10 +384,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
         * Either way, it has been saved, and the new value will get
         * loaded properly.  This will go away as soon as Xen has been
         * modified to not save/restore %gs for normal hypercalls.
+        *
+        * On x86_64, this hack is not used for %gs, because gs points
+        * to KERNEL_GS_BASE (and uses it for PDA references), so we
+        * must not zero %gs on x86_64
+        *
+        * For x86_64, we need to zero %fs, otherwise we may get an
+        * exception between the new %fs descriptor being loaded and
+        * %fs being effectively cleared at __switch_to().
         */
-       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
+#ifdef CONFIG_X86_32
                loadsegment(gs, 0);
+#else
+               loadsegment(fs, 0);
+#endif
+       }
+
+       xen_mc_batch();
+
+       load_TLS_descriptor(t, cpu, 0);
+       load_TLS_descriptor(t, cpu, 1);
+       load_TLS_descriptor(t, cpu, 2);
+
+       xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+#ifdef CONFIG_X86_64
+static void xen_load_gs_index(unsigned int idx)
+{
+       if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
+               BUG();
 }
+#endif
 
 static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
                                const void *ptr)
@@ -400,23 +434,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
        preempt_enable();
 }
 
-static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+static int cvt_gate_to_trap(int vector, const gate_desc *val,
                            struct trap_info *info)
 {
-       u8 type, dpl;
-
-       type = (high >> 8) & 0x1f;
-       dpl = (high >> 13) & 3;
-
-       if (type != 0xf && type != 0xe)
+       if (val->type != 0xf && val->type != 0xe)
                return 0;
 
        info->vector = vector;
-       info->address = (high & 0xffff0000) | (low & 0x0000ffff);
-       info->cs = low >> 16;
-       info->flags = dpl;
+       info->address = gate_offset(*val);
+       info->cs = gate_segment(*val);
+       info->flags = val->dpl;
        /* interrupt gates clear IF */
-       if (type == 0xe)
+       if (val->type == 0xe)
                info->flags |= 4;
 
        return 1;
@@ -443,11 +472,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
 
        if (p >= start && (p + 8) <= end) {
                struct trap_info info[2];
-               u32 *desc = (u32 *)g;
 
                info[1].address = 0;
 
-               if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0]))
+               if (cvt_gate_to_trap(entrynum, g, &info[0]))
                        if (HYPERVISOR_set_trap_table(info))
                                BUG();
        }
@@ -460,13 +488,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
 {
        unsigned in, out, count;
 
-       count = (desc->size+1) / 8;
+       count = (desc->size+1) / sizeof(gate_desc);
        BUG_ON(count > 256);
 
        for (in = out = 0; in < count; in++) {
-               const u32 *entry = (u32 *)(desc->address + in * 8);
+               gate_desc *entry = (gate_desc*)(desc->address) + in;
 
-               if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+               if (cvt_gate_to_trap(in, entry, &traps[out]))
                        out++;
        }
        traps[out].address = 0;
@@ -695,33 +723,89 @@ static void set_current_cr3(void *v)
        x86_write_percpu(xen_current_cr3, (unsigned long)v);
 }
 
-static void xen_write_cr3(unsigned long cr3)
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
 {
        struct mmuext_op *op;
        struct multicall_space mcs;
-       unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+       unsigned long mfn;
 
-       BUG_ON(preemptible());
+       if (cr3)
+               mfn = pfn_to_mfn(PFN_DOWN(cr3));
+       else
+               mfn = 0;
 
-       mcs = xen_mc_entry(sizeof(*op));  /* disables interrupts */
+       WARN_ON(mfn == 0 && kernel);
 
-       /* Update while interrupts are disabled, so its atomic with
-          respect to ipis */
-       x86_write_percpu(xen_cr3, cr3);
+       mcs = __xen_mc_entry(sizeof(*op));
 
        op = mcs.args;
-       op->cmd = MMUEXT_NEW_BASEPTR;
+       op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
        op->arg1.mfn = mfn;
 
        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 
-       /* Update xen_update_cr3 once the batch has actually
-          been submitted. */
-       xen_mc_callback(set_current_cr3, (void *)cr3);
+       if (kernel) {
+               x86_write_percpu(xen_cr3, cr3);
+
+               /* Update xen_current_cr3 once the batch has actually
+                  been submitted. */
+               xen_mc_callback(set_current_cr3, (void *)cr3);
+       }
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+       BUG_ON(preemptible());
+
+       xen_mc_batch();  /* disables interrupts */
+
+       /* Update while interrupts are disabled, so its atomic with
+          respect to ipis */
+       x86_write_percpu(xen_cr3, cr3);
+
+       __xen_write_cr3(true, cr3);
+
+#ifdef CONFIG_X86_64
+       {
+               pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+               if (user_pgd)
+                       __xen_write_cr3(false, __pa(user_pgd));
+               else
+                       __xen_write_cr3(false, 0);
+       }
+#endif
 
        xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
 }
 
+static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+{
+       int ret;
+
+       ret = 0;
+
+       switch(msr) {
+#ifdef CONFIG_X86_64
+               unsigned which;
+               u64 base;
+
+       case MSR_FS_BASE:               which = SEGBASE_FS; goto set;
+       case MSR_KERNEL_GS_BASE:        which = SEGBASE_GS_USER; goto set;
+       case MSR_GS_BASE:               which = SEGBASE_GS_KERNEL; goto set;
+
+       set:
+               base = ((u64)high << 32) | low;
+               if (HYPERVISOR_set_segment_base(which, base) != 0)
+                       ret = -EFAULT;
+               break;
+#endif
+       default:
+               ret = native_write_msr_safe(msr, low, high);
+       }
+
+       return ret;
+}
+
 /* Early in boot, while setting up the initial pagetable, assume
    everything is pinned. */
 static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -778,6 +862,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
        xen_alloc_ptpage(mm, pfn, PT_PMD);
 }
 
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+       pgd_t *pgd = mm->pgd;
+       int ret = 0;
+
+       BUG_ON(PagePinned(virt_to_page(pgd)));
+
+#ifdef CONFIG_X86_64
+       {
+               struct page *page = virt_to_page(pgd);
+               pgd_t *user_pgd;
+
+               BUG_ON(page->private != 0);
+
+               ret = -ENOMEM;
+
+               user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+               page->private = (unsigned long)user_pgd;
+
+               if (user_pgd != NULL) {
+                       user_pgd[pgd_index(VSYSCALL_START)] =
+                               __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+                       ret = 0;
+               }
+
+               BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+       }
+#endif
+
+       return ret;
+}
+
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+       pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+       if (user_pgd)
+               free_page((unsigned long)user_pgd);
+#endif
+}
+
 /* This should never happen until we're OK to use struct page */
 static void xen_release_ptpage(u32 pfn, unsigned level)
 {
@@ -803,6 +929,18 @@ static void xen_release_pmd(u32 pfn)
        xen_release_ptpage(pfn, PT_PMD);
 }
 
+#if PAGETABLE_LEVELS == 4
+static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
+{
+       xen_alloc_ptpage(mm, pfn, PT_PUD);
+}
+
+static void xen_release_pud(u32 pfn)
+{
+       xen_release_ptpage(pfn, PT_PUD);
+}
+#endif
+
 #ifdef CONFIG_HIGHPTE
 static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
 {
@@ -841,68 +979,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
 
 static __init void xen_pagetable_setup_start(pgd_t *base)
 {
-       pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
-       int i;
-
-       /* special set_pte for pagetable initialization */
-       pv_mmu_ops.set_pte = xen_set_pte_init;
-
-       init_mm.pgd = base;
-       /*
-        * copy top-level of Xen-supplied pagetable into place.  This
-        * is a stand-in while we copy the pmd pages.
-        */
-       memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
-
-       /*
-        * For PAE, need to allocate new pmds, rather than
-        * share Xen's, since Xen doesn't like pmd's being
-        * shared between address spaces.
-        */
-       for (i = 0; i < PTRS_PER_PGD; i++) {
-               if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
-                       pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
-
-                       memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
-                              PAGE_SIZE);
-
-                       make_lowmem_page_readonly(pmd);
-
-                       set_pgd(&base[i], __pgd(1 + __pa(pmd)));
-               } else
-                       pgd_clear(&base[i]);
-       }
-
-       /* make sure zero_page is mapped RO so we can use it in pagetables */
-       make_lowmem_page_readonly(empty_zero_page);
-       make_lowmem_page_readonly(base);
-       /*
-        * Switch to new pagetable.  This is done before
-        * pagetable_init has done anything so that the new pages
-        * added to the table can be prepared properly for Xen.
-        */
-       xen_write_cr3(__pa(base));
-
-       /* Unpin initial Xen pagetable */
-       pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
-                         PFN_DOWN(__pa(xen_start_info->pt_base)));
 }
 
 void xen_setup_shared_info(void)
 {
        if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-               unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP);
-
-               /*
-                * Create a mapping for the shared info page.
-                * Should be set_fixmap(), but shared_info is a machine
-                * address with no corresponding pseudo-phys address.
-                */
-               set_pte_mfn(addr,
-                           PFN_DOWN(xen_start_info->shared_info),
-                           PAGE_KERNEL);
-
-               HYPERVISOR_shared_info = (struct shared_info *)addr;
+               set_fixmap(FIX_PARAVIRT_BOOTMAP,
+                          xen_start_info->shared_info);
+
+               HYPERVISOR_shared_info =
+                       (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
        } else
                HYPERVISOR_shared_info =
                        (struct shared_info *)__va(xen_start_info->shared_info);
@@ -917,26 +1003,32 @@ void xen_setup_shared_info(void)
 
 static __init void xen_pagetable_setup_done(pgd_t *base)
 {
-       /* This will work as long as patching hasn't happened yet
-          (which it hasn't) */
-       pv_mmu_ops.alloc_pte = xen_alloc_pte;
-       pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
-       pv_mmu_ops.release_pte = xen_release_pte;
-       pv_mmu_ops.release_pmd = xen_release_pmd;
-       pv_mmu_ops.set_pte = xen_set_pte;
-
        xen_setup_shared_info();
-
-       /* Actually pin the pagetable down, but we can't set PG_pinned
-          yet because the page structures don't exist yet. */
-       pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
 }
 
 static __init void xen_post_allocator_init(void)
 {
+       pv_mmu_ops.set_pte = xen_set_pte;
        pv_mmu_ops.set_pmd = xen_set_pmd;
        pv_mmu_ops.set_pud = xen_set_pud;
+#if PAGETABLE_LEVELS == 4
+       pv_mmu_ops.set_pgd = xen_set_pgd;
+#endif
+
+       /* This will work as long as patching hasn't happened yet
+          (which it hasn't) */
+       pv_mmu_ops.alloc_pte = xen_alloc_pte;
+       pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+       pv_mmu_ops.release_pte = xen_release_pte;
+       pv_mmu_ops.release_pmd = xen_release_pmd;
+#if PAGETABLE_LEVELS == 4
+       pv_mmu_ops.alloc_pud = xen_alloc_pud;
+       pv_mmu_ops.release_pud = xen_release_pud;
+#endif
 
+#ifdef CONFIG_X86_64
+       SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
        xen_mark_init_mm_pinned();
 }
 
@@ -950,6 +1042,7 @@ void xen_setup_vcpu_info_placement(void)
 
        /* xen_vcpu_setup managed to place the vcpu_info within the
           percpu area for all cpus, so make use of it */
+#ifdef CONFIG_X86_32
        if (have_vcpu_info_placement) {
                printk(KERN_INFO "Xen: using vcpu_info placement\n");
 
@@ -959,6 +1052,7 @@ void xen_setup_vcpu_info_placement(void)
                pv_irq_ops.irq_enable = xen_irq_enable_direct;
                pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
        }
+#endif
 }
 
 static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -979,10 +1073,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
        goto patch_site
 
        switch (type) {
+#ifdef CONFIG_X86_32
                SITE(pv_irq_ops, irq_enable);
                SITE(pv_irq_ops, irq_disable);
                SITE(pv_irq_ops, save_fl);
                SITE(pv_irq_ops, restore_fl);
+#endif /* CONFIG_X86_32 */
 #undef SITE
 
        patch_site:
@@ -1025,8 +1121,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
 #ifdef CONFIG_X86_F00F_BUG
        case FIX_F00F_IDT:
 #endif
+#ifdef CONFIG_X86_32
        case FIX_WP_TEST:
        case FIX_VDSO:
+# ifdef CONFIG_HIGHMEM
+       case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
+#else
+       case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+#endif
 #ifdef CONFIG_X86_LOCAL_APIC
        case FIX_APIC_BASE:     /* maps dummy local APIC */
 #endif
@@ -1039,6 +1142,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
        }
 
        __native_set_fixmap(idx, pte);
+
+#ifdef CONFIG_X86_64
+       /* Replicate changes to map the vsyscall page into the user
+          pagetable vsyscall mapping. */
+       if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+               unsigned long vaddr = __fix_to_virt(idx);
+               set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+       }
+#endif
 }
 
 static const struct pv_info xen_info __initdata = {
@@ -1084,18 +1196,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
        .wbinvd = native_wbinvd,
 
        .read_msr = native_read_msr_safe,
-       .write_msr = native_write_msr_safe,
+       .write_msr = xen_write_msr_safe,
        .read_tsc = native_read_tsc,
        .read_pmc = native_read_pmc,
 
        .iret = xen_iret,
        .irq_enable_sysexit = xen_sysexit,
+#ifdef CONFIG_X86_64
+       .usergs_sysret32 = xen_sysret32,
+       .usergs_sysret64 = xen_sysret64,
+#endif
 
        .load_tr_desc = paravirt_nop,
        .set_ldt = xen_set_ldt,
        .load_gdt = xen_load_gdt,
        .load_idt = xen_load_idt,
        .load_tls = xen_load_tls,
+#ifdef CONFIG_X86_64
+       .load_gs_index = xen_load_gs_index,
+#endif
 
        .store_gdt = native_store_gdt,
        .store_idt = native_store_idt,
@@ -1109,14 +1228,34 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
        .set_iopl_mask = xen_set_iopl_mask,
        .io_delay = xen_io_delay,
 
+       /* Xen takes care of %gs when switching to usermode for us */
+       .swapgs = paravirt_nop,
+
        .lazy_mode = {
                .enter = paravirt_enter_lazy_cpu,
                .leave = xen_leave_lazy,
        },
 };
 
+static void __init __xen_init_IRQ(void)
+{
+#ifdef CONFIG_X86_64
+       int i;
+
+       /* Create identity vector->irq map */
+       for(i = 0; i < NR_VECTORS; i++) {
+               int cpu;
+
+               for_each_possible_cpu(cpu)
+                       per_cpu(vector_irq, cpu)[i] = i;
+       }
+#endif /* CONFIG_X86_64 */
+
+       xen_init_IRQ();
+}
+
 static const struct pv_irq_ops xen_irq_ops __initdata = {
-       .init_IRQ = xen_init_IRQ,
+       .init_IRQ = __xen_init_IRQ,
        .save_fl = xen_save_fl,
        .restore_fl = xen_restore_fl,
        .irq_disable = xen_irq_disable,
@@ -1124,7 +1263,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
        .safe_halt = xen_safe_halt,
        .halt = xen_halt,
 #ifdef CONFIG_X86_64
-       .adjust_exception_frame = paravirt_nop,
+       .adjust_exception_frame = xen_adjust_exception_frame,
 #endif
 };
 
@@ -1157,8 +1296,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
        .pte_update = paravirt_nop,
        .pte_update_defer = paravirt_nop,
 
-       .pgd_alloc = __paravirt_pgd_alloc,
-       .pgd_free = paravirt_nop,
+       .pgd_alloc = xen_pgd_alloc,
+       .pgd_free = xen_pgd_free,
 
        .alloc_pte = xen_alloc_pte_init,
        .release_pte = xen_release_pte_init,
@@ -1170,7 +1309,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
        .kmap_atomic_pte = xen_kmap_atomic_pte,
 #endif
 
-       .set_pte = NULL,        /* see xen_pagetable_setup_* */
+#ifdef CONFIG_X86_64
+       .set_pte = xen_set_pte,
+#else
+       .set_pte = xen_set_pte_init,
+#endif
        .set_pte_at = xen_set_pte_at,
        .set_pmd = xen_set_pmd_hyper,
 
@@ -1184,15 +1327,26 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
        .make_pte = xen_make_pte,
        .make_pgd = xen_make_pgd,
 
+#ifdef CONFIG_X86_PAE
        .set_pte_atomic = xen_set_pte_atomic,
        .set_pte_present = xen_set_pte_at,
-       .set_pud = xen_set_pud_hyper,
        .pte_clear = xen_pte_clear,
        .pmd_clear = xen_pmd_clear,
+#endif /* CONFIG_X86_PAE */
+       .set_pud = xen_set_pud_hyper,
 
        .make_pmd = xen_make_pmd,
        .pmd_val = xen_pmd_val,
 
+#if PAGETABLE_LEVELS == 4
+       .pud_val = xen_pud_val,
+       .make_pud = xen_make_pud,
+       .set_pgd = xen_set_pgd_hyper,
+
+       .alloc_pud = xen_alloc_pte_init,
+       .release_pud = xen_release_pte_init,
+#endif /* PAGETABLE_LEVELS == 4 */
+
        .activate_mm = xen_activate_mm,
        .dup_mmap = xen_dup_mmap,
        .exit_mmap = xen_exit_mmap,
@@ -1205,21 +1359,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
        .set_fixmap = xen_set_fixmap,
 };
 
-#ifdef CONFIG_SMP
-static const struct smp_ops xen_smp_ops __initdata = {
-       .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
-       .smp_prepare_cpus = xen_smp_prepare_cpus,
-       .cpu_up = xen_cpu_up,
-       .smp_cpus_done = xen_smp_cpus_done,
-
-       .smp_send_stop = xen_smp_send_stop,
-       .smp_send_reschedule = xen_smp_send_reschedule,
-
-       .send_call_func_ipi = xen_smp_send_call_function_ipi,
-       .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
-};
-#endif /* CONFIG_SMP */
-
 static void xen_reboot(int reason)
 {
        struct sched_shutdown r = { .reason = reason };
@@ -1264,6 +1403,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
 
 static void __init xen_reserve_top(void)
 {
+#ifdef CONFIG_X86_32
        unsigned long top = HYPERVISOR_VIRT_START;
        struct xen_platform_parameters pp;
 
@@ -1271,7 +1411,247 @@ static void __init xen_reserve_top(void)
                top = pp.virt_start;
 
        reserve_top_address(-top + 2 * PAGE_SIZE);
+#endif /* CONFIG_X86_32 */
+}
+
+/*
+ * Like __va(), but returns address in the kernel mapping (which is
+ * all we have until the physical memory mapping has been set up.
+ */
+static void *__ka(phys_addr_t paddr)
+{
+#ifdef CONFIG_X86_64
+       return (void *)(paddr + __START_KERNEL_map);
+#else
+       return __va(paddr);
+#endif
+}
+
+/* Convert a machine address to physical address */
+static unsigned long m2p(phys_addr_t maddr)
+{
+       phys_addr_t paddr;
+
+       maddr &= PTE_MASK;
+       paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
+
+       return paddr;
+}
+
+/* Convert a machine address to kernel virtual */
+static void *m2v(phys_addr_t maddr)
+{
+       return __ka(m2p(maddr));
+}
+
+#ifdef CONFIG_X86_64
+static void walk(pgd_t *pgd, unsigned long addr)
+{
+       unsigned l4idx = pgd_index(addr);
+       unsigned l3idx = pud_index(addr);
+       unsigned l2idx = pmd_index(addr);
+       unsigned l1idx = pte_index(addr);
+       pgd_t l4;
+       pud_t l3;
+       pmd_t l2;
+       pte_t l1;
+
+       xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
+                      pgd, addr, l4idx, l3idx, l2idx, l1idx);
+
+       l4 = pgd[l4idx];
+       xen_raw_printk("  l4: %016lx\n", l4.pgd);
+       xen_raw_printk("      %016lx\n", pgd_val(l4));
+
+       l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
+       xen_raw_printk("  l3: %016lx\n", l3.pud);
+       xen_raw_printk("      %016lx\n", pud_val(l3));
+
+       l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
+       xen_raw_printk("  l2: %016lx\n", l2.pmd);
+       xen_raw_printk("      %016lx\n", pmd_val(l2));
+
+       l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
+       xen_raw_printk("  l1: %016lx\n", l1.pte);
+       xen_raw_printk("      %016lx\n", pte_val(l1));
+}
+#endif
+
+static void set_page_prot(void *addr, pgprot_t prot)
+{
+       unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+       pte_t pte = pfn_pte(pfn, prot);
+
+       xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
+                      addr, pfn, get_phys_to_machine(pfn),
+                      pgprot_val(prot), pte.pte);
+
+       if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
+               BUG();
+}
+
+static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+{
+       unsigned pmdidx, pteidx;
+       unsigned ident_pte;
+       unsigned long pfn;
+
+       ident_pte = 0;
+       pfn = 0;
+       for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+               pte_t *pte_page;
+
+               /* Reuse or allocate a page of ptes */
+               if (pmd_present(pmd[pmdidx]))
+                       pte_page = m2v(pmd[pmdidx].pmd);
+               else {
+                       /* Check for free pte pages */
+                       if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+                               break;
+
+                       pte_page = &level1_ident_pgt[ident_pte];
+                       ident_pte += PTRS_PER_PTE;
+
+                       pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
+               }
+
+               /* Install mappings */
+               for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+                       pte_t pte;
+
+                       if (pfn > max_pfn_mapped)
+                               max_pfn_mapped = pfn;
+
+                       if (!pte_none(pte_page[pteidx]))
+                               continue;
+
+                       pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
+                       pte_page[pteidx] = pte;
+               }
+       }
+
+       for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+               set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+
+       set_page_prot(pmd, PAGE_KERNEL_RO);
+}
+
+#ifdef CONFIG_X86_64
+static void convert_pfn_mfn(void *v)
+{
+       pte_t *pte = v;
+       int i;
+
+       /* All levels are converted the same way, so just treat them
+          as ptes. */
+       for(i = 0; i < PTRS_PER_PTE; i++)
+               pte[i] = xen_make_pte(pte[i].pte);
+}
+
+/*
+ * Set up the inital kernel pagetable.
+ *
+ * We can construct this by grafting the Xen provided pagetable into
+ * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
+ * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
+ * means that only the kernel has a physical mapping to start with -
+ * but that's enough to get __va working.  We need to fill in the rest
+ * of the physical mapping once some sort of allocator has been set
+ * up.
+ */
+static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+{
+       pud_t *l3;
+       pmd_t *l2;
+
+       /* Zap identity mapping */
+       init_level4_pgt[0] = __pgd(0);
+
+       /* Pre-constructed entries are in pfn, so convert to mfn */
+       convert_pfn_mfn(init_level4_pgt);
+       convert_pfn_mfn(level3_ident_pgt);
+       convert_pfn_mfn(level3_kernel_pgt);
+
+       l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
+       l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
+
+       memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+       memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+       l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
+       l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
+       memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+       /* Set up identity map */
+       xen_map_identity_early(level2_ident_pgt, max_pfn);
+
+       /* Make pagetable pieces RO */
+       set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+       set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+       set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+       set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+       set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+       set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+
+       /* Pin down new L4 */
+       pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+                         PFN_DOWN(__pa_symbol(init_level4_pgt)));
+
+       /* Unpin Xen-provided one */
+       pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+       /* Switch over */
+       pgd = init_level4_pgt;
+
+       /*
+        * At this stage there can be no user pgd, and no page
+        * structure to attach it to, so make sure we just set kernel
+        * pgd.
+        */
+       xen_mc_batch();
+       __xen_write_cr3(true, __pa(pgd));
+       xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+       reserve_early(__pa(xen_start_info->pt_base),
+                     __pa(xen_start_info->pt_base +
+                          xen_start_info->nr_pt_frames * PAGE_SIZE),
+                     "XEN PAGETABLES");
+
+       return pgd;
+}
+#else  /* !CONFIG_X86_64 */
+static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+
+static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+{
+       pmd_t *kernel_pmd;
+
+       init_pg_tables_start = __pa(pgd);
+       init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+       max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+
+       kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+       memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
+
+       xen_map_identity_early(level2_kernel_pgt, max_pfn);
+
+       memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+       set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
+                       __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+
+       set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+       set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+       set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+
+       pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+       xen_write_cr3(__pa(swapper_pg_dir));
+
+       pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+
+       return swapper_pg_dir;
 }
+#endif /* CONFIG_X86_64 */
 
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
@@ -1301,53 +1681,56 @@ asmlinkage void __init xen_start_kernel(void)
 
        machine_ops = xen_machine_ops;
 
-#ifdef CONFIG_SMP
-       smp_ops = xen_smp_ops;
+#ifdef CONFIG_X86_64
+       /* Disable until direct per-cpu data access. */
+       have_vcpu_info_placement = 0;
+       x86_64_init_pda();
 #endif
 
+       xen_smp_init();
+
        /* Get mfn list */
        if (!xen_feature(XENFEAT_auto_translated_physmap))
                xen_build_dynamic_phys_to_machine();
 
        pgd = (pgd_t *)xen_start_info->pt_base;
 
-       init_pg_tables_start = __pa(pgd);
-       init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-       max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT;
-
-       init_mm.pgd = pgd; /* use the Xen pagetables to start */
-
-       /* keep using Xen gdt for now; no urgent need to change it */
-
-       x86_write_percpu(xen_cr3, __pa(pgd));
-       x86_write_percpu(xen_current_cr3, __pa(pgd));
+       /* Prevent unwanted bits from being set in PTEs. */
+       __supported_pte_mask &= ~_PAGE_GLOBAL;
+       if (!is_initial_xendomain())
+               __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
 
        /* Don't do the full vcpu_info placement stuff until we have a
           possible map and a non-dummy shared_info. */
        per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
 
+       xen_raw_console_write("mapping kernel into physical memory\n");
+       pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
+
+       init_mm.pgd = pgd;
+
+       /* keep using Xen gdt for now; no urgent need to change it */
+
        pv_info.kernel_rpl = 1;
        if (xen_feature(XENFEAT_supervisor_mode_kernel))
                pv_info.kernel_rpl = 0;
 
-       /* Prevent unwanted bits from being set in PTEs. */
-       __supported_pte_mask &= ~_PAGE_GLOBAL;
-       if (!is_initial_xendomain())
-               __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
-
        /* set the limit of our address space */
        xen_reserve_top();
 
+#ifdef CONFIG_X86_32
        /* set up basic CPUID stuff */
        cpu_detect(&new_cpu_data);
        new_cpu_data.hard_math = 1;
        new_cpu_data.x86_capability[0] = cpuid_edx(1);
+#endif
 
        /* Poke various useful things into boot_params */
        boot_params.hdr.type_of_loader = (9 << 4) | 0;
        boot_params.hdr.ramdisk_image = xen_start_info->mod_start
                ? __pa(xen_start_info->mod_start) : 0;
        boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
+       boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
 
        if (!is_initial_xendomain()) {
                add_preferred_console("xenboot", 0, NULL);
@@ -1355,6 +1738,21 @@ asmlinkage void __init xen_start_kernel(void)
                add_preferred_console("hvc", 0, NULL);
        }
 
+       xen_raw_console_write("about to get started...\n");
+
+#if 0
+       xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
+                      &boot_params, __pa_symbol(&boot_params),
+                      __va(__pa_symbol(&boot_params)));
+
+       walk(pgd, &boot_params);
+       walk(pgd, __va(__pa(&boot_params)));
+#endif
+
        /* Start the world */
+#ifdef CONFIG_X86_32
        i386_start_kernel();
+#else
+       x86_64_start_reservations((char *)__pa_symbol(&boot_params));
+#endif
 }
index ff0aa74..a44d56e 100644 (file)
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
+#include <asm/fixmap.h>
 #include <asm/mmu_context.h>
 #include <asm/paravirt.h>
+#include <asm/linkage.h>
 
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
 #include "multicalls.h"
 #include "mmu.h"
 
+/*
+ * Just beyond the highest usermode address.  STACK_TOP_MAX has a
+ * redzone above it, so round it up to a PGD boundary.
+ */
+#define USER_LIMIT     ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
+
+
 #define P2M_ENTRIES_PER_PAGE   (PAGE_SIZE / sizeof(unsigned long))
 #define TOP_ENTRIES            (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
 
 /* Placeholder for holes in the address space */
-static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE]
-       __attribute__((section(".data.page_aligned"))) =
+static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
                { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
 
  /* Array of pointers to pages containing p2m entries */
-static unsigned long *p2m_top[TOP_ENTRIES]
-       __attribute__((section(".data.page_aligned"))) =
+static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
                { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
 
 /* Arrays of p2m arrays expressed in mfns used for save/restore */
-static unsigned long p2m_top_mfn[TOP_ENTRIES]
-       __attribute__((section(".bss.page_aligned")));
+static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
 
-static unsigned long p2m_top_mfn_list[
-                       PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)]
-       __attribute__((section(".bss.page_aligned")));
+static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
+       __page_aligned_bss;
 
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
@@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
        p2m_top[topidx][idx] = mfn;
 }
 
-xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 {
+       unsigned long address = (unsigned long)vaddr;
        unsigned int level;
        pte_t *pte = lookup_address(address, &level);
        unsigned offset = address & ~PAGE_MASK;
 
        BUG_ON(pte == NULL);
 
-       return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+       return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
 }
 
 void make_lowmem_page_readonly(void *vaddr)
@@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 
        xen_mc_batch();
 
-       u.ptr = virt_to_machine(ptr).maddr;
+       /* ptr may be ioremapped for 64-bit pagetable setup */
+       u.ptr = arbitrary_virt_to_machine(ptr).maddr;
        u.val = pmd_val_ma(val);
        extend_mmu_update(&u);
 
@@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
  */
 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 {
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-
-       pgd = swapper_pg_dir + pgd_index(vaddr);
-       if (pgd_none(*pgd)) {
-               BUG();
-               return;
-       }
-       pud = pud_offset(pgd, vaddr);
-       if (pud_none(*pud)) {
-               BUG();
-               return;
-       }
-       pmd = pmd_offset(pud, vaddr);
-       if (pmd_none(*pmd)) {
-               BUG();
-               return;
-       }
-       pte = pte_offset_kernel(pmd, vaddr);
-       /* <mfn,flags> stored as-is, to permit clearing entries */
-       xen_set_pte(pte, mfn_pte(mfn, flags));
-
-       /*
-        * It's enough to flush this one mapping.
-        * (PGE mappings get flushed as well)
-        */
-       __flush_tlb_one(vaddr);
+       set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 }
 
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 
        xen_mc_batch();
 
-       u.ptr = virt_to_machine(ptr).maddr;
+       /* ptr may be ioremapped for 64-bit pagetable setup */
+       u.ptr = arbitrary_virt_to_machine(ptr).maddr;
        u.val = pud_val_ma(val);
        extend_mmu_update(&u);
 
@@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
 
 void xen_set_pte(pte_t *ptep, pte_t pte)
 {
+#ifdef CONFIG_X86_PAE
        ptep->pte_high = pte.pte_high;
        smp_wmb();
        ptep->pte_low = pte.pte_low;
+#else
+       *ptep = pte;
+#endif
 }
 
+#ifdef CONFIG_X86_PAE
 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
-       set_64bit((u64 *)ptep, pte_val_ma(pte));
+       set_64bit((u64 *)ptep, native_pte_val(pte));
 }
 
 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
 {
        set_pmd(pmdp, __pmd(0));
 }
+#endif /* CONFIG_X86_PAE */
 
 pmd_t xen_make_pmd(pmdval_t pmd)
 {
@@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
        return native_make_pmd(pmd);
 }
 
+#if PAGETABLE_LEVELS == 4
+pudval_t xen_pud_val(pud_t pud)
+{
+       return pte_mfn_to_pfn(pud.pud);
+}
+
+pud_t xen_make_pud(pudval_t pud)
+{
+       pud = pte_pfn_to_mfn(pud);
+
+       return native_make_pud(pud);
+}
+
+pgd_t *xen_get_user_pgd(pgd_t *pgd)
+{
+       pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
+       unsigned offset = pgd - pgd_page;
+       pgd_t *user_ptr = NULL;
+
+       if (offset < pgd_index(USER_LIMIT)) {
+               struct page *page = virt_to_page(pgd_page);
+               user_ptr = (pgd_t *)page->private;
+               if (user_ptr)
+                       user_ptr += offset;
+       }
+
+       return user_ptr;
+}
+
+static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+{
+       struct mmu_update u;
+
+       u.ptr = virt_to_machine(ptr).maddr;
+       u.val = pgd_val_ma(val);
+       extend_mmu_update(&u);
+}
+
+/*
+ * Raw hypercall-based set_pgd, intended for in early boot before
+ * there's a page structure.  This implies:
+ *  1. The only existing pagetable is the kernel's
+ *  2. It is always pinned
+ *  3. It has no user pagetable attached to it
+ */
+void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+{
+       preempt_disable();
+
+       xen_mc_batch();
+
+       __xen_set_pgd_hyper(ptr, val);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+       preempt_enable();
+}
+
+void xen_set_pgd(pgd_t *ptr, pgd_t val)
+{
+       pgd_t *user_ptr = xen_get_user_pgd(ptr);
+
+       /* If page is not pinned, we can just update the entry
+          directly */
+       if (!page_pinned(ptr)) {
+               *ptr = val;
+               if (user_ptr) {
+                       WARN_ON(page_pinned(user_ptr));
+                       *user_ptr = val;
+               }
+               return;
+       }
+
+       /* If it's pinned, then we can at least batch the kernel and
+          user updates together. */
+       xen_mc_batch();
+
+       __xen_set_pgd_hyper(ptr, val);
+       if (user_ptr)
+               __xen_set_pgd_hyper(user_ptr, val);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+#endif /* PAGETABLE_LEVELS == 4 */
+
 /*
-  (Yet another) pagetable walker.  This one is intended for pinning a
-  pagetable.  This means that it walks a pagetable and calls the
-  callback function on each page it finds making up the page table,
-  at every level.  It walks the entire pagetable, but it only bothers
-  pinning pte pages which are below pte_limit.  In the normal case
-  this will be TASK_SIZE, but at boot we need to pin up to
-  FIXADDR_TOP.  But the important bit is that we don't pin beyond
-  there, because then we start getting into Xen's ptes.
-*/
-static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
+ * (Yet another) pagetable walker.  This one is intended for pinning a
+ * pagetable.  This means that it walks a pagetable and calls the
+ * callback function on each page it finds making up the page table,
+ * at every level.  It walks the entire pagetable, but it only bothers
+ * pinning pte pages which are below limit.  In the normal case this
+ * will be STACK_TOP_MAX, but at boot we need to pin up to
+ * FIXADDR_TOP.
+ *
+ * For 32-bit the important bit is that we don't pin beyond there,
+ * because then we start getting into Xen's ptes.
+ *
+ * For 64-bit, we must skip the Xen hole in the middle of the address
+ * space, just after the big x86-64 virtual hole.
+ */
+static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
                    unsigned long limit)
 {
-       pgd_t *pgd = pgd_base;
        int flush = 0;
-       unsigned long addr = 0;
-       unsigned long pgd_next;
+       unsigned hole_low, hole_high;
+       unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
+       unsigned pgdidx, pudidx, pmdidx;
 
-       BUG_ON(limit > FIXADDR_TOP);
+       /* The limit is the last byte to be touched */
+       limit--;
+       BUG_ON(limit >= FIXADDR_TOP);
 
        if (xen_feature(XENFEAT_auto_translated_physmap))
                return 0;
 
-       for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+       /*
+        * 64-bit has a great big hole in the middle of the address
+        * space, which contains the Xen mappings.  On 32-bit these
+        * will end up making a zero-sized hole and so is a no-op.
+        */
+       hole_low = pgd_index(USER_LIMIT);
+       hole_high = pgd_index(PAGE_OFFSET);
+
+       pgdidx_limit = pgd_index(limit);
+#if PTRS_PER_PUD > 1
+       pudidx_limit = pud_index(limit);
+#else
+       pudidx_limit = 0;
+#endif
+#if PTRS_PER_PMD > 1
+       pmdidx_limit = pmd_index(limit);
+#else
+       pmdidx_limit = 0;
+#endif
+
+       flush |= (*func)(virt_to_page(pgd), PT_PGD);
+
+       for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
                pud_t *pud;
-               unsigned long pud_limit, pud_next;
 
-               pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+               if (pgdidx >= hole_low && pgdidx < hole_high)
+                       continue;
 
-               if (!pgd_val(*pgd))
+               if (!pgd_val(pgd[pgdidx]))
                        continue;
 
-               pud = pud_offset(pgd, 0);
+               pud = pud_offset(&pgd[pgdidx], 0);
 
                if (PTRS_PER_PUD > 1) /* not folded */
                        flush |= (*func)(virt_to_page(pud), PT_PUD);
 
-               for (; addr != pud_limit; pud++, addr = pud_next) {
+               for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
                        pmd_t *pmd;
-                       unsigned long pmd_limit;
 
-                       pud_next = pud_addr_end(addr, pud_limit);
-
-                       if (pud_next < limit)
-                               pmd_limit = pud_next;
-                       else
-                               pmd_limit = limit;
+                       if (pgdidx == pgdidx_limit &&
+                           pudidx > pudidx_limit)
+                               goto out;
 
-                       if (pud_none(*pud))
+                       if (pud_none(pud[pudidx]))
                                continue;
 
-                       pmd = pmd_offset(pud, 0);
+                       pmd = pmd_offset(&pud[pudidx], 0);
 
                        if (PTRS_PER_PMD > 1) /* not folded */
                                flush |= (*func)(virt_to_page(pmd), PT_PMD);
 
-                       for (; addr != pmd_limit; pmd++) {
-                               addr += (PAGE_SIZE * PTRS_PER_PTE);
-                               if ((pmd_limit-1) < (addr-1)) {
-                                       addr = pmd_limit;
-                                       break;
-                               }
+                       for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
+                               struct page *pte;
+
+                               if (pgdidx == pgdidx_limit &&
+                                   pudidx == pudidx_limit &&
+                                   pmdidx > pmdidx_limit)
+                                       goto out;
 
-                               if (pmd_none(*pmd))
+                               if (pmd_none(pmd[pmdidx]))
                                        continue;
 
-                               flush |= (*func)(pmd_page(*pmd), PT_PTE);
+                               pte = pmd_page(pmd[pmdidx]);
+                               flush |= (*func)(pte, PT_PTE);
                        }
                }
        }
-
-       flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
+out:
 
        return flush;
 }
@@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
 {
        xen_mc_batch();
 
-       if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+       if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
                /* re-enable interrupts for kmap_flush_unused */
                xen_mc_issue(0);
                kmap_flush_unused();
                xen_mc_batch();
        }
 
+#ifdef CONFIG_X86_64
+       {
+               pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+               xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
+
+               if (user_pgd) {
+                       pin_page(virt_to_page(user_pgd), PT_PGD);
+                       xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
+               }
+       }
+#else /* CONFIG_X86_32 */
+#ifdef CONFIG_X86_PAE
+       /* Need to make sure unshared kernel PMD is pinnable */
+       pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+#endif
        xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
+#endif /* CONFIG_X86_64 */
        xen_mc_issue(0);
 }
 
@@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
        spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
-/* The init_mm pagetable is really pinned as soon as its created, but
-   that's before we have page structures to store the bits.  So do all
-   the book-keeping now. */
+/*
+ * The init_mm pagetable is really pinned as soon as its created, but
+ * that's before we have page structures to store the bits.  So do all
+ * the book-keeping now.
+ */
 static __init int mark_pinned(struct page *page, enum pt_level level)
 {
        SetPagePinned(page);
@@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
 
        xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 
-       pgd_walk(pgd, unpin_page, TASK_SIZE);
+#ifdef CONFIG_X86_64
+       {
+               pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+               if (user_pgd) {
+                       xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
+                       unpin_page(virt_to_page(user_pgd), PT_PGD);
+               }
+       }
+#endif
+
+#ifdef CONFIG_X86_PAE
+       /* Need to make sure unshared kernel PMD is unpinned */
+       pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+#endif
+
+       pgd_walk(pgd, unpin_page, USER_LIMIT);
 
        xen_mc_issue(0);
 }
@@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
        list_for_each_entry(page, &pgd_list, lru) {
                if (PageSavePinned(page)) {
                        BUG_ON(!PagePinned(page));
-                       printk("unpinning pinned %p\n", page_address(page));
                        xen_pgd_unpin((pgd_t *)page_address(page));
                        ClearPageSavePinned(page);
                }
@@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 static void drop_other_mm_ref(void *info)
 {
        struct mm_struct *mm = info;
+       struct mm_struct *active_mm;
+
+#ifdef CONFIG_X86_64
+       active_mm = read_pda(active_mm);
+#else
+       active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
+#endif
 
-       if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+       if (active_mm == mm)
                leave_mm(smp_processor_id());
 
        /* If this cpu still has a stale cr3 reference, then make sure
index 297bf9f..0f59bd0 100644 (file)
@@ -10,18 +10,6 @@ enum pt_level {
        PT_PTE
 };
 
-/*
- * Page-directory addresses above 4GB do not fit into architectural %cr3.
- * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
- * must use the following accessor macros to pack/unpack valid MFNs.
- *
- * Note that Xen is using the fact that the pagetable base is always
- * page-aligned, and putting the 12 MSB of the address into the 12 LSB
- * of cr3.
- */
-#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
-#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
-
 
 void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
@@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
 void xen_set_pte(pte_t *ptep, pte_t pteval);
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
                    pte_t *ptep, pte_t pteval);
+
+#ifdef CONFIG_X86_PAE
 void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+#endif /* CONFIG_X86_PAE */
+
 void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
 void xen_set_pud(pud_t *ptr, pud_t val);
 void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
 void xen_set_pud_hyper(pud_t *ptr, pud_t val);
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_pmd_clear(pmd_t *pmdp);
+
+#if PAGETABLE_LEVELS == 4
+pudval_t xen_pud_val(pud_t pud);
+pud_t xen_make_pud(pudval_t pudval);
+void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
+void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
+#endif
+
+pgd_t *xen_get_user_pgd(pgd_t *pgd);
 
 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 void  xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
index 3c63c4d..9efd1c6 100644 (file)
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
                if (ret) {
                        printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
                               ret, smp_processor_id());
+                       dump_stack();
                        for (i = 0; i < b->mcidx; i++) {
                                printk("  call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
                                       i+1, b->mcidx,
index e0a3959..b6acc3a 100644 (file)
@@ -83,30 +83,72 @@ static void xen_idle(void)
 
 /*
  * Set the bit indicating "nosegneg" library variants should be used.
+ * We only need to bother in pure 32-bit mode; compat 32-bit processes
+ * can have un-truncated segments, so wrapping around is allowed.
  */
 static void __init fiddle_vdso(void)
 {
-       extern const char vdso32_default_start;
-       u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK);
+#ifdef CONFIG_X86_32
+       u32 *mask;
+       mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
        *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+       mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
+       *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+#endif
 }
 
-void xen_enable_sysenter(void)
+static __cpuinit int register_callback(unsigned type, const void *func)
 {
-       int cpu = smp_processor_id();
-       extern void xen_sysenter_target(void);
-       /* Mask events on entry, even though they get enabled immediately */
-       static struct callback_register sysenter = {
-               .type = CALLBACKTYPE_sysenter,
-               .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
+       struct callback_register callback = {
+               .type = type,
+               .address = XEN_CALLBACK(__KERNEL_CS, func),
                .flags = CALLBACKF_mask_events,
        };
 
-       if (!boot_cpu_has(X86_FEATURE_SEP) ||
-           HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) {
-               clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP);
-               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
+       return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
+}
+
+void __cpuinit xen_enable_sysenter(void)
+{
+       extern void xen_sysenter_target(void);
+       int ret;
+       unsigned sysenter_feature;
+
+#ifdef CONFIG_X86_32
+       sysenter_feature = X86_FEATURE_SEP;
+#else
+       sysenter_feature = X86_FEATURE_SYSENTER32;
+#endif
+
+       if (!boot_cpu_has(sysenter_feature))
+               return;
+
+       ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
+       if(ret != 0)
+               setup_clear_cpu_cap(sysenter_feature);
+}
+
+void __cpuinit xen_enable_syscall(void)
+{
+#ifdef CONFIG_X86_64
+       int ret;
+       extern void xen_syscall_target(void);
+       extern void xen_syscall32_target(void);
+
+       ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
+       if (ret != 0) {
+               printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
+               /* Pretty fatal; 64-bit userspace has no other
+                  mechanism for syscalls. */
        }
+
+       if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
+               ret = register_callback(CALLBACKTYPE_syscall32,
+                                       xen_syscall32_target);
+               if (ret != 0)
+                       setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+       }
+#endif /* CONFIG_X86_64 */
 }
 
 void __init xen_arch_setup(void)
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
        if (!xen_feature(XENFEAT_auto_translated_physmap))
                HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
 
-       HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
-                                __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+       if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
+           register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
+               BUG();
 
        xen_enable_sysenter();
+       xen_enable_syscall();
 
        set_iopl.iopl = 1;
        rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
 
        pm_idle = xen_idle;
 
-#ifdef CONFIG_SMP
-       /* fill cpus_possible with all available cpus */
-       xen_fill_possible_map();
-#endif
-
        paravirt_disable_iospace();
 
        fiddle_vdso();
index 233156f..f702199 100644 (file)
@@ -66,13 +66,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
        int cpu = smp_processor_id();
 
        cpu_init();
+       preempt_disable();
+
        xen_enable_sysenter();
+       xen_enable_syscall();
 
-       preempt_disable();
-       per_cpu(cpu_state, cpu) = CPU_ONLINE;
+       cpu = smp_processor_id();
+       smp_store_cpu_info(cpu);
+       cpu_data(cpu).x86_max_cores = 1;
+       set_cpu_sibling_map(cpu);
 
        xen_setup_cpu_clockevents();
 
+       cpu_set(cpu, cpu_online_map);
+       x86_write_percpu(cpu_state, CPU_ONLINE);
+       wmb();
+
        /* We can take interrupts now: we're officially "up". */
        local_irq_enable();
 
@@ -141,56 +150,37 @@ static int xen_smp_intr_init(unsigned int cpu)
        return rc;
 }
 
-void __init xen_fill_possible_map(void)
+static void __init xen_fill_possible_map(void)
 {
        int i, rc;
 
        for (i = 0; i < NR_CPUS; i++) {
                rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
-               if (rc >= 0)
+               if (rc >= 0) {
+                       num_processors++;
                        cpu_set(i, cpu_possible_map);
+               }
        }
 }
 
-void __init xen_smp_prepare_boot_cpu(void)
+static void __init xen_smp_prepare_boot_cpu(void)
 {
-       int cpu;
-
        BUG_ON(smp_processor_id() != 0);
        native_smp_prepare_boot_cpu();
 
        /* We've switched to the "real" per-cpu gdt, so make sure the
           old memory can be recycled */
-       make_lowmem_page_readwrite(&per_cpu__gdt_page);
-
-       for_each_possible_cpu(cpu) {
-               cpus_clear(per_cpu(cpu_sibling_map, cpu));
-               /*
-                * cpu_core_map lives in a per cpu area that is cleared
-                * when the per cpu array is allocated.
-                *
-                * cpus_clear(per_cpu(cpu_core_map, cpu));
-                */
-       }
+       make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
 
        xen_setup_vcpu_info_placement();
 }
 
-void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 {
        unsigned cpu;
 
-       for_each_possible_cpu(cpu) {
-               cpus_clear(per_cpu(cpu_sibling_map, cpu));
-               /*
-                * cpu_core_ map will be zeroed when the per
-                * cpu area is allocated.
-                *
-                * cpus_clear(per_cpu(cpu_core_map, cpu));
-                */
-       }
-
        smp_store_cpu_info(0);
+       cpu_data(0).x86_max_cores = 1;
        set_cpu_sibling_map(0);
 
        if (xen_smp_intr_init(0))
@@ -225,7 +215,7 @@ static __cpuinit int
 cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 {
        struct vcpu_guest_context *ctxt;
-       struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+       struct desc_struct *gdt;
 
        if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
                return 0;
@@ -234,12 +224,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
        if (ctxt == NULL)
                return -ENOMEM;
 
+       gdt = get_cpu_gdt_table(cpu);
+
        ctxt->flags = VGCF_IN_KERNEL;
        ctxt->user_regs.ds = __USER_DS;
        ctxt->user_regs.es = __USER_DS;
-       ctxt->user_regs.fs = __KERNEL_PERCPU;
-       ctxt->user_regs.gs = 0;
        ctxt->user_regs.ss = __KERNEL_DS;
+#ifdef CONFIG_X86_32
+       ctxt->user_regs.fs = __KERNEL_PERCPU;
+#endif
        ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
        ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
 
@@ -249,11 +242,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 
        ctxt->ldt_ents = 0;
 
-       BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
-       make_lowmem_page_readonly(gdt->gdt);
+       BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+       make_lowmem_page_readonly(gdt);
 
-       ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
-       ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
+       ctxt->gdt_frames[0] = virt_to_mfn(gdt);
+       ctxt->gdt_ents      = GDT_ENTRIES;
 
        ctxt->user_regs.cs = __KERNEL_CS;
        ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +254,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
        ctxt->kernel_ss = __KERNEL_DS;
        ctxt->kernel_sp = idle->thread.sp0;
 
+#ifdef CONFIG_X86_32
        ctxt->event_callback_cs     = __KERNEL_CS;
-       ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
        ctxt->failsafe_callback_cs  = __KERNEL_CS;
+#endif
+       ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
        ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
 
        per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -276,7 +271,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
        return 0;
 }
 
-int __cpuinit xen_cpu_up(unsigned int cpu)
+static int __cpuinit xen_cpu_up(unsigned int cpu)
 {
        struct task_struct *idle = idle_task(cpu);
        int rc;
@@ -287,11 +282,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
                return rc;
 #endif
 
+#ifdef CONFIG_X86_64
+       /* Allocate node local memory for AP pdas */
+       WARN_ON(cpu == 0);
+       if (cpu > 0) {
+               rc = get_local_pda(cpu);
+               if (rc)
+                       return rc;
+       }
+#endif
+
+#ifdef CONFIG_X86_32
        init_gdt(cpu);
        per_cpu(current_task, cpu) = idle;
        irq_ctx_init(cpu);
+#else
+       cpu_pda(cpu)->pcurrent = idle;
+       clear_tsk_thread_flag(idle, TIF_FORK);
+#endif
        xen_setup_timer(cpu);
 
+       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+
        /* make sure interrupts start blocked */
        per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
 
@@ -306,20 +318,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
        if (rc)
                return rc;
 
-       smp_store_cpu_info(cpu);
-       set_cpu_sibling_map(cpu);
-       /* This must be done before setting cpu_online_map */
-       wmb();
-
-       cpu_set(cpu, cpu_online_map);
-
        rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
        BUG_ON(rc);
 
+       while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
+               HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+               barrier();
+       }
+
        return 0;
 }
 
-void xen_smp_cpus_done(unsigned int max_cpus)
+static void xen_smp_cpus_done(unsigned int max_cpus)
 {
 }
 
@@ -335,12 +345,12 @@ static void stop_self(void *v)
        BUG();
 }
 
-void xen_smp_send_stop(void)
+static void xen_smp_send_stop(void)
 {
        smp_call_function(stop_self, NULL, 0);
 }
 
-void xen_smp_send_reschedule(int cpu)
+static void xen_smp_send_reschedule(int cpu)
 {
        xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
 }
@@ -355,7 +365,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
                xen_send_IPI_one(cpu, vector);
 }
 
-void xen_smp_send_call_function_ipi(cpumask_t mask)
+static void xen_smp_send_call_function_ipi(cpumask_t mask)
 {
        int cpu;
 
@@ -370,7 +380,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
        }
 }
 
-void xen_smp_send_call_function_single_ipi(int cpu)
+static void xen_smp_send_call_function_single_ipi(int cpu)
 {
        xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
 }
@@ -379,7 +389,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
 {
        irq_enter();
        generic_smp_call_function_interrupt();
+#ifdef CONFIG_X86_32
        __get_cpu_var(irq_stat).irq_call_count++;
+#else
+       add_pda(irq_call_count, 1);
+#endif
        irq_exit();
 
        return IRQ_HANDLED;
@@ -389,8 +403,31 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
 {
        irq_enter();
        generic_smp_call_function_single_interrupt();
+#ifdef CONFIG_X86_32
        __get_cpu_var(irq_stat).irq_call_count++;
+#else
+       add_pda(irq_call_count, 1);
+#endif
        irq_exit();
 
        return IRQ_HANDLED;
 }
+
+static const struct smp_ops xen_smp_ops __initdata = {
+       .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+       .smp_prepare_cpus = xen_smp_prepare_cpus,
+       .cpu_up = xen_cpu_up,
+       .smp_cpus_done = xen_smp_cpus_done,
+
+       .smp_send_stop = xen_smp_send_stop,
+       .smp_send_reschedule = xen_smp_send_reschedule,
+
+       .send_call_func_ipi = xen_smp_send_call_function_ipi,
+       .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
+};
+
+void __init xen_smp_init(void)
+{
+       smp_ops = xen_smp_ops;
+       xen_fill_possible_map();
+}
index 251669a..2a234db 100644 (file)
@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
                xen_cpu_initialized_map = cpu_online_map;
 #endif
                xen_vcpu_restore();
-               xen_timer_resume();
        }
 
 }
 
+void xen_arch_resume(void)
+{
+       /* nothing */
+}
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644 (file)
index 0000000..4038cbf
--- /dev/null
@@ -0,0 +1,271 @@
+/*
+       Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+       The inline versions are the same as the direct-use versions, with the
+       pre- and post-amble chopped off.
+
+       This code is encoded for size rather than absolute efficiency,
+       with a view to being able to inline as much as possible.
+
+       We only bother with direct forms (ie, vcpu in pda) of the operations
+       here; the indirect forms are better handled in C, since they're
+       generally too large to inline anyway.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
+
+#define RELOC(x, v)    .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)    .globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+#if 0
+#include <asm/percpu.h>
+
+/*
+       Enable events.  This clears the event mask and tests the pending
+       event status with one and operation.  If there are pending
+       events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+       /* Unmask events */
+       movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+
+       /* Preempt here doesn't matter because that will deal with
+          any pending interrupts.  The pending check may end up being
+          run on the wrong CPU, but that doesn't hurt. */
+
+       /* Test for pending */
+       testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
+       jz 1f
+
+2:     call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+       ret
+       ENDPROC(xen_irq_enable_direct)
+       RELOC(xen_irq_enable_direct, 2b+1)
+
+/*
+       Disabling events is simply a matter of making the event mask
+       non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+       movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+ENDPATCH(xen_irq_disable_direct)
+       ret
+       ENDPROC(xen_irq_disable_direct)
+       RELOC(xen_irq_disable_direct, 0)
+
+/*
+       (xen_)save_fl is used to get the current interrupt enable status.
+       Callers expect the status to be in X86_EFLAGS_IF, and other bits
+       may be set in the return value.  We take advantage of this by
+       making sure that X86_EFLAGS_IF has the right value (and other bits
+       in that byte are 0), but other bits in the return value are
+       undefined.  We need to toggle the state of the bit, because
+       Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+       testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+       setz %ah
+       addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+       ret
+       ENDPROC(xen_save_fl_direct)
+       RELOC(xen_save_fl_direct, 0)
+
+/*
+       In principle the caller should be passing us a value return
+       from xen_save_fl_direct, but for robustness sake we test only
+       the X86_EFLAGS_IF flag rather than the whole byte. After
+       setting the interrupt mask state, it checks for unmasked
+       pending events and enters the hypervisor to get them delivered
+       if so.
+ */
+ENTRY(xen_restore_fl_direct)
+       testb $X86_EFLAGS_IF>>8, %ah
+       setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+       /* Preempt here doesn't matter because that will deal with
+          any pending interrupts.  The pending check may end up being
+          run on the wrong CPU, but that doesn't hurt. */
+
+       /* check for unmasked and pending */
+       cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
+       jz 1f
+2:     call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+       ret
+       ENDPROC(xen_restore_fl_direct)
+       RELOC(xen_restore_fl_direct, 2b+1)
+
+
+/*
+       Force an event check by making a hypercall,
+       but preserve regs before making the call.
+ */
+check_events:
+       push %rax
+       push %rcx
+       push %rdx
+       push %rsi
+       push %rdi
+       push %r8
+       push %r9
+       push %r10
+       push %r11
+       call force_evtchn_callback
+       pop %r11
+       pop %r10
+       pop %r9
+       pop %r8
+       pop %rdi
+       pop %rsi
+       pop %rdx
+       pop %rcx
+       pop %rax
+       ret
+#endif
+
+ENTRY(xen_adjust_exception_frame)
+       mov 8+0(%rsp),%rcx
+       mov 8+8(%rsp),%r11
+       ret $16
+
+hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
+/*
+       Xen64 iret frame:
+
+       ss
+       rsp
+       rflags
+       cs
+       rip             <-- standard iret frame
+
+       flags
+
+       rcx             }
+       r11             }<-- pushed by hypercall page
+rsp -> rax             }
+ */
+ENTRY(xen_iret)
+       pushq $0
+1:     jmp hypercall_iret
+ENDPATCH(xen_iret)
+RELOC(xen_iret, 1b+1)
+
+/*
+       sysexit is not used for 64-bit processes, so it's
+       only ever used to return to 32-bit compat userspace.
+ */
+ENTRY(xen_sysexit)
+       pushq $__USER32_DS
+       pushq %rcx
+       pushq $X86_EFLAGS_IF
+       pushq $__USER32_CS
+       pushq %rdx
+
+       pushq $VGCF_in_syscall
+1:     jmp hypercall_iret
+ENDPATCH(xen_sysexit)
+RELOC(xen_sysexit, 1b+1)
+
+ENTRY(xen_sysret64)
+       /* We're already on the usermode stack at this point, but still
+          with the kernel gs, so we can easily switch back */
+       movq %rsp, %gs:pda_oldrsp
+       movq %gs:pda_kernelstack,%rsp
+
+       pushq $__USER_DS
+       pushq %gs:pda_oldrsp
+       pushq %r11
+       pushq $__USER_CS
+       pushq %rcx
+
+       pushq $VGCF_in_syscall
+1:     jmp hypercall_iret
+ENDPATCH(xen_sysret64)
+RELOC(xen_sysret64, 1b+1)
+
+ENTRY(xen_sysret32)
+       /* We're already on the usermode stack at this point, but still
+          with the kernel gs, so we can easily switch back */
+       movq %rsp, %gs:pda_oldrsp
+       movq %gs:pda_kernelstack, %rsp
+
+       pushq $__USER32_DS
+       pushq %gs:pda_oldrsp
+       pushq %r11
+       pushq $__USER32_CS
+       pushq %rcx
+
+       pushq $VGCF_in_syscall
+1:     jmp hypercall_iret
+ENDPATCH(xen_sysret32)
+RELOC(xen_sysret32, 1b+1)
+
+/*
+       Xen handles syscall callbacks much like ordinary exceptions,
+       which means we have:
+        - kernel gs
+        - kernel rsp
+        - an iret-like stack frame on the stack (including rcx and r11):
+               ss
+               rsp
+               rflags
+               cs
+               rip
+               r11
+       rsp->   rcx
+
+       In all the entrypoints, we undo all that to make it look
+       like a CPU-generated syscall/sysenter and jump to the normal
+       entrypoint.
+ */
+
+.macro undo_xen_syscall
+       mov 0*8(%rsp),%rcx
+       mov 1*8(%rsp),%r11
+       mov 5*8(%rsp),%rsp
+.endm
+
+/* Normal 64-bit system call target */
+ENTRY(xen_syscall_target)
+       undo_xen_syscall
+       jmp system_call_after_swapgs
+ENDPROC(xen_syscall_target)
+
+#ifdef CONFIG_IA32_EMULATION
+
+/* 32-bit compat syscall target */
+ENTRY(xen_syscall32_target)
+       undo_xen_syscall
+       jmp ia32_cstar_target
+ENDPROC(xen_syscall32_target)
+
+/* 32-bit compat sysenter target */
+ENTRY(xen_sysenter_target)
+       undo_xen_syscall
+       jmp ia32_sysenter_target
+ENDPROC(xen_sysenter_target)
+
+#else /* !CONFIG_IA32_EMULATION */
+
+ENTRY(xen_syscall32_target)
+ENTRY(xen_sysenter_target)
+       lea 16(%rsp), %rsp      /* strip %rcx,%r11 */
+       mov $-ENOSYS, %rax
+       pushq $VGCF_in_syscall
+       jmp hypercall_iret
+ENDPROC(xen_syscall32_target)
+ENDPROC(xen_sysenter_target)
+
+#endif /* CONFIG_IA32_EMULATION */
index 7c0cf63..63d49a5 100644 (file)
@@ -5,15 +5,24 @@
 
 #include <linux/elfnote.h>
 #include <linux/init.h>
+
 #include <asm/boot.h>
+#include <asm/asm.h>
+#include <asm/page.h>
+
 #include <xen/interface/elfnote.h>
 #include <asm/xen/interface.h>
 
        __INIT
 ENTRY(startup_xen)
-       movl %esi,xen_start_info
        cld
-       movl $(init_thread_union+THREAD_SIZE),%esp
+#ifdef CONFIG_X86_32
+       mov %esi,xen_start_info
+       mov $init_thread_union+THREAD_SIZE,%esp
+#else
+       mov %rsi,xen_start_info
+       mov $init_thread_union+THREAD_SIZE,%rsp
+#endif
        jmp xen_start_kernel
 
        __FINIT
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
 .pushsection .text
        .align PAGE_SIZE_asm
 ENTRY(hypercall_page)
-       .skip 0x1000
+       .skip PAGE_SIZE_asm
 .popsection
 
        ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
        ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
        ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
-       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long  __PAGE_OFFSET)
-       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
-       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
+#ifdef CONFIG_X86_32
+       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __PAGE_OFFSET)
+#else
+       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __START_KERNEL_map)
+#endif
+       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
+       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
        ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
        ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
        ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
        ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
                .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
        ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
-       ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   .long __HYPERVISOR_VIRT_START)
+       ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   _ASM_PTR __HYPERVISOR_VIRT_START)
+       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   _ASM_PTR 0)
 
 #endif /*CONFIG_XEN */
index 6f4b104..dd3c231 100644 (file)
@@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
 void __init xen_init_IRQ(void);
 void xen_enable_sysenter(void);
+void xen_enable_syscall(void);
 void xen_vcpu_restore(void);
 
 void __init xen_build_dynamic_phys_to_machine(void);
@@ -37,7 +38,6 @@ void __init xen_time_init(void);
 unsigned long xen_get_wallclock(void);
 int xen_set_wallclock(unsigned long time);
 unsigned long long xen_sched_clock(void);
-void xen_timer_resume(void);
 
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
 
@@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
 
 void xen_mark_init_mm_pinned(void);
 
-void __init xen_fill_possible_map(void);
-
 void __init xen_setup_vcpu_info_placement(void);
-void xen_smp_prepare_boot_cpu(void);
-void xen_smp_prepare_cpus(unsigned int max_cpus);
-int xen_cpu_up(unsigned int cpu);
-void xen_smp_cpus_done(unsigned int max_cpus);
 
-void xen_smp_send_stop(void);
-void xen_smp_send_reschedule(int cpu);
-void xen_smp_send_call_function_ipi(cpumask_t mask);
-void xen_smp_send_call_function_single_ipi(int cpu);
+#ifdef CONFIG_SMP
+void xen_smp_init(void);
 
 extern cpumask_t xen_cpu_initialized_map;
+#else
+static inline void xen_smp_init(void) {}
+#endif
 
 
 /* Declare an asm function, along with symbols needed to make it
@@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
 DECL_ASM(unsigned long, xen_save_fl_direct, void);
 DECL_ASM(void, xen_restore_fl_direct, unsigned long);
 
+/* These are not functions, and cannot be called normally */
 void xen_iret(void);
 void xen_sysexit(void);
+void xen_sysret32(void);
+void xen_sysret64(void);
+void xen_adjust_exception_frame(void);
 
 #endif /* XEN_OPS_H */
index ef671d1..902bbe7 100644 (file)
@@ -92,7 +92,7 @@ struct netfront_info {
         */
        union skb_entry {
                struct sk_buff *skb;
-               unsigned link;
+               unsigned long link;
        } tx_skbs[NET_TX_RING_SIZE];
        grant_ref_t gref_tx_head;
        grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
@@ -125,6 +125,17 @@ struct netfront_rx_info {
        struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
 };
 
+static void skb_entry_set_link(union skb_entry *list, unsigned short id)
+{
+       list->link = id;
+}
+
+static int skb_entry_is_link(const union skb_entry *list)
+{
+       BUILD_BUG_ON(sizeof(list->skb) != sizeof(list->link));
+       return ((unsigned long)list->skb < PAGE_OFFSET);
+}
+
 /*
  * Access macros for acquiring freeing slots in tx_skbs[].
  */
@@ -132,7 +143,7 @@ struct netfront_rx_info {
 static void add_id_to_freelist(unsigned *head, union skb_entry *list,
                               unsigned short id)
 {
-       list[id].link = *head;
+       skb_entry_set_link(&list[id], *head);
        *head = id;
 }
 
@@ -993,7 +1004,7 @@ static void xennet_release_tx_bufs(struct netfront_info *np)
 
        for (i = 0; i < NET_TX_RING_SIZE; i++) {
                /* Skip over entries which are actually freelist references */
-               if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET)
+               if (skb_entry_is_link(&np->tx_skbs[i]))
                        continue;
 
                skb = np->tx_skbs[i].skb;
@@ -1123,7 +1134,7 @@ static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev
        /* Initialise tx_skbs as a free chain containing every entry. */
        np->tx_skb_freelist = 0;
        for (i = 0; i < NET_TX_RING_SIZE; i++) {
-               np->tx_skbs[i].link = i+1;
+               skb_entry_set_link(&np->tx_skbs[i], i+1);
                np->grant_tx_ref[i] = GRANT_INVALID_REF;
        }
 
index 5b546e3..a5bc91a 100644 (file)
@@ -63,11 +63,12 @@ static int xen_suspend(void *data)
        gnttab_resume();
        xen_mm_unpin_all();
 
-       device_power_up();
+       device_power_up(PMSG_RESUME);
 
        if (!*cancelled) {
                xen_irq_resume();
                xen_console_resume();
+               xen_timer_resume();
        }
 
        return 0;
@@ -107,12 +108,13 @@ static void do_suspend(void)
                goto out;
        }
 
-       if (!cancelled)
+       if (!cancelled) {
+               xen_arch_resume();
                xenbus_resume();
-       else
+       else
                xenbus_suspend_cancel();
 
-       device_resume();
+       device_resume(PMSG_RESUME);
 
        /* Make sure timer events get retriggered on all CPUs */
        clock_was_set();
index ef5e8ec..eef8095 100644 (file)
@@ -1396,8 +1396,8 @@ extern struct paravirt_patch_site __parainstructions[],
  * caller saved registers but the argument parameter */
 #define PV_SAVE_REGS "pushq %%rdi;"
 #define PV_RESTORE_REGS "popq %%rdi;"
-#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx"
-#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx"
+#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi"
+#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi"
 #define PV_FLAGS_ARG "D"
 #endif
 
@@ -1489,8 +1489,26 @@ static inline unsigned long __raw_local_irq_save(void)
 
 
 #ifdef CONFIG_X86_64
-#define PV_SAVE_REGS   pushq %rax; pushq %rdi; pushq %rcx; pushq %rdx
-#define PV_RESTORE_REGS popq %rdx; popq %rcx; popq %rdi; popq %rax
+#define PV_SAVE_REGS                           \
+       push %rax;                              \
+       push %rcx;                              \
+       push %rdx;                              \
+       push %rsi;                              \
+       push %rdi;                              \
+       push %r8;                               \
+       push %r9;                               \
+       push %r10;                              \
+       push %r11
+#define PV_RESTORE_REGS                                \
+       pop %r11;                               \
+       pop %r10;                               \
+       pop %r9;                                \
+       pop %r8;                                \
+       pop %rdi;                               \
+       pop %rsi;                               \
+       pop %rdx;                               \
+       pop %rcx;                               \
+       pop %rax
 #define PARA_PATCH(struct, off)        ((PARAVIRT_PATCH_##struct + (off)) / 8)
 #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
 #define PARA_INDIRECT(addr)    *addr(%rip)
index 912a3a1..4e91ee1 100644 (file)
 
 DECLARE_PER_CPU(struct x8664_pda, pda);
 
+/*
+ * These are supposed to be implemented as a single instruction which
+ * operates on the per-cpu data base segment.  x86-64 doesn't have
+ * that yet, so this is a fairly inefficient workaround for the
+ * meantime.  The single instruction is atomic with respect to
+ * preemption and interrupts, so we need to explicitly disable
+ * interrupts here to achieve the same effect.  However, because it
+ * can be used from within interrupt-disable/enable, we can't actually
+ * disable interrupts; disabling preemption is enough.
+ */
+#define x86_read_percpu(var)                                           \
+       ({                                                              \
+               typeof(per_cpu_var(var)) __tmp;                         \
+               preempt_disable();                                      \
+               __tmp = __get_cpu_var(var);                             \
+               preempt_enable();                                       \
+               __tmp;                                                  \
+       })
+
+#define x86_write_percpu(var, val)                                     \
+       do {                                                            \
+               preempt_disable();                                      \
+               __get_cpu_var(var) = (val);                             \
+               preempt_enable();                                       \
+       } while(0)
+
 #else /* CONFIG_X86_64 */
 
 #ifdef __ASSEMBLY__
index 49cbd76..96aa76e 100644 (file)
@@ -302,6 +302,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 /* Install a pte for a particular vaddr in kernel space. */
 void set_pte_vaddr(unsigned long vaddr, pte_t pte);
 
+#ifdef CONFIG_X86_32
+extern void native_pagetable_setup_start(pgd_t *base);
+extern void native_pagetable_setup_done(pgd_t *base);
+#else
+static inline void native_pagetable_setup_start(pgd_t *base) {}
+static inline void native_pagetable_setup_done(pgd_t *base) {}
+#endif
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
@@ -333,6 +341,16 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte);
 
 #define pte_update(mm, addr, ptep)              do { } while (0)
 #define pte_update_defer(mm, addr, ptep)        do { } while (0)
+
+static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
+{
+       native_pagetable_setup_start(base);
+}
+
+static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
+{
+       native_pagetable_setup_done(base);
+}
 #endif /* CONFIG_PARAVIRT */
 
 #endif /* __ASSEMBLY__ */
index ec871c4..0611abf 100644 (file)
@@ -171,21 +171,6 @@ do {                                               \
  */
 #define update_mmu_cache(vma, address, pte) do { } while (0)
 
-extern void native_pagetable_setup_start(pgd_t *base);
-extern void native_pagetable_setup_done(pgd_t *base);
-
-#ifndef CONFIG_PARAVIRT
-static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
-{
-       native_pagetable_setup_start(base);
-}
-
-static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
-{
-       native_pagetable_setup_done(base);
-}
-#endif /* !CONFIG_PARAVIRT */
-
 #endif /* !__ASSEMBLY__ */
 
 /*
index fa7208b..805d312 100644 (file)
@@ -16,6 +16,8 @@
 extern pud_t level3_kernel_pgt[512];
 extern pud_t level3_ident_pgt[512];
 extern pmd_t level2_kernel_pgt[512];
+extern pmd_t level2_fixmap_pgt[512];
+extern pmd_t level2_ident_pgt[512];
 extern pgd_t init_level4_pgt[];
 
 #define swapper_pg_dir init_level4_pgt
index 90ab222..6594926 100644 (file)
@@ -76,6 +76,7 @@ extern unsigned long init_pg_tables_start;
 extern unsigned long init_pg_tables_end;
 
 #else
+void __init x86_64_init_pda(void);
 void __init x86_64_start_kernel(char *real_mode);
 void __init x86_64_start_reservations(char *real_mode_data);
 
index c2784b3..3c877f7 100644 (file)
@@ -25,6 +25,8 @@ extern cpumask_t cpu_callin_map;
 extern void (*mtrr_hook)(void);
 extern void zap_low_mappings(void);
 
+extern int __cpuinit get_local_pda(int cpu);
+
 extern int smp_num_siblings;
 extern unsigned int num_processors;
 extern cpumask_t cpu_initialized;
index 86e085e..8e18fb8 100644 (file)
@@ -36,4 +36,12 @@ extern const char VDSO32_PRELINK[];
 extern void __user __kernel_sigreturn;
 extern void __user __kernel_rt_sigreturn;
 
+/*
+ * These symbols are defined by vdso32.S to mark the bounds
+ * of the ELF DSO images included therein.
+ */
+extern const char vdso32_int80_start, vdso32_int80_end;
+extern const char vdso32_syscall_start, vdso32_syscall_end;
+extern const char vdso32_sysenter_start, vdso32_sysenter_end;
+
 #endif /* asm-x86/vdso.h */
index 2a4f9b4..91cb7fd 100644 (file)
 #include <xen/interface/sched.h>
 #include <xen/interface/physdev.h>
 
+/*
+ * The hypercall asms have to meet several constraints:
+ * - Work on 32- and 64-bit.
+ *    The two architectures put their arguments in different sets of
+ *    registers.
+ *
+ * - Work around asm syntax quirks
+ *    It isn't possible to specify one of the rNN registers in a
+ *    constraint, so we use explicit register variables to get the
+ *    args into the right place.
+ *
+ * - Mark all registers as potentially clobbered
+ *    Even unused parameters can be clobbered by the hypervisor, so we
+ *    need to make sure gcc knows it.
+ *
+ * - Avoid compiler bugs.
+ *    This is the tricky part.  Because x86_32 has such a constrained
+ *    register set, gcc versions below 4.3 have trouble generating
+ *    code when all the arg registers and memory are trashed by the
+ *    asm.  There are syntactically simpler ways of achieving the
+ *    semantics below, but they cause the compiler to crash.
+ *
+ *    The only combination I found which works is:
+ *     - assign the __argX variables first
+ *     - list all actually used parameters as "+r" (__argX)
+ *     - clobber the rest
+ *
+ * The result certainly isn't pretty, and it really shows up cpp's
+ * weakness as as macro language.  Sorry.  (But let's just give thanks
+ * there aren't more than 5 arguments...)
+ */
+
 extern struct { char _entry[32]; } hypercall_page[];
 
+#define __HYPERCALL            "call hypercall_page+%c[offset]"
+#define __HYPERCALL_ENTRY(x)                                           \
+       [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0]))
+
+#ifdef CONFIG_X86_32
+#define __HYPERCALL_RETREG     "eax"
+#define __HYPERCALL_ARG1REG    "ebx"
+#define __HYPERCALL_ARG2REG    "ecx"
+#define __HYPERCALL_ARG3REG    "edx"
+#define __HYPERCALL_ARG4REG    "esi"
+#define __HYPERCALL_ARG5REG    "edi"
+#else
+#define __HYPERCALL_RETREG     "rax"
+#define __HYPERCALL_ARG1REG    "rdi"
+#define __HYPERCALL_ARG2REG    "rsi"
+#define __HYPERCALL_ARG3REG    "rdx"
+#define __HYPERCALL_ARG4REG    "r10"
+#define __HYPERCALL_ARG5REG    "r8"
+#endif
+
+#define __HYPERCALL_DECLS                                              \
+       register unsigned long __res  asm(__HYPERCALL_RETREG);          \
+       register unsigned long __arg1 asm(__HYPERCALL_ARG1REG) = __arg1; \
+       register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
+       register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
+       register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
+       register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
+
+#define __HYPERCALL_0PARAM     "=r" (__res)
+#define __HYPERCALL_1PARAM     __HYPERCALL_0PARAM, "+r" (__arg1)
+#define __HYPERCALL_2PARAM     __HYPERCALL_1PARAM, "+r" (__arg2)
+#define __HYPERCALL_3PARAM     __HYPERCALL_2PARAM, "+r" (__arg3)
+#define __HYPERCALL_4PARAM     __HYPERCALL_3PARAM, "+r" (__arg4)
+#define __HYPERCALL_5PARAM     __HYPERCALL_4PARAM, "+r" (__arg5)
+
+#define __HYPERCALL_0ARG()
+#define __HYPERCALL_1ARG(a1)                                           \
+       __HYPERCALL_0ARG()              __arg1 = (unsigned long)(a1);
+#define __HYPERCALL_2ARG(a1,a2)                                                \
+       __HYPERCALL_1ARG(a1)            __arg2 = (unsigned long)(a2);
+#define __HYPERCALL_3ARG(a1,a2,a3)                                     \
+       __HYPERCALL_2ARG(a1,a2)         __arg3 = (unsigned long)(a3);
+#define __HYPERCALL_4ARG(a1,a2,a3,a4)                                  \
+       __HYPERCALL_3ARG(a1,a2,a3)      __arg4 = (unsigned long)(a4);
+#define __HYPERCALL_5ARG(a1,a2,a3,a4,a5)                               \
+       __HYPERCALL_4ARG(a1,a2,a3,a4)   __arg5 = (unsigned long)(a5);
+
+#define __HYPERCALL_CLOBBER5   "memory"
+#define __HYPERCALL_CLOBBER4   __HYPERCALL_CLOBBER5, __HYPERCALL_ARG5REG
+#define __HYPERCALL_CLOBBER3   __HYPERCALL_CLOBBER4, __HYPERCALL_ARG4REG
+#define __HYPERCALL_CLOBBER2   __HYPERCALL_CLOBBER3, __HYPERCALL_ARG3REG
+#define __HYPERCALL_CLOBBER1   __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG
+#define __HYPERCALL_CLOBBER0   __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG
+
 #define _hypercall0(type, name)                                                \
 ({                                                                     \
-       long __res;                                                     \
-       asm volatile (                                                  \
-               "call %[call]"                                          \
-               : "=a" (__res)                                          \
-               : [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
-               : "memory" );                                           \
+       __HYPERCALL_DECLS;                                              \
+       __HYPERCALL_0ARG();                                             \
+       asm volatile (__HYPERCALL                                       \
+                     : __HYPERCALL_0PARAM                              \
+                     : __HYPERCALL_ENTRY(name)                         \
+                     : __HYPERCALL_CLOBBER0);                          \
        (type)__res;                                                    \
 })
 
 #define _hypercall1(type, name, a1)                                    \
 ({                                                                     \
-       long __res, __ign1;                                             \
-       asm volatile (                                                  \
-               "call %[call]"                                          \
-               : "=a" (__res), "=b" (__ign1)                           \
-               : "1" ((long)(a1)),                                     \
-                 [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
-               : "memory" );                                           \
+       __HYPERCALL_DECLS;                                              \
+       __HYPERCALL_1ARG(a1);                                           \
+       asm volatile (__HYPERCALL                                       \
+                     : __HYPERCALL_1PARAM                              \
+                     : __HYPERCALL_ENTRY(name)                         \
+                     : __HYPERCALL_CLOBBER1);                          \
        (type)__res;                                                    \
 })
 
 #define _hypercall2(type, name, a1, a2)                                        \
 ({                                                                     \
-       long __res, __ign1, __ign2;                                     \
-       asm volatile (                                                  \
-               "call %[call]"                                          \
-               : "=a" (__res), "=b" (__ign1), "=c" (__ign2)            \
-               : "1" ((long)(a1)), "2" ((long)(a2)),                   \
-                 [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
-               : "memory" );                                           \
+       __HYPERCALL_DECLS;                                              \
+       __HYPERCALL_2ARG(a1, a2);                                       \
+       asm volatile (__HYPERCALL                                       \
+                     : __HYPERCALL_2PARAM                              \
+                     : __HYPERCALL_ENTRY(name)                         \
+                     : __HYPERCALL_CLOBBER2);                          \
        (type)__res;                                                    \
 })
 
 #define _hypercall3(type, name, a1, a2, a3)                            \
 ({                                                                     \
-       long __res, __ign1, __ign2, __ign3;                             \
-       asm volatile (                                                  \
-               "call %[call]"                                          \
-               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),           \
-               "=d" (__ign3)                                           \
-               : "1" ((long)(a1)), "2" ((long)(a2)),                   \
-                 "3" ((long)(a3)),                                     \
-                 [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
-               : "memory" );                                           \
+       __HYPERCALL_DECLS;                                              \
+       __HYPERCALL_3ARG(a1, a2, a3);                                   \
+       asm volatile (__HYPERCALL                                       \
+                     : __HYPERCALL_3PARAM                              \
+                     : __HYPERCALL_ENTRY(name)                         \
+                     : __HYPERCALL_CLOBBER3);                          \
        (type)__res;                                                    \
 })
 
 #define _hypercall4(type, name, a1, a2, a3, a4)                                \
 ({                                                                     \
-       long __res, __ign1, __ign2, __ign3, __ign4;                     \
-       asm volatile (                                                  \
-               "call %[call]"                                          \
-               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),           \
-               "=d" (__ign3), "=S" (__ign4)                            \
-               : "1" ((long)(a1)), "2" ((long)(a2)),                   \
-                 "3" ((long)(a3)), "4" ((long)(a4)),                   \
-                 [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
-               : "memory" );                                           \
+       __HYPERCALL_DECLS;                                              \
+       __HYPERCALL_4ARG(a1, a2, a3, a4);                               \
+       asm volatile (__HYPERCALL                                       \
+                     : __HYPERCALL_4PARAM                              \
+                     : __HYPERCALL_ENTRY(name)                         \
+                     : __HYPERCALL_CLOBBER4);                          \
        (type)__res;                                                    \
 })
 
 #define _hypercall5(type, name, a1, a2, a3, a4, a5)                    \
 ({                                                                     \
-       long __res, __ign1, __ign2, __ign3, __ign4, __ign5;             \
-       asm volatile (                                                  \
-               "call %[call]"                                          \
-               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),           \
-               "=d" (__ign3), "=S" (__ign4), "=D" (__ign5)             \
-               : "1" ((long)(a1)), "2" ((long)(a2)),                   \
-                 "3" ((long)(a3)), "4" ((long)(a4)),                   \
-                 "5" ((long)(a5)),                                     \
-                 [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
-               : "memory" );                                           \
+       __HYPERCALL_DECLS;                                              \
+       __HYPERCALL_5ARG(a1, a2, a3, a4, a5);                           \
+       asm volatile (__HYPERCALL                                       \
+                     : __HYPERCALL_5PARAM                              \
+                     : __HYPERCALL_ENTRY(name)                         \
+                     : __HYPERCALL_CLOBBER5);                          \
        (type)__res;                                                    \
 })
 
@@ -152,6 +226,7 @@ HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
        return _hypercall2(int, stack_switch, ss, esp);
 }
 
+#ifdef CONFIG_X86_32
 static inline int
 HYPERVISOR_set_callbacks(unsigned long event_selector,
                         unsigned long event_address,
@@ -162,6 +237,17 @@ HYPERVISOR_set_callbacks(unsigned long event_selector,
                           event_selector, event_address,
                           failsafe_selector, failsafe_address);
 }
+#else  /* CONFIG_X86_64 */
+static inline int
+HYPERVISOR_set_callbacks(unsigned long event_address,
+                       unsigned long failsafe_address,
+                       unsigned long syscall_address)
+{
+       return _hypercall3(int, set_callbacks,
+                          event_address, failsafe_address,
+                          syscall_address);
+}
+#endif  /* CONFIG_X86_{32,64} */
 
 static inline int
 HYPERVISOR_callback_op(int cmd, void *arg)
@@ -223,12 +309,12 @@ static inline int
 HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
                             unsigned long flags)
 {
-       unsigned long pte_hi = 0;
-#ifdef CONFIG_X86_PAE
-       pte_hi = new_val.pte_high;
-#endif
-       return _hypercall4(int, update_va_mapping, va,
-                          new_val.pte_low, pte_hi, flags);
+       if (sizeof(new_val) == sizeof(long))
+               return _hypercall3(int, update_va_mapping, va,
+                                  new_val.pte, flags);
+       else
+               return _hypercall4(int, update_va_mapping, va,
+                                  new_val.pte, new_val.pte >> 32, flags);
 }
 
 static inline int
@@ -281,12 +367,13 @@ static inline int
 HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
                                         unsigned long flags, domid_t domid)
 {
-       unsigned long pte_hi = 0;
-#ifdef CONFIG_X86_PAE
-       pte_hi = new_val.pte_high;
-#endif
-       return _hypercall5(int, update_va_mapping_otherdomain, va,
-                          new_val.pte_low, pte_hi, flags, domid);
+       if (sizeof(new_val) == sizeof(long))
+               return _hypercall4(int, update_va_mapping_otherdomain, va,
+                                  new_val.pte, flags, domid);
+       else
+               return _hypercall5(int, update_va_mapping_otherdomain, va,
+                                  new_val.pte, new_val.pte >> 32,
+                                  flags, domid);
 }
 
 static inline int
@@ -301,6 +388,14 @@ HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
        return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
 }
 
+#ifdef CONFIG_X86_64
+static inline int
+HYPERVISOR_set_segment_base(int reg, unsigned long value)
+{
+       return _hypercall2(int, set_segment_base, reg, value);
+}
+#endif
+
 static inline int
 HYPERVISOR_suspend(unsigned long srec)
 {
@@ -327,14 +422,14 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
 {
        mcl->op = __HYPERVISOR_update_va_mapping;
        mcl->args[0] = va;
-#ifdef CONFIG_X86_PAE
-       mcl->args[1] = new_val.pte_low;
-       mcl->args[2] = new_val.pte_high;
-#else
-       mcl->args[1] = new_val.pte_low;
-       mcl->args[2] = 0;
-#endif
-       mcl->args[3] = flags;
+       if (sizeof(new_val) == sizeof(long)) {
+               mcl->args[1] = new_val.pte;
+               mcl->args[2] = flags;
+       } else {
+               mcl->args[1] = new_val.pte;
+               mcl->args[2] = new_val.pte >> 32;
+               mcl->args[3] = flags;
+       }
 }
 
 static inline void
@@ -354,15 +449,16 @@ MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long v
 {
        mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
        mcl->args[0] = va;
-#ifdef CONFIG_X86_PAE
-       mcl->args[1] = new_val.pte_low;
-       mcl->args[2] = new_val.pte_high;
-#else
-       mcl->args[1] = new_val.pte_low;
-       mcl->args[2] = 0;
-#endif
-       mcl->args[3] = flags;
-       mcl->args[4] = domid;
+       if (sizeof(new_val) == sizeof(long)) {
+               mcl->args[1] = new_val.pte;
+               mcl->args[2] = flags;
+               mcl->args[3] = domid;
+       } else {
+               mcl->args[1] = new_val.pte;
+               mcl->args[2] = new_val.pte >> 32;
+               mcl->args[3] = flags;
+               mcl->args[4] = domid;
+       }
 }
 
 static inline void
@@ -370,10 +466,15 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
                        struct desc_struct desc)
 {
        mcl->op = __HYPERVISOR_update_descriptor;
-       mcl->args[0] = maddr;
-       mcl->args[1] = maddr >> 32;
-       mcl->args[2] = desc.a;
-       mcl->args[3] = desc.b;
+       if (sizeof(maddr) == sizeof(long)) {
+               mcl->args[0] = maddr;
+               mcl->args[1] = *(unsigned long *)&desc;
+       } else {
+               mcl->args[0] = maddr;
+               mcl->args[1] = maddr >> 32;
+               mcl->args[2] = desc.a;
+               mcl->args[3] = desc.b;
+       }
 }
 
 static inline void
index 6227000..9d810f2 100644 (file)
@@ -1,13 +1,13 @@
 /******************************************************************************
  * arch-x86_32.h
  *
- * Guest OS interface to x86 32-bit Xen.
+ * Guest OS interface to x86 Xen.
  *
  * Copyright (c) 2004, K A Fraser
  */
 
-#ifndef __XEN_PUBLIC_ARCH_X86_32_H__
-#define __XEN_PUBLIC_ARCH_X86_32_H__
+#ifndef __ASM_X86_XEN_INTERFACE_H
+#define __ASM_X86_XEN_INTERFACE_H
 
 #ifdef __XEN__
 #define __DEFINE_GUEST_HANDLE(name, type) \
@@ -57,6 +57,17 @@ DEFINE_GUEST_HANDLE(long);
 DEFINE_GUEST_HANDLE(void);
 #endif
 
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#endif
+
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/* Maximum number of virtual CPUs in multi-processor guests. */
+#define MAX_VIRT_CPUS 32
+
 /*
  * SEGMENT DESCRIPTOR TABLES
  */
@@ -70,59 +81,22 @@ DEFINE_GUEST_HANDLE(void);
 #define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
 #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
 
-/*
- * These flat segments are in the Xen-private section of every GDT. Since these
- * are also present in the initial GDT, many OSes will be able to avoid
- * installing their own GDT.
- */
-#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
-#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
-#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
-#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
-#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
-#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
-
-#define FLAT_KERNEL_CS FLAT_RING1_CS
-#define FLAT_KERNEL_DS FLAT_RING1_DS
-#define FLAT_KERNEL_SS FLAT_RING1_SS
-#define FLAT_USER_CS    FLAT_RING3_CS
-#define FLAT_USER_DS    FLAT_RING3_DS
-#define FLAT_USER_SS    FLAT_RING3_SS
-
-/* And the trap vector is... */
-#define TRAP_INSTR "int $0x82"
-
-/*
- * Virtual addresses beyond this are not modifiable by guest OSes. The
- * machine->physical mapping table starts at this address, read-only.
- */
-#ifdef CONFIG_X86_PAE
-#define __HYPERVISOR_VIRT_START 0xF5800000
-#else
-#define __HYPERVISOR_VIRT_START 0xFC000000
-#endif
-
-#ifndef HYPERVISOR_VIRT_START
-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
-#endif
-
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
-
-/* Maximum number of virtual CPUs in multi-processor guests. */
-#define MAX_VIRT_CPUS 32
-
-#ifndef __ASSEMBLY__
-
 /*
  * Send an array of these to HYPERVISOR_set_trap_table()
+ * The privilege level specifies which modes may enter a trap via a software
+ * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
+ * privilege levels as follows:
+ *  Level == 0: Noone may enter
+ *  Level == 1: Kernel may enter
+ *  Level == 2: Kernel may enter
+ *  Level == 3: Everyone may enter
  */
 #define TI_GET_DPL(_ti)                ((_ti)->flags & 3)
 #define TI_GET_IF(_ti)         ((_ti)->flags & 4)
 #define TI_SET_DPL(_ti, _dpl)  ((_ti)->flags |= (_dpl))
 #define TI_SET_IF(_ti, _if)    ((_ti)->flags |= ((!!(_if))<<2))
 
+#ifndef __ASSEMBLY__
 struct trap_info {
     uint8_t       vector;  /* exception vector                              */
     uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
@@ -131,32 +105,21 @@ struct trap_info {
 };
 DEFINE_GUEST_HANDLE_STRUCT(trap_info);
 
-struct cpu_user_regs {
-    uint32_t ebx;
-    uint32_t ecx;
-    uint32_t edx;
-    uint32_t esi;
-    uint32_t edi;
-    uint32_t ebp;
-    uint32_t eax;
-    uint16_t error_code;    /* private */
-    uint16_t entry_vector;  /* private */
-    uint32_t eip;
-    uint16_t cs;
-    uint8_t  saved_upcall_mask;
-    uint8_t  _pad0;
-    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
-    uint32_t esp;
-    uint16_t ss, _pad1;
-    uint16_t es, _pad2;
-    uint16_t ds, _pad3;
-    uint16_t fs, _pad4;
-    uint16_t gs, _pad5;
+struct arch_shared_info {
+    unsigned long max_pfn;                  /* max pfn that appears in table */
+    /* Frame containing list of mfns containing list of mfns containing p2m. */
+    unsigned long pfn_to_mfn_frame_list_list;
+    unsigned long nmi_reason;
 };
-DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+#endif /* !__ASSEMBLY__ */
 
-typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+#ifdef CONFIG_X86_32
+#include "interface_32.h"
+#else
+#include "interface_64.h"
+#endif
 
+#ifndef __ASSEMBLY__
 /*
  * The following is all CPU context. Note that the fpu_ctxt block is filled
  * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
@@ -173,33 +136,29 @@ struct vcpu_guest_context {
     unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
     unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
     unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
+    /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
     unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
     unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
+#ifdef __i386__
     unsigned long event_callback_cs;        /* CS:EIP of event callback     */
     unsigned long event_callback_eip;
     unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
     unsigned long failsafe_callback_eip;
+#else
+    unsigned long event_callback_eip;
+    unsigned long failsafe_callback_eip;
+    unsigned long syscall_callback_eip;
+#endif
     unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
+#ifdef __x86_64__
+    /* Segment base addresses. */
+    uint64_t      fs_base;
+    uint64_t      gs_base_kernel;
+    uint64_t      gs_base_user;
+#endif
 };
 DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
-
-struct arch_shared_info {
-    unsigned long max_pfn;                  /* max pfn that appears in table */
-    /* Frame containing list of mfns containing list of mfns containing p2m. */
-    unsigned long pfn_to_mfn_frame_list_list;
-    unsigned long nmi_reason;
-};
-
-struct arch_vcpu_info {
-    unsigned long cr2;
-    unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
-};
-
-struct xen_callback {
-       unsigned long cs;
-       unsigned long eip;
-};
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLY__ */
 
 /*
  * Prefix forces emulation of some non-trapping instructions.
@@ -213,4 +172,4 @@ struct xen_callback {
 #define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
 #endif
 
-#endif
+#endif /* __ASM_X86_XEN_INTERFACE_H */
diff --git a/include/asm-x86/xen/interface_32.h b/include/asm-x86/xen/interface_32.h
new file mode 100644 (file)
index 0000000..d8ac41d
--- /dev/null
@@ -0,0 +1,97 @@
+/******************************************************************************
+ * arch-x86_32.h
+ *
+ * Guest OS interface to x86 32-bit Xen.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __ASM_X86_XEN_INTERFACE_32_H
+#define __ASM_X86_XEN_INTERFACE_32_H
+
+
+/*
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
+#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
+#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
+#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
+#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
+#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
+
+#define FLAT_KERNEL_CS FLAT_RING1_CS
+#define FLAT_KERNEL_DS FLAT_RING1_DS
+#define FLAT_KERNEL_SS FLAT_RING1_SS
+#define FLAT_USER_CS    FLAT_RING3_CS
+#define FLAT_USER_DS    FLAT_RING3_DS
+#define FLAT_USER_SS    FLAT_RING3_SS
+
+/* And the trap vector is... */
+#define TRAP_INSTR "int $0x82"
+
+/*
+ * Virtual addresses beyond this are not modifiable by guest OSes. The
+ * machine->physical mapping table starts at this address, read-only.
+ */
+#define __HYPERVISOR_VIRT_START 0xF5800000
+
+#ifndef __ASSEMBLY__
+
+struct cpu_user_regs {
+    uint32_t ebx;
+    uint32_t ecx;
+    uint32_t edx;
+    uint32_t esi;
+    uint32_t edi;
+    uint32_t ebp;
+    uint32_t eax;
+    uint16_t error_code;    /* private */
+    uint16_t entry_vector;  /* private */
+    uint32_t eip;
+    uint16_t cs;
+    uint8_t  saved_upcall_mask;
+    uint8_t  _pad0;
+    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
+    uint32_t esp;
+    uint16_t ss, _pad1;
+    uint16_t es, _pad2;
+    uint16_t ds, _pad3;
+    uint16_t fs, _pad4;
+    uint16_t gs, _pad5;
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+struct arch_vcpu_info {
+    unsigned long cr2;
+    unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
+};
+
+struct xen_callback {
+       unsigned long cs;
+       unsigned long eip;
+};
+typedef struct xen_callback xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __eip)                              \
+       ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) })
+#endif /* !__ASSEMBLY__ */
+
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+#endif /* __ASM_X86_XEN_INTERFACE_32_H */
diff --git a/include/asm-x86/xen/interface_64.h b/include/asm-x86/xen/interface_64.h
new file mode 100644 (file)
index 0000000..842266c
--- /dev/null
@@ -0,0 +1,159 @@
+#ifndef __ASM_X86_XEN_INTERFACE_64_H
+#define __ASM_X86_XEN_INTERFACE_64_H
+
+/*
+ * 64-bit segment selectors
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+
+#define FLAT_RING3_CS32 0xe023  /* GDT index 260 */
+#define FLAT_RING3_CS64 0xe033  /* GDT index 261 */
+#define FLAT_RING3_DS32 0xe02b  /* GDT index 262 */
+#define FLAT_RING3_DS64 0x0000  /* NULL selector */
+#define FLAT_RING3_SS32 0xe02b  /* GDT index 262 */
+#define FLAT_RING3_SS64 0xe02b  /* GDT index 262 */
+
+#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
+#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
+#define FLAT_KERNEL_DS   FLAT_KERNEL_DS64
+#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
+#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
+#define FLAT_KERNEL_CS   FLAT_KERNEL_CS64
+#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
+#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
+#define FLAT_KERNEL_SS   FLAT_KERNEL_SS64
+
+#define FLAT_USER_DS64 FLAT_RING3_DS64
+#define FLAT_USER_DS32 FLAT_RING3_DS32
+#define FLAT_USER_DS   FLAT_USER_DS64
+#define FLAT_USER_CS64 FLAT_RING3_CS64
+#define FLAT_USER_CS32 FLAT_RING3_CS32
+#define FLAT_USER_CS   FLAT_USER_CS64
+#define FLAT_USER_SS64 FLAT_RING3_SS64
+#define FLAT_USER_SS32 FLAT_RING3_SS32
+#define FLAT_USER_SS   FLAT_USER_SS64
+
+#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
+#define __HYPERVISOR_VIRT_END   0xFFFF880000000000
+#define __MACH2PHYS_VIRT_START  0xFFFF800000000000
+#define __MACH2PHYS_VIRT_END    0xFFFF804000000000
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#define HYPERVISOR_VIRT_END   mk_unsigned_long(__HYPERVISOR_VIRT_END)
+#endif
+
+#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+ *  @which == SEGBASE_*  ;  @base == 64-bit base address
+ * Returns 0 on success.
+ */
+#define SEGBASE_FS          0
+#define SEGBASE_GS_USER     1
+#define SEGBASE_GS_KERNEL   2
+#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
+
+/*
+ * int HYPERVISOR_iret(void)
+ * All arguments are on the kernel stack, in the following format.
+ * Never returns if successful. Current kernel context is lost.
+ * The saved CS is mapped as follows:
+ *   RING0 -> RING3 kernel mode.
+ *   RING1 -> RING3 kernel mode.
+ *   RING2 -> RING3 kernel mode.
+ *   RING3 -> RING3 user mode.
+ * However RING0 indicates that the guest kernel should return to iteself
+ * directly with
+ *      orb   $3,1*8(%rsp)
+ *      iretq
+ * If flags contains VGCF_in_syscall:
+ *   Restore RAX, RIP, RFLAGS, RSP.
+ *   Discard R11, RCX, CS, SS.
+ * Otherwise:
+ *   Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
+ * All other registers are saved on hypercall entry and restored to user.
+ */
+/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
+#define _VGCF_in_syscall 8
+#define VGCF_in_syscall  (1<<_VGCF_in_syscall)
+#define VGCF_IN_SYSCALL  VGCF_in_syscall
+
+#ifndef __ASSEMBLY__
+
+struct iret_context {
+    /* Top of stack (%rsp at point of hypercall). */
+    uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+    /* Bottom of iret stack frame. */
+};
+
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
+#define __DECL_REG(name) union { \
+    uint64_t r ## name, e ## name; \
+    uint32_t _e ## name; \
+}
+#else
+/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
+#define __DECL_REG(name) uint64_t r ## name
+#endif
+
+struct cpu_user_regs {
+    uint64_t r15;
+    uint64_t r14;
+    uint64_t r13;
+    uint64_t r12;
+    __DECL_REG(bp);
+    __DECL_REG(bx);
+    uint64_t r11;
+    uint64_t r10;
+    uint64_t r9;
+    uint64_t r8;
+    __DECL_REG(ax);
+    __DECL_REG(cx);
+    __DECL_REG(dx);
+    __DECL_REG(si);
+    __DECL_REG(di);
+    uint32_t error_code;    /* private */
+    uint32_t entry_vector;  /* private */
+    __DECL_REG(ip);
+    uint16_t cs, _pad0[1];
+    uint8_t  saved_upcall_mask;
+    uint8_t  _pad1[3];
+    __DECL_REG(flags);      /* rflags.IF == !saved_upcall_mask */
+    __DECL_REG(sp);
+    uint16_t ss, _pad2[3];
+    uint16_t es, _pad3[3];
+    uint16_t ds, _pad4[3];
+    uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base.     */
+    uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+#undef __DECL_REG
+
+#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
+#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
+
+struct arch_vcpu_info {
+    unsigned long cr2;
+    unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
+};
+
+typedef unsigned long xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __rip)                              \
+       ((unsigned long)(__rip))
+
+#endif /* !__ASSEMBLY__ */
+
+
+#endif /* __ASM_X86_XEN_INTERFACE_64_H */
index 377c045..05e678a 100644 (file)
@@ -148,13 +148,17 @@ static inline pte_t __pte_ma(pteval_t x)
 }
 
 #define pmd_val_ma(v) ((v).pmd)
+#ifdef __PAGETABLE_PUD_FOLDED
 #define pud_val_ma(v) ((v).pgd.pgd)
+#else
+#define pud_val_ma(v) ((v).pud)
+#endif
 #define __pmd_ma(x)    ((pmd_t) { (x) } )
 
 #define pgd_val_ma(x)  ((x).pgd)
 
 
-xmaddr_t arbitrary_virt_to_machine(unsigned long address);
+xmaddr_t arbitrary_virt_to_machine(void *address);
 void make_lowmem_page_readonly(void *vaddr);
 void make_lowmem_page_readwrite(void *vaddr);
 
index 98b79bc..c3adde3 100644 (file)
@@ -5,11 +5,12 @@ extern struct console xenboot_console;
 
 #ifdef CONFIG_HVC_XEN
 void xen_console_resume(void);
+void xen_raw_console_write(const char *str);
+void xen_raw_printk(const char *fmt, ...);
 #else
 static inline void xen_console_resume(void) { }
+static inline void xen_raw_console_write(const char *str) { }
+static inline void xen_raw_printk(const char *fmt, ...) { }
 #endif
 
-void xen_raw_console_write(const char *str);
-void xen_raw_printk(const char *fmt, ...);
-
 #endif /* XEN_HVC_CONSOLE_H */
index 4aadcba..2ae3cd2 100644 (file)
@@ -82,9 +82,9 @@
  */
 #define CALLBACKOP_register                0
 struct callback_register {
-    uint16_t type;
-    uint16_t flags;
-    struct xen_callback address;
+       uint16_t type;
+       uint16_t flags;
+       xen_callback_t address;
 };
 
 /*
index a706d6a..883a21b 100644 (file)
@@ -11,4 +11,7 @@ void xen_post_suspend(int suspend_cancelled);
 void xen_mm_pin_all(void);
 void xen_mm_unpin_all(void);
 
+void xen_timer_resume(void);
+void xen_arch_resume(void);
+
 #endif /* INCLUDE_XEN_OPS_H */
index b45da40..59dfdf1 100644 (file)
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
 
 config PM_SLEEP
        bool
-       depends on SUSPEND || HIBERNATION
+       depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
        default y
 
 config SUSPEND