Merge branch 'x86/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 21 Jul 2008 17:34:25 +0000 (10:34 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 21 Jul 2008 17:34:25 +0000 (10:34 -0700)
* 'x86/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (160 commits)
  x86: remove extra calling to get ext cpuid level
  x86: use setup_clear_cpu_cap() when disabling the lapic
  KVM: fix exception entry / build bug, on 64-bit
  x86: add unknown_nmi_panic kernel parameter
  x86, VisWS: turn into generic arch, eliminate leftover files
  x86: add ->pre_time_init to x86_quirks
  x86: extend and use x86_quirks to clean up NUMAQ code
  x86: introduce x86_quirks
  x86: improve debug printout: add target bootmem range in early_res_to_bootmem()
  Subject: devmem, x86: fix rename of CONFIG_NONPROMISC_DEVMEM
  x86: remove arch_get_ram_range
  x86: Add a debugfs interface to dump PAT memtype
  x86: Add a arch directory for x86 under debugfs
  x86: i386: reduce boot fixmap space
  i386/xen: add proper unwind annotations to xen_sysenter_target
  x86: reduce force_mwait visibility
  x86: reduce forbid_dac's visibility
  x86: fix two modpost warnings
  x86: check function status in EDD boot code
  x86_64: ia32_signal.c: remove signal number conversion
  ...

152 files changed:
Documentation/kernel-parameters.txt
arch/x86/Kconfig
arch/x86/Kconfig.cpu
arch/x86/Kconfig.debug
arch/x86/boot/edd.c
arch/x86/boot/pm.c
arch/x86/configs/i386_defconfig
arch/x86/configs/x86_64_defconfig
arch/x86/ia32/ia32_signal.c
arch/x86/ia32/ia32entry.S
arch/x86/kernel/Makefile
arch/x86/kernel/acpi/sleep.c
arch/x86/kernel/amd_iommu.c
arch/x86/kernel/amd_iommu_init.c
arch/x86/kernel/aperture_64.c
arch/x86/kernel/apic_32.c
arch/x86/kernel/apic_64.c
arch/x86/kernel/asm-offsets_64.c
arch/x86/kernel/bios_uv.c [new file with mode: 0644]
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/cpu/amd_64.c
arch/x86/kernel/cpu/bugs.c
arch/x86/kernel/cpu/common_64.c
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/cpu/intel_cacheinfo.c
arch/x86/kernel/cpu/mcheck/p4.c
arch/x86/kernel/e820.c
arch/x86/kernel/early-quirks.c
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/genx2apic_uv_x.c
arch/x86/kernel/head64.c
arch/x86/kernel/head_64.S
arch/x86/kernel/io_apic_32.c
arch/x86/kernel/io_apic_64.c
arch/x86/kernel/io_delay.c
arch/x86/kernel/ipi.c
arch/x86/kernel/irq_32.c
arch/x86/kernel/kdebugfs.c
arch/x86/kernel/kprobes.c
arch/x86/kernel/module_64.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/nmi.c
arch/x86/kernel/numaq_32.c
arch/x86/kernel/paravirt.c
arch/x86/kernel/pci-calgary_64.c
arch/x86/kernel/pci-dma.c
arch/x86/kernel/pci-gart_64.c
arch/x86/kernel/pci-nommu.c
arch/x86/kernel/pci-swiotlb_64.c
arch/x86/kernel/process.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/reboot.c
arch/x86/kernel/setup.c
arch/x86/kernel/signal_32.c
arch/x86/kernel/signal_64.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/smpcommon_32.c [deleted file]
arch/x86/kernel/step.c
arch/x86/kernel/time_32.c
arch/x86/kernel/traps_32.c
arch/x86/kernel/traps_64.c
arch/x86/kernel/visws_quirks.c
arch/x86/kernel/vmi_32.c
arch/x86/lguest/boot.c
arch/x86/mach-default/setup.c
arch/x86/mm/Makefile
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/mm/memtest.c [new file with mode: 0644]
arch/x86/mm/pat.c
arch/x86/pci/Makefile
arch/x86/pci/legacy.c
arch/x86/pci/numaq_32.c [moved from arch/x86/pci/numa.c with 97% similarity]
arch/x86/pci/pci.h
arch/x86/pci/visws.c
arch/x86/vdso/Makefile
arch/x86/vdso/vdso32-setup.c
arch/x86/vdso/vdso32.S
arch/x86/vdso/vma.c
arch/x86/xen/Kconfig
arch/x86/xen/Makefile
arch/x86/xen/enlighten.c
arch/x86/xen/mmu.c
arch/x86/xen/mmu.h
arch/x86/xen/multicalls.c
arch/x86/xen/setup.c
arch/x86/xen/smp.c
arch/x86/xen/suspend.c
arch/x86/xen/xen-asm_32.S [moved from arch/x86/xen/xen-asm.S with 100% similarity]
arch/x86/xen/xen-asm_64.S [new file with mode: 0644]
arch/x86/xen/xen-head.S
arch/x86/xen/xen-ops.h
drivers/char/mem.c
drivers/net/xen-netfront.c
drivers/pci/intel-iommu.c
drivers/xen/events.c
drivers/xen/manage.c
include/asm-x86/amd_iommu_types.h
include/asm-x86/apic.h
include/asm-x86/arch_hooks.h
include/asm-x86/bitops.h
include/asm-x86/calling.h
include/asm-x86/cpufeature.h
include/asm-x86/dma-mapping.h
include/asm-x86/e820.h
include/asm-x86/fixmap_32.h
include/asm-x86/ftrace.h
include/asm-x86/gart.h
include/asm-x86/iommu.h
include/asm-x86/kvm_host.h
include/asm-x86/mach-bigsmp/mach_apic.h
include/asm-x86/mach-default/mach_apic.h
include/asm-x86/mach-es7000/mach_apic.h
include/asm-x86/mach-generic/mach_mpspec.h
include/asm-x86/mach-summit/mach_apic.h
include/asm-x86/mach-visws/entry_arch.h [deleted file]
include/asm-x86/mach-visws/mach_apic.h [deleted file]
include/asm-x86/mach-visws/mach_apicdef.h [deleted file]
include/asm-x86/mach-visws/setup_arch.h [deleted file]
include/asm-x86/mach-visws/smpboot_hooks.h [deleted file]
include/asm-x86/paravirt.h
include/asm-x86/percpu.h
include/asm-x86/pgtable.h
include/asm-x86/pgtable_32.h
include/asm-x86/pgtable_64.h
include/asm-x86/processor.h
include/asm-x86/ptrace-abi.h
include/asm-x86/segment.h
include/asm-x86/setup.h
include/asm-x86/signal.h
include/asm-x86/smp.h
include/asm-x86/spinlock.h
include/asm-x86/spinlock_types.h
include/asm-x86/swiotlb.h
include/asm-x86/thread_info.h
include/asm-x86/traps.h [new file with mode: 0644]
include/asm-x86/uv/bios.h [new file with mode: 0644]
include/asm-x86/vdso.h
include/asm-x86/xen/events.h
include/asm-x86/xen/hypercall.h
include/asm-x86/xen/interface.h
include/asm-x86/xen/interface_32.h [new file with mode: 0644]
include/asm-x86/xen/interface_64.h [new file with mode: 0644]
include/asm-x86/xen/page.h
include/linux/debugfs.h
include/xen/events.h
include/xen/hvc-console.h
include/xen/interface/callback.h
include/xen/xen-ops.h
kernel/power/Kconfig

index 09ad745..25e88cf 100644 (file)
@@ -1206,7 +1206,7 @@ and is between 256 and 4096 characters. It is defined in the file
                                 or
                                 memmap=0x10000$0x18690000
 
-       memtest=        [KNL,X86_64] Enable memtest
+       memtest=        [KNL,X86] Enable memtest
                        Format: <integer>
                        range: 0,4 : pattern number
                        default : 0 <disable>
@@ -2158,6 +2158,10 @@ and is between 256 and 4096 characters. It is defined in the file
                        Note that genuine overcurrent events won't be
                        reported either.
 
+       unknown_nmi_panic
+                       [X86-32,X86-64]
+                       Set unknown_nmi_panic=1 early on boot.
+
        usbcore.autosuspend=
                        [USB] The autosuspend time delay (in seconds) used
                        for newly-detected USB devices (default 2).  This
index 96e0c2e..03980cb 100644 (file)
@@ -447,7 +447,6 @@ config PARAVIRT_DEBUG
 
 config MEMTEST
        bool "Memtest"
-       depends on X86_64
        help
          This option adds a kernel parameter 'memtest', which allows memtest
          to be set.
index abff1b8..54b8c02 100644 (file)
@@ -362,10 +362,6 @@ config X86_ALIGNMENT_16
        def_bool y
        depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
 
-config X86_GOOD_APIC
-       def_bool y
-       depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
-
 config X86_INTEL_USERCOPY
        def_bool y
        depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
index ae36bfa..85a87d2 100644 (file)
@@ -5,13 +5,15 @@ config TRACE_IRQFLAGS_SUPPORT
 
 source "lib/Kconfig.debug"
 
-config NONPROMISC_DEVMEM
+config STRICT_DEVMEM
        bool "Filter access to /dev/mem"
        help
-         If this option is left off, you allow userspace access to all
+         If this option is left on, you allow userspace (root) access to all
          of memory, including kernel and userspace memory. Accidental
          access to this is obviously disastrous, but specific access can
-         be used by people debugging the kernel.
+         be used by people debugging the kernel. Note that with PAT support
+         enabled, even in this case there are restrictions on /dev/mem
+         use due to the cache aliasing requirements.
 
          If this option is switched on, the /dev/mem file only allows
          userspace access to PCI space and the BIOS code and data regions.
@@ -287,7 +289,6 @@ config CPA_DEBUG
 
 config OPTIMIZE_INLINING
        bool "Allow gcc to uninline functions marked 'inline'"
-       depends on BROKEN
        help
          This option determines if the kernel forces gcc to inline the functions
          developers have marked 'inline'. Doing so takes away freedom from gcc to
@@ -298,5 +299,7 @@ config OPTIMIZE_INLINING
          become the default in the future, until then this option is there to
          test gcc for this.
 
+         If unsure, say N.
+
 endmenu
 
index 03399d6..d93cbc6 100644 (file)
@@ -167,9 +167,8 @@ void query_edd(void)
                 * Scan the BIOS-supported hard disks and query EDD
                 * information...
                 */
-               get_edd_info(devno, &ei);
-
-               if (boot_params.eddbuf_entries < EDDMAXNR) {
+               if (!get_edd_info(devno, &ei)
+                   && boot_params.eddbuf_entries < EDDMAXNR) {
                        memcpy(edp, &ei, sizeof ei);
                        edp++;
                        boot_params.eddbuf_entries++;
index 328956f..85a1cd8 100644 (file)
@@ -98,12 +98,6 @@ static void reset_coprocessor(void)
 /*
  * Set up the GDT
  */
-#define GDT_ENTRY(flags, base, limit)          \
-       (((u64)(base & 0xff000000) << 32) |     \
-        ((u64)flags << 40) |                   \
-        ((u64)(limit & 0x00ff0000) << 32) |    \
-        ((u64)(base & 0x00ffffff) << 16) |     \
-        ((u64)(limit & 0x0000ffff)))
 
 struct gdt_ptr {
        u16 len;
index 9bc34e2..4d73f53 100644 (file)
@@ -2047,7 +2047,7 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
 # CONFIG_SAMPLES is not set
 # CONFIG_KGDB is not set
 CONFIG_HAVE_ARCH_KGDB=y
-# CONFIG_NONPROMISC_DEVMEM is not set
+# CONFIG_STRICT_DEVMEM is not set
 CONFIG_EARLY_PRINTK=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DEBUG_STACK_USAGE=y
index ae5124e..a404524 100644 (file)
@@ -2012,7 +2012,7 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
 # CONFIG_SAMPLES is not set
 # CONFIG_KGDB is not set
 CONFIG_HAVE_ARCH_KGDB=y
-# CONFIG_NONPROMISC_DEVMEM is not set
+# CONFIG_STRICT_DEVMEM is not set
 CONFIG_EARLY_PRINTK=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DEBUG_STACK_USAGE=y
index cb3856a..20af4c7 100644 (file)
 
 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
 
+#define FIX_EFLAGS     (X86_EFLAGS_AC | X86_EFLAGS_OF | \
+                        X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
+                        X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
+                        X86_EFLAGS_CF)
+
 asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
@@ -248,7 +253,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
        regs->ss |= 3;
 
        err |= __get_user(tmpflags, &sc->flags);
-       regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
+       regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
        /* disable syscall checks */
        regs->orig_ax = -1;
 
@@ -515,7 +520,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                        compat_sigset_t *set, struct pt_regs *regs)
 {
        struct rt_sigframe __user *frame;
-       struct exec_domain *ed = current_thread_info()->exec_domain;
        void __user *restorer;
        int err = 0;
 
@@ -538,8 +542,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
                goto give_sigsegv;
 
-       err |= __put_user((ed && ed->signal_invmap && sig < 32
-                          ? ed->signal_invmap[sig] : sig), &frame->sig);
+       err |= __put_user(sig, &frame->sig);
        err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
        err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
        err |= copy_siginfo_to_user32(&frame->info, info);
index 20371d0..23d146c 100644 (file)
        movq    %rax,R8(%rsp)
        .endm
 
+       /*
+        * Reload arg registers from stack in case ptrace changed them.
+        * We don't reload %eax because syscall_trace_enter() returned
+        * the value it wants us to use in the table lookup.
+        */
        .macro LOAD_ARGS32 offset
        movl \offset(%rsp),%r11d
        movl \offset+8(%rsp),%r10d
@@ -46,7 +51,6 @@
        movl \offset+48(%rsp),%edx
        movl \offset+56(%rsp),%esi
        movl \offset+64(%rsp),%edi
-       movl \offset+72(%rsp),%eax
        .endm
        
        .macro CFI_STARTPROC32 simple
@@ -137,13 +141,12 @@ ENTRY(ia32_sysenter_target)
        .previous       
        GET_THREAD_INFO(%r10)
        orl    $TS_COMPAT,TI_status(%r10)
-       testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
-                TI_flags(%r10)
+       testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
        CFI_REMEMBER_STATE
        jnz  sysenter_tracesys
-sysenter_do_call:      
        cmpl    $(IA32_NR_syscalls-1),%eax
        ja      ia32_badsys
+sysenter_do_call:
        IA32_ARG_FIXUP 1
        call    *ia32_sys_call_table(,%rax,8)
        movq    %rax,RAX-ARGOFFSET(%rsp)
@@ -242,8 +245,7 @@ ENTRY(ia32_cstar_target)
        .previous       
        GET_THREAD_INFO(%r10)
        orl   $TS_COMPAT,TI_status(%r10)
-       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
-               TI_flags(%r10)
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
        CFI_REMEMBER_STATE
        jnz   cstar_tracesys
 cstar_do_call: 
@@ -321,6 +323,7 @@ ENTRY(ia32_syscall)
        /*CFI_REL_OFFSET        rflags,EFLAGS-RIP*/
        /*CFI_REL_OFFSET        cs,CS-RIP*/
        CFI_REL_OFFSET  rip,RIP-RIP
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        SWAPGS
        /*
         * No need to follow this irqs on/off section: the syscall
@@ -336,8 +339,7 @@ ENTRY(ia32_syscall)
        SAVE_ARGS 0,0,1
        GET_THREAD_INFO(%r10)
        orl   $TS_COMPAT,TI_status(%r10)
-       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
-               TI_flags(%r10)
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
        jnz ia32_tracesys
 ia32_do_syscall:       
        cmpl $(IA32_NR_syscalls-1),%eax
index da14061..3db651f 100644 (file)
@@ -7,9 +7,10 @@ extra-y                := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
 ifdef CONFIG_FTRACE
-# Do not profile debug utilities
+# Do not profile debug and lowlevel utilities
 CFLAGS_REMOVE_tsc.o = -pg
 CFLAGS_REMOVE_rtc.o = -pg
+CFLAGS_REMOVE_paravirt.o = -pg
 endif
 
 #
@@ -102,6 +103,7 @@ obj-$(CONFIG_OLPC)          += olpc.o
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
         obj-y                          += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
+       obj-y                           += bios_uv.o
         obj-$(CONFIG_X86_PM_TIMER)     += pmtimer_64.o
         obj-$(CONFIG_AUDIT)            += audit_64.o
 
index 868de3d..a3ddad1 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/bootmem.h>
 #include <linux/dmi.h>
 #include <linux/cpumask.h>
+#include <asm/segment.h>
 
 #include "realmode/wakeup.h"
 #include "sleep.h"
@@ -23,15 +24,6 @@ static unsigned long acpi_realmode;
 static char temp_stack[10240];
 #endif
 
-/* XXX: this macro should move to asm-x86/segment.h and be shared with the
-   boot code... */
-#define GDT_ENTRY(flags, base, limit)          \
-       (((u64)(base & 0xff000000) << 32) |     \
-        ((u64)flags << 40) |                   \
-        ((u64)(limit & 0x00ff0000) << 32) |    \
-        ((u64)(base & 0x00ffffff) << 16) |     \
-        ((u64)(limit & 0x0000ffff)))
-
 /**
  * acpi_save_state_mem - save kernel state
  *
index f2766d8..c25210e 100644 (file)
@@ -23,7 +23,7 @@
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
 #include <asm/proto.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/amd_iommu_types.h>
 #include <asm/amd_iommu.h>
 
 #define to_pages(addr, size) \
         (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
 
+#define EXIT_LOOP_COUNT 10000000
+
 static DEFINE_RWLOCK(amd_iommu_devtable_lock);
 
-struct command {
+/*
+ * general struct to manage commands send to an IOMMU
+ */
+struct iommu_cmd {
        u32 data[4];
 };
 
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
                             struct unity_map_entry *e);
 
+/* returns !0 if the IOMMU is caching non-present entries in its TLB */
 static int iommu_has_npcache(struct amd_iommu *iommu)
 {
        return iommu->cap & IOMMU_CAP_NPCACHE;
 }
 
-static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+/****************************************************************************
+ *
+ * IOMMU command queuing functions
+ *
+ ****************************************************************************/
+
+/*
+ * Writes the command to the IOMMUs command buffer and informs the
+ * hardware about the new command. Must be called with iommu->lock held.
+ */
+static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
 {
        u32 tail, head;
        u8 *target;
@@ -63,7 +79,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
        return 0;
 }
 
-static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+/*
+ * General queuing function for commands. Takes iommu->lock and calls
+ * __iommu_queue_command().
+ */
+static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
 {
        unsigned long flags;
        int ret;
@@ -75,16 +95,24 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
        return ret;
 }
 
+/*
+ * This function is called whenever we need to ensure that the IOMMU has
+ * completed execution of all commands we sent. It sends a
+ * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
+ * us about that by writing a value to a physical address we pass with
+ * the command.
+ */
 static int iommu_completion_wait(struct amd_iommu *iommu)
 {
        int ret;
-       struct command cmd;
+       struct iommu_cmd cmd;
        volatile u64 ready = 0;
        unsigned long ready_phys = virt_to_phys(&ready);
+       unsigned long i = 0;
 
        memset(&cmd, 0, sizeof(cmd));
        cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
-       cmd.data[1] = HIGH_U32(ready_phys);
+       cmd.data[1] = upper_32_bits(ready_phys);
        cmd.data[2] = 1; /* value written to 'ready' */
        CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
 
@@ -95,15 +123,23 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
        if (ret)
                return ret;
 
-       while (!ready)
+       while (!ready && (i < EXIT_LOOP_COUNT)) {
+               ++i;
                cpu_relax();
+       }
+
+       if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
+               printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
 
        return 0;
 }
 
+/*
+ * Command send function for invalidating a device table entry
+ */
 static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
 {
-       struct command cmd;
+       struct iommu_cmd cmd;
 
        BUG_ON(iommu == NULL);
 
@@ -116,20 +152,23 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
        return iommu_queue_command(iommu, &cmd);
 }
 
+/*
+ * Generic command send function for invalidaing TLB entries
+ */
 static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
                u64 address, u16 domid, int pde, int s)
 {
-       struct command cmd;
+       struct iommu_cmd cmd;
 
        memset(&cmd, 0, sizeof(cmd));
        address &= PAGE_MASK;
        CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
        cmd.data[1] |= domid;
        cmd.data[2] = LOW_U32(address);
-       cmd.data[3] = HIGH_U32(address);
-       if (s)
+       cmd.data[3] = upper_32_bits(address);
+       if (s) /* size bit - we flush more than one 4kb page */
                cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
-       if (pde)
+       if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
                cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
 
        iommu->need_sync = 1;
@@ -137,6 +176,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
        return iommu_queue_command(iommu, &cmd);
 }
 
+/*
+ * TLB invalidation function which is called from the mapping functions.
+ * It invalidates a single PTE if the range to flush is within a single
+ * page. Otherwise it flushes the whole TLB of the IOMMU.
+ */
 static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
                u64 address, size_t size)
 {
@@ -159,6 +203,20 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
        return 0;
 }
 
+/****************************************************************************
+ *
+ * The functions below are used the create the page table mappings for
+ * unity mapped regions.
+ *
+ ****************************************************************************/
+
+/*
+ * Generic mapping functions. It maps a physical address into a DMA
+ * address space. It allocates the page table pages if necessary.
+ * In the future it can be extended to a generic mapping function
+ * supporting all features of AMD IOMMU page tables like level skipping
+ * and full 64 bit address spaces.
+ */
 static int iommu_map(struct protection_domain *dom,
                     unsigned long bus_addr,
                     unsigned long phys_addr,
@@ -209,6 +267,10 @@ static int iommu_map(struct protection_domain *dom,
        return 0;
 }
 
+/*
+ * This function checks if a specific unity mapping entry is needed for
+ * this specific IOMMU.
+ */
 static int iommu_for_unity_map(struct amd_iommu *iommu,
                               struct unity_map_entry *entry)
 {
@@ -223,6 +285,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
        return 0;
 }
 
+/*
+ * Init the unity mappings for a specific IOMMU in the system
+ *
+ * Basically iterates over all unity mapping entries and applies them to
+ * the default domain DMA of that IOMMU if necessary.
+ */
 static int iommu_init_unity_mappings(struct amd_iommu *iommu)
 {
        struct unity_map_entry *entry;
@@ -239,6 +307,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu)
        return 0;
 }
 
+/*
+ * This function actually applies the mapping to the page table of the
+ * dma_ops domain.
+ */
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
                             struct unity_map_entry *e)
 {
@@ -261,6 +333,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
        return 0;
 }
 
+/*
+ * Inits the unity mappings required for a specific device
+ */
 static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
                                          u16 devid)
 {
@@ -278,12 +353,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
        return 0;
 }
 
+/****************************************************************************
+ *
+ * The next functions belong to the address allocator for the dma_ops
+ * interface functions. They work like the allocators in the other IOMMU
+ * drivers. Its basically a bitmap which marks the allocated pages in
+ * the aperture. Maybe it could be enhanced in the future to a more
+ * efficient allocator.
+ *
+ ****************************************************************************/
 static unsigned long dma_mask_to_pages(unsigned long mask)
 {
        return (mask >> PAGE_SHIFT) +
                (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
 }
 
+/*
+ * The address allocator core function.
+ *
+ * called with domain->lock held
+ */
 static unsigned long dma_ops_alloc_addresses(struct device *dev,
                                             struct dma_ops_domain *dom,
                                             unsigned int pages)
@@ -317,6 +406,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
        return address;
 }
 
+/*
+ * The address free function.
+ *
+ * called with domain->lock held
+ */
 static void dma_ops_free_addresses(struct dma_ops_domain *dom,
                                   unsigned long address,
                                   unsigned int pages)
@@ -325,6 +419,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
        iommu_area_free(dom->bitmap, address, pages);
 }
 
+/****************************************************************************
+ *
+ * The next functions belong to the domain allocation. A domain is
+ * allocated for every IOMMU as the default domain. If device isolation
+ * is enabled, every device get its own domain. The most important thing
+ * about domains is the page table mapping the DMA address space they
+ * contain.
+ *
+ ****************************************************************************/
+
 static u16 domain_id_alloc(void)
 {
        unsigned long flags;
@@ -342,6 +446,10 @@ static u16 domain_id_alloc(void)
        return id;
 }
 
+/*
+ * Used to reserve address ranges in the aperture (e.g. for exclusion
+ * ranges.
+ */
 static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
                                      unsigned long start_page,
                                      unsigned int pages)
@@ -382,6 +490,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
        free_page((unsigned long)p1);
 }
 
+/*
+ * Free a domain, only used if something went wrong in the
+ * allocation path and we need to free an already allocated page table
+ */
 static void dma_ops_domain_free(struct dma_ops_domain *dom)
 {
        if (!dom)
@@ -396,6 +508,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
        kfree(dom);
 }
 
+/*
+ * Allocates a new protection domain usable for the dma_ops functions.
+ * It also intializes the page table and the address allocator data
+ * structures required for the dma_ops interface
+ */
 static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
                                                   unsigned order)
 {
@@ -436,6 +553,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
        dma_dom->bitmap[0] = 1;
        dma_dom->next_bit = 0;
 
+       /* Intialize the exclusion range if necessary */
        if (iommu->exclusion_start &&
            iommu->exclusion_start < dma_dom->aperture_size) {
                unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
@@ -444,6 +562,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
                dma_ops_reserve_addresses(dma_dom, startpage, pages);
        }
 
+       /*
+        * At the last step, build the page tables so we don't need to
+        * allocate page table pages in the dma_ops mapping/unmapping
+        * path.
+        */
        num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
        dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
                        GFP_KERNEL);
@@ -472,6 +595,10 @@ free_dma_dom:
        return NULL;
 }
 
+/*
+ * Find out the protection domain structure for a given PCI device. This
+ * will give us the pointer to the page table root for example.
+ */
 static struct protection_domain *domain_for_device(u16 devid)
 {
        struct protection_domain *dom;
@@ -484,6 +611,10 @@ static struct protection_domain *domain_for_device(u16 devid)
        return dom;
 }
 
+/*
+ * If a device is not yet associated with a domain, this function does
+ * assigns it visible for the hardware
+ */
 static void set_device_domain(struct amd_iommu *iommu,
                              struct protection_domain *domain,
                              u16 devid)
@@ -508,6 +639,19 @@ static void set_device_domain(struct amd_iommu *iommu,
        iommu->need_sync = 1;
 }
 
+/*****************************************************************************
+ *
+ * The next functions belong to the dma_ops mapping/unmapping code.
+ *
+ *****************************************************************************/
+
+/*
+ * In the dma_ops path we only have the struct device. This function
+ * finds the corresponding IOMMU, the protection domain and the
+ * requestor id for a given device.
+ * If the device is not yet associated with a domain this is also done
+ * in this function.
+ */
 static int get_device_resources(struct device *dev,
                                struct amd_iommu **iommu,
                                struct protection_domain **domain,
@@ -520,8 +664,9 @@ static int get_device_resources(struct device *dev,
        BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
 
        pcidev = to_pci_dev(dev);
-       _bdf = (pcidev->bus->number << 8) | pcidev->devfn;
+       _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
 
+       /* device not translated by any IOMMU in the system? */
        if (_bdf >= amd_iommu_last_bdf) {
                *iommu = NULL;
                *domain = NULL;
@@ -547,6 +692,10 @@ static int get_device_resources(struct device *dev,
        return 1;
 }
 
+/*
+ * This is the generic map function. It maps one 4kb page at paddr to
+ * the given address in the DMA address space for the domain.
+ */
 static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
                                     struct dma_ops_domain *dom,
                                     unsigned long address,
@@ -578,6 +727,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
        return (dma_addr_t)address;
 }
 
+/*
+ * The generic unmapping function for on page in the DMA address space.
+ */
 static void dma_ops_domain_unmap(struct amd_iommu *iommu,
                                 struct dma_ops_domain *dom,
                                 unsigned long address)
@@ -597,6 +749,12 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
        *pte = 0ULL;
 }
 
+/*
+ * This function contains common code for mapping of a physically
+ * contiguous memory region into DMA address space. It is uses by all
+ * mapping functions provided by this IOMMU driver.
+ * Must be called with the domain lock held.
+ */
 static dma_addr_t __map_single(struct device *dev,
                               struct amd_iommu *iommu,
                               struct dma_ops_domain *dma_dom,
@@ -628,6 +786,10 @@ out:
        return address;
 }
 
+/*
+ * Does the reverse of the __map_single function. Must be called with
+ * the domain lock held too
+ */
 static void __unmap_single(struct amd_iommu *iommu,
                           struct dma_ops_domain *dma_dom,
                           dma_addr_t dma_addr,
@@ -652,6 +814,9 @@ static void __unmap_single(struct amd_iommu *iommu,
        dma_ops_free_addresses(dma_dom, dma_addr, pages);
 }
 
+/*
+ * The exported map_single function for dma_ops.
+ */
 static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
                             size_t size, int dir)
 {
@@ -664,6 +829,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
        get_device_resources(dev, &iommu, &domain, &devid);
 
        if (iommu == NULL || domain == NULL)
+               /* device not handled by any AMD IOMMU */
                return (dma_addr_t)paddr;
 
        spin_lock_irqsave(&domain->lock, flags);
@@ -683,6 +849,9 @@ out:
        return addr;
 }
 
+/*
+ * The exported unmap_single function for dma_ops.
+ */
 static void unmap_single(struct device *dev, dma_addr_t dma_addr,
                         size_t size, int dir)
 {
@@ -692,6 +861,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
        u16 devid;
 
        if (!get_device_resources(dev, &iommu, &domain, &devid))
+               /* device not handled by any AMD IOMMU */
                return;
 
        spin_lock_irqsave(&domain->lock, flags);
@@ -706,6 +876,10 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
        spin_unlock_irqrestore(&domain->lock, flags);
 }
 
+/*
+ * This is a special map_sg function which is used if we should map a
+ * device which is not handled by an AMD IOMMU in the system.
+ */
 static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
                           int nelems, int dir)
 {
@@ -720,6 +894,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
        return nelems;
 }
 
+/*
+ * The exported map_sg function for dma_ops (handles scatter-gather
+ * lists).
+ */
 static int map_sg(struct device *dev, struct scatterlist *sglist,
                  int nelems, int dir)
 {
@@ -775,6 +953,10 @@ unmap:
        goto out;
 }
 
+/*
+ * The exported map_sg function for dma_ops (handles scatter-gather
+ * lists).
+ */
 static void unmap_sg(struct device *dev, struct scatterlist *sglist,
                     int nelems, int dir)
 {
@@ -804,6 +986,9 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
        spin_unlock_irqrestore(&domain->lock, flags);
 }
 
+/*
+ * The exported alloc_coherent function for dma_ops.
+ */
 static void *alloc_coherent(struct device *dev, size_t size,
                            dma_addr_t *dma_addr, gfp_t flag)
 {
@@ -851,6 +1036,11 @@ out:
        return virt_addr;
 }
 
+/*
+ * The exported free_coherent function for dma_ops.
+ * FIXME: fix the generic x86 DMA layer so that it actually calls that
+ *        function.
+ */
 static void free_coherent(struct device *dev, size_t size,
                          void *virt_addr, dma_addr_t dma_addr)
 {
@@ -879,6 +1069,8 @@ free_mem:
 }
 
 /*
+ * The function for pre-allocating protection domains.
+ *
  * If the driver core informs the DMA layer if a driver grabs a device
  * we don't need to preallocate the protection domains anymore.
  * For now we have to.
@@ -921,12 +1113,20 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
        .unmap_sg = unmap_sg,
 };
 
+/*
+ * The function which clues the AMD IOMMU driver into dma_ops.
+ */
 int __init amd_iommu_init_dma_ops(void)
 {
        struct amd_iommu *iommu;
        int order = amd_iommu_aperture_order;
        int ret;
 
+       /*
+        * first allocate a default protection domain for every IOMMU we
+        * found in the system. Devices not assigned to any other
+        * protection domain will be assigned to the default one.
+        */
        list_for_each_entry(iommu, &amd_iommu_list, list) {
                iommu->default_dom = dma_ops_domain_alloc(iommu, order);
                if (iommu->default_dom == NULL)
@@ -936,6 +1136,10 @@ int __init amd_iommu_init_dma_ops(void)
                        goto free_domains;
        }
 
+       /*
+        * If device isolation is enabled, pre-allocate the protection
+        * domains for each device.
+        */
        if (amd_iommu_isolate)
                prealloc_protection_domains();
 
@@ -947,6 +1151,7 @@ int __init amd_iommu_init_dma_ops(void)
        gart_iommu_aperture = 0;
 #endif
 
+       /* Make the driver finally visible to the drivers */
        dma_ops = &amd_iommu_dma_ops;
 
        return 0;
index 2a13e43..c9d8ff2 100644 (file)
 #include <asm/pci-direct.h>
 #include <asm/amd_iommu_types.h>
 #include <asm/amd_iommu.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 
 /*
  * definitions for the ACPI scanning code
  */
-#define UPDATE_LAST_BDF(x) do {\
-       if ((x) > amd_iommu_last_bdf) \
-               amd_iommu_last_bdf = (x); \
-       } while (0);
-
-#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
 #define PCI_BUS(x) (((x) >> 8) & 0xff)
 #define IVRS_HEADER_LENGTH 48
-#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
 
 #define ACPI_IVHD_TYPE                  0x10
 #define ACPI_IVMD_TYPE_ALL              0x20
 #define ACPI_DEVFLAG_LINT1              0x80
 #define ACPI_DEVFLAG_ATSDIS             0x10000000
 
+/*
+ * ACPI table definitions
+ *
+ * These data structures are laid over the table to parse the important values
+ * out of it.
+ */
+
+/*
+ * structure describing one IOMMU in the ACPI table. Typically followed by one
+ * or more ivhd_entrys.
+ */
 struct ivhd_header {
        u8 type;
        u8 flags;
@@ -83,6 +87,10 @@ struct ivhd_header {
        u32 reserved;
 } __attribute__((packed));
 
+/*
+ * A device entry describing which devices a specific IOMMU translates and
+ * which requestor ids they use.
+ */
 struct ivhd_entry {
        u8 type;
        u16 devid;
@@ -90,6 +98,10 @@ struct ivhd_entry {
        u32 ext;
 } __attribute__((packed));
 
+/*
+ * An AMD IOMMU memory definition structure. It defines things like exclusion
+ * ranges for devices and regions that should be unity mapped.
+ */
 struct ivmd_header {
        u8 type;
        u8 flags;
@@ -103,22 +115,80 @@ struct ivmd_header {
 
 static int __initdata amd_iommu_detected;
 
-u16 amd_iommu_last_bdf;
-struct list_head amd_iommu_unity_map;
-unsigned amd_iommu_aperture_order = 26;
-int amd_iommu_isolate;
+u16 amd_iommu_last_bdf;                        /* largest PCI device id we have
+                                          to handle */
+LIST_HEAD(amd_iommu_unity_map);                /* a list of required unity mappings
+                                          we find in ACPI */
+unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
+int amd_iommu_isolate;                 /* if 1, device isolation is enabled */
+
+LIST_HEAD(amd_iommu_list);             /* list of all AMD IOMMUs in the
+                                          system */
 
-struct list_head amd_iommu_list;
+/*
+ * Pointer to the device table which is shared by all AMD IOMMUs
+ * it is indexed by the PCI device id or the HT unit id and contains
+ * information about the domain the device belongs to as well as the
+ * page table root pointer.
+ */
 struct dev_table_entry *amd_iommu_dev_table;
+
+/*
+ * The alias table is a driver specific data structure which contains the
+ * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
+ * More than one device can share the same requestor id.
+ */
 u16 *amd_iommu_alias_table;
+
+/*
+ * The rlookup table is used to find the IOMMU which is responsible
+ * for a specific device. It is also indexed by the PCI device id.
+ */
 struct amd_iommu **amd_iommu_rlookup_table;
+
+/*
+ * The pd table (protection domain table) is used to find the protection domain
+ * data structure a device belongs to. Indexed with the PCI device id too.
+ */
 struct protection_domain **amd_iommu_pd_table;
+
+/*
+ * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
+ * to know which ones are already in use.
+ */
 unsigned long *amd_iommu_pd_alloc_bitmap;
 
-static u32 dev_table_size;
-static u32 alias_table_size;
-static u32 rlookup_table_size;
+static u32 dev_table_size;     /* size of the device table */
+static u32 alias_table_size;   /* size of the alias table */
+static u32 rlookup_table_size; /* size if the rlookup table */
 
+static inline void update_last_devid(u16 devid)
+{
+       if (devid > amd_iommu_last_bdf)
+               amd_iommu_last_bdf = devid;
+}
+
+static inline unsigned long tbl_size(int entry_size)
+{
+       unsigned shift = PAGE_SHIFT +
+                        get_order(amd_iommu_last_bdf * entry_size);
+
+       return 1UL << shift;
+}
+
+/****************************************************************************
+ *
+ * AMD IOMMU MMIO register space handling functions
+ *
+ * These functions are used to program the IOMMU device registers in
+ * MMIO space required for that driver.
+ *
+ ****************************************************************************/
+
+/*
+ * This function set the exclusion range in the IOMMU. DMA accesses to the
+ * exclusion range are passed through untranslated
+ */
 static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
 {
        u64 start = iommu->exclusion_start & PAGE_MASK;
@@ -137,6 +207,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
                        &entry, sizeof(entry));
 }
 
+/* Programs the physical address of the device table into the IOMMU hardware */
 static void __init iommu_set_device_table(struct amd_iommu *iommu)
 {
        u32 entry;
@@ -149,6 +220,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
                        &entry, sizeof(entry));
 }
 
+/* Generic functions to enable/disable certain features of the IOMMU. */
 static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
 {
        u32 ctrl;
@@ -167,6 +239,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
        writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
 }
 
+/* Function to enable the hardware */
 void __init iommu_enable(struct amd_iommu *iommu)
 {
        printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
@@ -176,6 +249,10 @@ void __init iommu_enable(struct amd_iommu *iommu)
        iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 }
 
+/*
+ * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
+ * the system has one.
+ */
 static u8 * __init iommu_map_mmio_space(u64 address)
 {
        u8 *ret;
@@ -199,16 +276,33 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
        release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
 }
 
+/****************************************************************************
+ *
+ * The functions below belong to the first pass of AMD IOMMU ACPI table
+ * parsing. In this pass we try to find out the highest device id this
+ * code has to handle. Upon this information the size of the shared data
+ * structures is determined later.
+ *
+ ****************************************************************************/
+
+/*
+ * This function reads the last device id the IOMMU has to handle from the PCI
+ * capability header for this IOMMU
+ */
 static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
 {
        u32 cap;
 
        cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
-       UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
+       update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
 
        return 0;
 }
 
+/*
+ * After reading the highest device id from the IOMMU PCI capability header
+ * this function looks if there is a higher device id defined in the ACPI table
+ */
 static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
 {
        u8 *p = (void *)h, *end = (void *)h;
@@ -229,7 +323,8 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
                case IVHD_DEV_RANGE_END:
                case IVHD_DEV_ALIAS:
                case IVHD_DEV_EXT_SELECT:
-                       UPDATE_LAST_BDF(dev->devid);
+                       /* all the above subfield types refer to device ids */
+                       update_last_devid(dev->devid);
                        break;
                default:
                        break;
@@ -242,6 +337,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
        return 0;
 }
 
+/*
+ * Iterate over all IVHD entries in the ACPI table and find the highest device
+ * id which we need to handle. This is the first of three functions which parse
+ * the ACPI table. So we check the checksum here.
+ */
 static int __init find_last_devid_acpi(struct acpi_table_header *table)
 {
        int i;
@@ -277,19 +377,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
        return 0;
 }
 
+/****************************************************************************
+ *
+ * The following functions belong the the code path which parses the ACPI table
+ * the second time. In this ACPI parsing iteration we allocate IOMMU specific
+ * data structures, initialize the device/alias/rlookup table and also
+ * basically initialize the hardware.
+ *
+ ****************************************************************************/
+
+/*
+ * Allocates the command buffer. This buffer is per AMD IOMMU. We can
+ * write commands to that buffer later and the IOMMU will execute them
+ * asynchronously
+ */
 static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
 {
-       u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL,
+       u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
                        get_order(CMD_BUFFER_SIZE));
-       u64 entry = 0;
+       u64 entry;
 
        if (cmd_buf == NULL)
                return NULL;
 
        iommu->cmd_buf_size = CMD_BUFFER_SIZE;
 
-       memset(cmd_buf, 0, CMD_BUFFER_SIZE);
-
        entry = (u64)virt_to_phys(cmd_buf);
        entry |= MMIO_CMD_SIZE_512;
        memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -302,11 +414,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
 
 static void __init free_command_buffer(struct amd_iommu *iommu)
 {
-       if (iommu->cmd_buf)
-               free_pages((unsigned long)iommu->cmd_buf,
-                               get_order(CMD_BUFFER_SIZE));
+       free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
 }
 
+/* sets a specific bit in the device table entry. */
 static void set_dev_entry_bit(u16 devid, u8 bit)
 {
        int i = (bit >> 5) & 0x07;
@@ -315,7 +426,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
        amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
 }
 
-static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
+/* Writes the specific IOMMU for a device into the rlookup table */
+static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
+{
+       amd_iommu_rlookup_table[devid] = iommu;
+}
+
+/*
+ * This function takes the device specific flags read from the ACPI
+ * table and sets up the device table entry with that information
+ */
+static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
+                                          u16 devid, u32 flags, u32 ext_flags)
 {
        if (flags & ACPI_DEVFLAG_INITPASS)
                set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
@@ -331,13 +453,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
                set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
        if (flags & ACPI_DEVFLAG_LINT1)
                set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
-}
 
-static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
-{
-       amd_iommu_rlookup_table[devid] = iommu;
+       set_iommu_for_device(iommu, devid);
 }
 
+/*
+ * Reads the device exclusion range from ACPI and initialize IOMMU with
+ * it
+ */
 static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
 {
        struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
@@ -346,12 +469,22 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
                return;
 
        if (iommu) {
+               /*
+                * We only can configure exclusion ranges per IOMMU, not
+                * per device. But we can enable the exclusion range per
+                * device. This is done here
+                */
                set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
                iommu->exclusion_start = m->range_start;
                iommu->exclusion_length = m->range_length;
        }
 }
 
+/*
+ * This function reads some important data from the IOMMU PCI space and
+ * initializes the driver data structure with it. It reads the hardware
+ * capabilities and the first/last device entries
+ */
 static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 {
        int bus = PCI_BUS(iommu->devid);
@@ -363,10 +496,16 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
        iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
 
        range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
-       iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range));
-       iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range));
+       iommu->first_device = calc_devid(MMIO_GET_BUS(range),
+                                        MMIO_GET_FD(range));
+       iommu->last_device = calc_devid(MMIO_GET_BUS(range),
+                                       MMIO_GET_LD(range));
 }
 
+/*
+ * Takes a pointer to an AMD IOMMU entry in the ACPI table and
+ * initializes the hardware and our data structures with it.
+ */
 static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
                                        struct ivhd_header *h)
 {
@@ -374,7 +513,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
        u8 *end = p, flags = 0;
        u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
        u32 ext_flags = 0;
-       bool alias = 0;
+       bool alias = false;
        struct ivhd_entry *e;
 
        /*
@@ -414,22 +553,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
                case IVHD_DEV_ALL:
                        for (dev_i = iommu->first_device;
                                        dev_i <= iommu->last_device; ++dev_i)
-                               set_dev_entry_from_acpi(dev_i, e->flags, 0);
+                               set_dev_entry_from_acpi(iommu, dev_i,
+                                                       e->flags, 0);
                        break;
                case IVHD_DEV_SELECT:
                        devid = e->devid;
-                       set_dev_entry_from_acpi(devid, e->flags, 0);
+                       set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
                        break;
                case IVHD_DEV_SELECT_RANGE_START:
                        devid_start = e->devid;
                        flags = e->flags;
                        ext_flags = 0;
-                       alias = 0;
+                       alias = false;
                        break;
                case IVHD_DEV_ALIAS:
                        devid = e->devid;
                        devid_to = e->ext >> 8;
-                       set_dev_entry_from_acpi(devid, e->flags, 0);
+                       set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
                        amd_iommu_alias_table[devid] = devid_to;
                        break;
                case IVHD_DEV_ALIAS_RANGE:
@@ -437,24 +577,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
                        flags = e->flags;
                        devid_to = e->ext >> 8;
                        ext_flags = 0;
-                       alias = 1;
+                       alias = true;
                        break;
                case IVHD_DEV_EXT_SELECT:
                        devid = e->devid;
-                       set_dev_entry_from_acpi(devid, e->flags, e->ext);
+                       set_dev_entry_from_acpi(iommu, devid, e->flags,
+                                               e->ext);
                        break;
                case IVHD_DEV_EXT_SELECT_RANGE:
                        devid_start = e->devid;
                        flags = e->flags;
                        ext_flags = e->ext;
-                       alias = 0;
+                       alias = false;
                        break;
                case IVHD_DEV_RANGE_END:
                        devid = e->devid;
                        for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
                                if (alias)
                                        amd_iommu_alias_table[dev_i] = devid_to;
-                               set_dev_entry_from_acpi(
+                               set_dev_entry_from_acpi(iommu,
                                                amd_iommu_alias_table[dev_i],
                                                flags, ext_flags);
                        }
@@ -467,6 +608,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
        }
 }
 
+/* Initializes the device->iommu mapping for the driver */
 static int __init init_iommu_devices(struct amd_iommu *iommu)
 {
        u16 i;
@@ -494,6 +636,11 @@ static void __init free_iommu_all(void)
        }
 }
 
+/*
+ * This function clues the initialization function for one IOMMU
+ * together and also allocates the command buffer and programs the
+ * hardware. It does NOT enable the IOMMU. This is done afterwards.
+ */
 static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 {
        spin_lock_init(&iommu->lock);
@@ -521,6 +668,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
        return 0;
 }
 
+/*
+ * Iterates over all IOMMU entries in the ACPI table, allocates the
+ * IOMMU structure and initializes it with init_iommu_one()
+ */
 static int __init init_iommu_all(struct acpi_table_header *table)
 {
        u8 *p = (u8 *)table, *end = (u8 *)table;
@@ -528,8 +679,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
        struct amd_iommu *iommu;
        int ret;
 
-       INIT_LIST_HEAD(&amd_iommu_list);
-
        end += table->length;
        p += IVRS_HEADER_LENGTH;
 
@@ -555,6 +704,14 @@ static int __init init_iommu_all(struct acpi_table_header *table)
        return 0;
 }
 
+/****************************************************************************
+ *
+ * The next functions belong to the third pass of parsing the ACPI
+ * table. In this last pass the memory mapping requirements are
+ * gathered (like exclusion and unity mapping reanges).
+ *
+ ****************************************************************************/
+
 static void __init free_unity_maps(void)
 {
        struct unity_map_entry *entry, *next;
@@ -565,6 +722,7 @@ static void __init free_unity_maps(void)
        }
 }
 
+/* called when we find an exclusion range definition in ACPI */
 static int __init init_exclusion_range(struct ivmd_header *m)
 {
        int i;
@@ -588,6 +746,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
        return 0;
 }
 
+/* called for unity map ACPI definition */
 static int __init init_unity_map_range(struct ivmd_header *m)
 {
        struct unity_map_entry *e = 0;
@@ -619,13 +778,12 @@ static int __init init_unity_map_range(struct ivmd_header *m)
        return 0;
 }
 
+/* iterates over all memory definitions we find in the ACPI table */
 static int __init init_memory_definitions(struct acpi_table_header *table)
 {
        u8 *p = (u8 *)table, *end = (u8 *)table;
        struct ivmd_header *m;
 
-       INIT_LIST_HEAD(&amd_iommu_unity_map);
-
        end += table->length;
        p += IVRS_HEADER_LENGTH;
 
@@ -642,6 +800,10 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
        return 0;
 }
 
+/*
+ * This function finally enables all IOMMUs found in the system after
+ * they have been initialized
+ */
 static void __init enable_iommus(void)
 {
        struct amd_iommu *iommu;
@@ -678,6 +840,34 @@ static struct sys_device device_amd_iommu = {
        .cls = &amd_iommu_sysdev_class,
 };
 
+/*
+ * This is the core init function for AMD IOMMU hardware in the system.
+ * This function is called from the generic x86 DMA layer initialization
+ * code.
+ *
+ * This function basically parses the ACPI table for AMD IOMMU (IVRS)
+ * three times:
+ *
+ *     1 pass) Find the highest PCI device id the driver has to handle.
+ *             Upon this information the size of the data structures is
+ *             determined that needs to be allocated.
+ *
+ *     2 pass) Initialize the data structures just allocated with the
+ *             information in the ACPI table about available AMD IOMMUs
+ *             in the system. It also maps the PCI devices in the
+ *             system to specific IOMMUs
+ *
+ *     3 pass) After the basic data structures are allocated and
+ *             initialized we update them with information about memory
+ *             remapping requirements parsed out of the ACPI table in
+ *             this last pass.
+ *
+ * After that the hardware is initialized and ready to go. In the last
+ * step we do some Linux specific things like registering the driver in
+ * the dma_ops interface and initializing the suspend/resume support
+ * functions. Finally it prints some information about AMD IOMMUs and
+ * the driver state and enables the hardware.
+ */
 int __init amd_iommu_init(void)
 {
        int i, ret = 0;
@@ -699,14 +889,14 @@ int __init amd_iommu_init(void)
        if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
                return -ENODEV;
 
-       dev_table_size     = TBL_SIZE(DEV_TABLE_ENTRY_SIZE);
-       alias_table_size   = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE);
-       rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE);
+       dev_table_size     = tbl_size(DEV_TABLE_ENTRY_SIZE);
+       alias_table_size   = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
+       rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
 
        ret = -ENOMEM;
 
        /* Device table - directly used by all IOMMUs */
-       amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL,
+       amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
                                      get_order(dev_table_size));
        if (amd_iommu_dev_table == NULL)
                goto out;
@@ -730,27 +920,23 @@ int __init amd_iommu_init(void)
         * Protection Domain table - maps devices to protection domains
         * This table has the same size as the rlookup_table
         */
-       amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL,
+       amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
                                     get_order(rlookup_table_size));
        if (amd_iommu_pd_table == NULL)
                goto free;
 
-       amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL,
+       amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
+                                           GFP_KERNEL | __GFP_ZERO,
                                            get_order(MAX_DOMAIN_ID/8));
        if (amd_iommu_pd_alloc_bitmap == NULL)
                goto free;
 
        /*
-        * memory is allocated now; initialize the device table with all zeroes
-        * and let all alias entries point to itself
+        * let all alias entries point to itself
         */
-       memset(amd_iommu_dev_table, 0, dev_table_size);
        for (i = 0; i < amd_iommu_last_bdf; ++i)
                amd_iommu_alias_table[i] = i;
 
-       memset(amd_iommu_pd_table, 0, rlookup_table_size);
-       memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
-
        /*
         * never allocate domain 0 because its used as the non-allocated and
         * error value placeholder
@@ -795,24 +981,19 @@ out:
        return ret;
 
 free:
-       if (amd_iommu_pd_alloc_bitmap)
-               free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
+       free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
 
-       if (amd_iommu_pd_table)
-               free_pages((unsigned long)amd_iommu_pd_table,
-                               get_order(rlookup_table_size));
+       free_pages((unsigned long)amd_iommu_pd_table,
+                  get_order(rlookup_table_size));
 
-       if (amd_iommu_rlookup_table)
-               free_pages((unsigned long)amd_iommu_rlookup_table,
-                               get_order(rlookup_table_size));
+       free_pages((unsigned long)amd_iommu_rlookup_table,
+                  get_order(rlookup_table_size));
 
-       if (amd_iommu_alias_table)
-               free_pages((unsigned long)amd_iommu_alias_table,
-                               get_order(alias_table_size));
+       free_pages((unsigned long)amd_iommu_alias_table,
+                  get_order(alias_table_size));
 
-       if (amd_iommu_dev_table)
-               free_pages((unsigned long)amd_iommu_dev_table,
-                               get_order(dev_table_size));
+       free_pages((unsigned long)amd_iommu_dev_table,
+                  get_order(dev_table_size));
 
        free_iommu_all();
 
@@ -821,6 +1002,13 @@ free:
        goto out;
 }
 
+/****************************************************************************
+ *
+ * Early detect code. This code runs at IOMMU detection time in the DMA
+ * layer. It just looks if there is an IVRS ACPI table to detect AMD
+ * IOMMUs
+ *
+ ****************************************************************************/
 static int __init early_amd_iommu_detect(struct acpi_table_header *table)
 {
        return 0;
@@ -828,7 +1016,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
 
 void __init amd_iommu_detect(void)
 {
-       if (swiotlb || no_iommu || iommu_detected)
+       if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
                return;
 
        if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
@@ -841,6 +1029,13 @@ void __init amd_iommu_detect(void)
        }
 }
 
+/****************************************************************************
+ *
+ * Parsing functions for the AMD IOMMU specific kernel command line
+ * options.
+ *
+ ****************************************************************************/
+
 static int __init parse_amd_iommu_options(char *str)
 {
        for (; *str; ++str) {
@@ -853,20 +1048,10 @@ static int __init parse_amd_iommu_options(char *str)
 
 static int __init parse_amd_iommu_size_options(char *str)
 {
-       for (; *str; ++str) {
-               if (strcmp(str, "32M") == 0)
-                       amd_iommu_aperture_order = 25;
-               if (strcmp(str, "64M") == 0)
-                       amd_iommu_aperture_order = 26;
-               if (strcmp(str, "128M") == 0)
-                       amd_iommu_aperture_order = 27;
-               if (strcmp(str, "256M") == 0)
-                       amd_iommu_aperture_order = 28;
-               if (strcmp(str, "512M") == 0)
-                       amd_iommu_aperture_order = 29;
-               if (strcmp(str, "1G") == 0)
-                       amd_iommu_aperture_order = 30;
-       }
+       unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
+
+       if ((order > 24) && (order < 31))
+               amd_iommu_aperture_order = order;
 
        return 1;
 }
index 9f90780..44e2182 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/suspend.h>
 #include <asm/e820.h>
 #include <asm/io.h>
+#include <asm/iommu.h>
 #include <asm/gart.h>
 #include <asm/pci-direct.h>
 #include <asm/dma.h>
index a437d02..d6c8983 100644 (file)
@@ -75,7 +75,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
 /*
  * Debug level, exported for io_apic.c
  */
-int apic_verbosity;
+unsigned int apic_verbosity;
 
 int pic_mode;
 
@@ -177,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void)
        /* Level triggered for 82489DX */
        if (!lapic_is_integrated())
                v |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT0, v);
+       apic_write(APIC_LVT0, v);
 }
 
 /**
@@ -212,9 +212,6 @@ int lapic_get_maxlvt(void)
  * this function twice on the boot CPU, once with a bogus timeout
  * value, second time for real. The other (noncalibrating) CPUs
  * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
  */
 static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 {
@@ -229,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
        if (!irqen)
                lvtt_value |= APIC_LVT_MASKED;
 
-       apic_write_around(APIC_LVTT, lvtt_value);
+       apic_write(APIC_LVTT, lvtt_value);
 
        /*
         * Divide PICLK by 16
         */
        tmp_value = apic_read(APIC_TDCR);
-       apic_write_around(APIC_TDCR, (tmp_value
-                               & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
-                               | APIC_TDR_DIV_16);
+       apic_write(APIC_TDCR,
+                  (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
+                  APIC_TDR_DIV_16);
 
        if (!oneshot)
-               apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+               apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
 }
 
 /*
@@ -249,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 static int lapic_next_event(unsigned long delta,
                            struct clock_event_device *evt)
 {
-       apic_write_around(APIC_TMICT, delta);
+       apic_write(APIC_TMICT, delta);
        return 0;
 }
 
@@ -278,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
        case CLOCK_EVT_MODE_SHUTDOWN:
                v = apic_read(APIC_LVTT);
                v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-               apic_write_around(APIC_LVTT, v);
+               apic_write(APIC_LVTT, v);
                break;
        case CLOCK_EVT_MODE_RESUME:
                /* Nothing to do here */
@@ -372,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
        }
 }
 
-/*
- * Setup the boot APIC
- *
- * Calibrate and verify the result.
- */
-void __init setup_boot_APIC_clock(void)
+static int __init calibrate_APIC_clock(void)
 {
        struct clock_event_device *levt = &__get_cpu_var(lapic_events);
        const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
@@ -387,24 +379,6 @@ void __init setup_boot_APIC_clock(void)
        long delta, deltapm;
        int pm_referenced = 0;
 
-       /*
-        * The local apic timer can be disabled via the kernel
-        * commandline or from the CPU detection code. Register the lapic
-        * timer as a dummy clock event source on SMP systems, so the
-        * broadcast mechanism is used. On UP systems simply ignore it.
-        */
-       if (local_apic_timer_disabled) {
-               /* No broadcast on UP ! */
-               if (num_possible_cpus() > 1) {
-                       lapic_clockevent.mult = 1;
-                       setup_APIC_timer();
-               }
-               return;
-       }
-
-       apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
-                   "calibrating APIC timer ...\n");
-
        local_irq_disable();
 
        /* Replace the global interrupt handler */
@@ -489,8 +463,6 @@ void __init setup_boot_APIC_clock(void)
                    calibration_result / (1000000 / HZ),
                    calibration_result % (1000000 / HZ));
 
-       local_apic_timer_verify_ok = 1;
-
        /*
         * Do a sanity check on the APIC calibration result
         */
@@ -498,12 +470,11 @@ void __init setup_boot_APIC_clock(void)
                local_irq_enable();
                printk(KERN_WARNING
                       "APIC frequency too slow, disabling apic timer\n");
-               /* No broadcast on UP ! */
-               if (num_possible_cpus() > 1)
-                       setup_APIC_timer();
-               return;
+               return -1;
        }
 
+       local_apic_timer_verify_ok = 1;
+
        /* We trust the pm timer based calibration */
        if (!pm_referenced) {
                apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -543,22 +514,55 @@ void __init setup_boot_APIC_clock(void)
        if (!local_apic_timer_verify_ok) {
                printk(KERN_WARNING
                       "APIC timer disabled due to verification failure.\n");
+                       return -1;
+       }
+
+       return 0;
+}
+
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
+void __init setup_boot_APIC_clock(void)
+{
+       /*
+        * The local apic timer can be disabled via the kernel
+        * commandline or from the CPU detection code. Register the lapic
+        * timer as a dummy clock event source on SMP systems, so the
+        * broadcast mechanism is used. On UP systems simply ignore it.
+        */
+       if (local_apic_timer_disabled) {
                /* No broadcast on UP ! */
-               if (num_possible_cpus() == 1)
-                       return;
-       } else {
-               /*
-                * If nmi_watchdog is set to IO_APIC, we need the
-                * PIT/HPET going.  Otherwise register lapic as a dummy
-                * device.
-                */
-               if (nmi_watchdog != NMI_IO_APIC)
-                       lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-               else
-                       printk(KERN_WARNING "APIC timer registered as dummy,"
-                               " due to nmi_watchdog=%d!\n", nmi_watchdog);
+               if (num_possible_cpus() > 1) {
+                       lapic_clockevent.mult = 1;
+                       setup_APIC_timer();
+               }
+               return;
        }
 
+       apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+                   "calibrating APIC timer ...\n");
+
+       if (calibrate_APIC_clock()) {
+               /* No broadcast on UP ! */
+               if (num_possible_cpus() > 1)
+                       setup_APIC_timer();
+               return;
+       }
+
+       /*
+        * If nmi_watchdog is set to IO_APIC, we need the
+        * PIT/HPET going.  Otherwise register lapic as a dummy
+        * device.
+        */
+       if (nmi_watchdog != NMI_IO_APIC)
+               lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+       else
+               printk(KERN_WARNING "APIC timer registered as dummy,"
+                       " due to nmi_watchdog=%d!\n", nmi_watchdog);
+
        /* Setup the lapic or request the broadcast */
        setup_APIC_timer();
 }
@@ -693,44 +697,44 @@ void clear_local_APIC(void)
         */
        if (maxlvt >= 3) {
                v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
-               apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
+               apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
        }
        /*
         * Careful: we have to set masks only first to deassert
         * any level-triggered sources.
         */
        v = apic_read(APIC_LVTT);
-       apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+       apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
        v = apic_read(APIC_LVT0);
-       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
        v = apic_read(APIC_LVT1);
-       apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
+       apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
        if (maxlvt >= 4) {
                v = apic_read(APIC_LVTPC);
-               apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
+               apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
        }
 
        /* lets not touch this if we didn't frob it */
 #ifdef CONFIG_X86_MCE_P4THERMAL
        if (maxlvt >= 5) {
                v = apic_read(APIC_LVTTHMR);
-               apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
+               apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
        }
 #endif
        /*
         * Clean APIC state for other OSs:
         */
-       apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
-       apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
-       apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
+       apic_write(APIC_LVTT, APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, APIC_LVT_MASKED);
+       apic_write(APIC_LVT1, APIC_LVT_MASKED);
        if (maxlvt >= 3)
-               apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
+               apic_write(APIC_LVTERR, APIC_LVT_MASKED);
        if (maxlvt >= 4)
-               apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
+               apic_write(APIC_LVTPC, APIC_LVT_MASKED);
 
 #ifdef CONFIG_X86_MCE_P4THERMAL
        if (maxlvt >= 5)
-               apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
+               apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
 #endif
        /* Integrated APIC (!82489DX) ? */
        if (lapic_is_integrated()) {
@@ -756,7 +760,7 @@ void disable_local_APIC(void)
         */
        value = apic_read(APIC_SPIV);
        value &= ~APIC_SPIV_APIC_ENABLED;
-       apic_write_around(APIC_SPIV, value);
+       apic_write(APIC_SPIV, value);
 
        /*
         * When LAPIC was disabled by the BIOS and enabled by the kernel,
@@ -865,8 +869,8 @@ void __init sync_Arb_IDs(void)
        apic_wait_icr_idle();
 
        apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
-       apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
-                               | APIC_DM_INIT);
+       apic_write(APIC_ICR,
+                  APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
 }
 
 /*
@@ -902,16 +906,16 @@ void __init init_bsp_APIC(void)
        else
                value |= APIC_SPIV_FOCUS_DISABLED;
        value |= SPURIOUS_APIC_VECTOR;
-       apic_write_around(APIC_SPIV, value);
+       apic_write(APIC_SPIV, value);
 
        /*
         * Set up the virtual wire mode.
         */
-       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+       apic_write(APIC_LVT0, APIC_DM_EXTINT);
        value = APIC_DM_NMI;
        if (!lapic_is_integrated())             /* 82489DX */
                value |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT1, value);
+       apic_write(APIC_LVT1, value);
 }
 
 static void __cpuinit lapic_setup_esr(void)
@@ -926,7 +930,7 @@ static void __cpuinit lapic_setup_esr(void)
 
                /* enables sending errors */
                value = ERROR_APIC_VECTOR;
-               apic_write_around(APIC_LVTERR, value);
+               apic_write(APIC_LVTERR, value);
                /*
                 * spec says clear errors after enabling vector.
                 */
@@ -989,7 +993,7 @@ void __cpuinit setup_local_APIC(void)
         */
        value = apic_read(APIC_TASKPRI);
        value &= ~APIC_TPRI_MASK;
-       apic_write_around(APIC_TASKPRI, value);
+       apic_write(APIC_TASKPRI, value);
 
        /*
         * After a crash, we no longer service the interrupts and a pending
@@ -1047,7 +1051,7 @@ void __cpuinit setup_local_APIC(void)
         * Set spurious IRQ vector
         */
        value |= SPURIOUS_APIC_VECTOR;
-       apic_write_around(APIC_SPIV, value);
+       apic_write(APIC_SPIV, value);
 
        /*
         * Set up LVT0, LVT1:
@@ -1069,7 +1073,7 @@ void __cpuinit setup_local_APIC(void)
                apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
                                smp_processor_id());
        }
-       apic_write_around(APIC_LVT0, value);
+       apic_write(APIC_LVT0, value);
 
        /*
         * only the BP should see the LINT1 NMI signal, obviously.
@@ -1080,7 +1084,7 @@ void __cpuinit setup_local_APIC(void)
                value = APIC_DM_NMI | APIC_LVT_MASKED;
        if (!integrated)                /* 82489DX */
                value |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT1, value);
+       apic_write(APIC_LVT1, value);
 }
 
 void __cpuinit end_local_APIC_setup(void)
@@ -1091,7 +1095,7 @@ void __cpuinit end_local_APIC_setup(void)
        /* Disable the local apic timer */
        value = apic_read(APIC_LVTT);
        value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-       apic_write_around(APIC_LVTT, value);
+       apic_write(APIC_LVTT, value);
 
        setup_apic_nmi_watchdog(NULL);
        apic_pm_activate();
@@ -1214,9 +1218,6 @@ int apic_version[MAX_APICS];
 
 int __init APIC_init_uniprocessor(void)
 {
-       if (disable_apic)
-               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
-
        if (!smp_found_config && !cpu_has_apic)
                return -1;
 
@@ -1419,7 +1420,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
                value &= ~APIC_VECTOR_MASK;
                value |= APIC_SPIV_APIC_ENABLED;
                value |= 0xf;
-               apic_write_around(APIC_SPIV, value);
+               apic_write(APIC_SPIV, value);
 
                if (!virt_wire_setup) {
                        /*
@@ -1432,10 +1433,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
                                APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
                        value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
                        value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
-                       apic_write_around(APIC_LVT0, value);
+                       apic_write(APIC_LVT0, value);
                } else {
                        /* Disable LVT0 */
-                       apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+                       apic_write(APIC_LVT0, APIC_LVT_MASKED);
                }
 
                /*
@@ -1449,7 +1450,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
                        APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
                value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
                value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
-               apic_write_around(APIC_LVT1, value);
+               apic_write(APIC_LVT1, value);
        }
 }
 
@@ -1700,7 +1701,7 @@ early_param("lapic", parse_lapic);
 static int __init parse_nolapic(char *arg)
 {
        disable_apic = 1;
-       clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+       setup_clear_cpu_cap(X86_FEATURE_APIC);
        return 0;
 }
 early_param("nolapic", parse_nolapic);
index 1e3d32e..7f1f030 100644 (file)
@@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
 /*
  * Debug level, exported for io_apic.c
  */
-int apic_verbosity;
+unsigned int apic_verbosity;
 
 /* Have we found an MP table */
 int smp_found_config;
@@ -314,7 +314,7 @@ static void setup_APIC_timer(void)
 
 #define TICK_COUNT 100000000
 
-static void __init calibrate_APIC_clock(void)
+static int __init calibrate_APIC_clock(void)
 {
        unsigned apic, apic_start;
        unsigned long tsc, tsc_start;
@@ -368,6 +368,17 @@ static void __init calibrate_APIC_clock(void)
                clockevent_delta2ns(0xF, &lapic_clockevent);
 
        calibration_result = result / HZ;
+
+       /*
+        * Do a sanity check on the APIC calibration result
+        */
+       if (calibration_result < (1000000 / HZ)) {
+               printk(KERN_WARNING
+                       "APIC frequency too slow, disabling apic timer\n");
+               return -1;
+       }
+
+       return 0;
 }
 
 /*
@@ -394,14 +405,7 @@ void __init setup_boot_APIC_clock(void)
        }
 
        printk(KERN_INFO "Using local APIC timer interrupts.\n");
-       calibrate_APIC_clock();
-
-       /*
-        * Do a sanity check on the APIC calibration result
-        */
-       if (calibration_result < (1000000 / HZ)) {
-               printk(KERN_WARNING
-                      "APIC frequency too slow, disabling apic timer\n");
+       if (calibrate_APIC_clock()) {
                /* No broadcast on UP ! */
                if (num_possible_cpus() > 1)
                        setup_APIC_timer();
@@ -1337,7 +1341,7 @@ early_param("apic", apic_set_verbosity);
 static __init int setup_disableapic(char *str)
 {
        disable_apic = 1;
-       clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+       setup_clear_cpu_cap(X86_FEATURE_APIC);
        return 0;
 }
 early_param("disableapic", setup_disableapic);
index bacf5de..aa89387 100644 (file)
@@ -18,6 +18,8 @@
 #include <asm/ia32.h>
 #include <asm/bootparam.h>
 
+#include <xen/interface/xen.h>
+
 #define __NO_STUBS 1
 #undef __SYSCALL
 #undef _ASM_X86_64_UNISTD_H_
@@ -131,5 +133,14 @@ int main(void)
        OFFSET(BP_loadflags, boot_params, hdr.loadflags);
        OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
        OFFSET(BP_version, boot_params, hdr.version);
+
+       BLANK();
+       DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
+#ifdef CONFIG_XEN
+       BLANK();
+       OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+       OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#undef ENTRY
+#endif
        return 0;
 }
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
new file mode 100644 (file)
index 0000000..c639bd5
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * BIOS run time interface routines.
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <asm/uv/bios.h>
+
+const char *
+x86_bios_strerror(long status)
+{
+       const char *str;
+       switch (status) {
+       case  0: str = "Call completed without error"; break;
+       case -1: str = "Not implemented"; break;
+       case -2: str = "Invalid argument"; break;
+       case -3: str = "Call completed with error"; break;
+       default: str = "Unknown BIOS status code"; break;
+       }
+       return str;
+}
+
+long
+x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
+                  unsigned long *drift_info)
+{
+       struct uv_bios_retval isrv;
+
+       BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
+       *ticks_per_second = isrv.v0;
+       *drift_info = isrv.v1;
+       return isrv.status;
+}
+EXPORT_SYMBOL_GPL(x86_bios_freq_base);
index 81a07ca..cae9cab 100644 (file)
@@ -24,8 +24,6 @@
 extern void vide(void);
 __asm__(".align 4\nvide: ret");
 
-int force_mwait __cpuinitdata;
-
 static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 {
        if (cpuid_eax(0x80000000) >= 0x80000007) {
index 7c36fb8..d1692b2 100644 (file)
@@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
        /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
        if (c->x86_power & (1<<8))
                set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+       set_cpu_cap(c, X86_FEATURE_SYSCALL32);
 }
 
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
index 1b1c56b..c9b58a8 100644 (file)
@@ -131,13 +131,7 @@ static void __init check_popad(void)
  *   (for due to lack of "invlpg" and working WP on a i386)
  * - In order to run on anything without a TSC, we need to be
  *   compiled for a i486.
- * - In order to support the local APIC on a buggy Pentium machine,
- *   we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
- *   which happens implicitly if compiled for a Pentium or lower
- *   (unless an advanced selection of CPU features is used) as an
- *   otherwise config implies a properly working local APIC without
- *   the need to do extra reads from the APIC.
-*/
+ */
 
 static void __init check_config(void)
 {
@@ -151,21 +145,6 @@ static void __init check_config(void)
        if (boot_cpu_data.x86 == 3)
                panic("Kernel requires i486+ for 'invlpg' and other features");
 #endif
-
-/*
- * If we were told we had a good local APIC, check for buggy Pentia,
- * i.e. all B steppings and the C2 stepping of P54C when using their
- * integrated APIC (see 11AP erratum in "Pentium Processor
- * Specification Update").
- */
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
-       if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
-           && cpu_has_apic
-           && boot_cpu_data.x86 == 5
-           && boot_cpu_data.x86_model == 2
-           && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
-               panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
-#endif
 }
 
 
index 7b8cc72..dd6e3f1 100644 (file)
@@ -7,15 +7,13 @@
 #include <linux/module.h>
 #include <linux/kgdb.h>
 #include <linux/topology.h>
-#include <linux/string.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
-#include <linux/module.h>
 #include <linux/percpu.h>
-#include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/msr.h>
 #include <asm/io.h>
+#include <asm/linkage.h>
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
@@ -305,7 +303,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
                        c->x86_capability[2] = cpuid_edx(0x80860001);
        }
 
-       c->extended_cpuid_level = cpuid_eax(0x80000000);
        if (c->extended_cpuid_level >= 0x80000007)
                c->x86_power = cpuid_edx(0x80000007);
 
@@ -316,18 +313,11 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
                c->x86_phys_bits = eax & 0xff;
        }
 
-       /* Assume all 64-bit CPUs support 32-bit syscall */
-       set_cpu_cap(c, X86_FEATURE_SYSCALL32);
-
        if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
            cpu_devs[c->x86_vendor]->c_early_init)
                cpu_devs[c->x86_vendor]->c_early_init(c);
 
        validate_pat_support(c);
-
-       /* early_param could clear that, but recall get it set again */
-       if (disable_apic)
-               clear_cpu_cap(c, X86_FEATURE_APIC);
 }
 
 /*
@@ -517,8 +507,7 @@ void pda_init(int cpu)
 }
 
 char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
-                          DEBUG_STKSZ]
-__attribute__((section(".bss.page_aligned")));
+                          DEBUG_STKSZ] __page_aligned_bss;
 
 extern asmlinkage void ignore_sysret(void);
 
index 70609ef..b75f256 100644 (file)
@@ -227,6 +227,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
        if (cpu_has_bts)
                ds_init_intel(c);
 
+       /*
+        * See if we have a good local APIC by checking for buggy Pentia,
+        * i.e. all B steppings and the C2 stepping of P54C when using their
+        * integrated APIC (see 11AP erratum in "Pentium Processor
+        * Specification Update").
+        */
+       if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
+           (c->x86_mask < 0x6 || c->x86_mask == 0xb))
+               set_cpu_cap(c, X86_FEATURE_11AP);
+
 #ifdef CONFIG_X86_NUMAQ
        numaq_tsc_disable();
 #endif
index 2c8afaf..ff517f0 100644 (file)
@@ -780,15 +780,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
                        }
                        kobject_put(per_cpu(cache_kobject, cpu));
                        cpuid4_cache_sysfs_exit(cpu);
-                       break;
+                       return retval;
                }
                kobject_uevent(&(this_object->kobj), KOBJ_ADD);
        }
-       if (!retval)
-               cpu_set(cpu, cache_dev_map);
+       cpu_set(cpu, cache_dev_map);
 
        kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
-       return retval;
+       return 0;
 }
 
 static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
index eef001a..9b60fce 100644 (file)
@@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
        /* The temperature transition interrupt handler setup */
        h = THERMAL_APIC_VECTOR;                /* our delivery vector */
        h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
-       apic_write_around(APIC_LVTTHMR, h);
+       apic_write(APIC_LVTTHMR, h);
 
        rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
        wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
@@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
        wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
 
        l = apic_read(APIC_LVTTHMR);
-       apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+       apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
        printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
 
        /* enable thermal throttle processing */
index 28c2918..9af8907 100644 (file)
@@ -877,7 +877,8 @@ void __init early_res_to_bootmem(u64 start, u64 end)
        for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
                count++;
 
-       printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count);
+       printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+                        count, start, end);
        for (i = 0; i < count; i++) {
                struct early_res *r = &early_res[i];
                printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
@@ -1298,11 +1299,6 @@ void __init e820_reserve_resources(void)
        }
 }
 
-/*
- * Non-standard memory setup can be specified via this quirk:
- */
-char * (*arch_memory_setup_quirk)(void);
-
 char *__init default_machine_specific_memory_setup(void)
 {
        char *who = "BIOS-e820";
@@ -1343,8 +1339,8 @@ char *__init default_machine_specific_memory_setup(void)
 
 char *__init __attribute__((weak)) machine_specific_memory_setup(void)
 {
-       if (arch_memory_setup_quirk) {
-               char *who = arch_memory_setup_quirk();
+       if (x86_quirks->arch_memory_setup) {
+               char *who = x86_quirks->arch_memory_setup();
 
                if (who)
                        return who;
@@ -1367,24 +1363,3 @@ void __init setup_memory_map(void)
        printk(KERN_INFO "BIOS-provided physical RAM map:\n");
        e820_print_map(who);
 }
-
-#ifdef CONFIG_X86_64
-int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
-{
-       int i;
-
-       if (slot < 0 || slot >= e820.nr_map)
-               return -1;
-       for (i = slot; i < e820.nr_map; i++) {
-               if (e820.map[i].type != E820_RAM)
-                       continue;
-               break;
-       }
-       if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
-               return -1;
-       *addr = e820.map[i].addr;
-       *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
-               max_pfn << PAGE_SHIFT) - *addr;
-       return i + 1;
-}
-#endif
index a0e11c0..4353cf5 100644 (file)
 #include <asm/dma.h>
 #include <asm/io_apic.h>
 #include <asm/apic.h>
-
-#ifdef CONFIG_GART_IOMMU
-#include <asm/gart.h>
-#endif
+#include <asm/iommu.h>
 
 static void __init fix_hypertransport_config(int num, int slot, int func)
 {
index 6bc07f0..cdfd94c 100644 (file)
@@ -332,7 +332,7 @@ sysenter_past_esp:
        GET_THREAD_INFO(%ebp)
 
        /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+       testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
        jnz syscall_trace_entry
        cmpl $(nr_syscalls), %eax
        jae syscall_badsys
@@ -370,7 +370,7 @@ ENTRY(system_call)
        GET_THREAD_INFO(%ebp)
                                        # system call tracing in operation / emulation
        /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+       testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
        jnz syscall_trace_entry
        cmpl $(nr_syscalls), %eax
        jae syscall_badsys
@@ -383,10 +383,6 @@ syscall_exit:
                                        # setting need_resched or sigpending
                                        # between sampling and the iret
        TRACE_IRQS_OFF
-       testl $X86_EFLAGS_TF,PT_EFLAGS(%esp)    # If tracing set singlestep flag on exit
-       jz no_singlestep
-       orl $_TIF_SINGLESTEP,TI_flags(%ebp)
-no_singlestep:
        movl TI_flags(%ebp), %ecx
        testw $_TIF_ALLWORK_MASK, %cx   # current->work
        jne syscall_exit_work
@@ -514,12 +510,8 @@ END(work_pending)
 syscall_trace_entry:
        movl $-ENOSYS,PT_EAX(%esp)
        movl %esp, %eax
-       xorl %edx,%edx
-       call do_syscall_trace
-       cmpl $0, %eax
-       jne resume_userspace            # ret != 0 -> running under PTRACE_SYSEMU,
-                                       # so must skip actual syscall
-       movl PT_ORIG_EAX(%esp), %eax
+       call syscall_trace_enter
+       /* What it returned is what we'll actually use.  */
        cmpl $(nr_syscalls), %eax
        jnae syscall_call
        jmp syscall_exit
@@ -528,14 +520,13 @@ END(syscall_trace_entry)
        # perform syscall exit tracing
        ALIGN
 syscall_exit_work:
-       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
+       testb $_TIF_WORK_SYSCALL_EXIT, %cl
        jz work_pending
        TRACE_IRQS_ON
-       ENABLE_INTERRUPTS(CLBR_ANY)     # could let do_syscall_trace() call
+       ENABLE_INTERRUPTS(CLBR_ANY)     # could let syscall_trace_leave() call
                                        # schedule() instead
        movl %esp, %eax
-       movl $1, %edx
-       call do_syscall_trace
+       call syscall_trace_leave
        jmp resume_userspace
 END(syscall_exit_work)
        CFI_ENDPROC
@@ -1024,6 +1015,7 @@ ENDPROC(kernel_thread_helper)
 ENTRY(xen_sysenter_target)
        RING0_INT_FRAME
        addl $5*4, %esp         /* remove xen-provided frame */
+       CFI_ADJUST_CFA_OFFSET -5*4
        jmp sysenter_past_esp
        CFI_ENDPROC
 
index ae63e58..8410e26 100644 (file)
@@ -349,8 +349,7 @@ ENTRY(system_call_after_swapgs)
        movq  %rcx,RIP-ARGOFFSET(%rsp)
        CFI_REL_OFFSET rip,RIP-ARGOFFSET
        GET_THREAD_INFO(%rcx)
-       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
-               TI_flags(%rcx)
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
        jnz tracesys
        cmpq $__NR_syscall_max,%rax
        ja badsys
@@ -430,7 +429,12 @@ tracesys:
        FIXUP_TOP_OF_STACK %rdi
        movq %rsp,%rdi
        call syscall_trace_enter
-       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+       /*
+        * Reload arg registers from stack in case ptrace changed them.
+        * We don't reload %rax because syscall_trace_enter() returned
+        * the value it wants us to use in the table lookup.
+        */
+       LOAD_ARGS ARGOFFSET, 1
        RESTORE_REST
        cmpq $__NR_syscall_max,%rax
        ja   int_ret_from_sys_call      /* RAX(%rsp) set to -ENOSYS above */
@@ -483,7 +487,7 @@ int_very_careful:
        ENABLE_INTERRUPTS(CLBR_NONE)
        SAVE_REST
        /* Check for syscall exit trace */      
-       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
+       testl $_TIF_WORK_SYSCALL_EXIT,%edx
        jz int_signal
        pushq %rdi
        CFI_ADJUST_CFA_OFFSET 8
@@ -491,7 +495,7 @@ int_very_careful:
        call syscall_trace_leave
        popq %rdi
        CFI_ADJUST_CFA_OFFSET -8
-       andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
+       andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
        jmp int_restore_rest
        
 int_signal:
@@ -1189,6 +1193,7 @@ END(device_not_available)
        /* runs on exception stack */
 KPROBE_ENTRY(debug)
        INTR_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        pushq $0
        CFI_ADJUST_CFA_OFFSET 8         
        paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1203,7 @@ KPROBE_END(debug)
        /* runs on exception stack */   
 KPROBE_ENTRY(nmi)
        INTR_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        pushq $-1
        CFI_ADJUST_CFA_OFFSET 8
        paranoidentry do_nmi, 0, 0
@@ -1211,6 +1217,7 @@ KPROBE_END(nmi)
 
 KPROBE_ENTRY(int3)
        INTR_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        pushq $0
        CFI_ADJUST_CFA_OFFSET 8
        paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1244,7 @@ END(coprocessor_segment_overrun)
        /* runs on exception stack */
 ENTRY(double_fault)
        XCPT_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        paranoidentry do_double_fault
        jmp paranoid_exit1
        CFI_ENDPROC
@@ -1253,6 +1261,7 @@ END(segment_not_present)
        /* runs on exception stack */
 ENTRY(stack_segment)
        XCPT_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        paranoidentry do_stack_segment
        jmp paranoid_exit1
        CFI_ENDPROC
@@ -1278,6 +1287,7 @@ END(spurious_interrupt_bug)
        /* runs on exception stack */
 ENTRY(machine_check)
        INTR_FRAME
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
        pushq $0
        CFI_ADJUST_CFA_OFFSET 8 
        paranoidentry do_machine_check
@@ -1312,3 +1322,103 @@ KPROBE_ENTRY(ignore_sysret)
        sysret
        CFI_ENDPROC
 ENDPROC(ignore_sysret)
+
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+       zeroentry xen_do_hypervisor_callback
+END(xen_hypervisor_callback)
+
+/*
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+*/
+ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
+       CFI_STARTPROC
+/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+   see the correct pointer to the pt_regs */
+       movq %rdi, %rsp            # we don't return, adjust the stack frame
+       CFI_ENDPROC
+       CFI_DEFAULT_STACK
+11:    incl %gs:pda_irqcount
+       movq %rsp,%rbp
+       CFI_DEF_CFA_REGISTER rbp
+       cmovzq %gs:pda_irqstackptr,%rsp
+       pushq %rbp                      # backlink for old unwinder
+       call xen_evtchn_do_upcall
+       popq %rsp
+       CFI_DEF_CFA_REGISTER rsp
+       decl %gs:pda_irqcount
+       jmp  error_exit
+       CFI_ENDPROC
+END(do_hypervisor_callback)
+
+/*
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we do not need to fix up as Xen has already reloaded all segment
+# registers that could be reloaded and zeroed the others.
+# Category 2 we fix up by killing the current process. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by comparing each saved segment register
+# with its current contents: any discrepancy means we in category 1.
+*/
+ENTRY(xen_failsafe_callback)
+       framesz = (RIP-0x30)    /* workaround buggy gas */
+       _frame framesz
+       CFI_REL_OFFSET rcx, 0
+       CFI_REL_OFFSET r11, 8
+       movw %ds,%cx
+       cmpw %cx,0x10(%rsp)
+       CFI_REMEMBER_STATE
+       jne 1f
+       movw %es,%cx
+       cmpw %cx,0x18(%rsp)
+       jne 1f
+       movw %fs,%cx
+       cmpw %cx,0x20(%rsp)
+       jne 1f
+       movw %gs,%cx
+       cmpw %cx,0x28(%rsp)
+       jne 1f
+       /* All segments match their saved values => Category 2 (Bad IRET). */
+       movq (%rsp),%rcx
+       CFI_RESTORE rcx
+       movq 8(%rsp),%r11
+       CFI_RESTORE r11
+       addq $0x30,%rsp
+       CFI_ADJUST_CFA_OFFSET -0x30
+       pushq $0
+       CFI_ADJUST_CFA_OFFSET 8
+       pushq %r11
+       CFI_ADJUST_CFA_OFFSET 8
+       pushq %rcx
+       CFI_ADJUST_CFA_OFFSET 8
+       jmp general_protection
+       CFI_RESTORE_STATE
+1:     /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+       movq (%rsp),%rcx
+       CFI_RESTORE rcx
+       movq 8(%rsp),%r11
+       CFI_RESTORE r11
+       addq $0x30,%rsp
+       CFI_ADJUST_CFA_OFFSET -0x30
+       pushq $0
+       CFI_ADJUST_CFA_OFFSET 8
+       SAVE_ALL
+       jmp error_exit
+       CFI_ENDPROC
+END(xen_failsafe_callback)
+
+#endif /* CONFIG_XEN */
index 711f11c..3c39293 100644 (file)
@@ -24,6 +24,7 @@
 #include <asm/pgtable.h>
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
 
 DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -40,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
 short uv_possible_blades;
 EXPORT_SYMBOL_GPL(uv_possible_blades);
 
+unsigned long sn_rtc_cycles_per_second;
+EXPORT_SYMBOL(sn_rtc_cycles_per_second);
+
 /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
 
 static cpumask_t uv_target_cpus(void)
@@ -272,6 +276,23 @@ static __init void map_mmioh_high(int max_pnode)
                map_high("MMIOH", mmioh.s.base, shift, map_uc);
 }
 
+static __init void uv_rtc_init(void)
+{
+       long status, ticks_per_sec, drift;
+
+       status =
+           x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
+                                       &drift);
+       if (status != 0 || ticks_per_sec < 100000) {
+               printk(KERN_WARNING
+                       "unable to determine platform RTC clock frequency, "
+                       "guessing.\n");
+               /* BIOS gives wrong value for clock freq. so guess */
+               sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
+       } else
+               sn_rtc_cycles_per_second = ticks_per_sec;
+}
+
 static __init void uv_system_init(void)
 {
        union uvh_si_addr_map_config_u m_n_config;
@@ -326,6 +347,8 @@ static __init void uv_system_init(void)
        gnode_upper = (((unsigned long)node_id.s.node_id) &
                       ~((1 << n_val) - 1)) << m_val;
 
+       uv_rtc_init();
+
        for_each_present_cpu(cpu) {
                nid = cpu_to_node(cpu);
                pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
index c978198..1b318e9 100644 (file)
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
 static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
 #endif
 
+void __init x86_64_init_pda(void)
+{
+       _cpu_pda = __cpu_pda;
+       cpu_pda(0) = &_boot_cpu_pda;
+       pda_init(0);
+}
+
 static void __init zap_identity_mappings(void)
 {
        pgd_t *pgd = pgd_offset_k(0UL);
@@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
 
        early_printk("Kernel alive\n");
 
-       _cpu_pda = __cpu_pda;
-       cpu_pda(0) = &_boot_cpu_pda;
-       pda_init(0);
+       x86_64_init_pda();
 
        early_printk("Kernel really alive\n");
 
index b07ac7b..db3280a 100644 (file)
@@ -407,6 +407,7 @@ ENTRY(phys_base)
        /* This must match the first entry in level2_kernel_pgt */
        .quad   0x0000000000000000
 
+#include "../../x86/xen/xen-head.S"
        
        .section .bss, "aw", @nobits
        .align L1_CACHE_BYTES
index 558abf4..de9aa0e 100644 (file)
@@ -756,7 +756,7 @@ void send_IPI_self(int vector)
        /*
         * Send the IPI. The write to APIC_ICR fires this off.
         */
-       apic_write_around(APIC_ICR, cfg);
+       apic_write(APIC_ICR, cfg);
 }
 #endif /* !CONFIG_SMP */
 
@@ -2030,7 +2030,7 @@ static void mask_lapic_irq(unsigned int irq)
        unsigned long v;
 
        v = apic_read(APIC_LVT0);
-       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
 }
 
 static void unmask_lapic_irq(unsigned int irq)
@@ -2038,7 +2038,7 @@ static void unmask_lapic_irq(unsigned int irq)
        unsigned long v;
 
        v = apic_read(APIC_LVT0);
-       apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
 static struct irq_chip lapic_chip __read_mostly = {
@@ -2168,7 +2168,7 @@ static inline void __init check_timer(void)
         * The AEOI mode will finish them in the 8259A
         * automatically.
         */
-       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
        init_8259A(1);
        timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
 
@@ -2177,8 +2177,9 @@ static inline void __init check_timer(void)
        pin2  = ioapic_i8259.pin;
        apic2 = ioapic_i8259.apic;
 
-       printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-               vector, apic1, pin1, apic2, pin2);
+       apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+                   "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+                   vector, apic1, pin1, apic2, pin2);
 
        /*
         * Some BIOS writers are clueless and report the ExtINTA
@@ -2216,12 +2217,13 @@ static inline void __init check_timer(void)
                }
                clear_IO_APIC_pin(apic1, pin1);
                if (!no_pin1)
-                       printk(KERN_ERR "..MP-BIOS bug: "
-                              "8254 timer not connected to IO-APIC\n");
+                       apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
+                                   "8254 timer not connected to IO-APIC\n");
 
-               printk(KERN_INFO "...trying to set up timer (IRQ0) "
-                      "through the 8259A ... ");
-               printk("\n..... (found pin %d) ...", pin2);
+               apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+                           "(IRQ0) through the 8259A ...\n");
+               apic_printk(APIC_QUIET, KERN_INFO
+                           "..... (found apic %d pin %d) ...\n", apic2, pin2);
                /*
                 * legacy devices should be connected to IO APIC #0
                 */
@@ -2230,7 +2232,7 @@ static inline void __init check_timer(void)
                unmask_IO_APIC_irq(0);
                enable_8259A_irq(0);
                if (timer_irq_works()) {
-                       printk("works.\n");
+                       apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
                        timer_through_8259 = 1;
                        if (nmi_watchdog == NMI_IO_APIC) {
                                disable_8259A_irq(0);
@@ -2244,44 +2246,47 @@ static inline void __init check_timer(void)
                 */
                disable_8259A_irq(0);
                clear_IO_APIC_pin(apic2, pin2);
-               printk(" failed.\n");
+               apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
        }
 
        if (nmi_watchdog == NMI_IO_APIC) {
-               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+               apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
+                           "through the IO-APIC - disabling NMI Watchdog!\n");
                nmi_watchdog = NMI_NONE;
        }
        timer_ack = 0;
 
-       printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+       apic_printk(APIC_QUIET, KERN_INFO
+                   "...trying to set up timer as Virtual Wire IRQ...\n");
 
        lapic_register_intr(0, vector);
-       apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);   /* Fixed mode */
+       apic_write(APIC_LVT0, APIC_DM_FIXED | vector);  /* Fixed mode */
        enable_8259A_irq(0);
 
        if (timer_irq_works()) {
-               printk(" works.\n");
+               apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
                goto out;
        }
        disable_8259A_irq(0);
-       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
-       printk(" failed.\n");
+       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+       apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
 
-       printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+       apic_printk(APIC_QUIET, KERN_INFO
+                   "...trying to set up timer as ExtINT IRQ...\n");
 
        init_8259A(0);
        make_8259A_irq(0);
-       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+       apic_write(APIC_LVT0, APIC_DM_EXTINT);
 
        unlock_ExtINT_logic();
 
        if (timer_irq_works()) {
-               printk(" works.\n");
+               apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
                goto out;
        }
-       printk(" failed :(.\n");
+       apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
        panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
-               "report.  Then try booting with the 'noapic' option");
+               "report.  Then try booting with the 'noapic' option.\n");
 out:
        local_irq_restore(flags);
 }
index 6510cde..64a46af 100644 (file)
@@ -45,6 +45,7 @@
 #include <asm/proto.h>
 #include <asm/acpi.h>
 #include <asm/dma.h>
+#include <asm/i8259.h>
 #include <asm/nmi.h>
 #include <asm/msidef.h>
 #include <asm/hypertransport.h>
@@ -1696,8 +1697,9 @@ static inline void __init check_timer(void)
        pin2  = ioapic_i8259.pin;
        apic2 = ioapic_i8259.apic;
 
-       apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-               cfg->vector, apic1, pin1, apic2, pin2);
+       apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+                   "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+                   cfg->vector, apic1, pin1, apic2, pin2);
 
        /*
         * Some BIOS writers are clueless and report the ExtINTA
@@ -1735,14 +1737,13 @@ static inline void __init check_timer(void)
                }
                clear_IO_APIC_pin(apic1, pin1);
                if (!no_pin1)
-                       apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: "
+                       apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
                                    "8254 timer not connected to IO-APIC\n");
 
-               apic_printk(APIC_VERBOSE,KERN_INFO
-                       "...trying to set up timer (IRQ0) "
-                       "through the 8259A ... ");
-               apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
-                       apic2, pin2);
+               apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+                           "(IRQ0) through the 8259A ...\n");
+               apic_printk(APIC_QUIET, KERN_INFO
+                           "..... (found apic %d pin %d) ...\n", apic2, pin2);
                /*
                 * legacy devices should be connected to IO APIC #0
                 */
@@ -1751,7 +1752,7 @@ static inline void __init check_timer(void)
                unmask_IO_APIC_irq(0);
                enable_8259A_irq(0);
                if (timer_irq_works()) {
-                       apic_printk(APIC_VERBOSE," works.\n");
+                       apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
                        timer_through_8259 = 1;
                        if (nmi_watchdog == NMI_IO_APIC) {
                                disable_8259A_irq(0);
@@ -1765,29 +1766,32 @@ static inline void __init check_timer(void)
                 */
                disable_8259A_irq(0);
                clear_IO_APIC_pin(apic2, pin2);
-               apic_printk(APIC_VERBOSE," failed.\n");
+               apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
        }
 
        if (nmi_watchdog == NMI_IO_APIC) {
-               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+               apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
+                           "through the IO-APIC - disabling NMI Watchdog!\n");
                nmi_watchdog = NMI_NONE;
        }
 
-       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+       apic_printk(APIC_QUIET, KERN_INFO
+                   "...trying to set up timer as Virtual Wire IRQ...\n");
 
        lapic_register_intr(0);
        apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
        enable_8259A_irq(0);
 
        if (timer_irq_works()) {
-               apic_printk(APIC_VERBOSE," works.\n");
+               apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
                goto out;
        }
        disable_8259A_irq(0);
        apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
-       apic_printk(APIC_VERBOSE," failed.\n");
+       apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
 
-       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+       apic_printk(APIC_QUIET, KERN_INFO
+                   "...trying to set up timer as ExtINT IRQ...\n");
 
        init_8259A(0);
        make_8259A_irq(0);
@@ -1796,11 +1800,12 @@ static inline void __init check_timer(void)
        unlock_ExtINT_logic();
 
        if (timer_irq_works()) {
-               apic_printk(APIC_VERBOSE," works.\n");
+               apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
                goto out;
        }
-       apic_printk(APIC_VERBOSE," failed :(.\n");
-       panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
+       apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+       panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
+               "report.  Then try booting with the 'noapic' option.\n");
 out:
        local_irq_restore(flags);
 }
index 5921e5f..1c3a66a 100644 (file)
@@ -103,6 +103,9 @@ void __init io_delay_init(void)
 
 static int __init io_delay_param(char *s)
 {
+       if (!s)
+               return -EINVAL;
+
        if (!strcmp(s, "0x80"))
                io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
        else if (!strcmp(s, "0xed"))
index 9d98cda..3f7537b 100644 (file)
@@ -70,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
        /*
         * Send the IPI. The write to APIC_ICR fires this off.
         */
-       apic_write_around(APIC_ICR, cfg);
+       apic_write(APIC_ICR, cfg);
 }
 
 void send_IPI_self(int vector)
@@ -98,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
         * prepare target chip field
         */
        cfg = __prepare_ICR2(mask);
-       apic_write_around(APIC_ICR2, cfg);
+       apic_write(APIC_ICR2, cfg);
 
        /*
         * program the ICR
@@ -108,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
        /*
         * Send the IPI. The write to APIC_ICR fires this off.
         */
-       apic_write_around(APIC_ICR, cfg);
+       apic_write(APIC_ICR, cfg);
 }
 
 /*
index 47a6f6f..1cf8c1f 100644 (file)
@@ -83,11 +83,8 @@ union irq_ctx {
 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
 
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
-               __attribute__((__section__(".bss.page_aligned")));
-
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
-               __attribute__((__section__(".bss.page_aligned")));
+static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
 
 static void call_on_stack(void *func, void *stack)
 {
index c032059..f2d43bc 100644 (file)
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 
 #include <asm/setup.h>
 
+struct dentry *arch_debugfs_dir;
+EXPORT_SYMBOL(arch_debugfs_dir);
+
 #ifdef CONFIG_DEBUG_BOOT_PARAMS
 struct setup_data_node {
        u64 paddr;
@@ -209,6 +213,10 @@ static int __init arch_kdebugfs_init(void)
 {
        int error = 0;
 
+       arch_debugfs_dir = debugfs_create_dir("x86", NULL);
+       if (!arch_debugfs_dir)
+               return -ENOMEM;
+
 #ifdef CONFIG_DEBUG_BOOT_PARAMS
        error = boot_params_kdebugfs_init();
 #endif
index b8c6743..43c019f 100644 (file)
@@ -860,7 +860,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
 
        resume_execution(cur, regs, kcb);
        regs->flags |= kcb->kprobe_saved_flags;
-       trace_hardirqs_fixup_flags(regs->flags);
 
        if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
                kcb->kprobe_status = KPROBE_HIT_SSDONE;
index a888e67..0e86767 100644 (file)
@@ -150,7 +150,8 @@ int module_finalize(const Elf_Ehdr *hdr,
                     const Elf_Shdr *sechdrs,
                     struct module *me)
 {
-       const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
+       const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+               *para = NULL;
        char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
        for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -160,6 +161,8 @@ int module_finalize(const Elf_Ehdr *hdr,
                        alt = s;
                if (!strcmp(".smp_locks", secstrings + s->sh_name))
                        locks= s;
+               if (!strcmp(".parainstructions", secstrings + s->sh_name))
+                       para = s;
        }
 
        if (alt) {
@@ -175,6 +178,11 @@ int module_finalize(const Elf_Ehdr *hdr,
                                            tseg, tseg + text->sh_size);
        }
 
+       if (para) {
+               void *pseg = (void *)para->sh_addr;
+               apply_paravirt(pseg, pseg + para->sh_size);
+       }
+
        return module_bug_finalize(hdr, sechdrs, me);
 }
 
index 3b25e49..6ae005c 100644 (file)
@@ -27,6 +27,7 @@
 #include <asm/bios_ebda.h>
 #include <asm/e820.h>
 #include <asm/trampoline.h>
+#include <asm/setup.h>
 
 #include <mach_apic.h>
 #ifdef CONFIG_X86_32
@@ -48,76 +49,6 @@ static int __init mpf_checksum(unsigned char *mp, int len)
        return sum & 0xFF;
 }
 
-#ifdef CONFIG_X86_NUMAQ
-int found_numaq;
-/*
- * Have to match translation table entries to main table entries by counter
- * hence the mpc_record variable .... can't see a less disgusting way of
- * doing this ....
- */
-struct mpc_config_translation {
-       unsigned char mpc_type;
-       unsigned char trans_len;
-       unsigned char trans_type;
-       unsigned char trans_quad;
-       unsigned char trans_global;
-       unsigned char trans_local;
-       unsigned short trans_reserved;
-};
-
-
-static int mpc_record;
-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
-    __cpuinitdata;
-
-static inline int generate_logical_apicid(int quad, int phys_apicid)
-{
-       return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
-}
-
-
-static inline int mpc_apic_id(struct mpc_config_processor *m,
-                       struct mpc_config_translation *translation_record)
-{
-       int quad = translation_record->trans_quad;
-       int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
-
-       printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
-              m->mpc_apicid,
-              (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-              (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-              m->mpc_apicver, quad, logical_apicid);
-       return logical_apicid;
-}
-
-int mp_bus_id_to_node[MAX_MP_BUSSES];
-
-int mp_bus_id_to_local[MAX_MP_BUSSES];
-
-static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
-       struct mpc_config_translation *translation)
-{
-       int quad = translation->trans_quad;
-       int local = translation->trans_local;
-
-       mp_bus_id_to_node[m->mpc_busid] = quad;
-       mp_bus_id_to_local[m->mpc_busid] = local;
-       printk(KERN_INFO "Bus #%d is %s (node %d)\n",
-              m->mpc_busid, name, quad);
-}
-
-int quad_local_to_mp_bus_id [NR_CPUS/4][4];
-static void mpc_oem_pci_bus(struct mpc_config_bus *m,
-       struct mpc_config_translation *translation)
-{
-       int quad = translation->trans_quad;
-       int local = translation->trans_local;
-
-       quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
-}
-
-#endif
-
 static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 {
        int apicid;
@@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
                disabled_cpus++;
                return;
        }
-#ifdef CONFIG_X86_NUMAQ
-       if (found_numaq)
-               apicid = mpc_apic_id(m, translation_table[mpc_record]);
+
+       if (x86_quirks->mpc_apic_id)
+               apicid = x86_quirks->mpc_apic_id(m);
        else
                apicid = m->mpc_apicid;
-#else
-       apicid = m->mpc_apicid;
-#endif
+
        if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
                bootup_cpu = " (Bootup-CPU)";
                boot_cpu_physical_apicid = m->mpc_apicid;
@@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
        memcpy(str, m->mpc_bustype, 6);
        str[6] = 0;
 
-#ifdef CONFIG_X86_NUMAQ
-       if (found_numaq)
-               mpc_oem_bus_info(m, str, translation_table[mpc_record]);
-#else
-       printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
-#endif
+       if (x86_quirks->mpc_oem_bus_info)
+               x86_quirks->mpc_oem_bus_info(m, str);
+       else
+               printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
 
 #if MAX_MP_BUSSES < 256
        if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
                mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
 #endif
        } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
-#ifdef CONFIG_X86_NUMAQ
-               if (found_numaq)
-                       mpc_oem_pci_bus(m, translation_table[mpc_record]);
-#endif
+               if (x86_quirks->mpc_oem_pci_bus)
+                       x86_quirks->mpc_oem_pci_bus(m);
+
                clear_bit(m->mpc_busid, mp_bus_not_pci);
 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
                mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
@@ -316,83 +242,6 @@ static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
                m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
 }
 
-#ifdef CONFIG_X86_NUMAQ
-static void __init MP_translation_info(struct mpc_config_translation *m)
-{
-       printk(KERN_INFO
-              "Translation: record %d, type %d, quad %d, global %d, local %d\n",
-              mpc_record, m->trans_type, m->trans_quad, m->trans_global,
-              m->trans_local);
-
-       if (mpc_record >= MAX_MPC_ENTRY)
-               printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
-       else
-               translation_table[mpc_record] = m;      /* stash this for later */
-       if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
-               node_set_online(m->trans_quad);
-}
-
-/*
- * Read/parse the MPC oem tables
- */
-
-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
-                                   unsigned short oemsize)
-{
-       int count = sizeof(*oemtable);  /* the header size */
-       unsigned char *oemptr = ((unsigned char *)oemtable) + count;
-
-       mpc_record = 0;
-       printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
-              oemtable);
-       if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
-               printk(KERN_WARNING
-                      "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
-                      oemtable->oem_signature[0], oemtable->oem_signature[1],
-                      oemtable->oem_signature[2], oemtable->oem_signature[3]);
-               return;
-       }
-       if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
-               printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
-               return;
-       }
-       while (count < oemtable->oem_length) {
-               switch (*oemptr) {
-               case MP_TRANSLATION:
-                       {
-                               struct mpc_config_translation *m =
-                                   (struct mpc_config_translation *)oemptr;
-                               MP_translation_info(m);
-                               oemptr += sizeof(*m);
-                               count += sizeof(*m);
-                               ++mpc_record;
-                               break;
-                       }
-               default:
-                       {
-                               printk(KERN_WARNING
-                                      "Unrecognised OEM table entry type! - %d\n",
-                                      (int)*oemptr);
-                               return;
-                       }
-               }
-       }
-}
-
-void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
-                                char *productid)
-{
-       if (strncmp(oem, "IBM NUMA", 8))
-               printk("Warning!  Not a NUMA-Q system!\n");
-       else
-               found_numaq = 1;
-
-       if (mpc->mpc_oemptr)
-               smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
-                                mpc->mpc_oemsize);
-}
-#endif /* CONFIG_X86_NUMAQ */
-
 /*
  * Read/parse the MPC
  */
@@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
        } else
                mps_oem_check(mpc, oem, str);
 #endif
-
        /* save the local APIC address, it might be non-default */
        if (!acpi_lapic)
                mp_lapic_addr = mpc->mpc_lapic;
@@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
        if (early)
                return 1;
 
+       if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
+               struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
+               x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
+       }
+
        /*
         *      Now process the configuration blocks.
         */
-#ifdef CONFIG_X86_NUMAQ
-       mpc_record = 0;
-#endif
+       if (x86_quirks->mpc_record)
+               *x86_quirks->mpc_record = 0;
+
        while (count < mpc->mpc_length) {
                switch (*mpt) {
                case MP_PROCESSOR:
@@ -536,9 +389,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
                        count = mpc->mpc_length;
                        break;
                }
-#ifdef CONFIG_X86_NUMAQ
-               ++mpc_record;
-#endif
+               if (x86_quirks->mpc_record)
+                       (*x86_quirks->mpc_record)++;
        }
 
 #ifdef CONFIG_X86_GENERICARCH
@@ -725,12 +577,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 
 static struct intel_mp_floating *mpf_found;
 
-/*
- * Machine specific quirk for finding the SMP config before other setup
- * activities destroy the table:
- */
-int (*mach_get_smp_config_quirk)(unsigned int early);
-
 /*
  * Scan the memory blocks for an SMP configuration block.
  */
@@ -738,8 +584,8 @@ static void __init __get_smp_config(unsigned int early)
 {
        struct intel_mp_floating *mpf = mpf_found;
 
-       if (mach_get_smp_config_quirk) {
-               if (mach_get_smp_config_quirk(early))
+       if (x86_quirks->mach_get_smp_config) {
+               if (x86_quirks->mach_get_smp_config(early))
                        return;
        }
        if (acpi_lapic && early)
@@ -899,14 +745,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
        return 0;
 }
 
-int (*mach_find_smp_config_quirk)(unsigned int reserve);
-
 static void __init __find_smp_config(unsigned int reserve)
 {
        unsigned int address;
 
-       if (mach_find_smp_config_quirk) {
-               if (mach_find_smp_config_quirk(reserve))
+       if (x86_quirks->mach_find_smp_config) {
+               if (x86_quirks->mach_find_smp_config(reserve))
                        return;
        }
        /*
index ec024b3..ac6d512 100644 (file)
@@ -263,7 +263,7 @@ late_initcall(init_lapic_nmi_sysfs);
 
 static void __acpi_nmi_enable(void *__unused)
 {
-       apic_write_around(APIC_LVT0, APIC_DM_NMI);
+       apic_write(APIC_LVT0, APIC_DM_NMI);
 }
 
 /*
@@ -277,7 +277,7 @@ void acpi_nmi_enable(void)
 
 static void __acpi_nmi_disable(void *__unused)
 {
-       apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
 }
 
 /*
@@ -448,6 +448,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 
 #ifdef CONFIG_SYSCTL
 
+static int __init setup_unknown_nmi_panic(char *str)
+{
+       unknown_nmi_panic = 1;
+       return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 {
        unsigned char reason = get_nmi_reason();
index a23e823..b8c4561 100644 (file)
@@ -33,6 +33,7 @@
 #include <asm/processor.h>
 #include <asm/mpspec.h>
 #include <asm/e820.h>
+#include <asm/setup.h>
 
 #define        MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
 
@@ -71,6 +72,188 @@ static void __init smp_dump_qct(void)
        }
 }
 
+
+void __init numaq_tsc_disable(void)
+{
+       if (!found_numaq)
+               return;
+
+       if (num_online_nodes() > 1) {
+               printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
+               setup_clear_cpu_cap(X86_FEATURE_TSC);
+       }
+}
+
+static int __init numaq_pre_time_init(void)
+{
+       numaq_tsc_disable();
+       return 0;
+}
+
+int found_numaq;
+/*
+ * Have to match translation table entries to main table entries by counter
+ * hence the mpc_record variable .... can't see a less disgusting way of
+ * doing this ....
+ */
+struct mpc_config_translation {
+       unsigned char mpc_type;
+       unsigned char trans_len;
+       unsigned char trans_type;
+       unsigned char trans_quad;
+       unsigned char trans_global;
+       unsigned char trans_local;
+       unsigned short trans_reserved;
+};
+
+/* x86_quirks member */
+static int mpc_record;
+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
+    __cpuinitdata;
+
+static inline int generate_logical_apicid(int quad, int phys_apicid)
+{
+       return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
+}
+
+/* x86_quirks member */
+static int mpc_apic_id(struct mpc_config_processor *m)
+{
+       int quad = translation_table[mpc_record]->trans_quad;
+       int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
+
+       printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
+              m->mpc_apicid,
+              (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+              (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+              m->mpc_apicver, quad, logical_apicid);
+       return logical_apicid;
+}
+
+int mp_bus_id_to_node[MAX_MP_BUSSES];
+
+int mp_bus_id_to_local[MAX_MP_BUSSES];
+
+/* x86_quirks member */
+static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
+{
+       int quad = translation_table[mpc_record]->trans_quad;
+       int local = translation_table[mpc_record]->trans_local;
+
+       mp_bus_id_to_node[m->mpc_busid] = quad;
+       mp_bus_id_to_local[m->mpc_busid] = local;
+       printk(KERN_INFO "Bus #%d is %s (node %d)\n",
+              m->mpc_busid, name, quad);
+}
+
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+
+/* x86_quirks member */
+static void mpc_oem_pci_bus(struct mpc_config_bus *m)
+{
+       int quad = translation_table[mpc_record]->trans_quad;
+       int local = translation_table[mpc_record]->trans_local;
+
+       quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
+}
+
+static void __init MP_translation_info(struct mpc_config_translation *m)
+{
+       printk(KERN_INFO
+              "Translation: record %d, type %d, quad %d, global %d, local %d\n",
+              mpc_record, m->trans_type, m->trans_quad, m->trans_global,
+              m->trans_local);
+
+       if (mpc_record >= MAX_MPC_ENTRY)
+               printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
+       else
+               translation_table[mpc_record] = m;      /* stash this for later */
+       if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
+               node_set_online(m->trans_quad);
+}
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+       int sum = 0;
+
+       while (len--)
+               sum += *mp++;
+
+       return sum & 0xFF;
+}
+
+/*
+ * Read/parse the MPC oem tables
+ */
+
+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
+                                   unsigned short oemsize)
+{
+       int count = sizeof(*oemtable);  /* the header size */
+       unsigned char *oemptr = ((unsigned char *)oemtable) + count;
+
+       mpc_record = 0;
+       printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
+              oemtable);
+       if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
+               printk(KERN_WARNING
+                      "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+                      oemtable->oem_signature[0], oemtable->oem_signature[1],
+                      oemtable->oem_signature[2], oemtable->oem_signature[3]);
+               return;
+       }
+       if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
+               printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
+               return;
+       }
+       while (count < oemtable->oem_length) {
+               switch (*oemptr) {
+               case MP_TRANSLATION:
+                       {
+                               struct mpc_config_translation *m =
+                                   (struct mpc_config_translation *)oemptr;
+                               MP_translation_info(m);
+                               oemptr += sizeof(*m);
+                               count += sizeof(*m);
+                               ++mpc_record;
+                               break;
+                       }
+               default:
+                       {
+                               printk(KERN_WARNING
+                                      "Unrecognised OEM table entry type! - %d\n",
+                                      (int)*oemptr);
+                               return;
+                       }
+               }
+       }
+}
+
+static struct x86_quirks numaq_x86_quirks __initdata = {
+       .arch_pre_time_init     = numaq_pre_time_init,
+       .arch_time_init         = NULL,
+       .arch_pre_intr_init     = NULL,
+       .arch_memory_setup      = NULL,
+       .arch_intr_init         = NULL,
+       .arch_trap_init         = NULL,
+       .mach_get_smp_config    = NULL,
+       .mach_find_smp_config   = NULL,
+       .mpc_record             = &mpc_record,
+       .mpc_apic_id            = mpc_apic_id,
+       .mpc_oem_bus_info       = mpc_oem_bus_info,
+       .mpc_oem_pci_bus        = mpc_oem_pci_bus,
+       .smp_read_mpc_oem       = smp_read_mpc_oem,
+};
+
+void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
+                                char *productid)
+{
+       if (strncmp(oem, "IBM NUMA", 8))
+               printk("Warning!  Not a NUMA-Q system!\n");
+       else
+               found_numaq = 1;
+}
+
 static __init void early_check_numaq(void)
 {
        /*
@@ -82,6 +265,9 @@ static __init void early_check_numaq(void)
         */
        if (smp_found_config)
                early_get_smp_config();
+
+       if (found_numaq)
+               x86_quirks = &numaq_x86_quirks;
 }
 
 int __init get_memcfg_numaq(void)
@@ -92,14 +278,3 @@ int __init get_memcfg_numaq(void)
        smp_dump_qct();
        return 1;
 }
-
-void __init numaq_tsc_disable(void)
-{
-       if (!found_numaq)
-               return;
-
-       if (num_online_nodes() > 1) {
-               printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
-               setup_clear_cpu_cap(X86_FEATURE_TSC);
-       }
-}
index e0f571d..097d8a6 100644 (file)
@@ -29,6 +29,7 @@
 #include <asm/desc.h>
 #include <asm/setup.h>
 #include <asm/arch_hooks.h>
+#include <asm/pgtable.h>
 #include <asm/time.h>
 #include <asm/pgalloc.h>
 #include <asm/irq.h>
@@ -123,6 +124,7 @@ static void *get_call_destination(u8 type)
                .pv_irq_ops = pv_irq_ops,
                .pv_apic_ops = pv_apic_ops,
                .pv_mmu_ops = pv_mmu_ops,
+               .pv_lock_ops = pv_lock_ops,
        };
        return *((void **)&tmpl + type);
 }
@@ -266,6 +268,17 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
        return __get_cpu_var(paravirt_lazy_mode);
 }
 
+void __init paravirt_use_bytelocks(void)
+{
+#ifdef CONFIG_SMP
+       pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
+       pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
+       pv_lock_ops.spin_lock = __byte_spin_lock;
+       pv_lock_ops.spin_trylock = __byte_spin_trylock;
+       pv_lock_ops.spin_unlock = __byte_spin_unlock;
+#endif
+}
+
 struct pv_info pv_info = {
        .name = "bare hardware",
        .paravirt_enabled = 0,
@@ -361,7 +374,6 @@ struct pv_cpu_ops pv_cpu_ops = {
 struct pv_apic_ops pv_apic_ops = {
 #ifdef CONFIG_X86_LOCAL_APIC
        .apic_write = native_apic_write,
-       .apic_write_atomic = native_apic_write_atomic,
        .apic_read = native_apic_read,
        .setup_boot_clock = setup_boot_APIC_clock,
        .setup_secondary_clock = setup_secondary_APIC_clock,
@@ -373,6 +385,9 @@ struct pv_mmu_ops pv_mmu_ops = {
 #ifndef CONFIG_X86_64
        .pagetable_setup_start = native_pagetable_setup_start,
        .pagetable_setup_done = native_pagetable_setup_done,
+#else
+       .pagetable_setup_start = paravirt_nop,
+       .pagetable_setup_done = paravirt_nop,
 #endif
 
        .read_cr2 = native_read_cr2,
@@ -446,6 +461,18 @@ struct pv_mmu_ops pv_mmu_ops = {
        .set_fixmap = native_set_fixmap,
 };
 
+struct pv_lock_ops pv_lock_ops = {
+#ifdef CONFIG_SMP
+       .spin_is_locked = __ticket_spin_is_locked,
+       .spin_is_contended = __ticket_spin_is_contended,
+
+       .spin_lock = __ticket_spin_lock,
+       .spin_trylock = __ticket_spin_trylock,
+       .spin_unlock = __ticket_spin_unlock,
+#endif
+};
+EXPORT_SYMBOL_GPL(pv_lock_ops);
+
 EXPORT_SYMBOL_GPL(pv_time_ops);
 EXPORT_SYMBOL    (pv_cpu_ops);
 EXPORT_SYMBOL    (pv_mmu_ops);
index 6959b5c..151f2d1 100644 (file)
@@ -36,7 +36,7 @@
 #include <linux/delay.h>
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/calgary.h>
 #include <asm/tce.h>
 #include <asm/pci-direct.h>
index 8467ec2..a4213c0 100644 (file)
@@ -5,12 +5,11 @@
 
 #include <asm/proto.h>
 #include <asm/dma.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/calgary.h>
 #include <asm/amd_iommu.h>
 
-int forbid_dac __read_mostly;
-EXPORT_SYMBOL(forbid_dac);
+static int forbid_dac __read_mostly;
 
 const struct dma_mapping_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
@@ -114,21 +113,15 @@ void __init pci_iommu_alloc(void)
         * The order of these functions is important for
         * fall-back/fail-over reasons
         */
-#ifdef CONFIG_GART_IOMMU
        gart_iommu_hole_init();
-#endif
 
-#ifdef CONFIG_CALGARY_IOMMU
        detect_calgary();
-#endif
 
        detect_intel_iommu();
 
        amd_iommu_detect();
 
-#ifdef CONFIG_SWIOTLB
        pci_swiotlb_init();
-#endif
 }
 #endif
 
@@ -184,9 +177,7 @@ static __init int iommu_setup(char *p)
                        swiotlb = 1;
 #endif
 
-#ifdef CONFIG_GART_IOMMU
                gart_parse_options(p);
-#endif
 
 #ifdef CONFIG_CALGARY_IOMMU
                if (!strncmp(p, "calgary", 7))
@@ -500,17 +491,13 @@ EXPORT_SYMBOL(dma_free_coherent);
 
 static int __init pci_iommu_init(void)
 {
-#ifdef CONFIG_CALGARY_IOMMU
        calgary_iommu_init();
-#endif
 
        intel_iommu_init();
 
        amd_iommu_init();
 
-#ifdef CONFIG_GART_IOMMU
        gart_iommu_init();
-#endif
 
        no_iommu_init();
        return 0;
index c3fe784..be60961 100644 (file)
@@ -32,6 +32,7 @@
 #include <asm/mtrr.h>
 #include <asm/pgtable.h>
 #include <asm/proto.h>
+#include <asm/iommu.h>
 #include <asm/gart.h>
 #include <asm/cacheflush.h>
 #include <asm/swiotlb.h>
index aec43d5..792b917 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/scatterlist.h>
 
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/processor.h>
 #include <asm/dma.h>
 
index 82299cd..20df839 100644 (file)
@@ -5,7 +5,7 @@
 #include <linux/module.h>
 #include <linux/dma-mapping.h>
 
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
 
index 4d629c6..7fc4d5b 100644 (file)
@@ -15,6 +15,7 @@ unsigned long idle_nomwait;
 EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
+static int force_mwait __cpuinitdata;
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
@@ -199,6 +200,7 @@ static void poll_idle(void)
  *
  * idle=mwait overrides this decision and forces the usage of mwait.
  */
+static int __cpuinitdata force_mwait;
 
 #define MWAIT_INFO                     0x05
 #define MWAIT_ECX_EXTENDED_INFO                0x01
@@ -326,6 +328,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 
 static int __init idle_setup(char *str)
 {
+       if (!str)
+               return -EINVAL;
+
        if (!strcmp(str, "poll")) {
                printk("using polling idle threads.\n");
                pm_idle = poll_idle;
index a8e5362..e8a8e1b 100644 (file)
@@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
-       struct thread_struct *prev = &prev_p->thread,
-                                *next = &next_p->thread;
+       struct thread_struct *prev = &prev_p->thread;
+       struct thread_struct *next = &next_p->thread;
        int cpu = smp_processor_id();
        struct tss_struct *tss = &per_cpu(init_tss, cpu);
        unsigned fsindex, gsindex;
@@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
        /* 
         * Switch FS and GS.
+        *
+        * Segment register != 0 always requires a reload.  Also
+        * reload when it has changed.  When prev process used 64bit
+        * base always reload to avoid an information leak.
         */
-       { 
-               /* segment register != 0 always requires a reload. 
-                  also reload when it has changed. 
-                  when prev process used 64bit base always reload
-                  to avoid an information leak. */
-               if (unlikely(fsindex | next->fsindex | prev->fs)) {
-                       loadsegment(fs, next->fsindex);
-                       /* check if the user used a selector != 0
-                        * if yes clear 64bit base, since overloaded base
-                         * is always mapped to the Null selector
-                         */
-                       if (fsindex)
+       if (unlikely(fsindex | next->fsindex | prev->fs)) {
+               loadsegment(fs, next->fsindex);
+               /* 
+                * Check if the user used a selector != 0; if yes
+                *  clear 64bit base, since overloaded base is always
+                *  mapped to the Null selector
+                */
+               if (fsindex)
                        prev->fs = 0;                           
-               }
-               /* when next process has a 64bit base use it */
-               if (next->fs) 
-                       wrmsrl(MSR_FS_BASE, next->fs); 
-               prev->fsindex = fsindex;
-
-               if (unlikely(gsindex | next->gsindex | prev->gs)) {
-                       load_gs_index(next->gsindex);
-                       if (gsindex)
+       }
+       /* when next process has a 64bit base use it */
+       if (next->fs)
+               wrmsrl(MSR_FS_BASE, next->fs);
+       prev->fsindex = fsindex;
+
+       if (unlikely(gsindex | next->gsindex | prev->gs)) {
+               load_gs_index(next->gsindex);
+               if (gsindex)
                        prev->gs = 0;                           
-               }
-               if (next->gs)
-                       wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 
-               prev->gsindex = gsindex;
        }
+       if (next->gs)
+               wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
+       prev->gsindex = gsindex;
 
        /* Must be after DS reload */
        unlazy_fpu(prev_p);
@@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        write_pda(pcurrent, next_p); 
 
        write_pda(kernelstack,
-       (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+                 (unsigned long)task_stack_page(next_p) +
+                 THREAD_SIZE - PDA_STACKOFFSET);
 #ifdef CONFIG_CC_STACKPROTECTOR
        write_pda(stack_canary, next_p->stack_canary);
        /*
index 77040b6..e37dccc 100644 (file)
@@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
 #endif
 }
 
-#ifdef CONFIG_X86_32
-
 void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 {
        struct siginfo info;
@@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
        force_sig_info(SIGTRAP, &info, tsk);
 }
 
-/* notification of system call entry/exit
- * - triggered by current->work.syscall_trace
- */
-int do_syscall_trace(struct pt_regs *regs, int entryexit)
-{
-       int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
-       /*
-        * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
-        * interception
-        */
-       int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
-       int ret = 0;
-
-       /* do the secure computing check first */
-       if (!entryexit)
-               secure_computing(regs->orig_ax);
-
-       if (unlikely(current->audit_context)) {
-               if (entryexit)
-                       audit_syscall_exit(AUDITSC_RESULT(regs->ax),
-                                               regs->ax);
-               /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
-                * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
-                * not used, entry.S will call us only on syscall exit, not
-                * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
-                * calling send_sigtrap() on syscall entry.
-                *
-                * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
-                * is_singlestep is false, despite his name, so we will still do
-                * the correct thing.
-                */
-               else if (is_singlestep)
-                       goto out;
-       }
-
-       if (!(current->ptrace & PT_PTRACED))
-               goto out;
-
-       /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
-        * and then is resumed with SYSEMU_SINGLESTEP, it will come in
-        * here. We have to check this and return */
-       if (is_sysemu && entryexit)
-               return 0;
-
-       /* Fake a debug trap */
-       if (is_singlestep)
-               send_sigtrap(current, regs, 0);
-
-       if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
-               goto out;
-
-       /* the 0x80 provides a way for the tracing parent to distinguish
-          between a syscall stop and SIGTRAP delivery */
-       /* Note that the debugger could change the result of test_thread_flag!*/
-       ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
-
-       /*
-        * this isn't the same as continuing with a signal, but it will do
-        * for normal use.  strace only continues with a signal if the
-        * stopping signal is not SIGTRAP.  -brl
-        */
-       if (current->exit_code) {
-               send_sig(current->exit_code, current, 1);
-               current->exit_code = 0;
-       }
-       ret = is_sysemu;
-out:
-       if (unlikely(current->audit_context) && !entryexit)
-               audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
-                                   regs->bx, regs->cx, regs->dx, regs->si);
-       if (ret == 0)
-               return 0;
-
-       regs->orig_ax = -1; /* force skip of syscall restarting */
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
-       return 1;
-}
-
-#else  /* CONFIG_X86_64 */
-
 static void syscall_trace(struct pt_regs *regs)
 {
+       if (!(current->ptrace & PT_PTRACED))
+               return;
 
 #if 0
        printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
@@ -1481,39 +1400,81 @@ static void syscall_trace(struct pt_regs *regs)
        }
 }
 
-asmlinkage void syscall_trace_enter(struct pt_regs *regs)
+#ifdef CONFIG_X86_32
+# define IS_IA32       1
+#elif defined CONFIG_IA32_EMULATION
+# define IS_IA32       test_thread_flag(TIF_IA32)
+#else
+# define IS_IA32       0
+#endif
+
+/*
+ * We must return the syscall number to actually look up in the table.
+ * This can be -1L to skip running any syscall at all.
+ */
+asmregparm long syscall_trace_enter(struct pt_regs *regs)
 {
+       long ret = 0;
+
+       /*
+        * If we stepped into a sysenter/syscall insn, it trapped in
+        * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+        * If user-mode had set TF itself, then it's still clear from
+        * do_debug() and we need to set it again to restore the user
+        * state.  If we entered on the slow path, TF was already set.
+        */
+       if (test_thread_flag(TIF_SINGLESTEP))
+               regs->flags |= X86_EFLAGS_TF;
+
        /* do the secure computing check first */
        secure_computing(regs->orig_ax);
 
-       if (test_thread_flag(TIF_SYSCALL_TRACE)
-           && (current->ptrace & PT_PTRACED))
+       if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
+               ret = -1L;
+
+       if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
                syscall_trace(regs);
 
        if (unlikely(current->audit_context)) {
-               if (test_thread_flag(TIF_IA32)) {
+               if (IS_IA32)
                        audit_syscall_entry(AUDIT_ARCH_I386,
                                            regs->orig_ax,
                                            regs->bx, regs->cx,
                                            regs->dx, regs->si);
-               } else {
+#ifdef CONFIG_X86_64
+               else
                        audit_syscall_entry(AUDIT_ARCH_X86_64,
                                            regs->orig_ax,
                                            regs->di, regs->si,
                                            regs->dx, regs->r10);
-               }
+#endif
        }
+
+       return ret ?: regs->orig_ax;
 }
 
-asmlinkage void syscall_trace_leave(struct pt_regs *regs)
+asmregparm void syscall_trace_leave(struct pt_regs *regs)
 {
        if (unlikely(current->audit_context))
                audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
 
-       if ((test_thread_flag(TIF_SYSCALL_TRACE)
-            || test_thread_flag(TIF_SINGLESTEP))
-           && (current->ptrace & PT_PTRACED))
+       if (test_thread_flag(TIF_SYSCALL_TRACE))
                syscall_trace(regs);
-}
 
-#endif /* CONFIG_X86_32 */
+       /*
+        * If TIF_SYSCALL_EMU is set, we only get here because of
+        * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+        * We already reported this syscall instruction in
+        * syscall_trace_enter(), so don't do any more now.
+        */
+       if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
+               return;
+
+       /*
+        * If we are single-stepping, synthesize a trap to follow the
+        * system call instruction.
+        */
+       if (test_thread_flag(TIF_SINGLESTEP) &&
+           (current->ptrace & PT_PTRACED))
+               send_sigtrap(current, regs, 0);
+}
index f8a6216..9dcf39c 100644 (file)
@@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
                        DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
                },
        },
+       {       /* Handle problems with rebooting on Dell T5400's */
+               .callback = set_bios_reboot,
+               .ident = "Dell Precision T5400",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
+               },
+       },
        {       /* Handle problems with rebooting on HP laptops */
                .callback = set_bios_reboot,
                .ident = "HP Compaq Laptop",
index 531b55b..ec952aa 100644 (file)
 #include <linux/slab.h>
 #include <linux/user.h>
 #include <linux/delay.h>
-#include <linux/highmem.h>
 
 #include <linux/kallsyms.h>
-#include <linux/edd.h>
-#include <linux/iscsi_ibft.h>
-#include <linux/kexec.h>
 #include <linux/cpufreq.h>
 #include <linux/dma-mapping.h>
 #include <linux/ctype.h>
@@ -96,7 +92,7 @@
 #include <asm/smp.h>
 #include <asm/desc.h>
 #include <asm/dma.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
 
 #include <asm/paravirt.h>
 
 #include <asm/percpu.h>
-#include <asm/sections.h>
 #include <asm/topology.h>
 #include <asm/apicdef.h>
 #ifdef CONFIG_X86_64
@@ -579,6 +574,10 @@ static int __init setup_elfcorehdr(char *arg)
 early_param("elfcorehdr", setup_elfcorehdr);
 #endif
 
+static struct x86_quirks default_x86_quirks __initdata;
+
+struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
@@ -824,7 +823,10 @@ void __init setup_arch(char **cmdline_p)
        vmi_init();
 #endif
 
+       paravirt_pagetable_setup_start(swapper_pg_dir);
        paging_init();
+       paravirt_pagetable_setup_done(swapper_pg_dir);
+       paravirt_post_allocator_init();
 
 #ifdef CONFIG_X86_64
        map_vsyscall();
@@ -854,14 +856,6 @@ void __init setup_arch(char **cmdline_p)
        init_cpu_to_node();
 #endif
 
-#ifdef CONFIG_X86_NUMAQ
-       /*
-        * need to check online nodes num, call it
-        * here before time_init/tsc_init
-        */
-       numaq_tsc_disable();
-#endif
-
        init_apic_mappings();
        ioapic_init_mappings();
 
index d923736..07faaa5 100644 (file)
@@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
 
 badframe:
        if (show_unhandled_signals && printk_ratelimit()) {
-               printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:"
+               printk("%s%s[%d] bad frame in sigreturn frame:"
                        "%p ip:%lx sp:%lx oeax:%lx",
                    task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
                    current->comm, task_pid_nr(current), frame, regs->ip,
@@ -657,12 +657,6 @@ static void do_signal(struct pt_regs *regs)
 void
 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 {
-       /* Pending single-step? */
-       if (thread_info_flags & _TIF_SINGLESTEP) {
-               regs->flags |= X86_EFLAGS_TF;
-               clear_thread_flag(TIF_SINGLESTEP);
-       }
-
        /* deal with pending signal delivery */
        if (thread_info_flags & _TIF_SIGPENDING)
                do_signal(regs);
index e53b267..bf87684 100644 (file)
@@ -487,12 +487,6 @@ static void do_signal(struct pt_regs *regs)
 void do_notify_resume(struct pt_regs *regs, void *unused,
                      __u32 thread_info_flags)
 {
-       /* Pending single-step? */
-       if (thread_info_flags & _TIF_SINGLESTEP) {
-               regs->flags |= X86_EFLAGS_TF;
-               clear_thread_flag(TIF_SINGLESTEP);
-       }
-
 #ifdef CONFIG_X86_MCE
        /* notify userspace of pending MCEs */
        if (thread_info_flags & _TIF_MCE_NOTIFY)
index 687376a..2764019 100644 (file)
@@ -546,8 +546,8 @@ static inline void __inquire_remote_apic(int apicid)
                        printk(KERN_CONT
                               "a previous APIC delivery may have failed\n");
 
-               apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
-               apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
+               apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
+               apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
 
                timeout = 0;
                do {
@@ -579,11 +579,11 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
        int maxlvt;
 
        /* Target chip */
-       apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
+       apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
 
        /* Boot on the stack */
        /* Kick the second */
-       apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
+       apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
 
        Dprintk("Waiting for send to finish...\n");
        send_status = safe_apic_wait_icr_idle();
@@ -592,14 +592,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
         * Give the other CPU some time to accept the IPI.
         */
        udelay(200);
-       /*
-        * Due to the Pentium erratum 3AP.
-        */
        maxlvt = lapic_get_maxlvt();
-       if (maxlvt > 3) {
-               apic_read_around(APIC_SPIV);
+       if (maxlvt > 3)                 /* Due to the Pentium erratum 3AP.  */
                apic_write(APIC_ESR, 0);
-       }
        accept_status = (apic_read(APIC_ESR) & 0xEF);
        Dprintk("NMI sent.\n");
 
@@ -625,12 +620,14 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
                return send_status;
        }
 
+       maxlvt = lapic_get_maxlvt();
+
        /*
         * Be paranoid about clearing APIC errors.
         */
        if (APIC_INTEGRATED(apic_version[phys_apicid])) {
-               apic_read_around(APIC_SPIV);
-               apic_write(APIC_ESR, 0);
+               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
+                       apic_write(APIC_ESR, 0);
                apic_read(APIC_ESR);
        }
 
@@ -639,13 +636,13 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
        /*
         * Turn INIT on target chip
         */
-       apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+       apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
 
        /*
         * Send IPI
         */
-       apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
-                               | APIC_DM_INIT);
+       apic_write(APIC_ICR,
+                  APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT);
 
        Dprintk("Waiting for send to finish...\n");
        send_status = safe_apic_wait_icr_idle();
@@ -655,10 +652,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
        Dprintk("Deasserting INIT.\n");
 
        /* Target chip */
-       apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+       apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
 
        /* Send IPI */
-       apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+       apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
 
        Dprintk("Waiting for send to finish...\n");
        send_status = safe_apic_wait_icr_idle();
@@ -689,12 +686,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
         */
        Dprintk("#startup loops: %d.\n", num_starts);
 
-       maxlvt = lapic_get_maxlvt();
-
        for (j = 1; j <= num_starts; j++) {
                Dprintk("Sending STARTUP #%d.\n", j);
-               apic_read_around(APIC_SPIV);
-               apic_write(APIC_ESR, 0);
+               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
+                       apic_write(APIC_ESR, 0);
                apic_read(APIC_ESR);
                Dprintk("After apic_write.\n");
 
@@ -703,12 +698,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
                 */
 
                /* Target chip */
-               apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+               apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
 
                /* Boot on the stack */
                /* Kick the second */
-               apic_write_around(APIC_ICR, APIC_DM_STARTUP
-                                       | (start_eip >> 12));
+               apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12));
 
                /*
                 * Give the other CPU some time to accept the IPI.
@@ -724,13 +718,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
                 * Give the other CPU some time to accept the IPI.
                 */
                udelay(200);
-               /*
-                * Due to the Pentium erratum 3AP.
-                */
-               if (maxlvt > 3) {
-                       apic_read_around(APIC_SPIV);
+               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
                        apic_write(APIC_ESR, 0);
-               }
                accept_status = (apic_read(APIC_ESR) & 0xEF);
                if (send_status || accept_status)
                        break;
@@ -768,7 +757,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
  *
  * Must be called after the _cpu_pda pointer table is initialized.
  */
-static int __cpuinit get_local_pda(int cpu)
+int __cpuinit get_local_pda(int cpu)
 {
        struct x8664_pda *oldpda, *newpda;
        unsigned long size = sizeof(struct x8664_pda);
@@ -1311,7 +1300,7 @@ static void __ref remove_cpu_from_maps(int cpu)
        cpu_clear(cpu, cpu_callout_map);
        cpu_clear(cpu, cpu_callin_map);
        /* was set by cpu_init() */
-       clear_bit(cpu, (unsigned long *)&cpu_initialized);
+       cpu_clear(cpu, cpu_initialized);
        numa_remove_cpu(cpu);
 }
 
@@ -1390,7 +1379,8 @@ static int __init parse_maxcpus(char *arg)
 {
        extern unsigned int maxcpus;
 
-       maxcpus = simple_strtoul(arg, NULL, 0);
+       if (arg)
+               maxcpus = simple_strtoul(arg, NULL, 0);
        return 0;
 }
 early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
deleted file mode 100644 (file)
index 8b13789..0000000
+++ /dev/null
@@ -1 +0,0 @@
-
index 92c20fe..e8b9863 100644 (file)
@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
 static int enable_single_step(struct task_struct *child)
 {
        struct pt_regs *regs = task_pt_regs(child);
+       unsigned long oflags;
+
+       /*
+        * If we stepped into a sysenter/syscall insn, it trapped in
+        * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+        * If user-mode had set TF itself, then it's still clear from
+        * do_debug() and we need to set it again to restore the user
+        * state so we don't wrongly set TIF_FORCED_TF below.
+        * If enable_single_step() was used last and that is what
+        * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
+        * already set and our bookkeeping is fine.
+        */
+       if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
+               regs->flags |= X86_EFLAGS_TF;
 
        /*
         * Always set TIF_SINGLESTEP - this guarantees that
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
         */
        set_tsk_thread_flag(child, TIF_SINGLESTEP);
 
-       /*
-        * If TF was already set, don't do anything else
-        */
-       if (regs->flags & X86_EFLAGS_TF)
-               return 0;
+       oflags = regs->flags;
 
        /* Set TF on the kernel stack.. */
        regs->flags |= X86_EFLAGS_TF;
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
         * ..but if TF is changed by the instruction we will trace,
         * don't mark it as being "us" that set it, so that we
         * won't clear it by hand later.
+        *
+        * Note that if we don't actually execute the popf because
+        * of a signal arriving right now or suchlike, we will lose
+        * track of the fact that it really was "us" that set it.
         */
-       if (is_setting_trap_flag(child, regs))
+       if (is_setting_trap_flag(child, regs)) {
+               clear_tsk_thread_flag(child, TIF_FORCED_TF);
                return 0;
+       }
+
+       /*
+        * If TF was already set, check whether it was us who set it.
+        * If not, we should never attempt a block step.
+        */
+       if (oflags & X86_EFLAGS_TF)
+               return test_tsk_thread_flag(child, TIF_FORCED_TF);
 
        set_tsk_thread_flag(child, TIF_FORCED_TF);
 
index 059ca6e..ffe3c66 100644 (file)
@@ -129,6 +129,7 @@ void __init hpet_time_init(void)
  */
 void __init time_init(void)
 {
+       pre_time_init_hook();
        tsc_init();
        late_time_init = choose_time_init();
 }
index 8a76897..03df8e4 100644 (file)
@@ -58,6 +58,7 @@
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include <asm/io.h>
+#include <asm/traps.h>
 
 #include "mach_traps.h"
 
@@ -77,26 +78,6 @@ char ignore_fpu_irq;
 gate_desc idt_table[256]
        __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
 
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void alignment_check(void);
-asmlinkage void spurious_interrupt_bug(void);
-asmlinkage void machine_check(void);
-
 int panic_on_unrecovered_nmi;
 int kstack_depth_to_print = 24;
 static unsigned int code_bytes = 64;
@@ -256,7 +237,7 @@ static const struct stacktrace_ops print_trace_ops = {
 
 static void
 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                  unsigned long *stack, unsigned long bp, char *log_lvl)
+               unsigned long *stack, unsigned long bp, char *log_lvl)
 {
        dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
        printk("%s =======================\n", log_lvl);
@@ -383,6 +364,54 @@ int is_valid_bugaddr(unsigned long ip)
        return ud2 == 0x0b0f;
 }
 
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+       unsigned long flags;
+
+       oops_enter();
+
+       if (die_owner != raw_smp_processor_id()) {
+               console_verbose();
+               raw_local_irq_save(flags);
+               __raw_spin_lock(&die_lock);
+               die_owner = smp_processor_id();
+               die_nest_count = 0;
+               bust_spinlocks(1);
+       } else {
+               raw_local_irq_save(flags);
+       }
+       die_nest_count++;
+       return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+       bust_spinlocks(0);
+       die_owner = -1;
+       add_taint(TAINT_DIE);
+       __raw_spin_unlock(&die_lock);
+       raw_local_irq_restore(flags);
+
+       if (!regs)
+               return;
+
+       if (kexec_should_crash(current))
+               crash_kexec(regs);
+
+       if (in_interrupt())
+               panic("Fatal exception in interrupt");
+
+       if (panic_on_oops)
+               panic("Fatal exception");
+
+       oops_exit();
+       do_exit(signr);
+}
+
 int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 {
        unsigned short ss;
@@ -423,31 +452,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
  */
 void die(const char *str, struct pt_regs *regs, long err)
 {
-       static struct {
-               raw_spinlock_t lock;
-               u32 lock_owner;
-               int lock_owner_depth;
-       } die = {
-               .lock =                 __RAW_SPIN_LOCK_UNLOCKED,
-               .lock_owner =           -1,
-               .lock_owner_depth =     0
-       };
-       unsigned long flags;
-
-       oops_enter();
-
-       if (die.lock_owner != raw_smp_processor_id()) {
-               console_verbose();
-               raw_local_irq_save(flags);
-               __raw_spin_lock(&die.lock);
-               die.lock_owner = smp_processor_id();
-               die.lock_owner_depth = 0;
-               bust_spinlocks(1);
-       } else {
-               raw_local_irq_save(flags);
-       }
+       unsigned long flags = oops_begin();
 
-       if (++die.lock_owner_depth < 3) {
+       if (die_nest_count < 3) {
                report_bug(regs->ip, regs);
 
                if (__die(str, regs, err))
@@ -456,26 +463,7 @@ void die(const char *str, struct pt_regs *regs, long err)
                printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
        }
 
-       bust_spinlocks(0);
-       die.lock_owner = -1;
-       add_taint(TAINT_DIE);
-       __raw_spin_unlock(&die.lock);
-       raw_local_irq_restore(flags);
-
-       if (!regs)
-               return;
-
-       if (kexec_should_crash(current))
-               crash_kexec(regs);
-
-       if (in_interrupt())
-               panic("Fatal exception in interrupt");
-
-       if (panic_on_oops)
-               panic("Fatal exception");
-
-       oops_exit();
-       do_exit(SIGSEGV);
+       oops_end(flags, regs, SIGSEGV);
 }
 
 static inline void
index 2696a68..3f18d73 100644 (file)
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
 #include <asm/pda.h>
+#include <asm/traps.h>
 
 #include <mach_traps.h>
 
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void double_fault(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void alignment_check(void);
-asmlinkage void spurious_interrupt_bug(void);
-asmlinkage void machine_check(void);
-
 int panic_on_unrecovered_nmi;
 int kstack_depth_to_print = 12;
 static unsigned int code_bytes = 64;
@@ -355,17 +335,24 @@ static const struct stacktrace_ops print_trace_ops = {
        .address = print_trace_address,
 };
 
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp)
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *stack, unsigned long bp, char *log_lvl)
 {
        printk("\nCall Trace:\n");
-       dump_trace(task, regs, stack, bp, &print_trace_ops, NULL);
+       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
        printk("\n");
 }
 
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *stack, unsigned long bp)
+{
+       show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
 static void
-_show_stack(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *sp, unsigned long bp)
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *sp, unsigned long bp, char *log_lvl)
 {
        unsigned long *stack;
        int i;
@@ -399,12 +386,12 @@ _show_stack(struct task_struct *task, struct pt_regs *regs,
                printk(" %016lx", *stack++);
                touch_nmi_watchdog();
        }
-       show_trace(task, regs, sp, bp);
+       show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
 void show_stack(struct task_struct *task, unsigned long *sp)
 {
-       _show_stack(task, NULL, sp, 0);
+       show_stack_log_lvl(task, NULL, sp, 0, "");
 }
 
 /*
@@ -454,7 +441,8 @@ void show_registers(struct pt_regs *regs)
                u8 *ip;
 
                printk("Stack: ");
-               _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
+               show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
+                               regs->bp, "");
                printk("\n");
 
                printk(KERN_EMERG "Code: ");
@@ -518,7 +506,7 @@ unsigned __kprobes long oops_begin(void)
 }
 
 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{ 
+{
        die_owner = -1;
        bust_spinlocks(0);
        die_nest_count--;
index e94bdb6..41e01b1 100644 (file)
@@ -73,7 +73,7 @@ int is_visws_box(void)
        return visws_board_type >= 0;
 }
 
-static int __init visws_time_init_quirk(void)
+static int __init visws_time_init(void)
 {
        printk(KERN_INFO "Starting Cobalt Timer system clock\n");
 
@@ -93,7 +93,7 @@ static int __init visws_time_init_quirk(void)
        return 0;
 }
 
-static int __init visws_pre_intr_init_quirk(void)
+static int __init visws_pre_intr_init(void)
 {
        init_VISWS_APIC_irqs();
 
@@ -114,7 +114,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size);
 
 long long mem_size __initdata = 0;
 
-static char * __init visws_memory_setup_quirk(void)
+static char * __init visws_memory_setup(void)
 {
        long long gfx_mem_size = 8 * MB;
 
@@ -176,7 +176,7 @@ static void visws_machine_power_off(void)
        outl(PIIX_SPECIAL_STOP, 0xCFC);
 }
 
-static int __init visws_get_smp_config_quirk(unsigned int early)
+static int __init visws_get_smp_config(unsigned int early)
 {
        /*
         * Prevent MP-table parsing by the generic code:
@@ -192,7 +192,7 @@ extern unsigned int __cpuinitdata maxcpus;
  * No problem for Linux.
  */
 
-static void __init MP_processor_info (struct mpc_config_processor *m)
+static void __init MP_processor_info(struct mpc_config_processor *m)
 {
        int ver, logical_apicid;
        physid_mask_t apic_cpus;
@@ -232,7 +232,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
        apic_version[m->mpc_apicid] = ver;
 }
 
-int __init visws_find_smp_config_quirk(unsigned int reserve)
+static int __init visws_find_smp_config(unsigned int reserve)
 {
        struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
        unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -258,7 +258,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
        return 1;
 }
 
-extern int visws_trap_init_quirk(void);
+static int visws_trap_init(void);
+
+static struct x86_quirks visws_x86_quirks __initdata = {
+       .arch_time_init         = visws_time_init,
+       .arch_pre_intr_init     = visws_pre_intr_init,
+       .arch_memory_setup      = visws_memory_setup,
+       .arch_intr_init         = NULL,
+       .arch_trap_init         = visws_trap_init,
+       .mach_get_smp_config    = visws_get_smp_config,
+       .mach_find_smp_config   = visws_find_smp_config,
+};
 
 void __init visws_early_detect(void)
 {
@@ -272,16 +282,10 @@ void __init visws_early_detect(void)
 
        /*
         * Install special quirks for timer, interrupt and memory setup:
-        */
-       arch_time_init_quirk            = visws_time_init_quirk;
-       arch_pre_intr_init_quirk        = visws_pre_intr_init_quirk;
-       arch_memory_setup_quirk         = visws_memory_setup_quirk;
-
-       /*
         * Fall back to generic behavior for traps:
+        * Override generic MP-table parsing:
         */
-       arch_intr_init_quirk            = NULL;
-       arch_trap_init_quirk            = visws_trap_init_quirk;
+       x86_quirks = &visws_x86_quirks;
 
        /*
         * Install reboot quirks:
@@ -294,12 +298,6 @@ void __init visws_early_detect(void)
         */
        no_broadcast = 0;
 
-       /*
-        * Override generic MP-table parsing:
-        */
-       mach_get_smp_config_quirk       = visws_get_smp_config_quirk;
-       mach_find_smp_config_quirk      = visws_find_smp_config_quirk;
-
 #ifdef CONFIG_X86_IO_APIC
        /*
         * Turn off IO-APIC detection and initialization:
@@ -426,7 +424,7 @@ static __init void cobalt_init(void)
                co_apic_read(CO_APIC_ID));
 }
 
-int __init visws_trap_init_quirk(void)
+static int __init visws_trap_init(void)
 {
        lithium_init();
        cobalt_init();
index b153460..0a1b1a9 100644 (file)
@@ -906,7 +906,6 @@ static inline int __init activate_vmi(void)
 #ifdef CONFIG_X86_LOCAL_APIC
        para_fill(pv_apic_ops.apic_read, APICRead);
        para_fill(pv_apic_ops.apic_write, APICWrite);
-       para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
 #endif
 
        /*
index 50dad44..0313a5e 100644 (file)
@@ -991,7 +991,6 @@ __init void lguest_init(void)
 #ifdef CONFIG_X86_LOCAL_APIC
        /* apic read/write intercepts */
        pv_apic_ops.apic_write = lguest_apic_write;
-       pv_apic_ops.apic_write_atomic = lguest_apic_write;
        pv_apic_ops.apic_read = lguest_apic_read;
 #endif
 
index 48278fa..3d31783 100644 (file)
 #include <asm/e820.h>
 #include <asm/setup.h>
 
-/*
- * Any quirks to be performed to initialize timers/irqs/etc?
- */
-int (*arch_time_init_quirk)(void);
-int (*arch_pre_intr_init_quirk)(void);
-int (*arch_intr_init_quirk)(void);
-int (*arch_trap_init_quirk)(void);
-
 #ifdef CONFIG_HOTPLUG_CPU
 #define DEFAULT_SEND_IPI       (1)
 #else
@@ -37,8 +29,8 @@ int no_broadcast=DEFAULT_SEND_IPI;
  **/
 void __init pre_intr_init_hook(void)
 {
-       if (arch_pre_intr_init_quirk) {
-               if (arch_pre_intr_init_quirk())
+       if (x86_quirks->arch_pre_intr_init) {
+               if (x86_quirks->arch_pre_intr_init())
                        return;
        }
        init_ISA_irqs();
@@ -64,8 +56,8 @@ static struct irqaction irq2 = {
  **/
 void __init intr_init_hook(void)
 {
-       if (arch_intr_init_quirk) {
-               if (arch_intr_init_quirk())
+       if (x86_quirks->arch_intr_init) {
+               if (x86_quirks->arch_intr_init())
                        return;
        }
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -97,8 +89,8 @@ void __init pre_setup_arch_hook(void)
  **/
 void __init trap_init_hook(void)
 {
-       if (arch_trap_init_quirk) {
-               if (arch_trap_init_quirk())
+       if (x86_quirks->arch_trap_init) {
+               if (x86_quirks->arch_trap_init())
                        return;
        }
 }
@@ -110,6 +102,16 @@ static struct irqaction irq0  = {
        .name = "timer"
 };
 
+/**
+ * pre_time_init_hook - do any specific initialisations before.
+ *
+ **/
+void __init pre_time_init_hook(void)
+{
+       if (x86_quirks->arch_pre_time_init)
+               x86_quirks->arch_pre_time_init();
+}
+
 /**
  * time_init_hook - do any specific initialisations for the system timer.
  *
@@ -119,13 +121,13 @@ static struct irqaction irq0  = {
  **/
 void __init time_init_hook(void)
 {
-       if (arch_time_init_quirk) {
+       if (x86_quirks->arch_time_init) {
                /*
                 * A nonzero return code does not mean failure, it means
                 * that the architecture quirk does not want any
                 * generic (timer) setup to be performed after this:
                 */
-               if (arch_time_init_quirk())
+               if (x86_quirks->arch_time_init())
                        return;
        }
 
index 9873716..1fbb844 100644 (file)
@@ -21,3 +21,4 @@ obj-$(CONFIG_K8_NUMA)         += k8topology_64.o
 endif
 obj-$(CONFIG_ACPI_NUMA)                += srat_$(BITS).o
 
+obj-$(CONFIG_MEMTEST)          += memtest.o
index 9689a51..d37f293 100644 (file)
@@ -844,6 +844,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
                reserve_early(table_start << PAGE_SHIFT,
                                 table_end << PAGE_SHIFT, "PGTABLE");
 
+       if (!after_init_bootmem)
+               early_memtest(start, end);
+
        return end >> PAGE_SHIFT;
 }
 
@@ -868,8 +871,6 @@ void __init paging_init(void)
         */
        sparse_init();
        zone_sizes_init();
-
-       paravirt_post_allocator_init();
 }
 
 /*
index 306049e..ec37121 100644 (file)
@@ -517,118 +517,6 @@ static void __init init_gbpages(void)
                direct_gbpages = 0;
 }
 
-#ifdef CONFIG_MEMTEST
-
-static void __init memtest(unsigned long start_phys, unsigned long size,
-                                unsigned pattern)
-{
-       unsigned long i;
-       unsigned long *start;
-       unsigned long start_bad;
-       unsigned long last_bad;
-       unsigned long val;
-       unsigned long start_phys_aligned;
-       unsigned long count;
-       unsigned long incr;
-
-       switch (pattern) {
-       case 0:
-               val = 0UL;
-               break;
-       case 1:
-               val = -1UL;
-               break;
-       case 2:
-               val = 0x5555555555555555UL;
-               break;
-       case 3:
-               val = 0xaaaaaaaaaaaaaaaaUL;
-               break;
-       default:
-               return;
-       }
-
-       incr = sizeof(unsigned long);
-       start_phys_aligned = ALIGN(start_phys, incr);
-       count = (size - (start_phys_aligned - start_phys))/incr;
-       start = __va(start_phys_aligned);
-       start_bad = 0;
-       last_bad = 0;
-
-       for (i = 0; i < count; i++)
-               start[i] = val;
-       for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
-               if (*start != val) {
-                       if (start_phys_aligned == last_bad + incr) {
-                               last_bad += incr;
-                       } else {
-                               if (start_bad) {
-                                       printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
-                                               val, start_bad, last_bad + incr);
-                                       reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
-                               }
-                               start_bad = last_bad = start_phys_aligned;
-                       }
-               }
-       }
-       if (start_bad) {
-               printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
-                       val, start_bad, last_bad + incr);
-               reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
-       }
-
-}
-
-/* default is disabled */
-static int memtest_pattern __initdata;
-
-static int __init parse_memtest(char *arg)
-{
-       if (arg)
-               memtest_pattern = simple_strtoul(arg, NULL, 0);
-       return 0;
-}
-
-early_param("memtest", parse_memtest);
-
-static void __init early_memtest(unsigned long start, unsigned long end)
-{
-       u64 t_start, t_size;
-       unsigned pattern;
-
-       if (!memtest_pattern)
-               return;
-
-       printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
-       for (pattern = 0; pattern < memtest_pattern; pattern++) {
-               t_start = start;
-               t_size = 0;
-               while (t_start < end) {
-                       t_start = find_e820_area_size(t_start, &t_size, 1);
-
-                       /* done ? */
-                       if (t_start >= end)
-                               break;
-                       if (t_start + t_size > end)
-                               t_size = end - t_start;
-
-                       printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
-                               (unsigned long long)t_start,
-                               (unsigned long long)t_start + t_size, pattern);
-
-                       memtest(t_start, t_size, pattern);
-
-                       t_start += t_size;
-               }
-       }
-       printk(KERN_CONT "\n");
-}
-#else
-static void __init early_memtest(unsigned long start, unsigned long end)
-{
-}
-#endif
-
 static unsigned long __init kernel_physical_mapping_init(unsigned long start,
                                                unsigned long end,
                                                unsigned long page_size_mask)
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
new file mode 100644 (file)
index 0000000..672e17f
--- /dev/null
@@ -0,0 +1,123 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pfn.h>
+
+#include <asm/e820.h>
+
+static void __init memtest(unsigned long start_phys, unsigned long size,
+                                unsigned pattern)
+{
+       unsigned long i;
+       unsigned long *start;
+       unsigned long start_bad;
+       unsigned long last_bad;
+       unsigned long val;
+       unsigned long start_phys_aligned;
+       unsigned long count;
+       unsigned long incr;
+
+       switch (pattern) {
+       case 0:
+               val = 0UL;
+               break;
+       case 1:
+               val = -1UL;
+               break;
+       case 2:
+#ifdef CONFIG_X86_64
+               val = 0x5555555555555555UL;
+#else
+               val = 0x55555555UL;
+#endif
+               break;
+       case 3:
+#ifdef CONFIG_X86_64
+               val = 0xaaaaaaaaaaaaaaaaUL;
+#else
+               val = 0xaaaaaaaaUL;
+#endif
+               break;
+       default:
+               return;
+       }
+
+       incr = sizeof(unsigned long);
+       start_phys_aligned = ALIGN(start_phys, incr);
+       count = (size - (start_phys_aligned - start_phys))/incr;
+       start = __va(start_phys_aligned);
+       start_bad = 0;
+       last_bad = 0;
+
+       for (i = 0; i < count; i++)
+               start[i] = val;
+       for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
+               if (*start != val) {
+                       if (start_phys_aligned == last_bad + incr) {
+                               last_bad += incr;
+                       } else {
+                               if (start_bad) {
+                                       printk(KERN_CONT "\n  %010lx bad mem addr %010lx - %010lx reserved",
+                                               val, start_bad, last_bad + incr);
+                                       reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+                               }
+                               start_bad = last_bad = start_phys_aligned;
+                       }
+               }
+       }
+       if (start_bad) {
+               printk(KERN_CONT "\n  %016lx bad mem addr %010lx - %010lx reserved",
+                       val, start_bad, last_bad + incr);
+               reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+       }
+
+}
+
+/* default is disabled */
+static int memtest_pattern __initdata;
+
+static int __init parse_memtest(char *arg)
+{
+       if (arg)
+               memtest_pattern = simple_strtoul(arg, NULL, 0);
+       return 0;
+}
+
+early_param("memtest", parse_memtest);
+
+void __init early_memtest(unsigned long start, unsigned long end)
+{
+       u64 t_start, t_size;
+       unsigned pattern;
+
+       if (!memtest_pattern)
+               return;
+
+       printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
+       for (pattern = 0; pattern < memtest_pattern; pattern++) {
+               t_start = start;
+               t_size = 0;
+               while (t_start < end) {
+                       t_start = find_e820_area_size(t_start, &t_size, 1);
+
+                       /* done ? */
+                       if (t_start >= end)
+                               break;
+                       if (t_start + t_size > end)
+                               t_size = end - t_start;
+
+                       printk(KERN_CONT "\n  %010llx - %010llx pattern %d",
+                               (unsigned long long)t_start,
+                               (unsigned long long)t_start + t_size, pattern);
+
+                       memtest(t_start, t_size, pattern);
+
+                       t_start += t_size;
+               }
+       }
+       printk(KERN_CONT "\n");
+}
index d458507..2fe3091 100644 (file)
@@ -12,6 +12,8 @@
 #include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 
 #include <asm/msr.h>
 #include <asm/tlbflush.h>
@@ -373,8 +375,8 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
        return vma_prot;
 }
 
-#ifdef CONFIG_NONPROMISC_DEVMEM
-/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
+#ifdef CONFIG_STRICT_DEVMEM
+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 {
        return 1;
@@ -398,7 +400,7 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
        }
        return 1;
 }
-#endif /* CONFIG_NONPROMISC_DEVMEM */
+#endif /* CONFIG_STRICT_DEVMEM */
 
 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
                                unsigned long size, pgprot_t *vma_prot)
@@ -489,3 +491,89 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
 
        free_memtype(addr, addr + size);
 }
+
+#if defined(CONFIG_DEBUG_FS)
+
+/* get Nth element of the linked list */
+static struct memtype *memtype_get_idx(loff_t pos)
+{
+       struct memtype *list_node, *print_entry;
+       int i = 1;
+
+       print_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+       if (!print_entry)
+               return NULL;
+
+       spin_lock(&memtype_lock);
+       list_for_each_entry(list_node, &memtype_list, nd) {
+               if (pos == i) {
+                       *print_entry = *list_node;
+                       spin_unlock(&memtype_lock);
+                       return print_entry;
+               }
+               ++i;
+       }
+       spin_unlock(&memtype_lock);
+       kfree(print_entry);
+       return NULL;
+}
+
+static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
+{
+       if (*pos == 0) {
+               ++*pos;
+               seq_printf(seq, "PAT memtype list:\n");
+       }
+
+       return memtype_get_idx(*pos);
+}
+
+static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       ++*pos;
+       return memtype_get_idx(*pos);
+}
+
+static void memtype_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int memtype_seq_show(struct seq_file *seq, void *v)
+{
+       struct memtype *print_entry = (struct memtype *)v;
+
+       seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
+                       print_entry->start, print_entry->end);
+       kfree(print_entry);
+       return 0;
+}
+
+static struct seq_operations memtype_seq_ops = {
+       .start = memtype_seq_start,
+       .next  = memtype_seq_next,
+       .stop  = memtype_seq_stop,
+       .show  = memtype_seq_show,
+};
+
+static int memtype_seq_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &memtype_seq_ops);
+}
+
+static const struct file_operations memtype_fops = {
+       .open    = memtype_seq_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
+static int __init pat_memtype_list_init(void)
+{
+       debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
+                               NULL, &memtype_fops);
+       return 0;
+}
+
+late_initcall(pat_memtype_list_init);
+
+#endif /* CONFIG_DEBUG_FS */
index e515e8d..d49202e 100644 (file)
@@ -5,13 +5,13 @@ obj-$(CONFIG_PCI_MMCONFIG)    += mmconfig_$(BITS).o direct.o mmconfig-shared.o
 obj-$(CONFIG_PCI_DIRECT)       += direct.o
 obj-$(CONFIG_PCI_OLPC)         += olpc.o
 
-pci-y                          := fixup.o
-pci-$(CONFIG_ACPI)             += acpi.o
-pci-y                          += legacy.o irq.o
+obj-y                          += fixup.o
+obj-$(CONFIG_ACPI)             += acpi.o
+obj-y                          += legacy.o irq.o
 
-pci-$(CONFIG_X86_VISWS)                += visws.o
+obj-$(CONFIG_X86_VISWS)                += visws.o
 
-pci-$(CONFIG_X86_NUMAQ)                += numa.o
+obj-$(CONFIG_X86_NUMAQ)                += numaq_32.o
 
-obj-y                          += $(pci-y) common.o early.o
+obj-y                          += common.o early.o
 obj-y                          += amd_bus.o
index 132876c..ec9ce35 100644 (file)
@@ -57,14 +57,17 @@ static int __init pci_legacy_init(void)
 
 int __init pci_subsys_init(void)
 {
+#ifdef CONFIG_X86_NUMAQ
+       pci_numaq_init();
+#endif
 #ifdef CONFIG_ACPI
        pci_acpi_init();
+#endif
+#ifdef CONFIG_X86_VISWS
+       pci_visws_init();
 #endif
        pci_legacy_init();
        pcibios_irq_init();
-#ifdef CONFIG_X86_NUMAQ
-       pci_numa_init();
-#endif
        pcibios_init();
 
        return 0;
similarity index 97%
rename from arch/x86/pci/numa.c
rename to arch/x86/pci/numaq_32.c
index 8b5ca19..f4b16dc 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * numa.c - Low-level PCI access for NUMA-Q machines
+ * numaq_32.c - Low-level PCI access for NUMA-Q machines
  */
 
 #include <linux/pci.h>
@@ -151,7 +151,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
 
-int __init pci_numa_init(void)
+int __init pci_numaq_init(void)
 {
        int quad;
 
index 3e25deb..15b9cf6 100644 (file)
@@ -108,7 +108,8 @@ extern void __init dmi_check_skip_isa_align(void);
 /* some common used subsys_initcalls */
 extern int __init pci_acpi_init(void);
 extern int __init pcibios_irq_init(void);
-extern int __init pci_numa_init(void);
+extern int __init pci_visws_init(void);
+extern int __init pci_numaq_init(void);
 extern int __init pcibios_init(void);
 
 /* pci-mmconfig.c */
index 1a7bed4..42f4cb1 100644 (file)
@@ -86,8 +86,14 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
        pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 }
 
-static int __init pci_visws_init(void)
+int __init pci_visws_init(void)
 {
+       if (!is_visws_box())
+               return -1;
+
+       pcibios_enable_irq = &pci_visws_enable_irq;
+       pcibios_disable_irq = &pci_visws_disable_irq;
+
        /* The VISWS supports configuration access type 1 only */
        pci_probe = (pci_probe | PCI_PROBE_CONF1) &
                    ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
@@ -105,18 +111,3 @@ static int __init pci_visws_init(void)
        pcibios_resource_survey();
        return 0;
 }
-
-static __init int pci_subsys_init(void)
-{
-       if (!is_visws_box())
-               return -1;
-
-       pcibios_enable_irq = &pci_visws_enable_irq;
-       pcibios_disable_irq = &pci_visws_disable_irq;
-
-       pci_visws_init();
-       pcibios_init();
-
-       return 0;
-}
-subsys_initcall(pci_subsys_init);
index b7ad9f8..4d6ef0a 100644 (file)
@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
 # Build multiple 32-bit vDSO images to choose from at boot time.
 #
 obj-$(VDSO32-y)                        += vdso32-syms.lds
-vdso32.so-$(CONFIG_X86_32)     += int80
+vdso32.so-$(VDSO32-y)          += int80
 vdso32.so-$(CONFIG_COMPAT)     += syscall
 vdso32.so-$(VDSO32-y)          += sysenter
 
index 0bce542..513f330 100644 (file)
@@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
        }
 }
 
-/*
- * These symbols are defined by vdso32.S to mark the bounds
- * of the ELF DSO images included therein.
- */
-extern const char vdso32_default_start, vdso32_default_end;
-extern const char vdso32_sysenter_start, vdso32_sysenter_end;
 static struct page *vdso32_pages[1];
 
 #ifdef CONFIG_X86_64
 
 #define        vdso32_sysenter()       (boot_cpu_has(X86_FEATURE_SYSENTER32))
+#define        vdso32_syscall()        (boot_cpu_has(X86_FEATURE_SYSCALL32))
 
 /* May not be __init: called during resume */
 void syscall32_cpu_init(void)
@@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map)
 #else  /* CONFIG_X86_32 */
 
 #define vdso32_sysenter()      (boot_cpu_has(X86_FEATURE_SEP))
+#define vdso32_syscall()       (0)
 
 void enable_sep_cpu(void)
 {
@@ -296,12 +292,15 @@ int __init sysenter_setup(void)
        gate_vma_init();
 #endif
 
-       if (!vdso32_sysenter()) {
-               vsyscall = &vdso32_default_start;
-               vsyscall_len = &vdso32_default_end - &vdso32_default_start;
-       } else {
+       if (vdso32_syscall()) {
+               vsyscall = &vdso32_syscall_start;
+               vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
+       } else if (vdso32_sysenter()){
                vsyscall = &vdso32_sysenter_start;
                vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
+       } else {
+               vsyscall = &vdso32_int80_start;
+               vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
        }
 
        memcpy(syscall_page, vsyscall, vsyscall_len);
index 1e36f72..2ce5f82 100644 (file)
@@ -2,14 +2,17 @@
 
 __INITDATA
 
-       .globl vdso32_default_start, vdso32_default_end
-vdso32_default_start:
-#ifdef CONFIG_X86_32
+       .globl vdso32_int80_start, vdso32_int80_end
+vdso32_int80_start:
        .incbin "arch/x86/vdso/vdso32-int80.so"
-#else
+vdso32_int80_end:
+
+       .globl vdso32_syscall_start, vdso32_syscall_end
+vdso32_syscall_start:
+#ifdef CONFIG_COMPAT
        .incbin "arch/x86/vdso/vdso32-syscall.so"
 #endif
-vdso32_default_end:
+vdso32_syscall_end:
 
        .globl vdso32_sysenter_start, vdso32_sysenter_end
 vdso32_sysenter_start:
index 19a6cfa..257ba4a 100644 (file)
@@ -21,7 +21,8 @@ unsigned int __read_mostly vdso_enabled = 1;
 extern char vdso_start[], vdso_end[];
 extern unsigned short vdso_sync_cpuid;
 
-struct page **vdso_pages;
+static struct page **vdso_pages;
+static unsigned vdso_size;
 
 static inline void *var_ref(void *p, char *name)
 {
@@ -38,6 +39,7 @@ static int __init init_vdso_vars(void)
        int i;
        char *vbase;
 
+       vdso_size = npages << PAGE_SHIFT;
        vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
        if (!vdso_pages)
                goto oom;
@@ -101,20 +103,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
        struct mm_struct *mm = current->mm;
        unsigned long addr;
        int ret;
-       unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
 
        if (!vdso_enabled)
                return 0;
 
        down_write(&mm->mmap_sem);
-       addr = vdso_addr(mm->start_stack, len);
-       addr = get_unmapped_area(NULL, addr, len, 0, 0);
+       addr = vdso_addr(mm->start_stack, vdso_size);
+       addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
        if (IS_ERR_VALUE(addr)) {
                ret = addr;
                goto up_fail;
        }
 
-       ret = install_special_mapping(mm, addr, len,
+       ret = install_special_mapping(mm, addr, vdso_size,
                                      VM_READ|VM_EXEC|
                                      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
                                      VM_ALWAYSDUMP,
index c2cc995..3815e42 100644 (file)
@@ -6,8 +6,8 @@ config XEN
        bool "Xen guest support"
        select PARAVIRT
        select PARAVIRT_CLOCK
-       depends on X86_32
-       depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
+       depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
+       depends on X86_CMPXCHG && X86_TSC
        help
          This is the Linux Xen port.  Enabling this will allow the
          kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,16 @@ config XEN
 
 config XEN_MAX_DOMAIN_MEMORY
        int "Maximum allowed size of a domain in gigabytes"
-       default 8
+       default 8 if X86_32
+       default 32 if X86_64
        depends on XEN
        help
          The pseudo-physical to machine address array is sized
          according to the maximum possible memory size of a Xen
          domain.  This array uses 1 page per gigabyte, so there's no
-         need to be too stingy here.
\ No newline at end of file
+         need to be too stingy here.
+
+config XEN_SAVE_RESTORE
+       bool
+       depends on PM
+       default y
\ No newline at end of file
index 2ba2d16..59c1e53 100644 (file)
@@ -1,4 +1,4 @@
 obj-y          := enlighten.o setup.o multicalls.o mmu.o \
-                       time.o xen-asm.o grant-table.o suspend.o
+                       time.o xen-asm_$(BITS).o grant-table.o suspend.o
 
 obj-$(CONFIG_SMP)      += smp.o
index bb50845..194bbd6 100644 (file)
@@ -33,6 +33,7 @@
 #include <xen/interface/sched.h>
 #include <xen/features.h>
 #include <xen/page.h>
+#include <xen/hvc-console.h>
 
 #include <asm/paravirt.h>
 #include <asm/page.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/fixmap.h>
 #include <asm/processor.h>
+#include <asm/msr-index.h>
 #include <asm/setup.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/reboot.h>
-#include <asm/pgalloc.h>
 
 #include "xen-ops.h"
 #include "mmu.h"
@@ -56,6 +57,18 @@ EXPORT_SYMBOL_GPL(hypercall_page);
 DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
 DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 
+/*
+ * Identity map, in addition to plain kernel map.  This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
+
 /*
  * Note about cr3 (pagetable base) values:
  *
@@ -167,10 +180,14 @@ void xen_vcpu_restore(void)
 
 static void __init xen_banner(void)
 {
+       unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
+       struct xen_extraversion extra;
+       HYPERVISOR_xen_version(XENVER_extraversion, &extra);
+
        printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
               pv_info.name);
-       printk(KERN_INFO "Hypervisor signature: %s%s\n",
-              xen_start_info->magic,
+       printk(KERN_INFO "Xen version: %d.%d%s%s\n",
+              version >> 16, version & 0xffff, extra.extraversion,
               xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
 }
 
@@ -363,14 +380,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
 
 static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
 {
-       xen_mc_batch();
-
-       load_TLS_descriptor(t, cpu, 0);
-       load_TLS_descriptor(t, cpu, 1);
-       load_TLS_descriptor(t, cpu, 2);
-
-       xen_mc_issue(PARAVIRT_LAZY_CPU);
-
        /*
         * XXX sleazy hack: If we're being called in a lazy-cpu zone,
         * it means we're in a context switch, and %gs has just been
@@ -379,10 +388,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
         * Either way, it has been saved, and the new value will get
         * loaded properly.  This will go away as soon as Xen has been
         * modified to not save/restore %gs for normal hypercalls.
+        *
+        * On x86_64, this hack is not used for %gs, because gs points
+        * to KERNEL_GS_BASE (and uses it for PDA references), so we
+        * must not zero %gs on x86_64
+        *
+        * For x86_64, we need to zero %fs, otherwise we may get an
+        * exception between the new %fs descriptor being loaded and
+        * %fs being effectively cleared at __switch_to().
         */
-       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
+#ifdef CONFIG_X86_32
                loadsegment(gs, 0);
+#else
+               loadsegment(fs, 0);
+#endif
+       }
+
+       xen_mc_batch();
+
+       load_TLS_descriptor(t, cpu, 0);
+       load_TLS_descriptor(t, cpu, 1);
+       load_TLS_descriptor(t, cpu, 2);
+
+       xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+#ifdef CONFIG_X86_64
+static void xen_load_gs_index(unsigned int idx)
+{
+       if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
+               BUG();
 }
+#endif
 
 static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
                                const void *ptr)
@@ -400,23 +438,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
        preempt_enable();
 }
 
-static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+static int cvt_gate_to_trap(int vector, const gate_desc *val,
                            struct trap_info *info)
 {
-       u8 type, dpl;
-
-       type = (high >> 8) & 0x1f;
-       dpl = (high >> 13) & 3;
-
-       if (type != 0xf && type != 0xe)
+       if (val->type != 0xf && val->type != 0xe)
                return 0;
 
        info->vector = vector;
-       info->address = (high & 0xffff0000) | (low & 0x0000ffff);
-       info->cs = low >> 16;
-       info->flags = dpl;
+       info->address = gate_offset(*val);
+       info->cs = gate_segment(*val);
+       info->flags = val->dpl;
        /* interrupt gates clear IF */
-       if (type == 0xe)
+       if (val->type == 0xe)
                info->flags |= 4;
 
        return 1;
@@ -443,11 +476,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
 
        if (p >= start && (p + 8) <= end) {
                struct trap_info info[2];
-               u32 *desc = (u32 *)g;
 
                info[1].address = 0;
 
-               if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0]))
+               if (cvt_gate_to_trap(entrynum, g, &info[0]))
                        if (HYPERVISOR_set_trap_table(info))
                                BUG();
        }
@@ -460,13 +492,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
 {
        unsigned in, out, count;
 
-       count = (desc->size+1) / 8;
+       count = (desc->size+1) / sizeof(gate_desc);
        BUG_ON(count > 256);
 
        for (in = out = 0; in < count; in++) {
-               const u32 *entry = (u32 *)(desc->address + in * 8);
+               gate_desc *entry = (gate_desc*)(desc->address) + in;
 
-               if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+               if (cvt_gate_to_trap(in, entry, &traps[out]))
                        out++;
        }
        traps[out].address = 0;
@@ -695,33 +727,89 @@ static void set_current_cr3(void *v)
        x86_write_percpu(xen_current_cr3, (unsigned long)v);
 }
 
-static void xen_write_cr3(unsigned long cr3)
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
 {
        struct mmuext_op *op;
        struct multicall_space mcs;
-       unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+       unsigned long mfn;
 
-       BUG_ON(preemptible());
+       if (cr3)
+               mfn = pfn_to_mfn(PFN_DOWN(cr3));
+       else
+               mfn = 0;
 
-       mcs = xen_mc_entry(sizeof(*op));  /* disables interrupts */
+       WARN_ON(mfn == 0 && kernel);
 
-       /* Update while interrupts are disabled, so its atomic with
-          respect to ipis */
-       x86_write_percpu(xen_cr3, cr3);
+       mcs = __xen_mc_entry(sizeof(*op));
 
        op = mcs.args;
-       op->cmd = MMUEXT_NEW_BASEPTR;
+       op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
        op->arg1.mfn = mfn;
 
        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 
-       /* Update xen_update_cr3 once the batch has actually
-          been submitted. */
-       xen_mc_callback(set_current_cr3, (void *)cr3);
+       if (kernel) {
+               x86_write_percpu(xen_cr3, cr3);
+
+               /* Update xen_current_cr3 once the batch has actually
+                  been submitted. */
+               xen_mc_callback(set_current_cr3, (void *)cr3);
+       }
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+       BUG_ON(preemptible());
+
+       xen_mc_batch();  /* disables interrupts */
+
+       /* Update while interrupts are disabled, so its atomic with
+          respect to ipis */
+       x86_write_percpu(xen_cr3, cr3);
+
+       __xen_write_cr3(true, cr3);
+
+#ifdef CONFIG_X86_64
+       {
+               pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+               if (user_pgd)
+                       __xen_write_cr3(false, __pa(user_pgd));
+               else
+                       __xen_write_cr3(false, 0);
+       }
+#endif
 
        xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
 }
 
+static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+{
+       int ret;
+
+       ret = 0;
+
+       switch(msr) {
+#ifdef CONFIG_X86_64
+               unsigned which;
+               u64 base;
+
+       case MSR_FS_BASE:               which = SEGBASE_FS; goto set;
+       case MSR_KERNEL_GS_BASE:        which = SEGBASE_GS_USER; goto set;
+       case MSR_GS_BASE:               which = SEGBASE_GS_KERNEL; goto set;
+
+       set:
+               base = ((u64)high << 32) | low;
+               if (HYPERVISOR_set_segment_base(which, base) != 0)
+                       ret = -EFAULT;
+               break;
+#endif
+       default:
+               ret = native_write_msr_safe(msr, low, high);
+       }
+
+       return ret;