Merge commit 'remotes/tip/x86/paravirt' into x86/untangle2
authorJeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Wed, 11 Feb 2009 19:52:22 +0000 (11:52 -0800)
committerJeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Wed, 11 Feb 2009 19:52:22 +0000 (11:52 -0800)
* commit 'remotes/tip/x86/paravirt': (175 commits)
  xen: use direct ops on 64-bit
  xen: make direct versions of irq_enable/disable/save/restore to common code
  xen: setup percpu data pointers
  xen: fix 32-bit build resulting from mmu move
  x86/paravirt: return full 64-bit result
  x86, percpu: fix kexec with vmlinux
  x86/vmi: fix interrupt enable/disable/save/restore calling convention.
  x86/paravirt: don't restore second return reg
  xen: setup percpu data pointers
  x86: split loading percpu segments from loading gdt
  x86: pass in cpu number to switch_to_new_gdt()
  x86: UV fix uv_flush_send_and_wait()
  x86/paravirt: fix missing callee-save call on pud_val
  x86/paravirt: use callee-saved convention for pte_val/make_pte/etc
  x86/paravirt: implement PVOP_CALL macros for callee-save functions
  x86/paravirt: add register-saving thunks to reduce caller register pressure
  x86/paravirt: selectively save/restore regs around pvops calls
  x86: fix paravirt clobber in entry_64.S
  x86/pvops: add a paravirt_ident functions to allow special patching
  xen: move remaining mmu-related stuff into mmu.c
  ...

Conflicts:
arch/x86/mach-voyager/voyager_smp.c
arch/x86/mm/fault.c

181 files changed:
Documentation/cputopology.txt
arch/alpha/kernel/irq.c
arch/arm/kernel/irq.c
arch/arm/kernel/vmlinux.lds.S
arch/arm/oprofile/op_model_mpcore.c
arch/blackfin/kernel/irqchip.c
arch/ia64/include/asm/topology.h
arch/ia64/kernel/iosapic.c
arch/ia64/kernel/irq.c
arch/ia64/kernel/irq_ia64.c
arch/ia64/kernel/msi_ia64.c
arch/ia64/kernel/vmlinux.lds.S
arch/ia64/sn/kernel/msi_sn.c
arch/mips/include/asm/irq.h
arch/mips/kernel/irq-gic.c
arch/mips/kernel/smtc.c
arch/mips/mti-malta/malta-smtc.c
arch/mips/sgi-ip22/ip22-int.c
arch/mips/sgi-ip22/ip22-time.c
arch/mips/sibyte/bcm1480/smp.c
arch/mips/sibyte/sb1250/smp.c
arch/mn10300/kernel/mn10300-watchdog.c
arch/parisc/kernel/irq.c
arch/powerpc/kernel/irq.c
arch/powerpc/kernel/vmlinux.lds.S
arch/powerpc/platforms/pseries/xics.c
arch/powerpc/sysdev/mpic.c
arch/sparc/kernel/irq_64.c
arch/sparc/kernel/time_64.c
arch/x86/Kconfig
arch/x86/Kconfig.cpu
arch/x86/Kconfig.debug
arch/x86/Makefile
arch/x86/ia32/ia32entry.S
arch/x86/include/asm/apicnum.h [new file with mode: 0644]
arch/x86/include/asm/cpu.h
arch/x86/include/asm/cpumask.h [new file with mode: 0644]
arch/x86/include/asm/current.h
arch/x86/include/asm/genapic_32.h
arch/x86/include/asm/genapic_64.h
arch/x86/include/asm/hardirq.h
arch/x86/include/asm/hardirq_32.h [deleted file]
arch/x86/include/asm/hardirq_64.h [deleted file]
arch/x86/include/asm/io_apic.h
arch/x86/include/asm/irq_regs.h
arch/x86/include/asm/irq_regs_32.h [deleted file]
arch/x86/include/asm/irq_regs_64.h [deleted file]
arch/x86/include/asm/irq_vectors.h
arch/x86/include/asm/mach-default/entry_arch.h
arch/x86/include/asm/mmu_context.h
arch/x86/include/asm/mmu_context_32.h [deleted file]
arch/x86/include/asm/mmu_context_64.h [deleted file]
arch/x86/include/asm/mpspec_def.h
arch/x86/include/asm/page.h
arch/x86/include/asm/page_64.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/pda.h [deleted file]
arch/x86/include/asm/percpu.h
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_64.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/setup.h
arch/x86/include/asm/smp.h
arch/x86/include/asm/stackprotector.h [new file with mode: 0644]
arch/x86/include/asm/system.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/tlbflush.h
arch/x86/include/asm/topology.h
arch/x86/include/asm/trampoline.h
arch/x86/include/asm/uv/uv.h [new file with mode: 0644]
arch/x86/include/asm/uv/uv_bau.h
arch/x86/kernel/Makefile
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/acpi/sleep.c
arch/x86/kernel/apic.c
arch/x86/kernel/asm-offsets_64.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/intel_cacheinfo.c
arch/x86/kernel/cpu/mcheck/mce_amd_64.c
arch/x86/kernel/cpu/mcheck/mce_intel_64.c
arch/x86/kernel/crash.c
arch/x86/kernel/dumpstack_64.c
arch/x86/kernel/efi.c
arch/x86/kernel/efi_64.c
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/genapic_64.c
arch/x86/kernel/genx2apic_uv_x.c
arch/x86/kernel/head64.c
arch/x86/kernel/head_32.S
arch/x86/kernel/head_64.S
arch/x86/kernel/io_apic.c
arch/x86/kernel/irq.c
arch/x86/kernel/irq_32.c
arch/x86/kernel/irq_64.c
arch/x86/kernel/irqinit_32.c
arch/x86/kernel/microcode_intel.c
arch/x86/kernel/module_32.c
arch/x86/kernel/module_64.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/msr.c
arch/x86/kernel/nmi.c
arch/x86/kernel/paravirt.c
arch/x86/kernel/paravirt_patch_32.c
arch/x86/kernel/paravirt_patch_64.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/reboot.c
arch/x86/kernel/setup.c
arch/x86/kernel/setup_percpu.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/smpcommon.c [deleted file]
arch/x86/kernel/tlb_32.c [deleted file]
arch/x86/kernel/tlb_uv.c
arch/x86/kernel/traps.c
arch/x86/kernel/vmi_32.c
arch/x86/kernel/vmlinux_32.lds.S
arch/x86/kernel/vmlinux_64.lds.S
arch/x86/kernel/vsmp_64.c
arch/x86/kernel/x8664_ksyms_64.c
arch/x86/lguest/boot.c
arch/x86/mach-voyager/setup.c
arch/x86/mach-voyager/voyager_smp.c
arch/x86/mm/Makefile
arch/x86/mm/fault.c
arch/x86/mm/init_32.c
arch/x86/mm/numa_64.c
arch/x86/mm/srat_64.c
arch/x86/mm/tlb.c [moved from arch/x86/kernel/tlb_64.c with 67% similarity]
arch/x86/xen/Makefile
arch/x86/xen/enlighten.c
arch/x86/xen/irq.c
arch/x86/xen/mmu.c
arch/x86/xen/mmu.h
arch/x86/xen/multicalls.h
arch/x86/xen/smp.c
arch/x86/xen/suspend.c
arch/x86/xen/xen-asm.S [new file with mode: 0644]
arch/x86/xen/xen-asm.h [new file with mode: 0644]
arch/x86/xen/xen-asm_32.S
arch/x86/xen/xen-asm_64.S
arch/x86/xen/xen-ops.h
drivers/base/cpu.c
drivers/base/topology.c
drivers/firmware/dcdbas.c
drivers/misc/Kconfig
drivers/misc/sgi-gru/gru.h
drivers/misc/sgi-xp/xp.h
drivers/misc/sgi-xp/xpc_main.c
drivers/net/sfc/efx.c
drivers/oprofile/buffer_sync.c
drivers/oprofile/buffer_sync.h
drivers/oprofile/oprof.c
drivers/pci/intr_remapping.c
drivers/xen/events.c
drivers/xen/manage.c
include/asm-generic/percpu.h
include/asm-generic/sections.h
include/asm-generic/vmlinux.lds.h
include/linux/interrupt.h
include/linux/irq.h
include/linux/irqnr.h
include/linux/magic.h
include/linux/percpu.h
include/linux/sched.h
include/linux/stackprotector.h [new file with mode: 0644]
include/linux/topology.h
init/main.c
kernel/exit.c
kernel/fork.c
kernel/irq/chip.c
kernel/irq/handle.c
kernel/irq/internals.h
kernel/irq/manage.c
kernel/irq/migration.c
kernel/irq/numa_migrate.c
kernel/irq/proc.c
kernel/panic.c
kernel/sched.c
kernel/sched_rt.c
kernel/softirq.c

index 45932ec..b41f3e5 100644 (file)
@@ -18,11 +18,11 @@ For an architecture to support this feature, it must define some of
 these macros in include/asm-XXX/topology.h:
 #define topology_physical_package_id(cpu)
 #define topology_core_id(cpu)
-#define topology_thread_siblings(cpu)
-#define topology_core_siblings(cpu)
+#define topology_thread_cpumask(cpu)
+#define topology_core_cpumask(cpu)
 
 The type of **_id is int.
-The type of siblings is cpumask_t.
+The type of siblings is (const) struct cpumask *.
 
 To be consistent on all architectures, include/linux/topology.h
 provides default definitions for any of the above macros that are
index 703731a..7bc7489 100644 (file)
@@ -55,7 +55,7 @@ int irq_select_affinity(unsigned int irq)
                cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
        last_cpu = cpu;
 
-       irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+       cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
        irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu));
        return 0;
 }
index 363db18..45eacb5 100644 (file)
@@ -104,6 +104,11 @@ static struct irq_desc bad_irq_desc = {
        .lock = __SPIN_LOCK_UNLOCKED(bad_irq_desc.lock),
 };
 
+#ifdef CONFIG_CPUMASK_OFFSTACK
+/* We are not allocating bad_irq_desc.affinity or .pending_mask */
+#error "ARM architecture does not support CONFIG_CPUMASK_OFFSTACK."
+#endif
+
 /*
  * do_IRQ handles all hardware IRQ's.  Decoded IRQs should not
  * come via this function.  Instead, they should provide their
@@ -161,7 +166,7 @@ void __init init_IRQ(void)
                irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_NOPROBE;
 
 #ifdef CONFIG_SMP
-       bad_irq_desc.affinity = CPU_MASK_ALL;
+       cpumask_setall(bad_irq_desc.affinity);
        bad_irq_desc.cpu = smp_processor_id();
 #endif
        init_arch_irq();
@@ -191,15 +196,16 @@ void migrate_irqs(void)
                struct irq_desc *desc = irq_desc + i;
 
                if (desc->cpu == cpu) {
-                       unsigned int newcpu = any_online_cpu(desc->affinity);
-
-                       if (newcpu == NR_CPUS) {
+                       unsigned int newcpu = cpumask_any_and(desc->affinity,
+                                                             cpu_online_mask);
+                       if (newcpu >= nr_cpu_ids) {
                                if (printk_ratelimit())
                                        printk(KERN_INFO "IRQ%u no longer affine to CPU%u\n",
                                               i, cpu);
 
-                               cpus_setall(desc->affinity);
-                               newcpu = any_online_cpu(desc->affinity);
+                               cpumask_setall(desc->affinity);
+                               newcpu = cpumask_any_and(desc->affinity,
+                                                        cpu_online_mask);
                        }
 
                        route_irq(desc, i, newcpu);
index 0021607..85598f7 100644 (file)
@@ -65,6 +65,7 @@ SECTIONS
 #endif
                . = ALIGN(4096);
                __per_cpu_start = .;
+                       *(.data.percpu.page_aligned)
                        *(.data.percpu)
                        *(.data.percpu.shared_aligned)
                __per_cpu_end = .;
index 6d6bd58..853d42b 100644 (file)
@@ -263,7 +263,7 @@ static void em_route_irq(int irq, unsigned int cpu)
        const struct cpumask *mask = cpumask_of(cpu);
 
        spin_lock_irq(&desc->lock);
-       desc->affinity = *mask;
+       cpumask_copy(desc->affinity, mask);
        desc->chip->set_affinity(irq, mask);
        spin_unlock_irq(&desc->lock);
 }
index 75724ee..23e9aa0 100644 (file)
@@ -70,6 +70,11 @@ static struct irq_desc bad_irq_desc = {
 #endif
 };
 
+#ifdef CONFIG_CPUMASK_OFFSTACK
+/* We are not allocating a variable-sized bad_irq_desc.affinity */
+#error "Blackfin architecture does not support CONFIG_CPUMASK_OFFSTACK."
+#endif
+
 int show_interrupts(struct seq_file *p, void *v)
 {
        int i = *(loff_t *) v, j;
index 32f3af1..3193f44 100644 (file)
@@ -84,7 +84,7 @@ void build_cpu_to_node_map(void);
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
        .min_interval           = 8,                    \
-       .max_interval           = 8*(min(num_online_cpus(), 32)), \
+       .max_interval           = 8*(min(num_online_cpus(), 32U)), \
        .busy_factor            = 64,                   \
        .imbalance_pct          = 125,                  \
        .cache_nice_tries       = 2,                    \
index 5cfd3d9..006ad36 100644 (file)
@@ -880,7 +880,7 @@ iosapic_unregister_intr (unsigned int gsi)
        if (iosapic_intr_info[irq].count == 0) {
 #ifdef CONFIG_SMP
                /* Clear affinity */
-               cpus_setall(idesc->affinity);
+               cpumask_setall(idesc->affinity);
 #endif
                /* Clear the interrupt information */
                iosapic_intr_info[irq].dest = 0;
index a58f64c..226233a 100644 (file)
@@ -103,7 +103,7 @@ static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
 void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 {
        if (irq < NR_IRQS) {
-               cpumask_copy(&irq_desc[irq].affinity,
+               cpumask_copy(irq_desc[irq].affinity,
                             cpumask_of(cpu_logical_id(hwid)));
                irq_redir[irq] = (char) (redir & 0xff);
        }
@@ -148,7 +148,7 @@ static void migrate_irqs(void)
                if (desc->status == IRQ_PER_CPU)
                        continue;
 
-               if (cpumask_any_and(&irq_desc[irq].affinity, cpu_online_mask)
+               if (cpumask_any_and(irq_desc[irq].affinity, cpu_online_mask)
                    >= nr_cpu_ids) {
                        /*
                         * Save it for phase 2 processing
index 28d3d48..927ad02 100644 (file)
@@ -493,11 +493,13 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
        saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
        ia64_srlz_d();
        while (vector != IA64_SPURIOUS_INT_VECTOR) {
+               struct irq_desc *desc = irq_to_desc(vector);
+
                if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
                        smp_local_flush_tlb();
-                       kstat_this_cpu.irqs[vector]++;
+                       kstat_incr_irqs_this_cpu(vector, desc);
                } else if (unlikely(IS_RESCHEDULE(vector)))
-                       kstat_this_cpu.irqs[vector]++;
+                       kstat_incr_irqs_this_cpu(vector, desc);
                else {
                        int irq = local_vector_to_irq(vector);
 
@@ -551,11 +553,13 @@ void ia64_process_pending_intr(void)
          * Perform normal interrupt style processing
          */
        while (vector != IA64_SPURIOUS_INT_VECTOR) {
+               struct irq_desc *desc = irq_to_desc(vector);
+
                if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
                        smp_local_flush_tlb();
-                       kstat_this_cpu.irqs[vector]++;
+                       kstat_incr_irqs_this_cpu(vector, desc);
                } else if (unlikely(IS_RESCHEDULE(vector)))
-                       kstat_this_cpu.irqs[vector]++;
+                       kstat_incr_irqs_this_cpu(vector, desc);
                else {
                        struct pt_regs *old_regs = set_irq_regs(NULL);
                        int irq = local_vector_to_irq(vector);
index 8903393..dcb6b7c 100644 (file)
@@ -75,7 +75,7 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
        msg.data = data;
 
        write_msi_msg(irq, &msg);
-       irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+       cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
 }
 #endif /* CONFIG_SMP */
 
@@ -187,7 +187,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
        msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
 
        dmar_msi_write(irq, &msg);
-       irq_desc[irq].affinity = *mask;
+       cpumask_copy(irq_desc[irq].affinity, mask);
 }
 #endif /* CONFIG_SMP */
 
index 10a7d47..f45e4e5 100644 (file)
@@ -219,6 +219,7 @@ SECTIONS
   .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET)
        {
                __per_cpu_start = .;
+               *(.data.percpu.page_aligned)
                *(.data.percpu)
                *(.data.percpu.shared_aligned)
                __per_cpu_end = .;
index ca553b0..81e4289 100644 (file)
@@ -205,7 +205,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
        msg.address_lo = (u32)(bus_addr & 0x00000000ffffffff);
 
        write_msi_msg(irq, &msg);
-       irq_desc[irq].affinity = *cpu_mask;
+       cpumask_copy(irq_desc[irq].affinity, cpu_mask);
 }
 #endif /* CONFIG_SMP */
 
index abc62aa..3214ade 100644 (file)
@@ -66,7 +66,7 @@ extern void smtc_forward_irq(unsigned int irq);
  */
 #define IRQ_AFFINITY_HOOK(irq)                                         \
 do {                                                                   \
-    if (!cpu_isset(smp_processor_id(), irq_desc[irq].affinity)) {      \
+    if (!cpumask_test_cpu(smp_processor_id(), irq_desc[irq].affinity)) {\
        smtc_forward_irq(irq);                                          \
        irq_exit();                                                     \
        return;                                                         \
index 494a49a..87deb8f 100644 (file)
@@ -187,7 +187,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
                set_bit(irq, pcpu_masks[first_cpu(tmp)].pcpu_mask);
 
        }
-       irq_desc[irq].affinity = *cpumask;
+       cpumask_copy(irq_desc[irq].affinity, cpumask);
        spin_unlock_irqrestore(&gic_lock, flags);
 
 }
index b6cca01..5f5af7d 100644 (file)
@@ -686,7 +686,7 @@ void smtc_forward_irq(unsigned int irq)
         * and efficiency, we just pick the easiest one to find.
         */
 
-       target = first_cpu(irq_desc[irq].affinity);
+       target = cpumask_first(irq_desc[irq].affinity);
 
        /*
         * We depend on the platform code to have correctly processed
@@ -921,11 +921,13 @@ void ipi_decode(struct smtc_ipi *pipi)
        struct clock_event_device *cd;
        void *arg_copy = pipi->arg;
        int type_copy = pipi->type;
+       int irq = MIPS_CPU_IRQ_BASE + 1;
+
        smtc_ipi_nq(&freeIPIq, pipi);
        switch (type_copy) {
        case SMTC_CLOCK_TICK:
                irq_enter();
-               kstat_this_cpu.irqs[MIPS_CPU_IRQ_BASE + 1]++;
+               kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
                cd = &per_cpu(mips_clockevent_device, cpu);
                cd->event_handler(cd);
                irq_exit();
index aabd727..5ba3188 100644 (file)
@@ -116,7 +116,7 @@ struct plat_smp_ops msmtc_smp_ops = {
 
 void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 {
-       cpumask_t tmask = *affinity;
+       cpumask_t tmask;
        int cpu = 0;
        void smtc_set_irq_affinity(unsigned int irq, cpumask_t aff);
 
@@ -139,11 +139,12 @@ void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
         * be made to forward to an offline "CPU".
         */
 
+       cpumask_copy(&tmask, affinity);
        for_each_cpu(cpu, affinity) {
                if ((cpu_data[cpu].vpe_id != 0) || !cpu_online(cpu))
                        cpu_clear(cpu, tmask);
        }
-       irq_desc[irq].affinity = tmask;
+       cpumask_copy(irq_desc[irq].affinity, &tmask);
 
        if (cpus_empty(tmask))
                /*
index f8b18af..0ecd5fe 100644 (file)
@@ -155,7 +155,7 @@ static void indy_buserror_irq(void)
        int irq = SGI_BUSERR_IRQ;
 
        irq_enter();
-       kstat_this_cpu.irqs[irq]++;
+       kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
        ip22_be_interrupt(irq);
        irq_exit();
 }
index 3dcb27e..c8f7d23 100644 (file)
@@ -122,7 +122,7 @@ void indy_8254timer_irq(void)
        char c;
 
        irq_enter();
-       kstat_this_cpu.irqs[irq]++;
+       kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
        printk(KERN_ALERT "Oops, got 8254 interrupt.\n");
        ArcRead(0, &c, 1, &cnt);
        ArcEnterInteractiveMode();
index dddfda8..3146916 100644 (file)
@@ -178,9 +178,10 @@ struct plat_smp_ops bcm1480_smp_ops = {
 void bcm1480_mailbox_interrupt(void)
 {
        int cpu = smp_processor_id();
+       int irq = K_BCM1480_INT_MBOX_0_0;
        unsigned int action;
 
-       kstat_this_cpu.irqs[K_BCM1480_INT_MBOX_0_0]++;
+       kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
        /* Load the mailbox register to figure out what we're supposed to do */
        action = (__raw_readq(mailbox_0_regs[cpu]) >> 48) & 0xffff;
 
index 5950a28..cad1400 100644 (file)
@@ -166,9 +166,10 @@ struct plat_smp_ops sb_smp_ops = {
 void sb1250_mailbox_interrupt(void)
 {
        int cpu = smp_processor_id();
+       int irq = K_INT_MBOX_0;
        unsigned int action;
 
-       kstat_this_cpu.irqs[K_INT_MBOX_0]++;
+       kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
        /* Load the mailbox register to figure out what we're supposed to do */
        action = (____raw_readq(mailbox_regs[cpu]) >> 48) & 0xffff;
 
index 10811e9..2e370d8 100644 (file)
@@ -130,6 +130,7 @@ void watchdog_interrupt(struct pt_regs *regs, enum exception_code excep)
         * the stack NMI-atomically, it's safe to use smp_processor_id().
         */
        int sum, cpu = smp_processor_id();
+       int irq = NMIIRQ;
        u8 wdt, tmp;
 
        wdt = WDCTR & ~WDCTR_WDCNE;
@@ -138,7 +139,7 @@ void watchdog_interrupt(struct pt_regs *regs, enum exception_code excep)
        NMICR = NMICR_WDIF;
 
        nmi_count(cpu)++;
-       kstat_this_cpu.irqs[NMIIRQ]++;
+       kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
        sum = irq_stat[cpu].__irq_count;
 
        if (last_irq_sums[cpu] == sum) {
index ac2c822..4948280 100644 (file)
@@ -120,7 +120,7 @@ int cpu_check_affinity(unsigned int irq, cpumask_t *dest)
        if (CHECK_IRQ_PER_CPU(irq)) {
                /* Bad linux design decision.  The mask has already
                 * been set; we must reset it */
-               irq_desc[irq].affinity = CPU_MASK_ALL;
+               cpumask_setall(irq_desc[irq].affinity);
                return -EINVAL;
        }
 
@@ -136,7 +136,7 @@ static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
        if (cpu_check_affinity(irq, dest))
                return;
 
-       irq_desc[irq].affinity = *dest;
+       cpumask_copy(irq_desc[irq].affinity, dest);
 }
 #endif
 
@@ -295,7 +295,7 @@ int txn_alloc_irq(unsigned int bits_wide)
 unsigned long txn_affinity_addr(unsigned int irq, int cpu)
 {
 #ifdef CONFIG_SMP
-       irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+       cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
 #endif
 
        return per_cpu(cpu_data, cpu).txn_addr;
@@ -352,7 +352,7 @@ void do_cpu_irq_mask(struct pt_regs *regs)
        irq = eirr_to_irq(eirr_val);
 
 #ifdef CONFIG_SMP
-       dest = irq_desc[irq].affinity;
+       cpumask_copy(&dest, irq_desc[irq].affinity);
        if (CHECK_IRQ_PER_CPU(irq_desc[irq].status) &&
            !cpu_isset(smp_processor_id(), dest)) {
                int cpu = first_cpu(dest);
index 23b8b5e..ad1e5ac 100644 (file)
@@ -231,7 +231,7 @@ void fixup_irqs(cpumask_t map)
                if (irq_desc[irq].status & IRQ_PER_CPU)
                        continue;
 
-               cpus_and(mask, irq_desc[irq].affinity, map);
+               cpumask_and(&mask, irq_desc[irq].affinity, &map);
                if (any_online_cpu(mask) == NR_CPUS) {
                        printk("Breaking affinity for irq %i\n", irq);
                        mask = map;
index 161b9b9..295ccc5 100644 (file)
@@ -184,6 +184,7 @@ SECTIONS
        . = ALIGN(PAGE_SIZE);
        .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
                __per_cpu_start = .;
+               *(.data.percpu.page_aligned)
                *(.data.percpu)
                *(.data.percpu.shared_aligned)
                __per_cpu_end = .;
index 84e058f..80b5134 100644 (file)
@@ -153,9 +153,10 @@ static int get_irq_server(unsigned int virq, unsigned int strict_check)
 {
        int server;
        /* For the moment only implement delivery to all cpus or one cpu */
-       cpumask_t cpumask = irq_desc[virq].affinity;
+       cpumask_t cpumask;
        cpumask_t tmp = CPU_MASK_NONE;
 
+       cpumask_copy(&cpumask, irq_desc[virq].affinity);
        if (!distribute_irqs)
                return default_server;
 
@@ -869,7 +870,7 @@ void xics_migrate_irqs_away(void)
                       virq, cpu);
 
                /* Reset affinity to all cpus */
-               irq_desc[virq].affinity = CPU_MASK_ALL;
+               cpumask_setall(irq_desc[virq].affinity);
                desc->chip->set_affinity(virq, cpu_all_mask);
 unlock:
                spin_unlock_irqrestore(&desc->lock, flags);
index a35297d..532e205 100644 (file)
@@ -566,9 +566,10 @@ static void __init mpic_scan_ht_pics(struct mpic *mpic)
 #ifdef CONFIG_SMP
 static int irq_choose_cpu(unsigned int virt_irq)
 {
-       cpumask_t mask = irq_desc[virt_irq].affinity;
+       cpumask_t mask;
        int cpuid;
 
+       cpumask_copy(&mask, irq_desc[virt_irq].affinity);
        if (cpus_equal(mask, CPU_MASK_ALL)) {
                static int irq_rover;
                static DEFINE_SPINLOCK(irq_rover_lock);
index e289376..3d2c6ba 100644 (file)
@@ -252,9 +252,10 @@ struct irq_handler_data {
 #ifdef CONFIG_SMP
 static int irq_choose_cpu(unsigned int virt_irq)
 {
-       cpumask_t mask = irq_desc[virt_irq].affinity;
+       cpumask_t mask;
        int cpuid;
 
+       cpumask_copy(&mask, irq_desc[virt_irq].affinity);
        if (cpus_equal(mask, CPU_MASK_ALL)) {
                static int irq_rover;
                static DEFINE_SPINLOCK(irq_rover_lock);
@@ -796,7 +797,7 @@ void fixup_irqs(void)
                    !(irq_desc[irq].status & IRQ_PER_CPU)) {
                        if (irq_desc[irq].chip->set_affinity)
                                irq_desc[irq].chip->set_affinity(irq,
-                                       &irq_desc[irq].affinity);
+                                       irq_desc[irq].affinity);
                }
                spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
        }
index 2db3c22..db310aa 100644 (file)
@@ -729,7 +729,7 @@ void timer_interrupt(int irq, struct pt_regs *regs)
 
        irq_enter();
 
-       kstat_this_cpu.irqs[0]++;
+       kstat_incr_irqs_this_cpu(0, irq_to_desc(0));
 
        if (unlikely(!evt->event_handler)) {
                printk(KERN_WARNING
index 73f7fe8..d6218e6 100644 (file)
@@ -133,7 +133,7 @@ config ARCH_HAS_CACHE_LINE_SIZE
        def_bool y
 
 config HAVE_SETUP_PER_CPU_AREA
-       def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER)
+       def_bool y
 
 config HAVE_CPUMASK_OF_CPU_MAP
        def_bool X86_64_SMP
@@ -391,6 +391,13 @@ config X86_RDC321X
          as R-8610-(G).
          If you don't have one of these chips, you should say N here.
 
+config X86_UV
+       bool "SGI Ultraviolet"
+       depends on X86_64
+       help
+         This option is needed in order to support SGI Ultraviolet systems.
+         If you don't have one of these, you should say N here.
+
 config SCHED_OMIT_FRAME_POINTER
        def_bool y
        prompt "Single-depth WCHAN output"
@@ -1340,13 +1347,17 @@ config SECCOMP
 
          If unsure, say Y. Only embedded should say N here.
 
+config CC_STACKPROTECTOR_ALL
+       bool
+
 config CC_STACKPROTECTOR
        bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
-       depends on X86_64 && EXPERIMENTAL && BROKEN
+       depends on X86_64
+       select CC_STACKPROTECTOR_ALL
        help
-         This option turns on the -fstack-protector GCC feature. This
-         feature puts, at the beginning of critical functions, a canary
-         value on the stack just before the return address, and validates
+          This option turns on the -fstack-protector GCC feature. This
+         feature puts, at the beginning of functions, a canary value on
+         the stack just before the return address, and validates
          the value just before actually returning.  Stack based buffer
          overflows (that need to overwrite this return address) now also
          overwrite the canary, which gets detected and the attack is then
@@ -1354,15 +1365,8 @@ config CC_STACKPROTECTOR
 
          This feature requires gcc version 4.2 or above, or a distribution
          gcc with the feature backported. Older versions are automatically
-         detected and for those versions, this configuration option is ignored.
-
-config CC_STACKPROTECTOR_ALL
-       bool "Use stack-protector for all functions"
-       depends on CC_STACKPROTECTOR
-       help
-         Normally, GCC only inserts the canary value protection for
-         functions that use large-ish on-stack buffers. By enabling
-         this option, GCC will be asked to do this for ALL functions.
+         detected and for those versions, this configuration option is
+         ignored. (and a warning is printed during bootup)
 
 source kernel/Kconfig.hz
 
index 8078955..8eb50ba 100644 (file)
@@ -292,25 +292,23 @@ config X86_CPU
 # Define implied options from the CPU selection here
 config X86_L1_CACHE_BYTES
        int
-       default "128" if GENERIC_CPU || MPSC
-       default "64" if MK8 || MCORE2
-       depends on X86_64
+       default "128" if MPSC
+       default "64" if GENERIC_CPU || MK8 || MCORE2 || X86_32
 
 config X86_INTERNODE_CACHE_BYTES
        int
        default "4096" if X86_VSMP
        default X86_L1_CACHE_BYTES if !X86_VSMP
-       depends on X86_64
 
 config X86_CMPXCHG
        def_bool X86_64 || (X86_32 && !M386)
 
 config X86_L1_CACHE_SHIFT
        int
-       default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC
+       default "7" if MPENTIUM4 || MPSC
        default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
        default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
-       default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7
+       default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 || X86_GENERIC || GENERIC_CPU
 
 config X86_XADD
        def_bool y
index 10d6cc3..28f1114 100644 (file)
@@ -117,6 +117,7 @@ config DEBUG_RODATA
 config DEBUG_RODATA_TEST
        bool "Testcase for the DEBUG_RODATA feature"
        depends on DEBUG_RODATA
+       default y
        help
          This option enables a testcase for the DEBUG_RODATA
          feature as well as for the change_page_attr() infrastructure.
index d1a47ad..cacee98 100644 (file)
@@ -73,7 +73,7 @@ else
 
         stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh
         stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \
-                "$(CC)" -fstack-protector )
+                "$(CC)" "-fstack-protector -DGCC_HAS_SP" )
         stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \
                 "$(CC)" -fstack-protector-all )
 
index 5a0d76d..097a6b6 100644 (file)
@@ -112,8 +112,8 @@ ENTRY(ia32_sysenter_target)
        CFI_DEF_CFA     rsp,0
        CFI_REGISTER    rsp,rbp
        SWAPGS_UNSAFE_STACK
-       movq    %gs:pda_kernelstack, %rsp
-       addq    $(PDA_STACKOFFSET),%rsp 
+       movq    PER_CPU_VAR(kernel_stack), %rsp
+       addq    $(KERNEL_STACK_OFFSET),%rsp
        /*
         * No need to follow this irqs on/off section: the syscall
         * disabled irqs, here we enable it straight after entry:
@@ -273,13 +273,13 @@ ENDPROC(ia32_sysenter_target)
 ENTRY(ia32_cstar_target)
        CFI_STARTPROC32 simple
        CFI_SIGNAL_FRAME
-       CFI_DEF_CFA     rsp,PDA_STACKOFFSET
+       CFI_DEF_CFA     rsp,KERNEL_STACK_OFFSET
        CFI_REGISTER    rip,rcx
        /*CFI_REGISTER  rflags,r11*/
        SWAPGS_UNSAFE_STACK
        movl    %esp,%r8d
        CFI_REGISTER    rsp,r8
-       movq    %gs:pda_kernelstack,%rsp
+       movq    PER_CPU_VAR(kernel_stack),%rsp
        /*
         * No need to follow this irqs on/off section: the syscall
         * disabled irqs and here we enable it straight after entry:
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
new file mode 100644 (file)
index 0000000..82f613c
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_APICNUM_H
+#define _ASM_X86_APICNUM_H
+
+/* define MAX_IO_APICS */
+#ifdef CONFIG_X86_32
+# define MAX_IO_APICS 64
+#else
+# define MAX_IO_APICS 128
+# define MAX_LOCAL_APIC 32768
+#endif
+
+#endif /* _ASM_X86_APICNUM_H */
index bae482d..f03b23e 100644 (file)
@@ -7,6 +7,20 @@
 #include <linux/nodemask.h>
 #include <linux/percpu.h>
 
+#ifdef CONFIG_SMP
+
+extern void prefill_possible_map(void);
+
+#else /* CONFIG_SMP */
+
+static inline void prefill_possible_map(void) {}
+
+#define cpu_physical_id(cpu)                   boot_cpu_physical_apicid
+#define safe_smp_processor_id()                        0
+#define stack_smp_processor_id()               0
+
+#endif /* CONFIG_SMP */
+
 struct x86_cpu {
        struct cpu cpu;
 };
@@ -17,4 +31,11 @@ extern void arch_unregister_cpu(int);
 #endif
 
 DECLARE_PER_CPU(int, cpu_state);
+
+#ifdef CONFIG_X86_HAS_BOOT_CPU_ID
+extern unsigned char boot_cpu_id;
+#else
+#define boot_cpu_id                            0
+#endif
+
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
new file mode 100644 (file)
index 0000000..a7f3c75
--- /dev/null
@@ -0,0 +1,32 @@
+#ifndef _ASM_X86_CPUMASK_H
+#define _ASM_X86_CPUMASK_H
+#ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
+
+#ifdef CONFIG_X86_64
+
+extern cpumask_var_t cpu_callin_mask;
+extern cpumask_var_t cpu_callout_mask;
+extern cpumask_var_t cpu_initialized_mask;
+extern cpumask_var_t cpu_sibling_setup_mask;
+
+extern void setup_cpu_local_masks(void);
+
+#else /* CONFIG_X86_32 */
+
+extern cpumask_t cpu_callin_map;
+extern cpumask_t cpu_callout_map;
+extern cpumask_t cpu_initialized;
+extern cpumask_t cpu_sibling_setup_map;
+
+#define cpu_callin_mask                ((struct cpumask *)&cpu_callin_map)
+#define cpu_callout_mask       ((struct cpumask *)&cpu_callout_map)
+#define cpu_initialized_mask   ((struct cpumask *)&cpu_initialized)
+#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map)
+
+static inline void setup_cpu_local_masks(void) { }
+
+#endif /* CONFIG_X86_32 */
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_CPUMASK_H */
index 0930b4f..c68c361 100644 (file)
@@ -1,39 +1,21 @@
 #ifndef _ASM_X86_CURRENT_H
 #define _ASM_X86_CURRENT_H
 
-#ifdef CONFIG_X86_32
 #include <linux/compiler.h>
 #include <asm/percpu.h>
 
+#ifndef __ASSEMBLY__
 struct task_struct;
 
 DECLARE_PER_CPU(struct task_struct *, current_task);
-static __always_inline struct task_struct *get_current(void)
-{
-       return x86_read_percpu(current_task);
-}
-
-#else /* X86_32 */
-
-#ifndef __ASSEMBLY__
-#include <asm/pda.h>
-
-struct task_struct;
 
 static __always_inline struct task_struct *get_current(void)
 {
-       return read_pda(pcurrent);
+       return percpu_read(current_task);
 }
 
-#else /* __ASSEMBLY__ */
-
-#include <asm/asm-offsets.h>
-#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg
+#define current get_current()
 
 #endif /* __ASSEMBLY__ */
 
-#endif /* X86_32 */
-
-#define current get_current()
-
 #endif /* _ASM_X86_CURRENT_H */
index 2c05b73..4334502 100644 (file)
@@ -138,11 +138,4 @@ struct genapic {
 extern struct genapic *genapic;
 extern void es7000_update_genapic_to_cluster(void);
 
-enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
-#define get_uv_system_type()           UV_NONE
-#define is_uv_system()                 0
-#define uv_wakeup_secondary(a, b)      1
-#define uv_system_init()               do {} while (0)
-
-
 #endif /* _ASM_X86_GENAPIC_32_H */
index adf32fb..7bb092c 100644 (file)
@@ -51,15 +51,9 @@ extern struct genapic apic_x2apic_phys;
 extern int acpi_madt_oem_check(char *, char *);
 
 extern void apic_send_IPI_self(int vector);
-enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
-extern enum uv_system_type get_uv_system_type(void);
-extern int is_uv_system(void);
 
 extern struct genapic apic_x2apic_uv_x;
 DECLARE_PER_CPU(int, x2apic_extra_bits);
-extern void uv_cpu_init(void);
-extern void uv_system_init(void);
-extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
 
 extern void setup_apic_routing(void);
 
index 000787d..176f058 100644 (file)
@@ -1,11 +1,52 @@
-#ifdef CONFIG_X86_32
-# include "hardirq_32.h"
-#else
-# include "hardirq_64.h"
+#ifndef _ASM_X86_HARDIRQ_H
+#define _ASM_X86_HARDIRQ_H
+
+#include <linux/threads.h>
+#include <linux/irq.h>
+
+typedef struct {
+       unsigned int __softirq_pending;
+       unsigned int __nmi_count;       /* arch dependent */
+       unsigned int irq0_irqs;
+#ifdef CONFIG_X86_LOCAL_APIC
+       unsigned int apic_timer_irqs;   /* arch dependent */
+       unsigned int irq_spurious_count;
+#endif
+#ifdef CONFIG_SMP
+       unsigned int irq_resched_count;
+       unsigned int irq_call_count;
+       unsigned int irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+       unsigned int irq_thermal_count;
+# ifdef CONFIG_X86_64
+       unsigned int irq_threshold_count;
+# endif
 #endif
+} ____cacheline_aligned irq_cpustat_t;
+
+DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
+
+/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
+#define MAX_HARDIRQS_PER_CPU NR_VECTORS
+
+#define __ARCH_IRQ_STAT
+
+#define inc_irq_stat(member)   percpu_add(irq_stat.member, 1)
+
+#define local_softirq_pending()        percpu_read(irq_stat.__softirq_pending)
+
+#define __ARCH_SET_SOFTIRQ_PENDING
+
+#define set_softirq_pending(x) percpu_write(irq_stat.__softirq_pending, (x))
+#define or_softirq_pending(x)  percpu_or(irq_stat.__softirq_pending, (x))
+
+extern void ack_bad_irq(unsigned int irq);
 
 extern u64 arch_irq_stat_cpu(unsigned int cpu);
 #define arch_irq_stat_cpu      arch_irq_stat_cpu
 
 extern u64 arch_irq_stat(void);
 #define arch_irq_stat          arch_irq_stat
+
+#endif /* _ASM_X86_HARDIRQ_H */
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
deleted file mode 100644 (file)
index cf7954d..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _ASM_X86_HARDIRQ_32_H
-#define _ASM_X86_HARDIRQ_32_H
-
-#include <linux/threads.h>
-#include <linux/irq.h>
-
-typedef struct {
-       unsigned int __softirq_pending;
-       unsigned long idle_timestamp;
-       unsigned int __nmi_count;       /* arch dependent */
-       unsigned int apic_timer_irqs;   /* arch dependent */
-       unsigned int irq0_irqs;
-       unsigned int irq_resched_count;
-       unsigned int irq_call_count;
-       unsigned int irq_tlb_count;
-       unsigned int irq_thermal_count;
-       unsigned int irq_spurious_count;
-} ____cacheline_aligned irq_cpustat_t;
-
-DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
-
-#define __ARCH_IRQ_STAT
-#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member)
-
-#define inc_irq_stat(member)   (__get_cpu_var(irq_stat).member++)
-
-void ack_bad_irq(unsigned int irq);
-#include <linux/irq_cpustat.h>
-
-#endif /* _ASM_X86_HARDIRQ_32_H */
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
deleted file mode 100644 (file)
index b5a6b5d..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _ASM_X86_HARDIRQ_64_H
-#define _ASM_X86_HARDIRQ_64_H
-
-#include <linux/threads.h>
-#include <linux/irq.h>
-#include <asm/pda.h>
-#include <asm/apic.h>
-
-/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
-#define MAX_HARDIRQS_PER_CPU NR_VECTORS
-
-#define __ARCH_IRQ_STAT 1
-
-#define inc_irq_stat(member)   add_pda(member, 1)
-
-#define local_softirq_pending() read_pda(__softirq_pending)
-
-#define __ARCH_SET_SOFTIRQ_PENDING 1
-
-#define set_softirq_pending(x) write_pda(__softirq_pending, (x))
-#define or_softirq_pending(x)  or_pda(__softirq_pending, (x))
-
-extern void ack_bad_irq(unsigned int irq);
-
-#endif /* _ASM_X86_HARDIRQ_64_H */
index 7a1f44a..08ec793 100644 (file)
@@ -114,38 +114,16 @@ struct IR_IO_APIC_route_entry {
 extern int nr_ioapics;
 extern int nr_ioapic_registers[MAX_IO_APICS];
 
-/*
- * MP-BIOS irq configuration table structures:
- */
-
 #define MP_MAX_IOAPIC_PIN 127
 
-struct mp_config_ioapic {
-       unsigned long mp_apicaddr;
-       unsigned int mp_apicid;
-       unsigned char mp_type;
-       unsigned char mp_apicver;
-       unsigned char mp_flags;
-};
-
-struct mp_config_intsrc {
-       unsigned int mp_dstapic;
-       unsigned char mp_type;
-       unsigned char mp_irqtype;
-       unsigned short mp_irqflag;
-       unsigned char mp_srcbus;
-       unsigned char mp_srcbusirq;
-       unsigned char mp_dstirq;
-};
-
 /* I/O APIC entries */
-extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
+extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
 
 /* # of MP IRQ source entries */
 extern int mp_irq_entries;
 
 /* MP IRQ source entries */
-extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
 /* non-0 if default (table-less) MP configuration */
 extern int mpc_default_type;
index 89c898a..7784322 100644 (file)
@@ -1,5 +1,31 @@
-#ifdef CONFIG_X86_32
-# include "irq_regs_32.h"
-#else
-# include "irq_regs_64.h"
-#endif
+/*
+ * Per-cpu current frame pointer - the location of the last exception frame on
+ * the stack, stored in the per-cpu area.
+ *
+ * Jeremy Fitzhardinge <jeremy@goop.org>
+ */
+#ifndef _ASM_X86_IRQ_REGS_H
+#define _ASM_X86_IRQ_REGS_H
+
+#include <asm/percpu.h>
+
+#define ARCH_HAS_OWN_IRQ_REGS
+
+DECLARE_PER_CPU(struct pt_regs *, irq_regs);
+
+static inline struct pt_regs *get_irq_regs(void)
+{
+       return percpu_read(irq_regs);
+}
+
+static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
+{
+       struct pt_regs *old_regs;
+
+       old_regs = get_irq_regs();
+       percpu_write(irq_regs, new_regs);
+
+       return old_regs;
+}
+
+#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
deleted file mode 100644 (file)
index 86afd74..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Per-cpu current frame pointer - the location of the last exception frame on
- * the stack, stored in the per-cpu area.
- *
- * Jeremy Fitzhardinge <jeremy@goop.org>
- */
-#ifndef _ASM_X86_IRQ_REGS_32_H
-#define _ASM_X86_IRQ_REGS_32_H
-
-#include <asm/percpu.h>
-
-#define ARCH_HAS_OWN_IRQ_REGS
-
-DECLARE_PER_CPU(struct pt_regs *, irq_regs);
-
-static inline struct pt_regs *get_irq_regs(void)
-{
-       return x86_read_percpu(irq_regs);
-}
-
-static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
-{
-       struct pt_regs *old_regs;
-
-       old_regs = get_irq_regs();
-       x86_write_percpu(irq_regs, new_regs);
-
-       return old_regs;
-}
-
-#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_64.h b/arch/x86/include/asm/irq_regs_64.h
deleted file mode 100644 (file)
index 3dd9c0b..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/irq_regs.h>
index f7ff650..9a83a10 100644 (file)
  *  some of the following vectors are 'rare', they are merged
  *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
  *  TLB, reschedule and local APIC vectors are performance-critical.
- *
- *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
  */
 #ifdef CONFIG_X86_32
 
 # define SPURIOUS_APIC_VECTOR          0xff
 # define ERROR_APIC_VECTOR             0xfe
-# define INVALIDATE_TLB_VECTOR         0xfd
-# define RESCHEDULE_VECTOR             0xfc
-# define CALL_FUNCTION_VECTOR          0xfb
-# define CALL_FUNCTION_SINGLE_VECTOR   0xfa
-# define THERMAL_APIC_VECTOR           0xf0
+# define RESCHEDULE_VECTOR             0xfd
+# define CALL_FUNCTION_VECTOR          0xfc
+# define CALL_FUNCTION_SINGLE_VECTOR   0xfb
+# define THERMAL_APIC_VECTOR           0xfa
+/* 0xf8 - 0xf9 : free */
+# define INVALIDATE_TLB_VECTOR_END     0xf7
+# define INVALIDATE_TLB_VECTOR_START   0xf0    /* f0-f7 used for TLB flush */
+
+# define NUM_INVALIDATE_TLB_VECTORS    8
 
 #else
 
-#define SPURIOUS_APIC_VECTOR           0xff
-#define ERROR_APIC_VECTOR              0xfe
-#define RESCHEDULE_VECTOR              0xfd
-#define CALL_FUNCTION_VECTOR           0xfc
-#define CALL_FUNCTION_SINGLE_VECTOR    0xfb
-#define THERMAL_APIC_VECTOR            0xfa
-#define THRESHOLD_APIC_VECTOR          0xf9
-#define UV_BAU_MESSAGE                 0xf8
-#define INVALIDATE_TLB_VECTOR_END      0xf7
-#define INVALIDATE_TLB_VECTOR_START    0xf0    /* f0-f7 used for TLB flush */
+# define SPURIOUS_APIC_VECTOR          0xff
+# define ERROR_APIC_VECTOR             0xfe
+# define RESCHEDULE_VECTOR             0xfd
+# define CALL_FUNCTION_VECTOR          0xfc
+# define CALL_FUNCTION_SINGLE_VECTOR   0xfb
+# define THERMAL_APIC_VECTOR           0xfa
+# define THRESHOLD_APIC_VECTOR         0xf9
+# define UV_BAU_MESSAGE                        0xf8
+# define INVALIDATE_TLB_VECTOR_END     0xf7
+# define INVALIDATE_TLB_VECTOR_START   0xf0    /* f0-f7 used for TLB flush */
 
 #define NUM_INVALIDATE_TLB_VECTORS     8
 
 
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
 
+#include <asm/apicnum.h>       /* need MAX_IO_APICS */
+
 #ifndef CONFIG_SPARSE_IRQ
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
 #  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
 # endif
 #else
-# if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
-#  define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
-# else
-#  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
-# endif
+
+# define NR_IRQS                                       \
+       ((8 * NR_CPUS) > (32 * MAX_IO_APICS) ?          \
+               (NR_VECTORS + (8 * NR_CPUS)) :          \
+               (NR_VECTORS + (32 * MAX_IO_APICS)))     \
+
 #endif
 
 #elif defined(CONFIG_X86_VOYAGER)
index 6b1add8..6fa399a 100644 (file)
  */
 #ifdef CONFIG_X86_SMP
 BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
-BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
 BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
 BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
 BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
+
+BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
+                smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1,
+                smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2,
+                smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3,
+                smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4,
+                smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5,
+                smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6,
+                smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
+                smp_invalidate_interrupt)
 #endif
 
 /*
index 8aeeb3f..52948df 100644 (file)
@@ -21,11 +21,54 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
 void destroy_context(struct mm_struct *mm);
 
-#ifdef CONFIG_X86_32
-# include "mmu_context_32.h"
-#else
-# include "mmu_context_64.h"
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+#ifdef CONFIG_SMP
+       if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+               percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+#endif
+}
+
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+                            struct task_struct *tsk)
+{
+       unsigned cpu = smp_processor_id();
+
+       if (likely(prev != next)) {
+               /* stop flush ipis for the previous mm */
+               cpu_clear(cpu, prev->cpu_vm_mask);
+#ifdef CONFIG_SMP
+               percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+               percpu_write(cpu_tlbstate.active_mm, next);
 #endif
+               cpu_set(cpu, next->cpu_vm_mask);
+
+               /* Re-load page tables */
+               load_cr3(next->pgd);
+
+               /*
+                * load the LDT, if the LDT is different:
+                */
+               if (unlikely(prev->context.ldt != next->context.ldt))
+                       load_LDT_nolock(&next->context);
+       }
+#ifdef CONFIG_SMP
+       else {
+               percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+               BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
+
+               if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+                       /* We were in lazy tlb mode and leave_mm disabled
+                        * tlb flush IPI delivery. We must reload CR3
+                        * to make sure to use no freed page tables.
+                        */
+                       load_cr3(next->pgd);
+                       load_LDT_nolock(&next->context);
+               }
+       }
+#endif
+}
 
 #define activate_mm(prev, next)                        \
 do {                                           \
@@ -33,5 +76,17 @@ do {                                         \
        switch_mm((prev), (next), NULL);        \
 } while (0);
 
+#ifdef CONFIG_X86_32
+#define deactivate_mm(tsk, mm)                 \
+do {                                           \
+       loadsegment(gs, 0);                     \
+} while (0)
+#else
+#define deactivate_mm(tsk, mm)                 \
+do {                                           \
+       load_gs_index(0);                       \
+       loadsegment(fs, 0);                     \
+} while (0)
+#endif
 
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
deleted file mode 100644 (file)
index 7e98ce1..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef _ASM_X86_MMU_CONTEXT_32_H
-#define _ASM_X86_MMU_CONTEXT_32_H
-
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-#ifdef CONFIG_SMP
-       if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
-               x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
-#endif
-}
-
-static inline void switch_mm(struct mm_struct *prev,
-                            struct mm_struct *next,
-                            struct task_struct *tsk)
-{
-       int cpu = smp_processor_id();
-
-       if (likely(prev != next)) {
-               /* stop flush ipis for the previous mm */
-               cpu_clear(cpu, prev->cpu_vm_mask);
-#ifdef CONFIG_SMP
-               x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
-               x86_write_percpu(cpu_tlbstate.active_mm, next);
-#endif
-               cpu_set(cpu, next->cpu_vm_mask);
-
-               /* Re-load page tables */
-               load_cr3(next->pgd);
-
-               /*
-                * load the LDT, if the LDT is different:
-                */
-               if (unlikely(prev->context.ldt != next->context.ldt))
-                       load_LDT_nolock(&next->context);
-       }
-#ifdef CONFIG_SMP
-       else {
-               x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
-               BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
-
-               if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
-                       /* We were in lazy tlb mode and leave_mm disabled
-                        * tlb flush IPI delivery. We must reload %cr3.
-                        */
-                       load_cr3(next->pgd);
-                       load_LDT_nolock(&next->context);
-               }
-       }
-#endif
-}
-
-#define deactivate_mm(tsk, mm)                 \
-       asm("movl %0,%%gs": :"r" (0));
-
-#endif /* _ASM_X86_MMU_CONTEXT_32_H */
diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h
deleted file mode 100644 (file)
index 677d36e..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef _ASM_X86_MMU_CONTEXT_64_H
-#define _ASM_X86_MMU_CONTEXT_64_H
-
-#include <asm/pda.h>
-
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-#ifdef CONFIG_SMP
-       if (read_pda(mmu_state) == TLBSTATE_OK)
-               write_pda(mmu_state, TLBSTATE_LAZY);
-#endif
-}
-
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
-                            struct task_struct *tsk)
-{
-       unsigned cpu = smp_processor_id();
-       if (likely(prev != next)) {
-               /* stop flush ipis for the previous mm */
-               cpu_clear(cpu, prev->cpu_vm_mask);
-#ifdef CONFIG_SMP
-               write_pda(mmu_state, TLBSTATE_OK);
-               write_pda(active_mm, next);
-#endif
-               cpu_set(cpu, next->cpu_vm_mask);
-               load_cr3(next->pgd);
-
-               if (unlikely(next->context.ldt != prev->context.ldt))
-                       load_LDT_nolock(&next->context);
-       }
-#ifdef CONFIG_SMP
-       else {
-               write_pda(mmu_state, TLBSTATE_OK);
-               if (read_pda(active_mm) != next)
-                       BUG();
-               if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
-                       /* We were in lazy tlb mode and leave_mm disabled
-                        * tlb flush IPI delivery. We must reload CR3
-                        * to make sure to use no freed page tables.
-                        */
-                       load_cr3(next->pgd);
-                       load_LDT_nolock(&next->context);
-               }
-       }
-#endif
-}
-
-#define deactivate_mm(tsk, mm)                 \
-do {                                           \
-       load_gs_index(0);                       \
-       asm volatile("movl %0,%%fs"::"r"(0));   \
-} while (0)
-
-#endif /* _ASM_X86_MMU_CONTEXT_64_H */
index 59568bc..4a7f96d 100644 (file)
 # endif
 #endif
 
-struct intel_mp_floating {
-       char mpf_signature[4];          /* "_MP_"                       */
-       unsigned int mpf_physptr;       /* Configuration table address  */
-       unsigned char mpf_length;       /* Our length (paragraphs)      */
-       unsigned char mpf_specification;/* Specification version        */
-       unsigned char mpf_checksum;     /* Checksum (makes sum 0)       */
-       unsigned char mpf_feature1;     /* Standard or configuration ?  */
-       unsigned char mpf_feature2;     /* Bit7 set for IMCR|PIC        */
-       unsigned char mpf_feature3;     /* Unused (0)                   */
-       unsigned char mpf_feature4;     /* Unused (0)                   */
-       unsigned char mpf_feature5;     /* Unused (0)                   */
+/* Intel MP Floating Pointer Structure */
+struct mpf_intel {
+       char signature[4];              /* "_MP_"                       */
+       unsigned int physptr;           /* Configuration table address  */
+       unsigned char length;           /* Our length (paragraphs)      */
+       unsigned char specification;    /* Specification version        */
+       unsigned char checksum;         /* Checksum (makes sum 0)       */
+       unsigned char feature1;         /* Standard or configuration ?  */
+       unsigned char feature2;         /* Bit7 set for IMCR|PIC        */
+       unsigned char feature3;         /* Unused (0)                   */
+       unsigned char feature4;         /* Unused (0)                   */
+       unsigned char feature5;         /* Unused (0)                   */
 };
 
 #define MPC_SIGNATURE "PCMP"
index 823cc93..4022699 100644 (file)
@@ -163,7 +163,7 @@ static inline pteval_t native_pte_val(pte_t pte)
        return pte.pte;
 }
 
-static inline pteval_t native_pte_flags(pte_t pte)
+static inline pteval_t pte_flags(pte_t pte)
 {
        return native_pte_val(pte) & PTE_FLAGS_MASK;
 }
@@ -189,7 +189,6 @@ static inline pteval_t native_pte_flags(pte_t pte)
 #endif
 
 #define pte_val(x)     native_pte_val(x)
-#define pte_flags(x)   native_pte_flags(x)
 #define __pte(x)       native_make_pte(x)
 
 #endif /* CONFIG_PARAVIRT */
index 5ebca29..e27fdbe 100644 (file)
@@ -13,8 +13,8 @@
 #define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
 #define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
 
-#define IRQSTACK_ORDER 2
-#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
+#define IRQ_STACK_ORDER 2
+#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
 
 #define STACKFAULT_STACK 1
 #define DOUBLEFAULT_STACK 2
index ba3e2ff..c85e747 100644 (file)
 #define CLBR_EAX  (1 << 0)
 #define CLBR_ECX  (1 << 1)
 #define CLBR_EDX  (1 << 2)
+#define CLBR_EDI  (1 << 3)
 
-#ifdef CONFIG_X86_64
-#define CLBR_RSI  (1 << 3)
-#define CLBR_RDI  (1 << 4)
+#ifdef CONFIG_X86_32
+/* CLBR_ANY should match all regs platform has. For i386, that's just it */
+#define CLBR_ANY  ((1 << 4) - 1)
+
+#define CLBR_ARG_REGS  (CLBR_EAX | CLBR_EDX | CLBR_ECX)
+#define CLBR_RET_REG   (CLBR_EAX | CLBR_EDX)
+#define CLBR_SCRATCH   (0)
+#else
+#define CLBR_RAX  CLBR_EAX
+#define CLBR_RCX  CLBR_ECX
+#define CLBR_RDX  CLBR_EDX
+#define CLBR_RDI  CLBR_EDI
+#define CLBR_RSI  (1 << 4)
 #define CLBR_R8   (1 << 5)
 #define CLBR_R9   (1 << 6)
 #define CLBR_R10  (1 << 7)
 #define CLBR_R11  (1 << 8)
+
 #define CLBR_ANY  ((1 << 9) - 1)
+
+#define CLBR_ARG_REGS  (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
+                        CLBR_RCX | CLBR_R8 | CLBR_R9)
+#define CLBR_RET_REG   (CLBR_RAX)
+#define CLBR_SCRATCH   (CLBR_R10 | CLBR_R11)
+
 #include <asm/desc_defs.h>
-#else
-/* CLBR_ANY should match all regs platform has. For i386, that's just it */
-#define CLBR_ANY  ((1 << 3) - 1)
 #endif /* X86_64 */
 
+#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
+
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
 #include <linux/cpumask.h>
@@ -40,6 +57,14 @@ struct tss_struct;
 struct mm_struct;
 struct desc_struct;
 
+/*
+ * Wrapper type for pointers to code which uses the non-standard
+ * calling convention.  See PV_CALL_SAVE_REGS_THUNK below.
+ */
+struct paravirt_callee_save {
+       void *func;
+};
+
 /* general info */
 struct pv_info {
        unsigned int kernel_rpl;
@@ -189,11 +214,15 @@ struct pv_irq_ops {
         * expected to use X86_EFLAGS_IF; all other bits
         * returned from save_fl are undefined, and may be ignored by
         * restore_fl.
+        *
+        * NOTE: These functions callers expect the callee to preserve
+        * more registers than the standard C calling convention.
         */
-       unsigned long (*save_fl)(void);
-       void (*restore_fl)(unsigned long);
-       void (*irq_disable)(void);
-       void (*irq_enable)(void);
+       struct paravirt_callee_save save_fl;
+       struct paravirt_callee_save restore_fl;
+       struct paravirt_callee_save irq_disable;
+       struct paravirt_callee_save irq_enable;
+
        void (*safe_halt)(void);
        void (*halt)(void);
 
@@ -244,7 +273,8 @@ struct pv_mmu_ops {
        void (*flush_tlb_user)(void);
        void (*flush_tlb_kernel)(void);
        void (*flush_tlb_single)(unsigned long addr);
-       void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm,
+       void (*flush_tlb_others)(const struct cpumask *cpus,
+                                struct mm_struct *mm,
                                 unsigned long va);
 
        /* Hooks for allocating and freeing a pagetable top-level */
@@ -278,12 +308,11 @@ struct pv_mmu_ops {
        void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
                                        pte_t *ptep, pte_t pte);
 
-       pteval_t (*pte_val)(pte_t);
-       pteval_t (*pte_flags)(pte_t);
-       pte_t (*make_pte)(pteval_t pte);
+       struct paravirt_callee_save pte_val;
+       struct paravirt_callee_save make_pte;
 
-       pgdval_t (*pgd_val)(pgd_t);
-       pgd_t (*make_pgd)(pgdval_t pgd);
+       struct paravirt_callee_save pgd_val;
+       struct paravirt_callee_save make_pgd;
 
 #if PAGETABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
@@ -298,12 +327,12 @@ struct pv_mmu_ops {
 
        void (*set_pud)(pud_t *pudp, pud_t pudval);
 
-       pmdval_t (*pmd_val)(pmd_t);
-       pmd_t (*make_pmd)(pmdval_t pmd);
+       struct paravirt_callee_save pmd_val;
+       struct paravirt_callee_save make_pmd;
 
 #if PAGETABLE_LEVELS == 4
-       pudval_t (*pud_val)(pud_t);
-       pud_t (*make_pud)(pudval_t pud);
+       struct paravirt_callee_save pud_val;
+       struct paravirt_callee_save make_pud;
 
        void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
 #endif /* PAGETABLE_LEVELS == 4 */
@@ -388,6 +417,8 @@ extern struct pv_lock_ops pv_lock_ops;
        asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
 
 unsigned paravirt_patch_nop(void);
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
 unsigned paravirt_patch_ignore(unsigned len);
 unsigned paravirt_patch_call(void *insnbuf,
                             const void *target, u16 tgt_clobbers,
@@ -479,25 +510,45 @@ int paravirt_disable_iospace(void);
  * makes sure the incoming and outgoing types are always correct.
  */
 #ifdef CONFIG_X86_32
-#define PVOP_VCALL_ARGS                        unsigned long __eax, __edx, __ecx
+#define PVOP_VCALL_ARGS                                \
+       unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx
 #define PVOP_CALL_ARGS                 PVOP_VCALL_ARGS
+
+#define PVOP_CALL_ARG1(x)              "a" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x)              "d" ((unsigned long)(x))
+#define PVOP_CALL_ARG3(x)              "c" ((unsigned long)(x))
+
 #define PVOP_VCALL_CLOBBERS            "=a" (__eax), "=d" (__edx),     \
                                        "=c" (__ecx)
 #define PVOP_CALL_CLOBBERS             PVOP_VCALL_CLOBBERS
+
+#define PVOP_VCALLEE_CLOBBERS          "=a" (__eax), "=d" (__edx)
+#define PVOP_CALLEE_CLOBBERS           PVOP_VCALLEE_CLOBBERS
+
 #define EXTRA_CLOBBERS
 #define VEXTRA_CLOBBERS
-#else
-#define PVOP_VCALL_ARGS                unsigned long __edi, __esi, __edx, __ecx
+#else  /* CONFIG_X86_64 */
+#define PVOP_VCALL_ARGS                                        \
+       unsigned long __edi = __edi, __esi = __esi,     \
+               __edx = __edx, __ecx = __ecx
 #define PVOP_CALL_ARGS         PVOP_VCALL_ARGS, __eax
+
+#define PVOP_CALL_ARG1(x)              "D" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x)              "S" ((unsigned long)(x))
+#define PVOP_CALL_ARG3(x)              "d" ((unsigned long)(x))
+#define PVOP_CALL_ARG4(x)              "c" ((unsigned long)(x))
+
 #define PVOP_VCALL_CLOBBERS    "=D" (__edi),                           \
                                "=S" (__esi), "=d" (__edx),             \
                                "=c" (__ecx)
-
 #define PVOP_CALL_CLOBBERS     PVOP_VCALL_CLOBBERS, "=a" (__eax)
 
+#define PVOP_VCALLEE_CLOBBERS  "=a" (__eax)
+#define PVOP_CALLEE_CLOBBERS   PVOP_VCALLEE_CLOBBERS
+
 #define EXTRA_CLOBBERS  , "r8", "r9", "r10", "r11"
 #define VEXTRA_CLOBBERS         , "rax", "r8", "r9", "r10", "r11"
-#endif
+#endif /* CONFIG_X86_32 */
 
 #ifdef CONFIG_PARAVIRT_DEBUG
 #define PVOP_TEST_NULL(op)     BUG_ON(op == NULL)
@@ -505,10 +556,11 @@ int paravirt_disable_iospace(void);
 #define PVOP_TEST_NULL(op)     ((void)op)
 #endif
 
-#define __PVOP_CALL(rettype, op, pre, post, ...)                       \
+#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr,                \
+                     pre, post, ...)                                   \
        ({                                                              \
                rettype __ret;                                          \
-               PVOP_CALL_ARGS;                                 \
+               PVOP_CALL_ARGS;                                         \
                PVOP_TEST_NULL(op);                                     \
                /* This is 32-bit specific, but is okay in 64-bit */    \
                /* since this condition will never hold */              \
@@ -516,70 +568,113 @@ int paravirt_disable_iospace(void);
                        asm volatile(pre                                \
                                     paravirt_alt(PARAVIRT_CALL)        \
                                     post                               \
-                                    : PVOP_CALL_CLOBBERS               \
+                                    : call_clbr                        \
                                     : paravirt_type(op),               \
-                                      paravirt_clobber(CLBR_ANY),      \
+                                      paravirt_clobber(clbr),          \
                                       ##__VA_ARGS__                    \
-                                    : "memory", "cc" EXTRA_CLOBBERS);  \
+                                    : "memory", "cc" extra_clbr);      \
                        __ret = (rettype)((((u64)__edx) << 32) | __eax); \
                } else {                                                \
                        asm volatile(pre                                \
                                     paravirt_alt(PARAVIRT_CALL)        \
                                     post                               \
-                                    : PVOP_CALL_CLOBBERS               \
+                                    : call_clbr                        \
                                     : paravirt_type(op),               \
-                                      paravirt_clobber(CLBR_ANY),      \
+                                      paravirt_clobber(clbr),          \
                                       ##__VA_ARGS__                    \
-                                    : "memory", "cc" EXTRA_CLOBBERS);  \
+                                    : "memory", "cc" extra_clbr);      \
                        __ret = (rettype)__eax;                         \
                }                                                       \
                __ret;                                                  \
        })
-#define __PVOP_VCALL(op, pre, post, ...)                               \
+
+#define __PVOP_CALL(rettype, op, pre, post, ...)                       \
+       ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS,        \
+                     EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
+
+#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...)                 \
+       ____PVOP_CALL(rettype, op.func, CLBR_RET_REG,                   \
+                     PVOP_CALLEE_CLOBBERS, ,                           \
+                     pre, post, ##__VA_ARGS__)
+
+
+#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...)        \
        ({                                                              \
                PVOP_VCALL_ARGS;                                        \
                PVOP_TEST_NULL(op);                                     \
                asm volatile(pre                                        \
                             paravirt_alt(PARAVIRT_CALL)                \
                             post                                       \
-                            : PVOP_VCALL_CLOBBERS                      \
+                            : call_clbr                                \
                             : paravirt_type(op),                       \
-                              paravirt_clobber(CLBR_ANY),              \
+                              paravirt_clobber(clbr),                  \
                               ##__VA_ARGS__                            \
-                            : "memory", "cc" VEXTRA_CLOBBERS);         \
+                            : "memory", "cc" extra_clbr);              \
        })
 
+#define __PVOP_VCALL(op, pre, post, ...)                               \
+       ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS,               \
+                      VEXTRA_CLOBBERS,                                 \
+                      pre, post, ##__VA_ARGS__)
+
+#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...)                        \
+       ____PVOP_CALL(rettype, op.func, CLBR_RET_REG,                   \
+                     PVOP_VCALLEE_CLOBBERS, ,                          \
+                     pre, post, ##__VA_ARGS__)
+
+
+
 #define PVOP_CALL0(rettype, op)                                                \
        __PVOP_CALL(rettype, op, "", "")
 #define PVOP_VCALL0(op)                                                        \
        __PVOP_VCALL(op, "", "")
 
+#define PVOP_CALLEE0(rettype, op)                                      \
+       __PVOP_CALLEESAVE(rettype, op, "", "")
+#define PVOP_VCALLEE0(op)                                              \
+       __PVOP_VCALLEESAVE(op, "", "")
+
+
 #define PVOP_CALL1(rettype, op, arg1)                                  \
-       __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)))
+       __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
 #define PVOP_VCALL1(op, arg1)                                          \
-       __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)))
+       __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
+
+#define PVOP_CALLEE1(rettype, op, arg1)                                        \
+       __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
+#define PVOP_VCALLEE1(op, arg1)                                                \
+       __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
+
 
 #define PVOP_CALL2(rettype, op, arg1, arg2)                            \
-       __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)),   \
-       "1" ((unsigned long)(arg2)))
+       __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1),          \
+                   PVOP_CALL_ARG2(arg2))
 #define PVOP_VCALL2(op, arg1, arg2)                                    \
-       __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)),           \
-       "1" ((unsigned long)(arg2)))
+       __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1),                  \
+                    PVOP_CALL_ARG2(arg2))
+
+#define PVOP_CALLEE2(rettype, op, arg1, arg2)                          \
+       __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1),    \
+                         PVOP_CALL_ARG2(arg2))
+#define PVOP_VCALLEE2(op, arg1, arg2)                                  \
+       __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1),            \
+                          PVOP_CALL_ARG2(arg2))
+
 
 #define PVOP_CALL3(rettype, op, arg1, arg2, arg3)                      \
-       __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)),   \
-       "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
+       __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1),          \
+                   PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
 #define PVOP_VCALL3(op, arg1, arg2, arg3)                              \
-       __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)),           \
-       "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
+       __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1),                  \
+                    PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
 
 /* This is the only difference in x86_64. We can make it much simpler */
 #ifdef CONFIG_X86_32
 #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4)                        \
        __PVOP_CALL(rettype, op,                                        \
                    "push %[_arg4];", "lea 4(%%esp),%%esp;",            \
-                   "0" ((u32)(arg1)), "1" ((u32)(arg2)),               \
-                   "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
+                   PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2),         \
+                   PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
 #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)                                \
        __PVOP_VCALL(op,                                                \
                    "push %[_arg4];", "lea 4(%%esp),%%esp;",            \
@@ -587,13 +682,13 @@ int paravirt_disable_iospace(void);
                    "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
 #else
 #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4)                        \
-       __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)),   \
-       "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)),         \
-       "3"((unsigned long)(arg4)))
+       __PVOP_CALL(rettype, op, "", "",                                \
+                   PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2),         \
+                   PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
 #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)                                \
-       __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)),           \
-       "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)),         \
-       "3"((unsigned long)(arg4)))
+       __PVOP_VCALL(op, "", "",                                        \
+                    PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2),        \
+                    PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
 #endif
 
 static inline int paravirt_enabled(void)
@@ -984,10 +1079,11 @@ static inline void __flush_tlb_single(unsigned long addr)
        PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
 }
 
-static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
+static inline void flush_tlb_others(const struct cpumask *cpumask,
+                                   struct mm_struct *mm,
                                    unsigned long va)
 {
-       PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
+       PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va);
 }
 
 static inline int paravirt_pgd_alloc(struct mm_struct *mm)
@@ -1059,13 +1155,13 @@ static inline pte_t __pte(pteval_t val)
        pteval_t ret;
 
        if (sizeof(pteval_t) > sizeof(long))
-               ret = PVOP_CALL2(pteval_t,
-                                pv_mmu_ops.make_pte,
-                                val, (u64)val >> 32);
+               ret = PVOP_CALLEE2(pteval_t,
+                                  pv_mmu_ops.make_pte,
+                                  val, (u64)val >> 32);
        else
-               ret = PVOP_CALL1(pteval_t,
-                                pv_mmu_ops.make_pte,
-                                val);
+               ret = PVOP_CALLEE1(pteval_t,
+                                  pv_mmu_ops.make_pte,
+                                  val);
 
        return (pte_t) { .pte = ret };
 }
@@ -1075,29 +1171,12 @@ static inline pteval_t pte_val(pte_t pte)
        pteval_t ret;
 
        if (sizeof(pteval_t) > sizeof(long))
-               ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val,
-                                pte.pte, (u64)pte.pte >> 32);
-       else
-               ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val,
-                                pte.pte);
-
-       return ret;
-}
-
-static inline pteval_t pte_flags(pte_t pte)
-{
-       pteval_t ret;
-
-       if (sizeof(pteval_t) > sizeof(long))
-               ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags,
-                                pte.pte, (u64)pte.pte >> 32);
+               ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val,
+                                  pte.pte, (u64)pte.pte >> 32);
        else
-               ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags,
-                                pte.pte);
+               ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val,
+                                  pte.pte);
 
-#ifdef CONFIG_PARAVIRT_DEBUG
-       BUG_ON(ret & PTE_PFN_MASK);
-#endif
        return ret;
 }
 
@@ -1106,11 +1185,11 @@ static inline pgd_t __pgd(pgdval_t val)
        pgdval_t ret;
 
        if (sizeof(pgdval_t) > sizeof(long))
-               ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd,
-                                val, (u64)val >> 32);
+               ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd,
+                                  val, (u64)val >> 32);
        else
-               ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd,
-                                val);
+               ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd,
+                                  val);
 
        return (pgd_t) { ret };
 }
@@ -1120,11 +1199,11 @@ static inline pgdval_t pgd_val(pgd_t pgd)
        pgdval_t ret;
 
        if (sizeof(pgdval_t) > sizeof(long))
-               ret =  PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val,
-                                 pgd.pgd, (u64)pgd.pgd >> 32);
+               ret =  PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val,
+                                   pgd.pgd, (u64)pgd.pgd >> 32);
        else
-               ret =  PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val,
-                                 pgd.pgd);
+               ret =  PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val,
+                                   pgd.pgd);
 
        return ret;
 }
@@ -1188,11 +1267,11 @@ static inline pmd_t __pmd(pmdval_t val)
        pmdval_t ret;
 
        if (sizeof(pmdval_t) > sizeof(long))
-               ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd,
-                                val, (u64)val >> 32);
+               ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd,
+                                  val, (u64)val >> 32);
        else
-               ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd,
-                                val);
+               ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd,
+                                  val);
 
        return (pmd_t) { ret };
 }
@@ -1202,11 +1281,11 @@ static inline pmdval_t pmd_val(pmd_t pmd)
        pmdval_t ret;
 
        if (sizeof(pmdval_t) > sizeof(long))
-               ret =  PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val,
-                                 pmd.pmd, (u64)pmd.pmd >> 32);
+               ret =  PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val,
+                                   pmd.pmd, (u64)pmd.pmd >> 32);
        else
-               ret =  PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val,
-                                 pmd.pmd);
+               ret =  PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val,
+                                   pmd.pmd);
 
        return ret;
 }
@@ -1228,11 +1307,11 @@ static inline pud_t __pud(pudval_t val)
        pudval_t ret;
 
        if (sizeof(pudval_t) > sizeof(long))
-               ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud,
-                                val, (u64)val >> 32);
+               ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud,
+                                  val, (u64)val >> 32);
        else
-               ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud,
-                                val);
+               ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud,
+                                  val);
 
        return (pud_t) { ret };
 }
@@ -1242,11 +1321,11 @@ static inline pudval_t pud_val(pud_t pud)
        pudval_t ret;
 
        if (sizeof(pudval_t) > sizeof(long))
-               ret =  PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val,
-                                 pud.pud, (u64)pud.pud >> 32);
+               ret =  PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val,
+                                   pud.pud, (u64)pud.pud >> 32);
        else
-               ret =  PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val,
-                                 pud.pud);
+               ret =  PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val,
+                                   pud.pud);
 
        return ret;
 }
@@ -1387,6 +1466,9 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
 }
 
 void _paravirt_nop(void);
+u32 _paravirt_ident_32(u32);
+u64 _paravirt_ident_64(u64);
+
 #define paravirt_nop   ((void *)_paravirt_nop)
 
 void paravirt_use_bytelocks(void);
@@ -1438,12 +1520,37 @@ extern struct paravirt_patch_site __parainstructions[],
        __parainstructions_end[];
 
 #ifdef CONFIG_X86_32
-#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;"
-#define PV_RESTORE_REGS "popl %%edx; popl %%ecx"
+#define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
+#define PV_RESTORE_REGS "popl %edx; popl %ecx;"
+
+/* save and restore all caller-save registers, except return value */
+#define PV_SAVE_ALL_CALLER_REGS                "pushl %ecx;"
+#define PV_RESTORE_ALL_CALLER_REGS     "popl  %ecx;"
+
 #define PV_FLAGS_ARG "0"
 #define PV_EXTRA_CLOBBERS
 #define PV_VEXTRA_CLOBBERS
 #else
+/* save and restore all caller-save registers, except return value */
+#define PV_SAVE_ALL_CALLER_REGS                                                \
+       "push %rcx;"                                                    \
+       "push %rdx;"                                                    \
+       "push %rsi;"                                                    \
+       "push %rdi;"                                                    \
+       "push %r8;"                                                     \
+       "push %r9;"                                                     \
+       "push %r10;"                                                    \
+       "push %r11;"
+#define PV_RESTORE_ALL_CALLER_REGS                                     \
+       "pop %r11;"                                                     \
+       "pop %r10;"                                                     \
+       "pop %r9;"                                                      \
+       "pop %r8;"                                                      \
+       "pop %rdi;"                                                     \
+       "pop %rsi;"                                                     \
+       "pop %rdx;"                                                     \
+       "pop %rcx;"
+
 /* We save some registers, but all of them, that's too much. We clobber all
  * caller saved registers but the argument parameter */
 #define PV_SAVE_REGS "pushq %%rdi;"
@@ -1453,52 +1560,76 @@ extern struct paravirt_patch_site __parainstructions[],
 #define PV_FLAGS_ARG "D"
 #endif
 
+/*
+ * Generate a thunk around a function which saves all caller-save
+ * registers except for the return value.  This allows C functions to
+ * be called from assembler code where fewer than normal registers are
+ * available.  It may also help code generation around calls from C
+ * code if the common case doesn't use many registers.
+ *
+ * When a callee is wrapped in a thunk, the caller can assume that all
+ * arg regs and all scratch registers are preserved across the
+ * call. The return value in rax/eax will not be saved, even for void
+ * functions.
+ */
+#define PV_CALLEE_SAVE_REGS_THUNK(func)                                        \
+       extern typeof(func) __raw_callee_save_##func;                   \
+       static void *__##func##__ __used = func;                        \
+                                                                       \
+       asm(".pushsection .text;"                                       \
+           "__raw_callee_save_" #func ": "                             \
+           PV_SAVE_ALL_CALLER_REGS                                     \
+           "call " #func ";"                                           \
+           PV_RESTORE_ALL_CALLER_REGS                                  \
+           "ret;"                                                      \
+           ".popsection")
+
+/* Get a reference to a callee-save function */
+#define PV_CALLEE_SAVE(func)                                           \
+       ((struct paravirt_callee_save) { __raw_callee_save_##func })
+
+/* Promise that "func" already uses the right calling convention */
+#define __PV_IS_CALLEE_SAVE(func)                      \
+       ((struct paravirt_callee_save) { func })
+
 static inline unsigned long __raw_local_save_flags(void)
 {
        unsigned long f;
 
-       asm volatile(paravirt_alt(PV_SAVE_REGS
-                                 PARAVIRT_CALL
-                                 PV_RESTORE_REGS)
+       asm volatile(paravirt_alt(PARAVIRT_CALL)
                     : "=a"(f)
                     : paravirt_type(pv_irq_ops.save_fl),
                       paravirt_clobber(CLBR_EAX)
-                    : "memory", "cc" PV_VEXTRA_CLOBBERS);
+                    : "memory", "cc");
        return f;
 }
 
 static inline void raw_local_irq_restore(unsigned long f)
 {
-       asm volatile(paravirt_alt(PV_SAVE_REGS
-                                 PARAVIRT_CALL
-                                 PV_RESTORE_REGS)
+       asm volatile(paravirt_alt(PARAVIRT_CALL)
                     : "=a"(f)
                     : PV_FLAGS_ARG(f),
                       paravirt_type(pv_irq_ops.restore_fl),
                       paravirt_clobber(CLBR_EAX)
-                    : "memory", "cc" PV_EXTRA_CLOBBERS);
+                    : "memory", "cc");
 }
 
 static inline void raw_local_irq_disable(void)
 {
-       asm volatile(paravirt_alt(PV_SAVE_REGS
-                                 PARAVIRT_CALL
-                                 PV_RESTORE_REGS)
+       asm volatile(paravirt_alt(PARAVIRT_CALL)
                     :
                     : paravirt_type(pv_irq_ops.irq_disable),
                       paravirt_clobber(CLBR_EAX)
-                    : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
+                    : "memory", "eax", "cc");
 }
 
 static inline void raw_local_irq_enable(void)
 {
-       asm volatile(paravirt_alt(PV_SAVE_REGS
-                                 PARAVIRT_CALL
-                                 PV_RESTORE_REGS)
+       asm volatile(paravirt_alt(PARAVIRT_CALL)
                     :
                     : paravirt_type(pv_irq_ops.irq_enable),
                       paravirt_clobber(CLBR_EAX)
-                    : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
+                    : "memory", "eax", "cc");
 }
 
 static inline unsigned long __raw_local_irq_save(void)
@@ -1541,33 +1672,49 @@ static inline unsigned long __raw_local_irq_save(void)
        .popsection
 
 
+#define COND_PUSH(set, mask, reg)                      \
+       .if ((~(set)) & mask); push %reg; .endif
+#define COND_POP(set, mask, reg)                       \
+       .if ((~(set)) & mask); pop %reg; .endif
+
 #ifdef CONFIG_X86_64
-#define PV_SAVE_REGS                           \
-       push %rax;                              \
-       push %rcx;                              \
-       push %rdx;                              \
-       push %rsi;                              \
-       push %rdi;                              \
-       push %r8;                               \
-       push %r9;                               \
-       push %r10;                              \
-       push %r11
-#define PV_RESTORE_REGS                                \
-       pop %r11;                               \
-       pop %r10;                               \
-       pop %r9;                                \
-       pop %r8;                                \
-       pop %rdi;                               \
-       pop %rsi;                               \
-       pop %rdx;                               \
-       pop %rcx;                               \
-       pop %rax
+
+#define PV_SAVE_REGS(set)                      \
+       COND_PUSH(set, CLBR_RAX, rax);          \
+       COND_PUSH(set, CLBR_RCX, rcx);          \
+       COND_PUSH(set, CLBR_RDX, rdx);          \
+       COND_PUSH(set, CLBR_RSI, rsi);          \
+       COND_PUSH(set, CLBR_RDI, rdi);          \
+       COND_PUSH(set, CLBR_R8, r8);            \
+       COND_PUSH(set, CLBR_R9, r9);            \
+       COND_PUSH(set, CLBR_R10, r10);          \
+       COND_PUSH(set, CLBR_R11, r11)
+#define PV_RESTORE_REGS(set)                   \
+       COND_POP(set, CLBR_R11, r11);           \
+       COND_POP(set, CLBR_R10, r10);           \
+       COND_POP(set, CLBR_R9, r9);             \
+       COND_POP(set, CLBR_R8, r8);             \
+       COND_POP(set, CLBR_RDI, rdi);           \
+       COND_POP(set, CLBR_RSI, rsi);           \
+       COND_POP(set, CLBR_RDX, rdx);           \
+       COND_POP(set, CLBR_RCX, rcx);           \
+       COND_POP(set, CLBR_RAX, rax)
+
 #define PARA_PATCH(struct, off)        ((PARAVIRT_PATCH_##struct + (off)) / 8)
 #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
 #define PARA_INDIRECT(addr)    *addr(%rip)
 #else
-#define PV_SAVE_REGS   pushl %eax; pushl %edi; pushl %ecx; pushl %edx
-#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax
+#define PV_SAVE_REGS(set)                      \
+       COND_PUSH(set, CLBR_EAX, eax);          \
+       COND_PUSH(set, CLBR_EDI, edi);          \
+       COND_PUSH(set, CLBR_ECX, ecx);          \
+       COND_PUSH(set, CLBR_EDX, edx)
+#define PV_RESTORE_REGS(set)                   \
+       COND_POP(set, CLBR_EDX, edx);           \
+       COND_POP(set, CLBR_ECX, ecx);           \
+       COND_POP(set, CLBR_EDI, edi);           \
+       COND_POP(set, CLBR_EAX, eax)
+
 #define PARA_PATCH(struct, off)        ((PARAVIRT_PATCH_##struct + (off)) / 4)
 #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
 #define PARA_INDIRECT(addr)    *%cs:addr
@@ -1579,15 +1726,15 @@ static inline unsigned long __raw_local_irq_save(void)
 
 #define DISABLE_INTERRUPTS(clobbers)                                   \
        PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
-                 PV_SAVE_REGS;                                         \
+                 PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);            \
                  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable);    \
-                 PV_RESTORE_REGS;)                     \
+                 PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 
 #define ENABLE_INTERRUPTS(clobbers)                                    \
        PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers,  \
-                 PV_SAVE_REGS;                                         \
+                 PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);            \
                  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable);     \
-                 PV_RESTORE_REGS;)
+                 PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 
 #define USERGS_SYSRET32                                                        \
        PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32),       \
@@ -1617,11 +1764,15 @@ static inline unsigned long __raw_local_irq_save(void)
        PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE,     \
                  swapgs)
 
+/*
+ * Note: swapgs is very special, and in practise is either going to be
+ * implemented with a single "swapgs" instruction or something very
+ * special.  Either way, we don't need to save any registers for
+ * it.
+ */
 #define SWAPGS                                                         \
        PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE,     \
-                 PV_SAVE_REGS;                                         \
-                 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs);         \
-                 PV_RESTORE_REGS                                       \
+                 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs)          \
                 )
 
 #define GET_CR2_INTO_RCX                               \
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
deleted file mode 100644 (file)
index 2fbfff8..0000000
+++ /dev/null
@@ -1,137 +0,0 @@
-#ifndef _ASM_X86_PDA_H
-#define _ASM_X86_PDA_H
-
-#ifndef __ASSEMBLY__
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <linux/cache.h>
-#include <asm/page.h>
-
-/* Per processor datastructure. %gs points to it while the kernel runs */
-struct x8664_pda {
-       struct task_struct *pcurrent;   /* 0  Current process */
-       unsigned long data_offset;      /* 8 Per cpu data offset from linker
-                                          address */
-       unsigned long kernelstack;      /* 16 top of kernel stack for current */
-       unsigned long oldrsp;           /* 24 user rsp for system call */
-       int irqcount;                   /* 32 Irq nesting counter. Starts -1 */
-       unsigned int cpunumber;         /* 36 Logical CPU number */
-#ifdef CONFIG_CC_STACKPROTECTOR
-       unsigned long stack_canary;     /* 40 stack canary value */
-                                       /* gcc-ABI: this canary MUST be at
-                                          offset 40!!! */
-#endif
-       char *irqstackptr;
-       short nodenumber;               /* number of current node (32k max) */
-       short in_bootmem;               /* pda lives in bootmem */
-       unsigned int __softirq_pending;
-       unsigned int __nmi_count;       /* number of NMI on this CPUs */
-       short mmu_state;
-       short isidle;
-       struct mm_struct *active_mm;
-       unsigned apic_timer_irqs;
-       unsigned irq0_irqs;
-       unsigned irq_resched_count;
-       unsigned irq_call_count;
-       unsigned irq_tlb_count;
-       unsigned irq_thermal_count;
-       unsigned irq_threshold_count;
-       unsigned irq_spurious_count;
-} ____cacheline_aligned_in_smp;
-
-extern struct x8664_pda **_cpu_pda;
-extern void pda_init(int);
-
-#define cpu_pda(i) (_cpu_pda[i])
-
-/*
- * There is no fast way to get the base address of the PDA, all the accesses
- * have to mention %fs/%gs.  So it needs to be done this Torvaldian way.
- */
-extern void __bad_pda_field(void) __attribute__((noreturn));
-
-/*
- * proxy_pda doesn't actually exist, but tell gcc it is accessed for
- * all PDA accesses so it gets read/write dependencies right.
- */
-extern struct x8664_pda _proxy_pda;
-
-#define pda_offset(field) offsetof(struct x8664_pda, field)
-
-#define pda_to_op(op, field, val)                                      \
-do {                                                                   \
-       typedef typeof(_proxy_pda.field) T__;                           \
-       if (0) { T__ tmp__; tmp__ = (val); }    /* type checking */     \
-       switch (sizeof(_proxy_pda.field)) {                             \
-       case 2:                                                         \
-               asm(op "w %1,%%gs:%c2" :                                \
-                   "+m" (_proxy_pda.field) :                           \
-                   "ri" ((T__)val),                                    \
-                   "i"(pda_offset(field)));                            \
-               break;                                                  \
-       case 4:                                                         \
-               asm(op "l %1,%%gs:%c2" :                                \
-                   "+m" (_proxy_pda.field) :                           \
-                   "ri" ((T__)val),                                    \
-                   "i" (pda_offset(field)));                           \
-               break;                                                  \
-       case 8:                                                         \
-               asm(op "q %1,%%gs:%c2":                                 \
-                   "+m" (_proxy_pda.field) :                           \
-                   "ri" ((T__)val),                                    \
-                   "i"(pda_offset(field)));                            \
-               break;                                                  \
-       default:                                                        \
-               __bad_pda_field();                                      \
-       }                                                               \
-} while (0)
-
-#define pda_from_op(op, field)                 \
-({                                             \
-       typeof(_proxy_pda.field) ret__;         \
-       switch (sizeof(_proxy_pda.field)) {     \
-       case 2:                                 \
-               asm(op "w %%gs:%c1,%0" :        \
-                   "=r" (ret__) :              \
-                   "i" (pda_offset(field)),    \
-                   "m" (_proxy_pda.field));    \
-               break;                          \
-       case 4:                                 \
-               asm(op "l %%gs:%c1,%0":         \
-                   "=r" (ret__):               \
-                   "i" (pda_offset(field)),    \
-                   "m" (_proxy_pda.field));    \
-               break;                          \
-       case 8:                                 \
-               asm(op "q %%gs:%c1,%0":         \
-                   "=r" (ret__) :              \
-                   "i" (pda_offset(field)),    \
-                   "m" (_proxy_pda.field));    \
-               break;                          \
-       default:                                \
-               __bad_pda_field();              \
-       }                                       \
-       ret__;                                  \
-})
-
-#define read_pda(field)                pda_from_op("mov", field)
-#define write_pda(field, val)  pda_to_op("mov", field, val)
-#define add_pda(field, val)    pda_to_op("add", field, val)
-#define sub_pda(field, val)    pda_to_op("sub", field, val)
-#define or_pda(field, val)     pda_to_op("or", field, val)
-
-/* This is not atomic against other CPUs -- CPU preemption needs to be off */
-#define test_and_clear_bit_pda(bit, field)                             \
-({                                                                     \
-       int old__;                                                      \
-       asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0"                    \
-                    : "=r" (old__), "+m" (_proxy_pda.field)            \
-                    : "dIr" (bit), "i" (pda_offset(field)) : "memory");\
-       old__;                                                          \
-})
-
-#endif
-
-#define PDA_STACKOFFSET (5*8)
-
-#endif /* _ASM_X86_PDA_H */
index ece7205..0b64af4 100644 (file)
@@ -2,53 +2,12 @@
 #define _ASM_X86_PERCPU_H
 
 #ifdef CONFIG_X86_64
-#include <linux/compiler.h>
-
-/* Same as asm-generic/percpu.h, except that we store the per cpu offset
-   in the PDA. Longer term the PDA and every per cpu variable
-   should be just put into a single section and referenced directly
-   from %gs */
-
-#ifdef CONFIG_SMP
-#include <asm/pda.h>
-
-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
-#define __my_cpu_offset read_pda(data_offset)
-
-#define per_cpu_offset(x) (__per_cpu_offset(x))
-
+#define __percpu_seg           gs
+#define __percpu_mov_op                movq
+#else
+#define __percpu_seg           fs
+#define __percpu_mov_op                movl
 #endif
-#include <asm-generic/percpu.h>
-
-DECLARE_PER_CPU(struct x8664_pda, pda);
-
-/*
- * These are supposed to be implemented as a single instruction which
- * operates on the per-cpu data base segment.  x86-64 doesn't have
- * that yet, so this is a fairly inefficient workaround for the
- * meantime.  The single instruction is atomic with respect to
- * preemption and interrupts, so we need to explicitly disable
- * interrupts here to achieve the same effect.  However, because it
- * can be used from within interrupt-disable/enable, we can't actually
- * disable interrupts; disabling preemption is enough.
- */
-#define x86_read_percpu(var)                                           \
-       ({                                                              \
-               typeof(per_cpu_var(var)) __tmp;                         \
-               preempt_disable();                                      \
-               __tmp = __get_cpu_var(var);                             \
-               preempt_enable();                                       \
-               __tmp;                                                  \
-       })
-
-#define x86_write_percpu(var, val)                                     \
-       do {                                                            \
-               preempt_disable();                                      \
-               __get_cpu_var(var) = (val);                             \
-               preempt_enable();                                       \
-       } while(0)
-
-#else /* CONFIG_X86_64 */
 
 #ifdef __ASSEMBLY__
 
@@ -65,47 +24,26 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
  *    PER_CPU(cpu_gdt_descr, %ebx)
  */
 #ifdef CONFIG_SMP
-#define PER_CPU(var, reg)                              \
-       movl %fs:per_cpu__##this_cpu_off, reg;          \
+#define PER_CPU(var, reg)                                              \
+       __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg;       \
        lea per_cpu__##var(reg), reg
-#define PER_CPU_VAR(var)       %fs:per_cpu__##var
+#define PER_CPU_VAR(var)       %__percpu_seg:per_cpu__##var
 #else /* ! SMP */
-#define PER_CPU(var, reg)                      \
-       movl $per_cpu__##var, reg
+#define PER_CPU(var, reg)                                              \
+       __percpu_mov_op $per_cpu__##var, reg
 #define PER_CPU_VAR(var)       per_cpu__##var
 #endif /* SMP */
 
 #else /* ...!ASSEMBLY */
 
-/*
- * PER_CPU finds an address of a per-cpu variable.
- *
- * Args:
- *    var - variable name
- *    cpu - 32bit register containing the current CPU number
- *
- * The resulting address is stored in the "cpu" argument.
- *
- * Example:
- *    PER_CPU(cpu_gdt_descr, %ebx)
- */
-#ifdef CONFIG_SMP
-
-#define __my_cpu_offset x86_read_percpu(this_cpu_off)
+#include <linux/stringify.h>
 
-/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
-#define __percpu_seg "%%fs:"
-
-#else  /* !SMP */
-
-#define __percpu_seg ""
-
-#endif /* SMP */
-
-#include <asm-generic/percpu.h>
-
-/* We can use this directly for local CPU (faster). */
-DECLARE_PER_CPU(unsigned long, this_cpu_off);
+#ifdef CONFIG_SMP
+#define __percpu_arg(x)                "%%"__stringify(__percpu_seg)":%P" #x
+#define __my_cpu_offset                percpu_read(this_cpu_off)
+#else
+#define __percpu_arg(x)                "%" #x
+#endif
 
 /* For arch-specific code, we can use direct single-insn ops (they
  * don't give an lvalue though). */
@@ -120,20 +58,25 @@ do {                                                       \
        }                                               \
        switch (sizeof(var)) {                          \
        case 1:                                         \
-               asm(op "b %1,"__percpu_seg"%0"          \
+               asm(op "b %1,"__percpu_arg(0)           \
                    : "+m" (var)                        \
                    : "ri" ((T__)val));                 \
                break;                                  \
        case 2:                                         \
-               asm(op "w %1,"__percpu_seg"%0"          \
+               asm(op "w %1,"__percpu_arg(0)           \
                    : "+m" (var)                        \
                    : "ri" ((T__)val));                 \
                break;                                  \
        case 4:                                         \
-               asm(op "l %1,"__percpu_seg"%0"          \
+               asm(op "l %1,"__percpu_arg(0)           \
                    : "+m" (var)                        \
                    : "ri" ((T__)val));                 \
                break;                                  \
+       case 8:                                         \
+               asm(op "q %1,"__percpu_arg(0)           \
+                   : "+m" (var)                        \
+                   : "re" ((T__)val));                 \
+               break;                                  \
        default: __bad_percpu_size();                   \
        }                                               \
 } while (0)
@@ -143,17 +86,22 @@ do {                                                       \
        typeof(var) ret__;                              \
        switch (sizeof(var)) {                          \
        case 1:                                         \
-               asm(op "b "__percpu_seg"%1,%0"          \
+               asm(op "b "__percpu_arg(1)",%0"         \
                    : "=r" (ret__)                      \
                    : "m" (var));                       \
                break;                                  \
        case 2:                                         \
-               asm(op "w "__percpu_seg"%1,%0"          \
+               asm(op "w "__percpu_arg(1)",%0"         \
                    : "=r" (ret__)                      \
                    : "m" (var));                       \
                break;                                  \
        case 4:                                         \
-               asm(op "l "__percpu_seg"%1,%0"          \
+               asm(op "l "__percpu_arg(1)",%0"         \
+                   : "=r" (ret__)                      \
+                   : "m" (var));                       \
+               break;                                  \
+       case 8:                                         \
+               asm(op "q "__percpu_arg(1)",%0"         \
                    : "=r" (ret__)                      \
                    : "m" (var));                       \
                break;                                  \
@@ -162,13 +110,30 @@ do {                                                      \
        ret__;                                          \
 })
 
-#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
-#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
-#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
-#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
-#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
+#define percpu_read(var)       percpu_from_op("mov", per_cpu__##var)
+#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val)
+#define percpu_add(var, val)   percpu_to_op("add", per_cpu__##var, val)
+#define percpu_sub(var, val)   percpu_to_op("sub", per_cpu__##var, val)
+#define percpu_and(var, val)   percpu_to_op("and", per_cpu__##var, val)
+#define percpu_or(var, val)    percpu_to_op("or", per_cpu__##var, val)
+#define percpu_xor(var, val)   percpu_to_op("xor", per_cpu__##var, val)
+
+/* This is not atomic against other CPUs -- CPU preemption needs to be off */
+#define x86_test_and_clear_bit_percpu(bit, var)                                \
+({                                                                     \
+       int old__;                                                      \
+       asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0"           \
+                    : "=r" (old__), "+m" (per_cpu__##var)              \
+                    : "dIr" (bit));                                    \
+       old__;                                                          \
+})
+
+#include <asm-generic/percpu.h>
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU(unsigned long, this_cpu_off);
+
 #endif /* !__ASSEMBLY__ */
-#endif /* !CONFIG_X86_64 */
 
 #ifdef CONFIG_SMP
 
@@ -195,9 +160,9 @@ do {                                                        \
 #define        early_per_cpu_ptr(_name) (_name##_early_ptr)
 #define        early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
 #define        early_per_cpu(_name, _cpu)                              \
-       (early_per_cpu_ptr(_name) ?                             \
-               early_per_cpu_ptr(_name)[_cpu] :                \
-               per_cpu(_name, _cpu))
+       *(early_per_cpu_ptr(_name) ?                            \
+               &early_per_cpu_ptr(_name)[_cpu] :               \
+               &per_cpu(_name, _cpu))
 
 #else  /* !CONFIG_SMP */
 #define        DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)          \
index 1782053..860f1b6 100644 (file)
@@ -242,64 +242,78 @@ static inline int pmd_large(pmd_t pte)
                (_PAGE_PSE | _PAGE_PRESENT);
 }
 
+static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
+{
+       pteval_t v = native_pte_val(pte);
+
+       return native_make_pte(v | set);
+}
+
+static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
+{
+       pteval_t v = native_pte_val(pte);
+
+       return native_make_pte(v & ~clear);
+}
+
 static inline pte_t pte_mkclean(pte_t pte)
 {
-       return __pte(pte_val(pte) & ~_PAGE_DIRTY);
+       return pte_clear_flags(pte, _PAGE_DIRTY);
 }
 
 static inline pte_t pte_mkold(pte_t pte)
 {
-       return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
+       return pte_clear_flags(pte, _PAGE_ACCESSED);
 }
 
 static inline pte_t pte_wrprotect(pte_t pte)
 {
-       return __pte(pte_val(pte) & ~_PAGE_RW);
+       return pte_clear_flags(pte, _PAGE_RW);
 }
 
 static inline pte_t pte_mkexec(pte_t pte)
 {
-       return __pte(pte_val(pte) & ~_PAGE_NX);
+       return pte_clear_flags(pte, _PAGE_NX);
 }
 
 static inline pte_t pte_mkdirty(pte_t pte)
 {
-       return __pte(pte_val(pte) | _PAGE_DIRTY);
+       return pte_set_flags(pte, _PAGE_DIRTY);
 }
 
 static inline pte_t pte_mkyoung(pte_t pte)
 {
-       return __pte(pte_val(pte) | _PAGE_ACCESSED);
+       return pte_set_flags(pte, _PAGE_ACCESSED);
 }
 
 static inline pte_t pte_mkwrite(pte_t pte)
 {
-       return __pte(pte_val(pte) | _PAGE_RW);
+       return pte_set_flags(pte, _PAGE_RW);
 }
 
 static inline pte_t pte_mkhuge(pte_t pte)
 {
-       return __pte(pte_val(pte) | _PAGE_PSE);
+       return pte_set_flags(pte, _PAGE_PSE);
 }
 
 static inline pte_t pte_clrhuge(pte_t pte)
 {
-       return __pte(pte_val(pte) & ~_PAGE_PSE);
+       return pte_clear_flags(pte, _PAGE_PSE);
 }
 
 static inline pte_t pte_mkglobal(pte_t pte)
 {
-       return __pte(pte_val(pte) | _PAGE_GLOBAL);
+       return pte_set_flags(pte, _PAGE_GLOBAL);
 }
 
 static inline pte_t pte_clrglobal(pte_t pte)
 {
-       return __pte(pte_val(pte) & ~_PAGE_GLOBAL);
+       return pte_clear_flags(pte, _PAGE_GLOBAL);
 }
 
 static inline pte_t pte_mkspecial(pte_t pte)
 {
-       return __pte(pte_val(pte) | _PAGE_SPECIAL);
+       return pte_set_flags(pte, _PAGE_SPECIAL);
 }
 
 extern pteval_t __supported_pte_mask;
index 100ac48..1c4e247 100644 (file)
@@ -11,7 +11,6 @@
 #include <asm/processor.h>
 #include <linux/bitops.h>
 #include <linux/threads.h>
-#include <asm/pda.h>
 
 extern pud_t level3_kernel_pgt[512];
 extern pud_t level3_ident_pgt[512];
index 091cd88..656d02e 100644 (file)
@@ -378,6 +378,22 @@ union thread_xstate {
 
 #ifdef CONFIG_X86_64
 DECLARE_PER_CPU(struct orig_ist, orig_ist);
+
+union irq_stack_union {
+       char irq_stack[IRQ_STACK_SIZE];
+       /*
+        * GCC hardcodes the stack canary as %gs:40.  Since the
+        * irq_stack is the object at %gs:0, we reserve the bottom
+        * 48 bytes of the irq stack for the canary.
+        */
+       struct {
+               char gs_base[40];
+               unsigned long stack_canary;
+       };
+};
+
+DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
+DECLARE_PER_CPU(char *, irq_stack_ptr);
 #endif
 
 extern void print_cpu_info(struct cpuinfo_x86 *);
@@ -752,9 +768,9 @@ extern int sysenter_setup(void);
 extern struct desc_ptr         early_gdt_descr;
 
 extern void cpu_set_gdt(int);
-extern void switch_to_new_gdt(void);
+extern void switch_to_new_gdt(int);
+extern void load_percpu_segment(int);
 extern void cpu_init(void);
-extern void init_gdt(int cpu);
 
 static inline unsigned long get_debugctlmsr(void)
 {
index ebe858c..5369497 100644 (file)
@@ -100,7 +100,6 @@ extern unsigned long init_pg_tables_start;
 extern unsigned long init_pg_tables_end;
 
 #else
-void __init x86_64_init_pda(void);
 void __init x86_64_start_kernel(char *real_mode);
 void __init x86_64_start_reservations(char *real_mode_data);
 
index 19953df..45ef8a1 100644 (file)
 #  include <asm/io_apic.h>
 # endif
 #endif
-#include <asm/pda.h>
 #include <asm/thread_info.h>
-
-#ifdef CONFIG_X86_64
-
-extern cpumask_var_t cpu_callin_mask;
-extern cpumask_var_t cpu_callout_mask;
-extern cpumask_var_t cpu_initialized_mask;
-extern cpumask_var_t cpu_sibling_setup_mask;
-
-#else /* CONFIG_X86_32 */
-
-extern cpumask_t cpu_callin_map;
-extern cpumask_t cpu_callout_map;
-extern cpumask_t cpu_initialized;
-extern cpumask_t cpu_sibling_setup_map;
-
-#define cpu_callin_mask                ((struct cpumask *)&cpu_callin_map)
-#define cpu_callout_mask       ((struct cpumask *)&cpu_callout_map)
-#define cpu_initialized_mask   ((struct cpumask *)&cpu_initialized)
-#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map)
-
-#endif /* CONFIG_X86_32 */
-
-extern void (*mtrr_hook)(void);
-extern void zap_low_mappings(void);
-
-extern int __cpuinit get_local_pda(int cpu);
+#include <asm/cpumask.h>
 
 extern int smp_num_siblings;
 extern unsigned int num_processors;
@@ -50,9 +24,7 @@ extern unsigned int num_processors;
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
 DECLARE_PER_CPU(u16, cpu_llc_id);
-#ifdef CONFIG_X86_32
 DECLARE_PER_CPU(int, cpu_number);
-#endif
 
 static inline struct cpumask *cpu_sibling_mask(int cpu)
 {
@@ -167,8 +139,6 @@ void play_dead_common(void);
 void native_send_call_func_ipi(const struct cpumask *mask);
 void native_send_call_func_single_ipi(int cpu);
 
-extern void prefill_possible_map(void);
-
 void smp_store_cpu_info(int id);
 #define cpu_physical_id(cpu)   per_cpu(x86_cpu_to_apicid, cpu)
 
@@ -177,10 +147,6 @@ static inline int num_booting_cpus(void)
 {
        return cpumask_weight(cpu_callout_mask);
 }
-#else
-static inline void prefill_possible_map(void)
-{
-}
 #endif /* CONFIG_SMP */
 
 extern unsigned disabled_cpus __cpuinitdata;
@@ -191,11 +157,11 @@ extern unsigned disabled_cpus __cpuinitdata;
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
-#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+#define raw_smp_processor_id() (percpu_read(cpu_number))
 extern int safe_smp_processor_id(void);
 
 #elif defined(CONFIG_X86_64_SMP)
-#define raw_smp_processor_id() read_pda(cpunumber)
+#define raw_smp_processor_id() (percpu_read(cpu_number))
 
 #define stack_smp_processor_id()                                       \
 ({                                                             \
@@ -205,10 +171,6 @@ extern int safe_smp_processor_id(void);
 })
 #define safe_smp_processor_id()                smp_processor_id()
 
-#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
-#define cpu_physical_id(cpu)           boot_cpu_physical_apicid
-#define safe_smp_processor_id()                0
-#define stack_smp_processor_id()       0
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -251,11 +213,5 @@ static inline int hard_smp_processor_id(void)
 
 #endif /* CONFIG_X86_LOCAL_APIC */
 
-#ifdef CONFIG_X86_HAS_BOOT_CPU_ID
-extern unsigned char boot_cpu_id;
-#else
-#define boot_cpu_id    0
-#endif
-
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
new file mode 100644 (file)
index 0000000..36a700a
--- /dev/null
@@ -0,0 +1,38 @@
+#ifndef _ASM_STACKPROTECTOR_H
+#define _ASM_STACKPROTECTOR_H 1
+
+#include <asm/tsc.h>
+#include <asm/processor.h>
+
+/*
+ * Initialize the stackprotector canary value.
+ *
+ * NOTE: this must only be called from functions that never return,
+ * and it must always be inlined.
+ */
+static __always_inline void boot_init_stack_canary(void)
+{
+       u64 canary;
+       u64 tsc;
+
+       /*
+        * Build time only check to make sure the stack_canary is at
+        * offset 40 in the pda; this is a gcc ABI requirement
+        */
+       BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40);
+
+       /*
+        * We both use the random pool and the current TSC as a source
+        * of randomness. The TSC only matters for very early init,
+        * there it already has some randomness on most systems. Later
+        * on during the bootup the random pool has true entropy too.
+        */
+       get_random_bytes(&canary, sizeof(canary));
+       tsc = __native_read_tsc();
+       canary += tsc + (tsc << 32UL);
+
+       current->stack_canary = canary;
+       percpu_write(irq_stack_union.stack_canary, canary);
+}
+
+#endif
index 8e626ea..2fcc70b 100644 (file)
@@ -86,27 +86,44 @@ do {                                                                        \
        , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
          "r12", "r13", "r14", "r15"
 
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary                                                          \
+       "movq %P[task_canary](%%rsi),%%r8\n\t"                            \
+       "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
+#define __switch_canary_oparam                                           \
+       , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary))
+#define __switch_canary_iparam                                           \
+       , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else  /* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif /* CC_STACKPROTECTOR */
+
 /* Save restore flags to clear handle leaking NT */
 #define switch_to(prev, next, last) \
-       asm volatile(SAVE_CONTEXT                                                   \
+       asm volatile(SAVE_CONTEXT                                         \
             "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
             "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
             "call __switch_to\n\t"                                       \
             ".globl thread_return\n"                                     \
             "thread_return:\n\t"                                         \
-            "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
+            "movq "__percpu_arg([current_task])",%%rsi\n\t"              \
+            __switch_canary                                              \
             "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
             LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"        \
             "movq %%rax,%%rdi\n\t"                                       \
             "jc   ret_from_fork\n\t"                                     \
             RESTORE_CONTEXT                                              \
             : "=a" (last)                                                \
+              __switch_canary_oparam                                     \
             : [next] "S" (next), [prev] "D" (prev),                      \
               [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
               [ti_flags] "i" (offsetof(struct thread_info, flags)),      \
               [tif_fork] "i" (TIF_FORK),                                 \
               [thread_info] "i" (offsetof(struct task_struct, stack)),   \
-              [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))  \
+              [current_task] "m" (per_cpu_var(current_task))             \
+              __switch_canary_iparam                                     \
             : "memory", "cc" __EXTRA_CLOBBER)
 #endif
 
index 9878964..b46f8ca 100644 (file)
@@ -194,25 +194,21 @@ static inline struct thread_info *current_thread_info(void)
 
 #else /* X86_32 */
 
-#include <asm/pda.h>
+#include <asm/percpu.h>
+#define KERNEL_STACK_OFFSET (5*8)
 
 /*
  * macros/functions for gaining access to the thread information structure
  * preempt_count needs to be 1 initially, until the scheduler is functional.
  */
 #ifndef __ASSEMBLY__
-static inline struct thread_info *current_thread_info(void)
-{
-       struct thread_info *ti;
-       ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE);
-       return ti;
-}
+DECLARE_PER_CPU(unsigned long, kernel_stack);
 
-/* do not use in interrupt context */
-static inline struct thread_info *stack_thread_info(void)
+static inline struct thread_info *current_thread_info(void)
 {
        struct thread_info *ti;
-       asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1)));
+       ti = (void *)(percpu_read(kernel_stack) +
+                     KERNEL_STACK_OFFSET - THREAD_SIZE);
        return ti;
 }
 
@@ -220,8 +216,8 @@ static inline struct thread_info *stack_thread_info(void)
 
 /* how to get the thread information struct from ASM */
 #define GET_THREAD_INFO(reg) \
-       movq %gs:pda_kernelstack,reg ; \
-       subq $(THREAD_SIZE-PDA_STACKOFFSET),reg
+       movq PER_CPU_VAR(kernel_stack),reg ; \
+       subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg
 
 #endif
 
index 0e7bbb5..d3539f9 100644 (file)
@@ -113,7 +113,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
                __flush_tlb();
 }
 
-static inline void native_flush_tlb_others(const cpumask_t *cpumask,
+static inline void native_flush_tlb_others(const struct cpumask *cpumask,
                                           struct mm_struct *mm,
                                           unsigned long va)
 {
@@ -142,31 +142,28 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
        flush_tlb_mm(vma->vm_mm);
 }
 
-void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
-                            unsigned long va);
+void native_flush_tlb_others(const struct cpumask *cpumask,
+                            struct mm_struct *mm, unsigned long va);
 
 #define TLBSTATE_OK    1
 #define TLBSTATE_LAZY  2
 
-#ifdef CONFIG_X86_32
 struct tlb_state {
        struct mm_struct *active_mm;
        int state;
-       char __cacheline_padding[L1_CACHE_BYTES-8];
 };
 DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
 
-void reset_lazy_tlbstate(void);
-#else
 static inline void reset_lazy_tlbstate(void)
 {
+       percpu_write(cpu_tlbstate.state, 0);
+       percpu_write(cpu_tlbstate.active_mm, &init_mm);
 }
-#endif
 
 #endif /* SMP */
 
 #ifndef CONFIG_PARAVIRT
-#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(&mask, mm, va)
+#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va)
 #endif
 
 static inline void flush_tlb_kernel_range(unsigned long start,
@@ -175,4 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start,
        flush_tlb_all();
 }
 
+extern void zap_low_mappings(void);
+
 #endif /* _ASM_X86_TLBFLUSH_H */
index 4e2f2e0..77cfb2c 100644 (file)
@@ -74,6 +74,8 @@ static inline const struct cpumask *cpumask_of_node(int node)
        return &node_to_cpumask_map[node];
 }
 
+static inline void setup_node_to_cpumask_map(void) { }
+
 #else /* CONFIG_X86_64 */
 
 /* Mappings between node number and cpus on that node. */
@@ -83,7 +85,8 @@ extern cpumask_t *node_to_cpumask_map;
 DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
 
 /* Returns the number of the current Node. */
-#define numa_node_id()         read_pda(nodenumber)
+DECLARE_PER_CPU(int, node_number);
+#define numa_node_id()         percpu_read(node_number)
 
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 extern int cpu_to_node(int cpu);
@@ -102,10 +105,7 @@ static inline int cpu_to_node(int cpu)
 /* Same function but used if called before per_cpu areas are setup */
 static inline int early_cpu_to_node(int cpu)
 {
-       if (early_per_cpu_ptr(x86_cpu_to_node_map))
-               return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-
-       return per_cpu(x86_cpu_to_node_map, cpu);
+       return early_per_cpu(x86_cpu_to_node_map, cpu);
 }
 
 /* Returns a pointer to the cpumask of CPUs on Node 'node'. */
@@ -122,6 +122,8 @@ static inline cpumask_t node_to_cpumask(int node)
 
 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
 
+extern void setup_node_to_cpumask_map(void);
+
 /*
  * Replace default node_to_cpumask_ptr with optimized version
  * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
@@ -192,9 +194,20 @@ extern int __node_distance(int, int);
 
 #else /* !CONFIG_NUMA */
 
-#define numa_node_id()         0
-#define        cpu_to_node(cpu)        0
-#define        early_cpu_to_node(cpu)  0
+static inline int numa_node_id(void)
+{
+       return 0;
+}
+
+static inline int cpu_to_node(int cpu)
+{
+       return 0;
+}
+
+static inline int early_cpu_to_node(int cpu)
+{
+       return 0;
+}
 
 static inline const cpumask_t *cpumask_of_node(int node)
 {
@@ -209,6 +222,8 @@ static inline int node_to_first_cpu(int node)
        return first_cpu(cpu_online_map);
 }
 
+static inline void setup_node_to_cpumask_map(void) { }
+
 /*
  * Replace default node_to_cpumask_ptr with optimized version
  * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
index 780ba0a..90f06c2 100644 (file)
@@ -13,6 +13,7 @@ extern unsigned char *trampoline_base;
 
 extern unsigned long init_rsp;
 extern unsigned long initial_code;
+extern unsigned long initial_gs;
 
 #define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
 #define TRAMPOLINE_BASE 0x6000
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
new file mode 100644 (file)
index 0000000..8ac1d7e
--- /dev/null
@@ -0,0 +1,33 @@
+#ifndef _ASM_X86_UV_UV_H
+#define _ASM_X86_UV_UV_H
+
+enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
+
+#ifdef CONFIG_X86_UV
+
+extern enum uv_system_type get_uv_system_type(void);
+extern int is_uv_system(void);
+extern void uv_cpu_init(void);
+extern void uv_system_init(void);
+extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
+extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+                                                struct mm_struct *mm,
+                                                unsigned long va,
+                                                unsigned int cpu);
+
+#else  /* X86_UV */
+
+static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
+static inline int is_uv_system(void)   { return 0; }
+static inline void uv_cpu_init(void)   { }
+static inline void uv_system_init(void)        { }
+static inline int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
+{ return 1; }
+static inline const struct cpumask *
+uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
+                   unsigned long va, unsigned int cpu)
+{ return cpumask; }
+
+#endif /* X86_UV */
+
+#endif /* _ASM_X86_UV_UV_H */
index 50423c7..9b0e61b 100644 (file)
@@ -325,7 +325,6 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
 #define cpubit_isset(cpu, bau_local_cpumask) \
        test_bit((cpu), (bau_local_cpumask).bits)
 
-extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long);
 extern void uv_bau_message_intr1(void);
 extern void uv_bau_timeout_intr1(void);
 
index d364df0..37fa30b 100644 (file)
@@ -23,11 +23,12 @@ nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_vsyscall_64.o   := $(PROFILING) -g0 $(nostackp)
 CFLAGS_hpet.o          := $(nostackp)
 CFLAGS_tsc.o           := $(nostackp)
+CFLAGS_paravirt.o      := $(nostackp)
 
 obj-y                  := process_$(BITS).o signal.o entry_$(BITS).o
 obj-y                  += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y                  += time_$(BITS).o ioport.o ldt.o dumpstack.o
-obj-y                  += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
+obj-y                  += setup.o i8259.o irqinit_$(BITS).o
 obj-$(CONFIG_X86_VISWS)        += visws_quirks.o
 obj-$(CONFIG_X86_32)   += probe_roms_32.o
 obj-$(CONFIG_X86_32)   += sys_i386_32.o i386_ksyms_32.o
@@ -57,9 +58,9 @@ obj-$(CONFIG_PCI)             += early-quirks.o
 apm-y                          := apm_32.o
 obj-$(CONFIG_APM)              += apm.o
 obj-$(CONFIG_X86_SMP)          += smp.o
-obj-$(CONFIG_X86_SMP)          += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o
-obj-$(CONFIG_X86_32_SMP)       += smpcommon.o
-obj-$(CONFIG_X86_64_SMP)       += tsc_sync.o smpcommon.o
+obj-$(CONFIG_X86_SMP)          += smpboot.o tsc_sync.o ipi.o
+obj-$(CONFIG_SMP)              += setup_percpu.o
+obj-$(CONFIG_X86_64_SMP)       += tsc_sync.o
 obj-$(CONFIG_X86_TRAMPOLINE)   += trampoline_$(BITS).o
 obj-$(CONFIG_X86_MPPARSE)      += mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o nmi.o
@@ -114,10 +115,11 @@ obj-$(CONFIG_SWIOTLB)                     += pci-swiotlb_64.o # NB rename without _64
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
-        obj-y                          += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
-       obj-y                           += bios_uv.o uv_irq.o uv_sysfs.o
+        obj-y                          += genapic_64.o genapic_flat_64.o
         obj-y                          += genx2apic_cluster.o
         obj-y                          += genx2apic_phys.o
+       obj-$(CONFIG_X86_UV)            += genx2apic_uv_x.o tlb_uv.o
+       obj-$(CONFIG_X86_UV)            += bios_uv.o uv_irq.o uv_sysfs.o
         obj-$(CONFIG_X86_PM_TIMER)     += pmtimer_64.o
         obj-$(CONFIG_AUDIT)            += audit_64.o
 
index d37593c..4cb5964 100644 (file)
@@ -912,8 +912,8 @@ static u8 __init uniq_ioapic_id(u8 id)
        DECLARE_BITMAP(used, 256);
        bitmap_zero(used, 256);
        for (i = 0; i < nr_ioapics; i++) {
-               struct mp_config_ioapic *ia = &mp_ioapics[i];
-               __set_bit(ia->mp_apicid, used);
+               struct mpc_ioapic *ia = &mp_ioapics[i];
+               __set_bit(ia->apicid, used);
        }
        if (!test_bit(id, used))
                return id;
@@ -945,47 +945,47 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 
        idx = nr_ioapics;
 
-       mp_ioapics[idx].mp_type = MP_IOAPIC;
-       mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
-       mp_ioapics[idx].mp_apicaddr = address;
+       mp_ioapics[idx].type = MP_IOAPIC;
+       mp_ioapics[idx].flags = MPC_APIC_USABLE;
+       mp_ioapics[idx].apicaddr = address;
 
        set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-       mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
+       mp_ioapics[idx].apicid = uniq_ioapic_id(id);
 #ifdef CONFIG_X86_32
-       mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
+       mp_ioapics[idx].apicver = io_apic_get_version(idx);
 #else
-       mp_ioapics[idx].mp_apicver = 0;
+       mp_ioapics[idx].apicver = 0;
 #endif
        /*
         * Build basic GSI lookup table to facilitate gsi->io_apic lookups
         * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
         */
-       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid;
+       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
        mp_ioapic_routing[idx].gsi_base = gsi_base;
        mp_ioapic_routing[idx].gsi_end = gsi_base +
            io_apic_get_redir_entries(idx);
 
-       printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
-              "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid,
-              mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr,
+       printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+              "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
+              mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
               mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
 
        nr_ioapics++;
 }
 
-static void assign_to_mp_irq(struct mp_config_intsrc *m,
-                                   struct mp_config_intsrc *mp_irq)
+static void assign_to_mp_irq(struct mpc_intsrc *m,
+                                   struct mpc_intsrc *mp_irq)
 {
-       memcpy(mp_irq, m, sizeof(struct mp_config_intsrc));
+       memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
 }
 
-static int mp_irq_cmp(struct mp_config_intsrc *mp_irq,
-                               struct mp_config_intsrc *m)
+static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
+                               struct mpc_intsrc *m)
 {
-       return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc));
+       return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
 }
 
-static void save_mp_irq(struct mp_config_intsrc *m)
+static void save_mp_irq(struct mpc_intsrc *m)
 {
        int i;
 
@@ -1003,7 +1003,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 {
        int ioapic;
        int pin;
-       struct mp_config_intsrc mp_irq;
+       struct mpc_intsrc mp_irq;
 
        /*
         * Convert 'gsi' to 'ioapic.pin'.
@@ -1021,13 +1021,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
        if ((bus_irq == 0) && (trigger == 3))
                trigger = 1;
 
-       mp_irq.mp_type = MP_INTSRC;
-       mp_irq.mp_irqtype = mp_INT;
-       mp_irq.mp_irqflag = (trigger << 2) | polarity;
-       mp_irq.mp_srcbus = MP_ISA_BUS;
-       mp_irq.mp_srcbusirq = bus_irq;  /* IRQ */
-       mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */
-       mp_irq.mp_dstirq = pin; /* INTIN# */
+       mp_irq.type = MP_INTSRC;
+       mp_irq.irqtype = mp_INT;
+       mp_irq.irqflag = (trigger << 2) | polarity;
+       mp_irq.srcbus = MP_ISA_BUS;
+       mp_irq.srcbusirq = bus_irq;     /* IRQ */
+       mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
+       mp_irq.dstirq = pin;    /* INTIN# */
 
        save_mp_irq(&mp_irq);
 }
@@ -1037,7 +1037,7 @@ void __init mp_config_acpi_legacy_irqs(void)
        int i;
        int ioapic;
        unsigned int dstapic;
-       struct mp_config_intsrc mp_irq;
+       struct mpc_intsrc mp_irq;
 
 #if defined (CONFIG_MCA) || defined (CONFIG_EISA)
        /*
@@ -1062,7 +1062,7 @@ void __init mp_config_acpi_legacy_irqs(void)
        ioapic = mp_find_ioapic(0);
        if (ioapic < 0)
                return;
-       dstapic = mp_ioapics[ioapic].mp_apicid;
+       dstapic = mp_ioapics[ioapic].apicid;
 
        /*
         * Use the default configuration for the IRQs 0-15.  Unless
@@ -1072,16 +1072,14 @@ void __init mp_config_acpi_legacy_irqs(void)
                int idx;
 
                for (idx = 0; idx < mp_irq_entries; idx++) {
-                       struct mp_config_intsrc *irq = mp_irqs + idx;
+                       struct mpc_intsrc *irq = mp_irqs + idx;
 
                        /* Do we already have a mapping for this ISA IRQ? */
-                       if (irq->mp_srcbus == MP_ISA_BUS
-                           && irq->mp_srcbusirq == i)
+                       if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i)
                                break;
 
                        /* Do we already have a mapping for this IOAPIC pin */
-                       if (irq->mp_dstapic == dstapic &&
-                           irq->mp_dstirq == i)
+                       if (irq->dstapic == dstapic && irq->dstirq == i)
                                break;
                }
 
@@ -1090,13 +1088,13 @@ void __init mp_config_acpi_legacy_irqs(void)
                        continue;       /* IRQ already used */
                }
 
-               mp_irq.mp_type = MP_INTSRC;
-               mp_irq.mp_irqflag = 0;  /* Conforming */
-               mp_irq.mp_srcbus = MP_ISA_BUS;
-               mp_irq.mp_dstapic = dstapic;
-               mp_irq.mp_irqtype = mp_INT;
-               mp_irq.mp_srcbusirq = i; /* Identity mapped */
-               mp_irq.mp_dstirq = i;
+               mp_irq.type = MP_INTSRC;
+               mp_irq.irqflag = 0;     /* Conforming */
+               mp_irq.srcbus = MP_ISA_BUS;
+               mp_irq.dstapic = dstapic;
+               mp_irq.irqtype = mp_INT;
+               mp_irq.srcbusirq = i; /* Identity mapped */
+               mp_irq.dstirq = i;
 
                save_mp_irq(&mp_irq);
        }
@@ -1207,22 +1205,22 @@ int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
                        u32 gsi, int triggering, int polarity)
 {
 #ifdef CONFIG_X86_MPPARSE
-       struct mp_config_intsrc mp_irq;
+       struct mpc_intsrc mp_irq;
        int ioapic;
 
        if (!acpi_ioapic)
                return 0;
 
        /* print the entry should happen on mptable identically */
-       mp_irq.mp_type = MP_INTSRC;
-       mp_irq.mp_irqtype = mp_INT;
-       mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+       mp_irq.type = MP_INTSRC;
+       mp_irq.irqtype = mp_INT;
+       mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
                                (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
-       mp_irq.mp_srcbus = number;
-       mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+       mp_irq.srcbus = number;
+       mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
        ioapic = mp_find_ioapic(gsi);
-       mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id;
-       mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
+       mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
+       mp_irq.dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
 
        save_mp_irq(&mp_irq);
 #endif
index a60c1f3..7c243a2 100644 (file)
@@ -101,6 +101,7 @@ int acpi_save_state_mem(void)
        stack_start.sp = temp_stack + sizeof(temp_stack);
        early_gdt_descr.address =
                        (unsigned long)get_cpu_gdt_table(smp_processor_id());
+       initial_gs = per_cpu_offset(smp_processor_id());
 #endif
        initial_code = (unsigned long)wakeup_long64;
        saved_magic = 0x123456789abcdef0;
index 115449f..383d827 100644 (file)
 # error SPURIOUS_APIC_VECTOR definition error
 #endif
 
+unsigned int num_processors;
+unsigned disabled_cpus __cpuinitdata;
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+EXPORT_SYMBOL(boot_cpu_physical_apicid);
+unsigned int max_physical_apicid;
+
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map;
+
+/*
+ * Map cpu index to physical APIC ID
+ */
+DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+
 #ifdef CONFIG_X86_32
 /*
  * Knob to control our willingness to enable the local APIC.
@@ -1130,6 +1148,13 @@ void __cpuinit setup_local_APIC(void)
        unsigned int value;
        int i, j;
 
+       if (disable_apic) {
+#ifdef CONFIG_X86_IO_APIC
+               disable_ioapic_setup();
+#endif
+               return;
+       }
+
 #ifdef CONFIG_X86_32
        /* Pound the ESR really hard over the head with a big hammer - mbligh */
        if (lapic_is_integrated() && esr_disable) {
@@ -1570,11 +1595,11 @@ int apic_version[MAX_APICS];
 
 int __init APIC_init_uniprocessor(void)
 {
-#ifdef CONFIG_X86_64
        if (disable_apic) {
                pr_info("Apic disabled\n");
                return -1;
        }
+#ifdef CONFIG_X86_64
        if (!cpu_has_apic) {
                disable_apic = 1;
                pr_info("Apic disabled by BIOS\n");
@@ -1877,17 +1902,8 @@ void __cpuinit generic_processor_info(int apicid, int version)
 #endif
 
 #if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
-       /* are we being called early in kernel startup? */
-       if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
-               u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
-               u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-
-               cpu_to_apicid[cpu] = apicid;
-               bios_cpu_apicid[cpu] = apicid;
-       } else {
-               per_cpu(x86_cpu_to_apicid, cpu) = apicid;
-               per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
-       }
+       early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
+       early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
 #endif
 
        set_cpu_possible(cpu, true);
index 1d41d3f..8793ab3 100644 (file)
@@ -11,7 +11,6 @@
 #include <linux/hardirq.h>
 #include <linux/suspend.h>
 #include <linux/kbuild.h>
-#include <asm/pda.h>
 #include <asm/processor.h>
 #include <asm/segment.h>
 #include <asm/thread_info.h>
@@ -48,16 +47,6 @@ int main(void)
 #endif
        BLANK();
 #undef ENTRY
-#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
-       ENTRY(kernelstack); 
-       ENTRY(oldrsp); 
-       ENTRY(pcurrent); 
-       ENTRY(irqcount);
-       ENTRY(cpunumber);
-       ENTRY(irqstackptr);
-       ENTRY(data_offset);
-       BLANK();
-#undef ENTRY
 #ifdef CONFIG_PARAVIRT
        BLANK();
        OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
index 83492b1..0f73ea4 100644 (file)
 #include <asm/asm.h>
 #include <asm/numa.h>
 #include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/cpumask.h>
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/mpspec.h>
 #include <asm/apic.h>
 #include <mach_apic.h>
 #include <asm/genapic.h>
+#include <asm/uv/uv.h>
 #endif
 
-#include <asm/pda.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
@@ -50,6 +52,15 @@ cpumask_var_t cpu_initialized_mask;
 /* representing cpus for which sibling maps can be computed */
 cpumask_var_t cpu_sibling_setup_mask;
 
+/* correctly size the local cpu masks */
+void __init setup_cpu_local_masks(void)
+{
+       alloc_bootmem_cpumask_var(&cpu_initialized_mask);
+       alloc_bootmem_cpumask_var(&cpu_callin_mask);
+       alloc_bootmem_cpumask_var(&cpu_callout_mask);
+       alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+}
+
 #else /* CONFIG_X86_32 */
 
 cpumask_t cpu_callin_map;
@@ -62,23 +73,23 @@ cpumask_t cpu_sibling_setup_map;
 
 static struct cpu_dev *this_cpu __cpuinitdata;
 
+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
-/* We need valid kernel segments for data and code in long mode too
- * IRET will check the segment types  kkeil 2000/10/28
- * Also sysret mandates a special GDT layout
- */
-/* The TLS descriptors are currently at a different place compared to i386.
-   Hopefully nobody expects them at a fixed place (Wine?) */
-DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+       /*
+        * We need valid kernel segments for data and code in long mode too
+        * IRET will check the segment types  kkeil 2000/10/28
+        * Also sysret mandates a special GDT layout
+        *
+        * The TLS descriptors are currently at a different place compared to i386.
+        * Hopefully nobody expects them at a fixed place (Wine?)
+        */
        [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
        [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
        [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
        [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
        [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
        [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
-} };
 #else
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
        [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
        [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
        [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -110,9 +121,9 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
        [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
 
        [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
-       [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
-} };
+       [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
 #endif
+} };
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 
 #ifdef CONFIG_X86_32
@@ -242,18 +253,28 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
 
 __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
 
+void load_percpu_segment(int cpu)
+{
+#ifdef CONFIG_X86_32
+       loadsegment(fs, __KERNEL_PERCPU);
+#else
+       loadsegment(gs, 0);
+       wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
+#endif
+}
+
 /* Current gdt points %fs at the "master" per-cpu area: after this,
  * it's on the real one. */
-void switch_to_new_gdt(void)
+void switch_to_new_gdt(int cpu)
 {
        struct desc_ptr gdt_descr;
 
-       gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+       gdt_descr.address = (long)get_cpu_gdt_table(cpu);
        gdt_descr.size = GDT_SIZE - 1;
        load_gdt(&gdt_descr);
-#ifdef CONFIG_X86_32
-       asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
-#endif
+       /* Reload the per-cpu base */
+
+       load_percpu_segment(cpu);
 }
 
 static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
@@ -877,54 +898,26 @@ static __init int setup_disablecpuid(char *arg)
 __setup("clearcpuid=", setup_disablecpuid);
 
 #ifdef CONFIG_X86_64
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 
-static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+DEFINE_PER_CPU_FIRST(union irq_stack_union,
+                    irq_stack_union) __aligned(PAGE_SIZE);
+#ifdef CONFIG_SMP
+DEFINE_PER_CPU(char *, irq_stack_ptr); /* will be set during per cpu init */
+#else
+DEFINE_PER_CPU(char *, irq_stack_ptr) =
+       per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+#endif
 
-void __cpuinit pda_init(int cpu)
-{
-       struct x8664_pda *pda = cpu_pda(cpu);
+DEFINE_PER_CPU(unsigned long, kernel_stack) =
+       (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(kernel_stack);
 
-       /* Setup up data that may be needed in __get_free_pages early */
-       loadsegment(fs, 0);
-       loadsegment(gs, 0);
-       /* Memory clobbers used to order PDA accessed */
-       mb();
-       wrmsrl(MSR_GS_BASE, pda);
-       mb();
-
-       pda->cpunumber = cpu;
-       pda->irqcount = -1;
-       pda->kernelstack = (unsigned long)stack_thread_info() -
-                                PDA_STACKOFFSET + THREAD_SIZE;
-       pda->active_mm = &init_mm;
-       pda->mmu_state = 0;
-
-       if (cpu == 0) {
-               /* others are initialized in smpboot.c */
-               pda->pcurrent = &init_task;
-               pda->irqstackptr = boot_cpu_stack;
-               pda->irqstackptr += IRQSTACKSIZE - 64;
-       } else {
-               if (!pda->irqstackptr) {
-                       pda->irqstackptr = (char *)
-                               __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
-                       if (!pda->irqstackptr)
-                               panic("cannot allocate irqstack for cpu %d",
-                                     cpu);
-                       pda->irqstackptr += IRQSTACKSIZE - 64;
-               }
+DEFINE_PER_CPU(unsigned int, irq_count) = -1;
 
-               if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
-                       pda->nodenumber = cpu_to_node(cpu);
-       }
-}
-
-static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
-                                 DEBUG_STKSZ] __page_aligned_bss;
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+       [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
+       __aligned(PAGE_SIZE);
 
 extern asmlinkage void ignore_sysret(void);
 
@@ -982,15 +975,14 @@ void __cpuinit cpu_init(void)
        struct tss_struct *t = &per_cpu(init_tss, cpu);
        struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
        unsigned long v;
-       char *estacks = NULL;
        struct task_struct *me;
        int i;
 
-       /* CPU 0 is initialised in head64.c */
-       if (cpu != 0)
-               pda_init(cpu);
-       else
-               estacks = boot_exception_stacks;
+#ifdef CONFIG_NUMA
+       if (cpu != 0 && percpu_read(node_number) == 0 &&
+           cpu_to_node(cpu) != NUMA_NO_NODE)
+               percpu_write(node_number, cpu_to_node(cpu));
+#endif
 
        me = current;
 
@@ -1006,7 +998,9 @@ void __cpuinit cpu_init(void)
         * and set up the GDT descriptor:
         */
 
-       switch_to_new_gdt();
+       switch_to_new_gdt(cpu);
+       loadsegment(fs, 0);
+
        load_idt((const struct desc_ptr *)&idt_descr);
 
        memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
@@ -1024,18 +1018,13 @@ void __cpuinit cpu_init(void)
         * set up and load the per-CPU TSS
         */
        if (!orig_ist->ist[0]) {
-               static const unsigned int order[N_EXCEPTION_STACKS] = {
-                 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
-                 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+               static const unsigned int sizes[N_EXCEPTION_STACKS] = {
+                 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+                 [DEBUG_STACK - 1] = DEBUG_STKSZ
                };
+               char *estacks = per_cpu(exception_stacks, cpu);
                for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-                       if (cpu) {
-                               estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
-                               if (!estacks)
-                                       panic("Cannot allocate exception "
-                                             "stack %ld %d\n", v, cpu);
-                       }
-                       estacks += PAGE_SIZE << order[v];
+                       estacks += sizes[v];
                        orig_ist->ist[v] = t->x86_tss.ist[v] =
                                        (unsigned long)estacks;
                }
@@ -1114,7 +1103,7 @@ void __cpuinit cpu_init(void)
                clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
 
        load_idt(&idt_descr);
-       switch_to_new_gdt();
+       switch_to_new_gdt(cpu);
 
        /*
         * Set up and load the per-CPU TSS and LDT
index da299eb..7293508 100644 (file)
@@ -147,7 +147,16 @@ struct _cpuid4_info {
        union _cpuid4_leaf_ecx ecx;
        unsigned long size;
        unsigned long can_disable;
-       cpumask_t shared_cpu_map;       /* future?: only cpus/node is needed */
+       DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
+};
+
+/* subset of above _cpuid4_info w/o shared_cpu_map */
+struct _cpuid4_info_regs {
+       union _cpuid4_leaf_eax eax;
+       union _cpuid4_leaf_ebx ebx;
+       union _cpuid4_leaf_ecx ecx;
+       unsigned long size;
+       unsigned long can_disable;
 };
 
 #ifdef CONFIG_PCI
@@ -278,7 +287,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
 }
 
 static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
+amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
        if (index < 3)
                return;
@@ -286,7 +295,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
 }
 
 static int
-__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
+__cpuinit cpuid4_cache_lookup_regs(int index,
+                                  struct _cpuid4_info_regs *this_leaf)
 {
        union _cpuid4_leaf_eax  eax;
        union _cpuid4_leaf_ebx  ebx;
@@ -314,6 +324,15 @@ __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
        return 0;
 }
 
+static int
+__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
+{
+       struct _cpuid4_info_regs *leaf_regs =
+               (struct _cpuid4_info_regs *)this_leaf;
+
+       return cpuid4_cache_lookup_regs(index, leaf_regs);
+}
+
 static int __cpuinit find_num_cache_leaves(void)
 {
        unsigned int            eax, ebx, ecx, edx;
@@ -353,11 +372,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
                 * parameters cpuid leaf to find the cache details
                 */
                for (i = 0; i < num_cache_leaves; i++) {
-                       struct _cpuid4_info this_leaf;
-
+                       struct _cpuid4_info_regs this_leaf;
                        int retval;
 
-                       retval = cpuid4_cache_lookup(i, &this_leaf);
+                       retval = cpuid4_cache_lookup_regs(i, &this_leaf);
                        if (retval >= 0) {
                                switch(this_leaf.eax.split.level) {
                                    case 1:
@@ -506,17 +524,20 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
        num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
 
        if (num_threads_sharing == 1)
-               cpu_set(cpu, this_leaf->shared_cpu_map);
+               cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
        else {
                index_msb = get_count_order(num_threads_sharing);
 
                for_each_online_cpu(i) {
                        if (cpu_data(i).apicid >> index_msb ==
                            c->apicid >> index_msb) {
-                               cpu_set(i, this_leaf->shared_cpu_map);
+                               cpumask_set_cpu(i,
+                                       to_cpumask(this_leaf->shared_cpu_map));
                                if (i != cpu && per_cpu(cpuid4_info, i))  {
-                                       sibling_leaf = CPUID4_INFO_IDX(i, index);
-                                       cpu_set(cpu, sibling_leaf->shared_cpu_map);
+                                       sibling_leaf =
+                                               CPUID4_INFO_IDX(i, index);
+                                       cpumask_set_cpu(cpu, to_cpumask(
+                                               sibling_leaf->shared_cpu_map));
                                }
                        }
                }
@@ -528,9 +549,10 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
        int sibling;
 
        this_leaf = CPUID4_INFO_IDX(cpu, index);
-       for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
+       for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
                sibling_leaf = CPUID4_INFO_IDX(sibling, index);
-               cpu_clear(cpu, sibling_leaf->shared_cpu_map);
+               cpumask_clear_cpu(cpu,
+                                 to_cpumask(sibling_leaf->shared_cpu_map));
        }
 }
 #else
@@ -635,8 +657,9 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
        int n = 0;
 
        if (len > 1) {
-               cpumask_t *mask = &this_leaf->shared_cpu_map;
+               const struct cpumask *mask;
 
+               mask = to_cpumask(this_leaf->shared_cpu_map);
                n = type?
                        cpulist_scnprintf(buf, len-2, mask) :
                        cpumask_scnprintf(buf, len-2, mask);
@@ -699,7 +722,8 @@ static struct pci_dev *get_k8_northbridge(int node)
 
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
 {
-       int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+       const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
+       int node = cpu_to_node(cpumask_first(mask));
        struct pci_dev *dev = NULL;
        ssize_t ret = 0;
        int i;
@@ -733,7 +757,8 @@ static ssize_t
 store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
                    size_t count)
 {
-       int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+       const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
+       int node = cpu_to_node(cpumask_first(mask));
        struct pci_dev *dev = NULL;
        unsigned int ret, index, val;
 
@@ -878,7 +903,7 @@ err_out:
        return -ENOMEM;
 }
 
-static cpumask_t cache_dev_map = CPU_MASK_NONE;
+static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
 
 /* Add/Remove cache interface for CPU device */
 static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
@@ -918,7 +943,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
                }
                kobject_uevent(&(this_object->kobj), KOBJ_ADD);
        }
-       cpu_set(cpu, cache_dev_map);
+       cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
 
        kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
        return 0;
@@ -931,9 +956,9 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
 
        if (per_cpu(cpuid4_info, cpu) == NULL)
                return;
-       if (!cpu_isset(cpu, cache_dev_map))
+       if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
                return;
-       cpu_clear(cpu, cache_dev_map);
+       cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
 
        for (i = 0; i < num_cache_leaves; i++)
                kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
index 8ae8c4f..4772e91 100644 (file)
@@ -67,7 +67,7 @@ static struct threshold_block threshold_defaults = {
 struct threshold_bank {
        struct kobject *kobj;
        struct threshold_block *blocks;
-       cpumask_t cpus;
+       cpumask_var_t cpus;
 };
 static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
 
@@ -481,7 +481,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
 #ifdef CONFIG_SMP
        if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {   /* symlink */
-               i = first_cpu(per_cpu(cpu_core_map, cpu));
+               i = cpumask_first(&per_cpu(cpu_core_map, cpu));
 
                /* first core not up yet */
                if (cpu_data(i).cpu_core_id)
@@ -501,7 +501,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                if (err)
                        goto out;
 
-               b->cpus = per_cpu(cpu_core_map, cpu);
+               cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
                per_cpu(threshold_banks, cpu)[bank] = b;
                goto out;
        }
@@ -512,15 +512,20 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                err = -ENOMEM;
                goto out;
        }
+       if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
+               kfree(b);
+               err = -ENOMEM;
+               goto out;
+       }
 
        b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
        if (!b->kobj)
                goto out_free;
 
 #ifndef CONFIG_SMP
-       b->cpus = CPU_MASK_ALL;
+       cpumask_setall(b->cpus);
 #else
-       b->cpus = per_cpu(cpu_core_map, cpu);
+       cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
 #endif
 
        per_cpu(threshold_banks, cpu)[bank] = b;
@@ -529,7 +534,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
        if (err)
                goto out_free;
 
-       for_each_cpu_mask_nr(i, b->cpus) {
+       for_each_cpu(i, b->cpus) {
                if (i == cpu)
                        continue;
 
@@ -545,6 +550,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
 out_free:
        per_cpu(threshold_banks, cpu)[bank] = NULL;
+       free_cpumask_var(b->cpus);
        kfree(b);
 out:
        return err;
@@ -619,7 +625,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #endif
 
        /* remove all sibling symlinks before unregistering */
-       for_each_cpu_mask_nr(i, b->cpus) {
+       for_each_cpu(i, b->cpus) {
                if (i == cpu)
                        continue;
 
@@ -632,6 +638,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 free_out:
        kobject_del(b->kobj);
        kobject_put(b->kobj);
+       free_cpumask_var(b->cpus);
        kfree(b);
        per_cpu(threshold_banks, cpu)[bank] = NULL;
 }
index 4b48f25..5e8c79e 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <asm/processor.h>
+#include <asm/apic.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
 #include <asm/hw_irq.h>
index c689d19..11b93ca 100644 (file)
@@ -24,7 +24,7 @@
 #include <asm/apic.h>
 #include <asm/hpet.h>
 #include <linux/kdebug.h>
-#include <asm/smp.h>
+#include <asm/cpu.h>
 #include <asm/reboot.h>
 #include <asm/virtext.h>
 
index c302d07..d35db59 100644 (file)
@@ -106,7 +106,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                const struct stacktrace_ops *ops, void *data)
 {
        const unsigned cpu = get_cpu();
-       unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+       unsigned long *irq_stack_end =
+               (unsigned long *)per_cpu(irq_stack_ptr, cpu);
        unsigned used = 0;
        struct thread_info *tinfo;
        int graph = 0;
@@ -160,23 +161,23 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                        stack = (unsigned long *) estack_end[-2];
                        continue;
                }
-               if (irqstack_end) {
-                       unsigned long *irqstack;
-                       irqstack = irqstack_end -
-                               (IRQSTACKSIZE - 64) / sizeof(*irqstack);
+               if (irq_stack_end) {
+                       unsigned long *irq_stack;
+                       irq_stack = irq_stack_end -
+                               (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
 
-                       if (stack >= irqstack && stack < irqstack_end) {
+                       if (stack >= irq_stack && stack < irq_stack_end) {
                                if (ops->stack(data, "IRQ") < 0)
                                        break;
                                bp = print_context_stack(tinfo, stack, bp,
-                                       ops, data, irqstack_end, &graph);
+                                       ops, data, irq_stack_end, &graph);
                                /*
                                 * We link to the next stack (which would be
                                 * the process stack normally) the last
                                 * pointer (index -1 to end) in the IRQ stack:
                                 */
-                               stack = (unsigned long *) (irqstack_end[-1]);
-                               irqstack_end = NULL;
+                               stack = (unsigned long *) (irq_stack_end[-1]);
+                               irq_stack_end = NULL;
                                ops->stack(data, "EOI");
                                continue;
                        }
@@ -199,10 +200,10 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
        unsigned long *stack;
        int i;
        const int cpu = smp_processor_id();
-       unsigned long *irqstack_end =
-               (unsigned long *) (cpu_pda(cpu)->irqstackptr);
-       unsigned long *irqstack =
-               (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+       unsigned long *irq_stack_end =
+               (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
+       unsigned long *irq_stack =
+               (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
 
        /*
         * debugging aid: "show_stack(NULL, NULL);" prints the
@@ -218,9 +219,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 
        stack = sp;
        for (i = 0; i < kstack_depth_to_print; i++) {
-               if (stack >= irqstack && stack <= irqstack_end) {
-                       if (stack == irqstack_end) {
-                               stack = (unsigned long *) (irqstack_end[-1]);
+               if (stack >= irq_stack && stack <= irq_stack_end) {
+                       if (stack == irq_stack_end) {
+                               stack = (unsigned long *) (irq_stack_end[-1]);
                                printk(" <EOI> ");
                        }
                } else {
@@ -241,7 +242,7 @@ void show_registers(struct pt_regs *regs)
        int i;
        unsigned long sp;
        const int cpu = smp_processor_id();
-       struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+       struct task_struct *cur = current;
 
        sp = regs->sp;
        printk("CPU %d ", cpu);
index 1119d24..b205272 100644 (file)
@@ -366,10 +366,12 @@ void __init efi_init(void)
                                        SMBIOS_TABLE_GUID)) {
                        efi.smbios = config_tables[i].table;
                        printk(" SMBIOS=0x%lx ", config_tables[i].table);
+#ifdef CONFIG_X86_UV
                } else if (!efi_guidcmp(config_tables[i].guid,
                                        UV_SYSTEM_TABLE_GUID)) {
                        efi.uv_systab = config_tables[i].table;
                        printk(" UVsystab=0x%lx ", config_tables[i].table);
+#endif
                } else if (!efi_guidcmp(config_tables[i].guid,
                                        HCDP_TABLE_GUID)) {
                        efi.hcdp = config_tables[i].table;
index 652c528..a4ee291 100644 (file)
@@ -36,6 +36,7 @@
 #include <asm/proto.h>
 #include <asm/efi.h>
 #include <asm/cacheflush.h>
+#include <asm/fixmap.h>
 
 static pgd_t save_pgd __initdata;
 static unsigned long efi_flags __initdata;
index 4646902..a0b91aa 100644 (file)
@@ -672,7 +672,7 @@ common_interrupt:
 ENDPROC(common_interrupt)
        CFI_ENDPROC
 
-#define BUILD_INTERRUPT(name, nr)      \
+#define BUILD_INTERRUPT3(name, nr, fn) \
 ENTRY(name)                            \
        RING0_INT_FRAME;                \
        pushl $~(nr);                   \
@@ -680,11 +680,13 @@ ENTRY(name)                               \
        SAVE_ALL;                       \
        TRACE_IRQS_OFF                  \
        movl %esp,%eax;                 \
-       call smp_##name;                \
+       call fn;                        \
        jmp ret_from_intr;              \
        CFI_ENDPROC;                    \
 ENDPROC(name)
 
+#define BUILD_INTERRUPT(name, nr)      BUILD_INTERRUPT3(name, nr, smp_##name)
+
 /* The include is where all of the SMP etc. interrupts come from */
 #include "entry_arch.h"
 
index a134621..586bed6 100644 (file)
@@ -52,6 +52,7 @@
 #include <asm/irqflags.h>
 #include <asm/paravirt.h>
 #include <asm/ftrace.h>
+#include <asm/percpu.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -209,7 +210,7 @@ ENTRY(native_usergs_sysret64)
 
        /* %rsp:at FRAMEEND */
        .macro FIXUP_TOP_OF_STACK tmp offset=0
-       movq %gs:pda_oldrsp,\tmp
+       movq PER_CPU_VAR(old_rsp),\tmp
        movq \tmp,RSP+\offset(%rsp)
        movq $__USER_DS,SS+\offset(%rsp)
        movq $__USER_CS,CS+\offset(%rsp)
@@ -220,7 +221,7 @@ ENTRY(native_usergs_sysret64)
 
        .macro RESTORE_TOP_OF_STACK tmp offset=0
        movq RSP+\offset(%rsp),\tmp
-       movq \tmp,%gs:pda_oldrsp
+       movq \tmp,PER_CPU_VAR(old_rsp)
        movq EFLAGS+\offset(%rsp),\tmp
        movq \tmp,R11+\offset(%rsp)
        .endm
@@ -336,15 +337,15 @@ ENTRY(save_args)
        je 1f
        SWAPGS
        /*
-        * irqcount is used to check if a CPU is already on an interrupt stack
+        * irq_count is used to check if a CPU is already on an interrupt stack
         * or not. While this is essentially redundant with preempt_count it is
         * a little cheaper to use a separate counter in the PDA (short of
         * moving irq_enter into assembly, which would be too much work)
         */
-1:     incl %gs:pda_irqcount
+1:     incl PER_CPU_VAR(irq_count)
        jne 2f
        popq_cfi %rax                   /* move return address... */
-       mov %gs:pda_irqstackptr,%rsp
+       mov PER_CPU_VAR(irq_stack_ptr),%rsp
        EMPTY_FRAME 0
        pushq_cfi %rbp                  /* backlink for unwinder */
        pushq_cfi %rax                  /* ... to the new stack */
@@ -468,7 +469,7 @@ END(ret_from_fork)
 ENTRY(system_call)
        CFI_STARTPROC   simple
        CFI_SIGNAL_FRAME
-       CFI_DEF_CFA     rsp,PDA_STACKOFFSET
+       CFI_DEF_CFA     rsp,KERNEL_STACK_OFFSET
        CFI_REGISTER    rip,rcx
        /*CFI_REGISTER  rflags,r11*/
        SWAPGS_UNSAFE_STACK
@@ -479,8 +480,8 @@ ENTRY(system_call)
         */
 ENTRY(system_call_after_swapgs)
 
-       movq    %rsp,%gs:pda_oldrsp
-       movq    %gs:pda_kernelstack,%rsp
+       movq    %rsp,PER_CPU_VAR(old_rsp)
+       movq    PER_CPU_VAR(kernel_stack),%rsp
        /*
         * No need to follow this irqs off/on section - it's straight
         * and short:
@@ -523,7 +524,7 @@ sysret_check:
        CFI_REGISTER    rip,rcx
        RESTORE_ARGS 0,-ARG_SKIP,1
        /*CFI_REGISTER  rflags,r11*/
-       movq    %gs:pda_oldrsp, %rsp
+       movq    PER_CPU_VAR(old_rsp), %rsp
        USERGS_SYSRET64
 
        CFI_RESTORE_STATE
@@ -833,11 +834,11 @@ common_interrupt:
        XCPT_FRAME
        addq $-0x80,(%rsp)              /* Adjust vector to [-256,-1] range */
        interrupt do_IRQ
-       /* 0(%rsp): oldrsp-ARGOFFSET */
+       /* 0(%rsp): old_rsp-ARGOFFSET */
 ret_from_intr:
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
-       decl %gs:pda_irqcount
+       decl PER_CPU_VAR(irq_count)
        leaveq
        CFI_DEF_CFA_REGISTER    rsp
        CFI_ADJUST_CFA_OFFSET   -8
@@ -982,8 +983,10 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
        irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
 #endif
 
+#ifdef CONFIG_X86_UV
 apicinterrupt UV_BAU_MESSAGE \
        uv_bau_message_intr1 uv_bau_message_interrupt
+#endif
 apicinterrupt LOCAL_TIMER_VECTOR \
        apic_timer_interrupt smp_apic_timer_interrupt
 
@@ -1073,10 +1076,10 @@ ENTRY(\sym)
        TRACE_IRQS_OFF
        movq %rsp,%rdi          /* pt_regs pointer */
        xorl %esi,%esi          /* no error code */
-       movq %gs:pda_data_offset, %rbp
-       subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+       PER_CPU(init_tss, %rbp)
+       subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
        call \do_sym
-       addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+       addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
        jmp paranoid_exit       /* %ebx: no swapgs flag */
        CFI_ENDPROC
 END(\sym)
@@ -1138,7 +1141,7 @@ ENTRY(native_load_gs_index)
        CFI_STARTPROC
        pushf
        CFI_ADJUST_CFA_OFFSET 8
-       DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
+       DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
        SWAPGS
 gs_change:
        movl %edi,%gs
@@ -1260,14 +1263,14 @@ ENTRY(call_softirq)
        CFI_REL_OFFSET rbp,0
        mov  %rsp,%rbp
        CFI_DEF_CFA_REGISTER rbp
-       incl %gs:pda_irqcount
-       cmove %gs:pda_irqstackptr,%rsp
+       incl PER_CPU_VAR(irq_count)
+       cmove PER_CPU_VAR(irq_stack_ptr),%rsp
        push  %rbp                      # backlink for old unwinder
        call __do_softirq
        leaveq
        CFI_DEF_CFA_REGISTER    rsp
        CFI_ADJUST_CFA_OFFSET   -8
-       decl %gs:pda_irqcount
+       decl PER_CPU_VAR(irq_count)
        ret
        CFI_ENDPROC
 END(call_softirq)
@@ -1297,15 +1300,15 @@ ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
        movq %rdi, %rsp            # we don't return, adjust the stack frame
        CFI_ENDPROC
        DEFAULT_FRAME
-11:    incl %gs:pda_irqcount
+11:    incl PER_CPU_VAR(irq_count)
        movq %rsp,%rbp
        CFI_DEF_CFA_REGISTER rbp
-       cmovzq %gs:pda_irqstackptr,%rsp
+       cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
        pushq %rbp                      # backlink for old unwinder
        call xen_evtchn_do_upcall
        popq %rsp
        CFI_DEF_CFA_REGISTER rsp
-       decl %gs:pda_irqcount
+       decl PER_CPU_VAR(irq_count)
        jmp  error_exit
        CFI_ENDPROC
 END(do_hypervisor_callback)
index 2bced78..e656c27 100644 (file)
@@ -32,7 +32,9 @@ extern struct genapic apic_x2apic_cluster;
 struct genapic __read_mostly *genapic = &apic_flat;
 
 static struct genapic *apic_probe[] __initdata = {
+#ifdef CONFIG_X86_UV
        &apic_x2apic_uv_x,
+#endif
        &apic_x2apic_phys,
        &apic_x2apic_cluster,
        &apic_physflat,
index b193e08..bfe3624 100644 (file)
@@ -25,6 +25,7 @@
 #include <asm/ipi.h>
 #include <asm/genapic.h>
 #include <asm/pgtable.h>
+#include <asm/uv/uv.h>
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/bios.h>
index b9a4d8c..f5b2722 100644 (file)
 #include <asm/bios_ebda.h>
 #include <asm/trampoline.h>
 
-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda;
-
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
-void __init x86_64_init_pda(void)
-{
-       _cpu_pda = __cpu_pda;
-       cpu_pda(0) = &_boot_cpu_pda;
-       pda_init(0);
-}
-
 static void __init zap_identity_mappings(void)
 {
        pgd_t *pgd = pgd_offset_k(0UL);
@@ -112,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
        if (console_loglevel == 10)
                early_printk("Kernel alive\n");
 
-       x86_64_init_pda();
-
        x86_64_start_reservations(real_mode_data);
 }
 
index e835b4e..24c0e5c 100644 (file)
@@ -429,12 +429,14 @@ is386:    movl $2,%ecx            # set MP
        ljmp $(__KERNEL_CS),$1f
 1:     movl $(__KERNEL_DS),%eax        # reload all the segment registers
        movl %eax,%ss                   # after changing gdt.
-       movl %eax,%fs                   # gets reset once there's real percpu
 
        movl $(__USER_DS),%eax          # DS/ES contains default USER segment
        movl %eax,%ds
        movl %eax,%es
 
+       movl $(__KERNEL_PERCPU), %eax
+       movl %eax,%fs                   # set this cpu's percpu
+
        xorl %eax,%eax                  # Clear GS and LDT
        movl %eax,%gs
        lldt %ax
@@ -446,8 +448,6 @@ is386:      movl $2,%ecx            # set MP
        movb $1, ready
        cmpb $0,%cl             # the first CPU calls start_kernel
        je   1f
-       movl $(__KERNEL_PERCPU), %eax
-       movl %eax,%fs           # set this cpu's percpu
        movl (stack_start), %esp
 1:
 #endif /* CONFIG_SMP */
index 0e275d4..a0a2b5c 100644 (file)
@@ -19,6 +19,7 @@
 #include <asm/msr.h>
 #include <asm/cache.h>
 #include <asm/processor-flags.h>
+#include <asm/percpu.h>
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/asm-offsets.h>
@@ -204,6 +205,19 @@ ENTRY(secondary_startup_64)
        pushq $0
        popfq
 
+#ifdef CONFIG_SMP
+       /*
+        * Fix up static pointers that need __per_cpu_load added.  The assembler
+        * is unable to do this directly.  This is only needed for the boot cpu.
+        * These values are set up with the correct base addresses by C code for
+        * secondary cpus.
+        */
+       movq    initial_gs(%rip), %rax
+       cmpl    $0, per_cpu__cpu_number(%rax)
+       jne     1f
+       addq    %rax, early_gdt_descr_base(%rip)
+1:
+#endif
        /*
         * We must switch to a new descriptor in kernel space for the GDT
         * because soon the kernel won't have access anymore to the userspace
@@ -226,12 +240,15 @@ ENTRY(secondary_startup_64)
        movl %eax,%fs
        movl %eax,%gs
 
-       /* 
-        * Setup up a dummy PDA. this is just for some early bootup code
-        * that does in_interrupt() 
-        */ 
+       /* Set up %gs.
+        *
+        * The base of %gs always points to the bottom of the irqstack
+        * union.  If the stack protector canary is enabled, it is
+        * located at %gs:40.  Note that, on SMP, the boot cpu uses
+        * init data section till per cpu areas are set up.
+        */
        movl    $MSR_GS_BASE,%ecx
-       movq    $empty_zero_page,%rax
+       movq    initial_gs(%rip),%rax
        movq    %rax,%rdx
        shrq    $32,%rdx
        wrmsr   
@@ -257,6 +274,12 @@ ENTRY(secondary_startup_64)
        .align  8
        ENTRY(initial_code)
        .quad   x86_64_start_kernel
+       ENTRY(initial_gs)
+#ifdef CONFIG_SMP
+       .quad   __per_cpu_load
+#else
+       .quad   PER_CPU_VAR(irq_stack_union)
+#endif
        __FINITDATA
 
        ENTRY(stack_start)
@@ -401,7 +424,8 @@ NEXT_PAGE(level2_spare_pgt)
        .globl early_gdt_descr
 early_gdt_descr:
        .word   GDT_ENTRIES*8-1
-       .quad   per_cpu__gdt_page
+early_gdt_descr_base:
+       .quad   per_cpu__gdt_page
 
 ENTRY(phys_base)
        /* This must match the first entry in level2_kernel_pgt */
index 9b0c480..0a7f6d6 100644 (file)
@@ -46,6 +46,7 @@
 #include <asm/idle.h>
 #include <asm/io.h>
 #include <asm/smp.h>
+#include <asm/cpu.h>
 #include <asm/desc.h>
 #include <asm/proto.h>
 #include <asm/acpi.h>
@@ -82,11 +83,11 @@ static DEFINE_SPINLOCK(vector_lock);
 int nr_ioapic_registers[MAX_IO_APICS];
 
 /* I/O APIC entries */
-struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;
 
 /* MP IRQ source entries */
-struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
 /* # of MP IRQ source entries */
 int mp_irq_entries;
@@ -356,7 +357,7 @@ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
 
        if (!cfg->move_in_progress) {
                /* it means that domain is not changed */
-               if (!cpumask_intersects(&desc->affinity, mask))
+               if (!cpumask_intersects(desc->affinity, mask))
                        cfg->move_desc_pending = 1;
        }
 }
@@ -386,7 +387,7 @@ struct io_apic {
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 {
        return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-               + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
+               + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
 }
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -579,9 +580,9 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
        if (assign_irq_vector(irq, cfg, mask))
                return BAD_APICID;
 
-       cpumask_and(&desc->affinity, cfg->domain, mask);
+       cpumask_and(desc->affinity, cfg->domain, mask);
        set_extra_move_desc(desc, mask);
-       return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+       return cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
 }
 
 static void
@@ -944,10 +945,10 @@ static int find_irq_entry(int apic, int pin, int type)
        int i;
 
        for (i = 0; i < mp_irq_entries; i++)
-               if (mp_irqs[i].mp_irqtype == type &&
-                   (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
-                    mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
-                   mp_irqs[i].mp_dstirq == pin)
+               if (mp_irqs[i].irqtype == type &&
+                   (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
+                    mp_irqs[i].dstapic == MP_APIC_ALL) &&
+                   mp_irqs[i].dstirq == pin)
                        return i;
 
        return -1;
@@ -961,13 +962,13 @@ static int __init find_isa_irq_pin(int irq, int type)
        int i;
 
        for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mp_srcbus;
+               int lbus = mp_irqs[i].srcbus;
 
                if (test_bit(lbus, mp_bus_not_pci) &&
-                   (mp_irqs[i].mp_irqtype == type) &&
-                   (mp_irqs[i].mp_srcbusirq == irq))
+                   (mp_irqs[i].irqtype == type) &&
+                   (mp_irqs[i].srcbusirq == irq))
 
-                       return mp_irqs[i].mp_dstirq;
+                       return mp_irqs[i].dstirq;
        }
        return -1;
 }
@@ -977,17 +978,17 @@ static int __init find_isa_irq_apic(int irq, int type)
        int i;
 
        for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mp_srcbus;
+               int lbus = mp_irqs[i].srcbus;
 
                if (test_bit(lbus, mp_bus_not_pci) &&
-                   (mp_irqs[i].mp_irqtype == type) &&
-                   (mp_irqs[i].mp_srcbusirq == irq))
+                   (mp_irqs[i].irqtype == type) &&
+                   (mp_irqs[i].srcbusirq == irq))
                        break;
        }
        if (i < mp_irq_entries) {
                int apic;
                for(apic = 0; apic < nr_ioapics; apic++) {
-                       if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
+                       if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
                                return apic;
                }
        }
@@ -1012,23 +1013,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
                return -1;
        }
        for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mp_srcbus;
+               int lbus = mp_irqs[i].srcbus;
 
                for (apic = 0; apic < nr_ioapics; apic++)
-                       if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
-                           mp_irqs[i].mp_dstapic == MP_APIC_ALL)
+                       if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+                           mp_irqs[i].dstapic == MP_APIC_ALL)
                                break;
 
                if (!test_bit(lbus, mp_bus_not_pci) &&
-                   !mp_irqs[i].mp_irqtype &&
+                   !mp_irqs[i].irqtype &&
                    (bus == lbus) &&
-                   (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
-                       int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
+                   (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+                       int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
 
                        if (!(apic || IO_APIC_IRQ(irq)))
                                continue;
 
-                       if (pin == (mp_irqs[i].mp_srcbusirq & 3))
+                       if (pin == (mp_irqs[i].srcbusirq & 3))
                                return irq;
                        /*
                         * Use the first all-but-pin matching entry as a
@@ -1071,7 +1072,7 @@ static int EISA_ELCR(unsigned int irq)
  * EISA conforming in the MP table, that means its trigger type must
  * be read in from the ELCR */
 
-#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
+#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].srcbusirq))
 #define default_EISA_polarity(idx)     default_ISA_polarity(idx)
 
 /* PCI interrupts are always polarity one level triggered,
@@ -1088,13 +1089,13 @@ static int EISA_ELCR(unsigned int irq)
 
 static int MPBIOS_polarity(int idx)
 {
-       int bus = mp_irqs[idx].mp_srcbus;
+       int bus = mp_irqs[idx].srcbus;
        int polarity;
 
        /*
         * Determine IRQ line polarity (high active or low active):
         */
-       switch (mp_irqs[idx].mp_irqflag & 3)
+       switch (mp_irqs[idx].irqflag & 3)
        {
                case 0: /* conforms, ie. bus-type dependent polarity */
                        if (test_bit(bus, mp_bus_not_pci))
@@ -1130,13 +1131,13 @@ static int MPBIOS_polarity(int idx)
 
 static int MPBIOS_trigger(int idx)
 {
-       int bus = mp_irqs[idx].mp_srcbus;
+       int bus = mp_irqs[idx].srcbus;
        int trigger;
 
        /*
         * Determine IRQ trigger mode (edge or level sensitive):
         */
-       switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
+       switch ((mp_irqs[idx].irqflag>>2) & 3)
        {
                case 0: /* conforms, ie. bus-type dependent */
                        if (test_bit(bus, mp_bus_not_pci))
@@ -1214,16 +1215,16 @@ int (*ioapic_renumber_irq)(int ioapic, int irq);
 static int pin_2_irq(int idx, int apic, int pin)
 {
        int irq, i;
-       int bus = mp_irqs[idx].mp_srcbus;
+       int bus = mp_irqs[idx].srcbus;
 
        /*
         * Debugging check, we are in big trouble if this message pops up!
         */
-       if (mp_irqs[idx].mp_dstirq != pin)
+       if (mp_irqs[idx].dstirq != pin)
                printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
 
        if (test_bit(bus, mp_bus_not_pci)) {
-               irq = mp_irqs[idx].mp_srcbusirq;
+               irq = mp_irqs[idx].srcbusirq;
        } else {
                /*
                 * PCI IRQs are mapped in order
@@ -1566,14 +1567,14 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_de
        apic_printk(APIC_VERBOSE,KERN_DEBUG
                    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
                    "IRQ %d Mode:%i Active:%i)\n",
-                   apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
+                   apic, mp_ioapics[apic].apicid, pin, cfg->vector,
                    irq, trigger, polarity);
 
 
-       if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
+       if (setup_ioapic_entry(mp_ioapics[apic].apicid, irq, &entry,
                               dest, trigger, polarity, cfg->vector)) {
                printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
-                      mp_ioapics[apic].mp_apicid, pin);
+                      mp_ioapics[apic].apicid, pin);
                __clear_irq_vector(irq, cfg);
                return;
        }
@@ -1604,12 +1605,10 @@ static void __init setup_IO_APIC_irqs(void)
                                        notcon = 1;
                                        apic_printk(APIC_VERBOSE,
                                                KERN_DEBUG " %d-%d",
-                                               mp_ioapics[apic].mp_apicid,
-                                               pin);
+                                               mp_ioapics[apic].apicid, pin);
                                } else
                                        apic_printk(APIC_VERBOSE, " %d-%d",
-                                               mp_ioapics[apic].mp_apicid,
-                                               pin);
+                                               mp_ioapics[apic].apicid, pin);
                                continue;
                        }
                        if (notcon) {
@@ -1699,7 +1698,7 @@ __apicdebuginit(void) print_IO_APIC(void)
        printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
        for (i = 0; i < nr_ioapics; i++)
                printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-                      mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
+                      mp_ioapics[i].apicid, nr_ioapic_registers[i]);
 
        /*
         * We are a bit conservative about what we expect.  We have to
@@ -1719,7 +1718,7 @@ __apicdebuginit(void) print_IO_APIC(void)
        spin_unlock_irqrestore(&ioapic_lock, flags);
 
        printk("\n");
-       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
+       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
        printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
        printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
        printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -2121,14 +2120,14 @@ static void __init setup_ioapic_ids_from_mpc(void)
                reg_00.raw = io_apic_read(apic, 0);
                spin_unlock_irqrestore(&ioapic_lock, flags);
 
-               old_id = mp_ioapics[apic].mp_apicid;
+               old_id = mp_ioapics[apic].apicid;
 
-               if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
+               if (mp_ioapics[apic].apicid >= get_physical_broadcast()) {
                        printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
-                               apic, mp_ioapics[apic].mp_apicid);
+                               apic, mp_ioapics[apic].apicid);
                        printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
                                reg_00.bits.ID);
-                       mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
+                       mp_ioapics[apic].apicid = reg_00.bits.ID;
                }
 
                /*
@@ -2137,9 +2136,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
                 * 'stuck on smp_invalidate_needed IPI wait' messages.
                 */
                if (check_apicid_used(phys_id_present_map,
-                                       mp_ioapics[apic].mp_apicid)) {
+                                       mp_ioapics[apic].apicid)) {
                        printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
-                               apic, mp_ioapics[apic].mp_apicid);
+                               apic, mp_ioapics[apic].apicid);
                        for (i = 0; i < get_physical_broadcast(); i++)
                                if (!physid_isset(i, phys_id_present_map))
                                        break;
@@ -2148,13 +2147,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
                        printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
                                i);
                        physid_set(i, phys_id_present_map);
-                       mp_ioapics[apic].mp_apicid = i;
+                       mp_ioapics[apic].apicid = i;
                } else {
                        physid_mask_t tmp;
-                       tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
+                       tmp = apicid_to_cpu_present(mp_ioapics[apic].apicid);
                        apic_printk(APIC_VERBOSE, "Setting %d in the "
                                        "phys_id_present_map\n",
-                                       mp_ioapics[apic].mp_apicid);
+                                       mp_ioapics[apic].apicid);
                        physids_or(phys_id_present_map, phys_id_present_map, tmp);
                }
 
@@ -2163,11 +2162,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
                 * We need to adjust the IRQ routing table
                 * if the ID changed.
                 */
-               if (old_id != mp_ioapics[apic].mp_apicid)
+               if (old_id != mp_ioapics[apic].apicid)
                        for (i = 0; i < mp_irq_entries; i++)
-                               if (mp_irqs[i].mp_dstapic == old_id)
-                                       mp_irqs[i].mp_dstapic
-                                               = mp_ioapics[apic].mp_apicid;
+                               if (mp_irqs[i].dstapic == old_id)
+                                       mp_irqs[i].dstapic
+                                               = mp_ioapics[apic].apicid;
 
                /*
                 * Read the right value from the MPC table and
@@ -2175,9 +2174,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
                 */
                apic_printk(APIC_VERBOSE, KERN_INFO
                        "...changing IO-APIC physical APIC ID to %d ...",
-                       mp_ioapics[apic].mp_apicid);
+                       mp_ioapics[apic].apicid);
 
-               reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
+               reg_00.bits.ID = mp_ioapics[apic].apicid;
                spin_lock_irqsave(&ioapic_lock, flags);
                io_apic_write(apic, 0, reg_00.raw);
                spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2188,7 +2187,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
                spin_lock_irqsave(&ioapic_lock, flags);
                reg_00.raw = io_apic_read(apic, 0);
                spin_unlock_irqrestore(&ioapic_lock, flags);
-               if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
+               if (reg_00.bits.ID != mp_ioapics[apic].apicid)
                        printk("could not set ID!\n");
                else
                        apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2383,7 +2382,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
        if (cfg->move_in_progress)
                send_cleanup_vector(cfg);
 
-       cpumask_copy(&desc->affinity, mask);
+       cpumask_copy(desc->affinity, mask);
 }
 
 static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
@@ -2405,11 +2404,11 @@ static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
        }
 
        /* everthing is clear. we have right of way */
-       migrate_ioapic_irq_desc(desc, &desc->pending_mask);
+       migrate_ioapic_irq_desc(desc, desc->pending_mask);
 
        ret = 0;
        desc->status &= ~IRQ_MOVE_PENDING;
-       cpumask_clear(&desc->pending_mask);
+       cpumask_clear(desc->pending_mask);
 
 unmask:
        unmask_IO_APIC_irq_desc(desc);
@@ -2434,7 +2433,7 @@ static void ir_irq_migration(struct work_struct *work)
                                continue;
                        }
 
-                       desc->chip->set_affinity(irq, &desc->pending_mask);
+                       desc->chip->set_affinity(irq, desc->pending_mask);
                        spin_unlock_irqrestore(&desc->lock, flags);
                }
        }
@@ -2448,7 +2447,7 @@ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 {
        if (desc->status & IRQ_LEVEL) {
                desc->status |= IRQ_MOVE_PENDING;
-               cpumask_copy(&desc->pending_mask, mask);
+               cpumask_copy(desc->pending_mask, mask);
                migrate_irq_remapped_level_desc(desc);
                return;
        }
@@ -2516,7 +2515,7 @@ static void irq_complete_move(struct irq_desc **descp)
 
                /* domain has not changed, but affinity did */
                me = smp_processor_id();
-               if (cpu_isset(me, desc->affinity)) {
+               if (cpumask_test_cpu(me, desc->affinity)) {
                        *descp = desc = move_irq_desc(desc, me);
                        /* get the new one */
                        cfg = desc->chip_data;
@@ -3118,8 +3117,8 @@ static int ioapic_resume(struct sys_device *dev)
 
        spin_lock_irqsave(&ioapic_lock, flags);
        reg_00.raw = io_apic_read(dev->id, 0);
-       if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
-               reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
+       if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
+               reg_00.bits.ID = mp_ioapics[dev->id].apicid;
                io_apic_write(dev->id, 0, reg_00.raw);
        }
        spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -3184,7 +3183,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
        irq = 0;
        spin_lock_irqsave(&vector_lock, flags);
-       for (new = irq_want; new < NR_IRQS; new++) {
+       for (new = irq_want; new < nr_irqs; new++) {
                if (platform_legacy_irq(new))
                        continue;
 
@@ -3259,6 +3258,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
        int err;
        unsigned dest;
 
+       if (disable_apic)
+               return -ENXIO;
+
        cfg = irq_cfg(irq);
        err = assign_irq_vector(irq, cfg, TARGET_CPUS);
        if (err)
@@ -3727,6 +3729,9 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
        struct irq_cfg *cfg;
        int err;
 
+       if (disable_apic)
+               return -ENXIO;
+
        cfg = irq_cfg(irq);
        err = assign_irq_vector(irq, cfg, TARGET_CPUS);
        if (!err) {
@@ -3761,7 +3766,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 }
 #endif /* CONFIG_HT_IRQ */
 
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_UV
 /*
  * Re-target the irq to the specified CPU and enable the specified MMR located
  * on the specified blade to allow the sending of MSIs to the specified CPU.
@@ -3851,6 +3856,22 @@ void __init probe_nr_irqs_gsi(void)
                nr_irqs_gsi = nr;
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+int __init arch_probe_nr_irqs(void)
+{
+       int nr;
+
+       nr = ((8 * nr_cpu_ids) > (32 * nr_ioapics) ?
+               (NR_VECTORS + (8 * nr_cpu_ids)) :
+               (NR_VECTORS + (32 * nr_ioapics)));
+
+       if (nr < nr_irqs && nr > nr_irqs_gsi)
+               nr_irqs = nr;
+
+       return 0;
+}
+#endif
+
 /* --------------------------------------------------------------------------
                           ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */
@@ -3985,8 +4006,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
                return -1;
 
        for (i = 0; i < mp_irq_entries; i++)
-               if (mp_irqs[i].mp_irqtype == mp_INT &&
-                   mp_irqs[i].mp_srcbusirq == bus_irq)
+               if (mp_irqs[i].irqtype == mp_INT &&
+                   mp_irqs[i].srcbusirq == bus_irq)
                        break;
        if (i >= mp_irq_entries)
                return -1;
@@ -4040,7 +4061,7 @@ void __init setup_ioapic_dest(void)
                         */
                        if (desc->status &
                            (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-                               mask = &desc->affinity;
+                               mask = desc->affinity;
                        else
                                mask = TARGET_CPUS;
 
@@ -4101,7 +4122,7 @@ void __init ioapic_init_mappings(void)
        ioapic_res = ioapic_setup_resources();
        for (i = 0; i < nr_ioapics; i++) {
                if (smp_found_config) {
-                       ioapic_phys = mp_ioapics[i].mp_apicaddr;
+                       ioapic_phys = mp_ioapics[i].apicaddr;
 #ifdef CONFIG_X86_32
                        if (!ioapic_phys) {
                                printk(KERN_ERR
index 3973e2d..8b30d0c 100644 (file)
@@ -36,11 +36,7 @@ void ack_bad_irq(unsigned int irq)
 #endif
 }
 
-#ifdef CONFIG_X86_32
-# define irq_stats(x)          (&per_cpu(irq_stat, x))
-#else
-# define irq_stats(x)          cpu_pda(x)
-#endif
+#define irq_stats(x)           (&per_cpu(irq_stat, x))
 /*
  * /proc/interrupts printing:
  */
index 74b9ff7..e0f29be 100644 (file)
@@ -248,7 +248,7 @@ void fixup_irqs(void)
                if (irq == 2)
                        continue;
 
-               affinity = &desc->affinity;
+               affinity = desc->affinity;
                if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
                        printk("Breaking affinity for irq %i\n", irq);
                        affinity = cpu_all_mask;
index 63c88e6..018963a 100644 (file)
 #include <linux/smp.h>
 #include <asm/io_apic.h>
 #include <asm/idle.h>
+#include <asm/apic.h>
+
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
 
 /*
  * Probabilistic stack overflow check:
@@ -100,7 +107,7 @@ void fixup_irqs(void)
                /* interrupt's are disabled at this point */
                spin_lock(&desc->lock);
 
-               affinity = &desc->affinity;
+               affinity = desc->affinity;
                if (!irq_has_action(irq) ||
                    cpumask_equal(affinity, cpu_online_mask)) {
                        spin_unlock(&desc->lock);
index 10a09c2..22608eb 100644 (file)
@@ -140,8 +140,15 @@ void __init native_init_IRQ(void)
         */
        alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
 
-       /* IPI for invalidation */
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+       /* IPIs for invalidation */
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
 
        /* IPI for generic function call */
        alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
index b7f4c92..5e9f4fc 100644 (file)
@@ -87,9 +87,9 @@
 #include <linux/cpu.h>
 #include <linux/firmware.h>
 #include <linux/platform_device.h>
+#include <linux/uaccess.h>
 
 #include <asm/msr.h>
-#include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/microcode.h>
 
@@ -196,7 +196,7 @@ static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
        return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
 }
 
-static inline int 
+static inline int
 update_match_revision(struct microcode_header_intel *mc_header,        int rev)
 {
        return (mc_header->rev <= rev) ? 0 : 1;
@@ -442,8 +442,8 @@ static int request_microcode_fw(int cpu, struct device *device)
                return ret;
        }
 
-       ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
-                       &get_ucode_fw);
+       ret = generic_load_microcode(cpu, (void *)firmware->data,
+                                    firmware->size, &get_ucode_fw);
 
        release_firmware(firmware);
 
@@ -460,7 +460,7 @@ static int request_microcode_user(int cpu, const void __user *buf, size_t size)
        /* We should bind the task to the CPU */
        BUG_ON(cpu != raw_smp_processor_id());
 
-       return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user);
+       return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
 }
 
 static void microcode_fini_cpu(int cpu)
index 3db0a54..0edd819 100644 (file)
@@ -42,7 +42,7 @@ void module_free(struct module *mod, void *module_region)
 {
        vfree(module_region);
        /* FIXME: If module_region == mod->init_region, trim exception
-           table entries. */
+          table entries. */
 }
 
 /* We don't need anything special. */
@@ -113,13 +113,13 @@ int module_finalize(const Elf_Ehdr *hdr,
                *para = NULL;
        char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
-       for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 
+       for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
                if (!strcmp(".text", secstrings + s->sh_name))
                        text = s;
                if (!strcmp(".altinstructions", secstrings + s->sh_name))
                        alt = s;
                if (!strcmp(".smp_locks", secstrings + s->sh_name))
-                       locks= s;
+                       locks = s;
                if (!strcmp(".parainstructions", secstrings + s->sh_name))
                        para = s;
        }
index 6ba8783..c23880b 100644 (file)
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
-#define DEBUGP(fmt...) 
+#define DEBUGP(fmt...)
 
 #ifndef CONFIG_UML
 void module_free(struct module *mod, void *module_region)
 {
        vfree(module_region);
        /* FIXME: If module_region == mod->init_region, trim exception
-           table entries. */
+          table entries. */
 }
 
 void *module_alloc(unsigned long size)
@@ -77,7 +77,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
        Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
        Elf64_Sym *sym;
        void *loc;
-       u64 val; 
+       u64 val;
 
        DEBUGP("Applying relocate section %u to %u\n", relsec,
               sechdrs[relsec].sh_info);
@@ -91,11 +91,11 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
                sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
                        + ELF64_R_SYM(rel[i].r_info);
 
-               DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
-                      (int)ELF64_R_TYPE(rel[i].r_info), 
-                      sym->st_value, rel[i].r_addend, (u64)loc);
+               DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
+                       (int)ELF64_R_TYPE(rel[i].r_info),
+                       sym->st_value, rel[i].r_addend, (u64)loc);
 
-               val = sym->st_value + rel[i].r_addend; 
+               val = sym->st_value + rel[i].r_addend;
 
                switch (ELF64_R_TYPE(rel[i].r_info)) {
                case R_X86_64_NONE:
@@ -113,16 +113,16 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
                        if ((s64)val != *(s32 *)loc)
                                goto overflow;
                        break;
-               case R_X86_64_PC32: 
+               case R_X86_64_PC32:
                        val -= (u64)loc;
                        *(u32 *)loc = val;
 #if 0
                        if ((s64)val != *(s32 *)loc)
-                               goto overflow; 
+                               goto overflow;
 #endif
                        break;
                default:
-                       printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n",
+                       printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",
                               me->name, ELF64_R_TYPE(rel[i].r_info));
                        return -ENOEXEC;
                }
@@ -130,7 +130,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
        return 0;
 
 overflow:
-       printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 
+       printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
               (int)ELF64_R_TYPE(rel[i].r_info), val);
        printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
               me->name);
@@ -143,13 +143,13 @@ int apply_relocate(Elf_Shdr *sechdrs,
                   unsigned int relsec,
                   struct module *me)
 {
-       printk("non add relocation not supported\n");
+       printk(KERN_ERR "non add relocation not supported\n");
        return -ENOSYS;
-} 
+}
 
 int module_finalize(const Elf_Ehdr *hdr,
-                    const Elf_Shdr *sechdrs,
-                    struct module *me)
+                   const Elf_Shdr *sechdrs,
+                   struct module *me)
 {
        const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
                *para = NULL;
@@ -161,7 +161,7 @@ int module_finalize(const Elf_Ehdr *hdr,
                if (!strcmp(".altinstructions", secstrings + s->sh_name))
                        alt = s;
                if (!strcmp(".smp_locks", secstrings + s->sh_name))
-                       locks= s;
+                       locks = s;
                if (!strcmp(".parainstructions", secstrings + s->sh_name))
                        para = s;
        }
index a649a4c..fa6bb26 100644 (file)
@@ -144,11 +144,11 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
        if (bad_ioapic(m->apicaddr))
                return;
 
-       mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr;
-       mp_ioapics[nr_ioapics].mp_apicid = m->apicid;
-       mp_ioapics[nr_ioapics].mp_type = m->type;
-       mp_ioapics[nr_ioapics].mp_apicver = m->apicver;
-       mp_ioapics[nr_ioapics].mp_flags = m->flags;
+       mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
+       mp_ioapics[nr_ioapics].apicid = m->apicid;
+       mp_ioapics[nr_ioapics].type = m->type;
+       mp_ioapics[nr_ioapics].apicver = m->apicver;
+       mp_ioapics[nr_ioapics].flags = m->flags;
        nr_ioapics++;
 }
 
@@ -160,55 +160,55 @@ static void print_MP_intsrc_info(struct mpc_intsrc *m)
                m->srcbusirq, m->dstapic, m->dstirq);
 }
 
-static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
+static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
 {
        apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
                " IRQ %02x, APIC ID %x, APIC INT %02x\n",
-               mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
-               (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
-               mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
+               mp_irq->irqtype, mp_irq->irqflag & 3,
+               (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
+               mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
 }
 
 static void __init assign_to_mp_irq(struct mpc_intsrc *m,
-                                   struct mp_config_intsrc *mp_irq)
+                                   struct mpc_intsrc *mp_irq)
 {
-       mp_irq->mp_dstapic = m->dstapic;
-       mp_irq->mp_type = m->type;
-       mp_irq->mp_irqtype = m->irqtype;
-       mp_irq->mp_irqflag = m->irqflag;
-       mp_irq->mp_srcbus = m->srcbus;
-       mp_irq->mp_srcbusirq = m->srcbusirq;
-       mp_irq->mp_dstirq = m->dstirq;
+       mp_irq->dstapic = m->dstapic;
+       mp_irq->type = m->type;
+       mp_irq->irqtype = m->irqtype;
+       mp_irq->irqflag = m->irqflag;
+       mp_irq->srcbus = m->srcbus;
+       mp_irq->srcbusirq = m->srcbusirq;
+       mp_irq->dstirq = m->dstirq;
 }
 
-static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
+static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
                                        struct mpc_intsrc *m)
 {
-       m->dstapic = mp_irq->mp_dstapic;
-       m->type = mp_irq->mp_type;
-       m->irqtype = mp_irq->mp_irqtype;
-       m->irqflag = mp_irq->mp_irqflag;
-       m->srcbus = mp_irq->mp_srcbus;
-       m->srcbusirq = mp_irq->mp_srcbusirq;
-       m->dstirq = mp_irq->mp_dstirq;
+       m->dstapic = mp_irq->dstapic;
+       m->type = mp_irq->type;
+       m->irqtype = mp_irq->irqtype;
+       m->irqflag = mp_irq->irqflag;
+       m->srcbus = mp_irq->srcbus;
+       m->srcbusirq = mp_irq->srcbusirq;
+       m->dstirq = mp_irq->dstirq;
 }
 
-static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
+static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
                                        struct mpc_intsrc *m)
 {
-       if (mp_irq->mp_dstapic != m->dstapic)
+       if (mp_irq->dstapic != m->dstapic)
                return 1;
-       if (mp_irq->mp_type != m->type)
+       if (mp_irq->type != m->type)
                return 2;
-       if (mp_irq->mp_irqtype != m->irqtype)
+       if (mp_irq->irqtype != m->irqtype)
                return 3;
-       if (mp_irq->mp_irqflag != m->irqflag)
+       if (mp_irq->irqflag != m->irqflag)
                return 4;
-       if (mp_irq->mp_srcbus != m->srcbus)
+       if (mp_irq->srcbus != m->srcbus)
                return 5;
-       if (mp_irq->mp_srcbusirq != m->srcbusirq)
+       if (mp_irq->srcbusirq != m->srcbusirq)
                return 6;
-       if (mp_irq->mp_dstirq != m->dstirq)
+       if (mp_irq->dstirq != m->dstirq)
                return 7;
 
        return 0;
@@ -417,7 +417,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
        intsrc.type = MP_INTSRC;
        intsrc.irqflag = 0;     /* conforming */
        intsrc.srcbus = 0;
-       intsrc.dstapic = mp_ioapics[0].mp_apicid;
+       intsrc.dstapic = mp_ioapics[0].apicid;
 
        intsrc.irqtype = mp_INT;
 
@@ -570,14 +570,14 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
        }
 }
 
-static struct intel_mp_floating *mpf_found;
+static struct mpf_intel *mpf_found;
 
 /*
  * Scan the memory blocks for an SMP configuration block.
  */
 static void __init __get_smp_config(unsigned int early)
 {
-       struct intel_mp_floating *mpf = mpf_found;
+       struct mpf_intel *mpf = mpf_found;
 
        if (!mpf)
                return;
@@ -598,9 +598,9 @@ static void __init __get_smp_config(unsigned int early)
        }
 
        printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
-              mpf->mpf_specification);
+              mpf->specification);
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
-       if (mpf->mpf_feature2 & (1 << 7)) {
+       if (mpf->feature2 & (1 << 7)) {
                printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
                pic_mode = 1;
        } else {
@@ -611,7 +611,7 @@ static void __init __get_smp_config(unsigned int early)
        /*
         * Now see if we need to read further.
         */
-       if (mpf->mpf_feature1 != 0) {
+       if (mpf->feature1 != 0) {
                if (early) {
                        /*
                         * local APIC has default address
@@ -621,16 +621,16 @@ static void __init __get_smp_config(unsigned int early)
                }
 
                printk(KERN_INFO "Default MP configuration #%d\n",
-                      mpf->mpf_feature1);
-               construct_default_ISA_mptable(mpf->mpf_feature1);
+                      mpf->feature1);
+               construct_default_ISA_mptable(mpf->feature1);
 
-       } else if (mpf->mpf_physptr) {
+       } else if (mpf->physptr) {
 
                /*
                 * Read the physical hardware table.  Anything here will
                 * override the defaults.
                 */
-               if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) {
+               if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
 #ifdef CONFIG_X86_LOCAL_APIC
                        smp_found_config = 0;
 #endif
@@ -688,19 +688,19 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
                                  unsigned reserve)
 {
        unsigned int *bp = phys_to_virt(base);
-       struct intel_mp_floating *mpf;
+       struct mpf_intel *mpf;
 
        apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
                        bp, length);
        BUILD_BUG_ON(sizeof(*mpf) != 16);
 
        while (length > 0) {
-               mpf = (struct intel_mp_floating *)bp;
+               mpf = (struct mpf_intel *)bp;
                if ((*bp == SMP_MAGIC_IDENT) &&
-                   (mpf->mpf_length == 1) &&
+                   (mpf->length == 1) &&
                    !mpf_checksum((unsigned char *)bp, 16) &&
-                   ((mpf->mpf_specification == 1)
-                    || (mpf->mpf_specification == 4))) {
+                   ((mpf->specification == 1)
+                    || (mpf->specification == 4))) {
 #ifdef CONFIG_X86_LOCAL_APIC
                        smp_found_config = 1;
 #endif
@@ -713,7 +713,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
                                return 1;
                        reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
                                        BOOTMEM_DEFAULT);
-                       if (mpf->mpf_physptr) {
+                       if (mpf->physptr) {
                                unsigned long size = PAGE_SIZE;
 #ifdef CONFIG_X86_32
                                /*
@@ -722,14 +722,14 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
                                 * the bottom is mapped now.
                                 * PC-9800's MPC table places on the very last
                                 * of physical memory; so that simply reserving
-                                * PAGE_SIZE from mpg->mpf_physptr yields BUG()
+                                * PAGE_SIZE from mpf->physptr yields BUG()
                                 * in reserve_bootmem.
                                 */
                                unsigned long end = max_low_pfn * PAGE_SIZE;
-                               if (mpf->mpf_physptr + size > end)
-                                       size = end - mpf->mpf_physptr;
+                               if (mpf->physptr + size > end)
+                                       size = end - mpf->physptr;
 #endif
-                               reserve_bootmem_generic(mpf->mpf_physptr, size,
+                               reserve_bootmem_generic(mpf->physptr, size,
                                                BOOTMEM_DEFAULT);
                        }
 
@@ -809,15 +809,15 @@ static int  __init get_MP_intsrc_index(struct mpc_intsrc *m)
        /* not legacy */
 
        for (i = 0; i < mp_irq_entries; i++) {
-               if (mp_irqs[i].mp_irqtype != mp_INT)
+               if (mp_irqs[i].irqtype != mp_INT)
                        continue;
 
-               if (mp_irqs[i].mp_irqflag != 0x0f)
+               if (mp_irqs[i].irqflag != 0x0f)
                        continue;
 
-               if (mp_irqs[i].mp_srcbus != m->srcbus)
+               if (mp_irqs[i].srcbus != m->srcbus)
                        continue;
-               if (mp_irqs[i].mp_srcbusirq != m->srcbusirq)
+               if (mp_irqs[i].srcbusirq != m->srcbusirq)
                        continue;
                if (irq_used[i]) {
                        /* already claimed */
@@ -922,10 +922,10 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
                if (irq_used[i])
                        continue;
 
-               if (mp_irqs[i].mp_irqtype != mp_INT)
+               if (mp_irqs[i].irqtype != mp_INT)
                        continue;
 
-               if (mp_irqs[i].mp_irqflag != 0x0f)
+               if (mp_irqs[i].irqflag != 0x0f)
                        continue;
 
                if (nr_m_spare > 0) {
@@ -1001,7 +1001,7 @@ static int __init update_mp_table(void)
 {
        char str[16];
        char oem[10];
-       struct intel_mp_floating *mpf;
+       struct mpf_intel *mpf;
        struct mpc_table *mpc, *mpc_new;
 
        if (!enable_update_mptable)
@@ -1014,19 +1014,19 @@ static int __init update_mp_table(void)
        /*
         * Now see if we need to go further.
         */
-       if (mpf->mpf_feature1 != 0)
+       if (mpf->feature1 != 0)
                return 0;
 
-       if (!mpf->mpf_physptr)
+       if (!mpf->physptr)
                return 0;
 
-       mpc = phys_to_virt(mpf->mpf_physptr);
+       mpc = phys_to_virt(mpf->physptr);
 
        if (!smp_check_mpc(mpc, oem, str))
                return 0;
 
        printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
-       printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
+       printk(KERN_INFO "physptr: %x\n", mpf->physptr);
 
        if (mpc_new_phys && mpc->length > mpc_new_length) {
                mpc_new_phys = 0;
@@ -1047,23 +1047,23 @@ static int __init update_mp_table(void)
                }
                printk(KERN_INFO "use in-positon replacing\n");
        } else {
-               mpf->mpf_physptr = mpc_new_phys;
+               mpf->physptr = mpc_new_phys;
                mpc_new = phys_to_virt(mpc_new_phys);
                memcpy(mpc_new, mpc, mpc->length);
                mpc = mpc_new;
                /* check if we can modify that */
-               if (mpc_new_phys - mpf->mpf_physptr) {
-                       struct intel_mp_floating *mpf_new;
+               if (mpc_new_phys - mpf->physptr) {
+                       struct mpf_intel *mpf_new;
                        /* steal 16 bytes from [0, 1k) */
                        printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
                        mpf_new = phys_to_virt(0x400 - 16);
                        memcpy(mpf_new, mpf, 16);
                        mpf = mpf_new;
-                       mpf->mpf_physptr = mpc_new_phys;
+                       mpf->physptr = mpc_new_phys;
                }
-               mpf->mpf_checksum = 0;
-               mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
-               printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
+               mpf->checksum = 0;
+               mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
+               printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
        }
 
        /*
index 7262666..3cf3413 100644 (file)
 #include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/uaccess.h>
 
 #include <asm/processor.h>
 #include <asm/msr.h>
-#include <asm/uaccess.h>
 #include <asm/system.h>
 
 static struct class *msr_class;
index 7228979..23b6d9e 100644 (file)
@@ -61,11 +61,7 @@ static int endflag __initdata;
 
 static inline unsigned int get_nmi_count(int cpu)
 {
-#ifdef CONFIG_X86_64
-       return cpu_pda(cpu)->__nmi_count;
-#else
-       return nmi_count(cpu);
-#endif
+       return per_cpu(irq_stat, cpu).__nmi_count;
 }
 
 static inline int mce_in_progress(void)
@@ -82,12 +78,8 @@ static inline int mce_in_progress(void)
  */
 static inline unsigned int get_timer_irqs(int cpu)
 {
-#ifdef CONFIG_X86_64
-       return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
-#else
        return per_cpu(irq_stat, cpu).apic_timer_irqs +
                per_cpu(irq_stat, cpu).irq0_irqs;
-#endif
 }
 
 #ifdef CONFIG_SMP
index e4c8fb6..cea11c8 100644 (file)
@@ -44,6 +44,17 @@ void _paravirt_nop(void)
 {
 }
 
+/* identity function, which can be inlined */
+u32 _paravirt_ident_32(u32 x)
+{
+       return x;
+}
+
+u64 _paravirt_ident_64(u64 x)
+{
+       return x;
+}
+
 static void __init default_banner(void)
 {
        printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -138,9 +149,16 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
        if (opfunc == NULL)
                /* If there's no function, patch it with a ud2a (BUG) */
                ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
-       else if (opfunc == paravirt_nop)
+       else if (opfunc == _paravirt_nop)
                /* If the operation is a nop, then nop the callsite */
                ret = paravirt_patch_nop();
+
+       /* identity functions just return their single argument */
+       else if (opfunc == _paravirt_ident_32)
+               ret = paravirt_patch_ident_32(insnbuf, len);
+       else if (opfunc == _paravirt_ident_64)
+               ret = paravirt_patch_ident_64(insnbuf, len);
+
        else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
                 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
                 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
@@ -292,10 +310,10 @@ struct pv_time_ops pv_time_ops = {
 
 struct pv_irq_ops pv_irq_ops = {
        .init_IRQ = native_init_IRQ,
-       .save_fl = native_save_fl,
-       .restore_fl = native_restore_fl,
-       .irq_disable = native_irq_disable,
-       .irq_enable = native_irq_enable,
+       .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
+       .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
+       .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
+       .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
        .safe_halt = native_safe_halt,
        .halt = native_halt,
 #ifdef CONFIG_X86_64
@@ -373,6 +391,14 @@ struct pv_apic_ops pv_apic_ops = {
 #endif
 };
 
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
+/* 32-bit pagetable entries */
+#define PTE_IDENT      __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
+#else
+/* 64-bit pagetable entries */
+#define PTE_IDENT      __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
+#endif
+
 struct pv_mmu_ops pv_mmu_ops = {
 #ifndef CONFIG_X86_64
        .pagetable_setup_start = native_pagetable_setup_start,
@@ -424,22 +450,23 @@ struct pv_mmu_ops pv_mmu_ops = {
        .pmd_clear = native_pmd_clear,
 #endif
        .set_pud = native_set_pud,
-       .pmd_val = native_pmd_val,
-       .make_pmd = native_make_pmd,
+
+       .pmd_val = PTE_IDENT,
+       .make_pmd = PTE_IDENT,
 
 #if PAGETABLE_LEVELS == 4
-       .pud_val = native_pud_val,
-       .make_pud = native_make_pud,
+       .pud_val = PTE_IDENT,
+       .make_pud = PTE_IDENT,
+
        .set_pgd = native_set_pgd,
 #endif
 #endif /* PAGETABLE_LEVELS >= 3 */
 
-       .pte_val = native_pte_val,
-       .pte_flags = native_pte_flags,
-       .pgd_val = native_pgd_val,
+       .pte_val = PTE_IDENT,
+       .pgd_val = PTE_IDENT,
 
-       .make_pte = native_make_pte,
-       .make_pgd = native_make_pgd,
+       .make_pte = PTE_IDENT,
+       .make_pgd = PTE_IDENT,
 
        .dup_mmap = paravirt_nop,
        .exit_mmap = paravirt_nop,
index 9fe644f..d9f32e6 100644 (file)
@@ -12,6 +12,18 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
 
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
+{
+       /* arg in %eax, return in %eax */
+       return 0;
+}
+
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
+{
+       /* arg in %edx:%eax, return in %edx:%eax */
+       return 0;
+}
+
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                      unsigned long addr, unsigned len)
 {
index 061d01d..3f08f34 100644 (file)
@@ -19,6 +19,21 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
 DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
 DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
 
+DEF_NATIVE(, mov32, "mov %edi, %eax");
+DEF_NATIVE(, mov64, "mov %rdi, %rax");
+
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
+{
+       return paravirt_patch_insns(insnbuf, len,
+                                   start__mov32, end__mov32);
+}
+
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
+{
+       return paravirt_patch_insns(insnbuf, len,
+                                   start__mov64, end__mov64);
+}
+
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                      unsigned long addr, unsigned len)
 {
index a546f55..1a1ae8e 100644 (file)
@@ -66,9 +66,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
 
-DEFINE_PER_CPU(int, cpu_number);
-EXPORT_PER_CPU_SYMBOL(cpu_number);
-
 /*
  * Return saved PC of a blocked thread.
  */
@@ -111,7 +108,6 @@ void cpu_idle(void)
                                play_dead();
 
                        local_irq_disable();
-                       __get_cpu_var(irq_stat).idle_timestamp = jiffies;
                        /* Don't trace irqs off for idle */
                        stop_critical_timings();
                        pm_idle();
@@ -591,7 +587,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        if (prev->gs | next->gs)
                loadsegment(gs, next->gs);
 
-       x86_write_percpu(current_task, next_p);
+       percpu_write(current_task, next_p);
 
        return prev_p;
 }
index 416fb92..c422eeb 100644 (file)
@@ -16,6 +16,7 @@
 
 #include <stdarg.h>
 
+#include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
@@ -46,7 +47,6 @@
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/mmu_context.h>
-#include <asm/pda.h>
 #include <asm/prctl.h>
 #include <asm/desc.h>
 #include <asm/proto.h>
 
 asmlinkage extern void ret_from_fork(void);
 
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+DEFINE_PER_CPU(unsigned long, old_rsp);
+static DEFINE_PER_CPU(unsigned char, is_idle);
+
 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
 
 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -75,13 +81,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
 
 void enter_idle(void)
 {
-       write_pda(isidle, 1);
+       percpu_write(is_idle, 1);
        atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
 }
 
 static void __exit_idle(void)
 {
-       if (test_and_clear_bit_pda(0, isidle) == 0)
+       if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
                return;
        atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
 }
@@ -111,6 +117,17 @@ static inline void play_dead(void)
 void cpu_idle(void)
 {
        current_thread_info()->status |= TS_POLLING;
+
+       /*
+        * If we're the non-boot CPU, nothing set the PDA stack
+        * canary up for us - and if we are the boot CPU we have
+        * a 0 stack canary. This is a good place for updating
+        * it, as we wont ever return from this function (so the
+        * invalid canaries already on the stack wont ever
+        * trigger):
+        */
+       boot_init_stack_canary();
+
        /* endless idle loop with no priority at all */
        while (1) {
                tick_nohz_stop_sched_tick(1);
@@ -392,7 +409,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
        load_gs_index(0);
        regs->ip                = new_ip;
        regs->sp                = new_sp;
-       write_pda(oldrsp, new_sp);
+       percpu_write(old_rsp, new_sp);
        regs->cs                = __USER_CS;
        regs->ss                = __USER_DS;
        regs->flags             = 0x200;
@@ -613,21 +630,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        /*
         * Switch the PDA and FPU contexts.
         */
-       prev->usersp = read_pda(oldrsp);
-       write_pda(oldrsp, next->usersp);
-       write_pda(pcurrent, next_p);
+       prev->usersp = percpu_read(old_rsp);
+       percpu_write(old_rsp, next->usersp);
+       percpu_write(current_task, next_p);
 
-       write_pda(kernelstack,
+       percpu_write(kernel_stack,
                  (unsigned long)task_stack_page(next_p) +
-                 THREAD_SIZE - PDA_STACKOFFSET);
-#ifdef CONFIG_CC_STACKPROTECTOR
-       write_pda(stack_canary, next_p->stack_canary);
-       /*
-        * Build time only check to make sure the stack_canary is at
-        * offset 40 in the pda; this is a gcc ABI requirement
-        */
-       BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
-#endif
+                 THREAD_SIZE - KERNEL_STACK_OFFSET);
 
        /*
         * Now maybe reload the debug registers and handle I/O bitmaps
index 2b46eb4..f8536fe 100644 (file)
@@ -14,6 +14,7 @@
 #include <asm/reboot.h>
 #include <asm/pci_x86.h>
 #include <asm/virtext.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_X86_32
 # include <linux/dmi.h>
index ae0d804..f41c448 100644 (file)
@@ -89,7 +89,7 @@
 
 #include <asm/system.h>
 #include <asm/vsyscall.h>
-#include <asm/smp.h>
+#include <asm/cpu.h>
 #include <asm/desc.h>
 #include <asm/dma.h>
 #include <asm/iommu.h>
index 0116107..ef91747 100644 (file)
 #include <asm/mpspec.h>
 #include <asm/apicdef.h>
 #include <asm/highmem.h>
+#include <asm/proto.h>
+#include <asm/cpumask.h>
+#include <asm/cpu.h>
 
-#ifdef CONFIG_X86_LOCAL_APIC
-unsigned int num_processors;
-unsigned disabled_cpus __cpuinitdata;
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
-EXPORT_SYMBOL(boot_cpu_physical_apicid);
-unsigned int max_physical_apicid;
-
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map;
-#endif
-
-/* map cpu index to physical APIC ID */
-DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
-DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
-#define        X86_64_NUMA     1
-
-/* map cpu index to node index */
-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-
-/* which logical CPUs are on which nodes */
-cpumask_t *node_to_cpumask_map;
-EXPORT_SYMBOL(node_to_cpumask_map);
-
-/* setup node_to_cpumask_map */
-static void __init setup_node_to_cpumask_map(void);
-
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+# define DBG(x...) printk(KERN_DEBUG x)
 #else
-static inline void setup_node_to_cpumask_map(void) { }
+# define DBG(x...)
 #endif
 
-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
-/*
- * Copy data used in early init routines from the initial arrays to the
- * per cpu data areas.  These arrays then become expendable and the
- * *_early_ptr's are zeroed indicating that the static arrays are gone.
- */
-static void __init setup_per_cpu_maps(void)
-{
-       int cpu;
+DEFINE_PER_CPU(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
 
-       for_each_possible_cpu(cpu) {
-               per_cpu(x86_cpu_to_apicid, cpu) =
-                               early_per_cpu_map(x86_cpu_to_apicid, cpu);
-               per_cpu(x86_bios_cpu_apicid, cpu) =
-                               early_per_cpu_map(x86_bios_cpu_apicid, cpu);
-#ifdef X86_64_NUMA
-               per_cpu(x86_cpu_to_node_map, cpu) =
-                               early_per_cpu_map(x86_cpu_to_node_map, cpu);
+#ifdef CONFIG_X86_64
+#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
+#else
+#define BOOT_PERCPU_OFFSET 0
 #endif
-       }
 
-       /* indicate the early static arrays will soon be gone */
-       early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
-       early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
-#ifdef X86_64_NUMA
-       early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
-#endif
-}
+DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
 
-#ifdef CONFIG_X86_32
-/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
- */
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+       [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
+};
 EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
- */
-static void __init setup_cpu_pda_map(void)
-{
-       char *pda;
-       struct x8664_pda **new_cpu_pda;
-       unsigned long size;
-       int cpu;
-
-       size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
-       /* allocate cpu_pda array and pointer table */
-       {
-               unsigned long tsize = nr_cpu_ids * sizeof(void *);
-               unsigned long asize = size * (nr_cpu_ids - 1);
 
-               tsize = roundup(tsize, cache_line_size());
-               new_cpu_pda = alloc_bootmem(tsize + asize);
-               pda = (char *)new_cpu_pda + tsize;
-       }
-
-       /* initialize pointer table to static pda's */
-       for_each_possible_cpu(cpu) {
-               if (cpu == 0) {
-                       /* leave boot cpu pda in place */
-                       new_cpu_pda[0] = cpu_pda(0);
-                       continue;
-               }
-               new_cpu_pda[cpu] = (struct x8664_pda *)pda;
-               new_cpu_pda[cpu]->in_bootmem = 1;
-               pda += size;
-       }
-
-       /* point to new pointer table */
-       _cpu_pda = new_cpu_pda;
-}
-
-#endif /* CONFIG_SMP && CONFIG_X86_64 */
-
-#ifdef CONFIG_X86_64
-
-/* correctly size the local cpu masks */
-static void __init setup_cpu_local_masks(void)
+static inline void setup_percpu_segment(int cpu)
 {
-       alloc_bootmem_cpumask_var(&cpu_initialized_mask);
-       alloc_bootmem_cpumask_var(&cpu_callin_mask);
-       alloc_bootmem_cpumask_var(&cpu_callout_mask);
-       alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
-}
-
-#else /* CONFIG_X86_32 */
+#ifdef CONFIG_X86_32
+       struct desc_struct gdt;
 
-static inline void setup_cpu_local_masks(void)
-{
+       pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
+                       0x2 | DESCTYPE_S, 0x8);
+       gdt.s = 1;
+       write_gdt_entry(get_cpu_gdt_table(cpu),
+                       GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
+#endif
 }
 
-#endif /* CONFIG_X86_32 */
-
 /*
  * Great future plan:
  * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
@@ -159,18 +60,12 @@ static inline void setup_cpu_local_masks(void)
  */
 void __init setup_per_cpu_areas(void)
 {
-       ssize_t size, old_size;
+       ssize_t size;
        char *ptr;
        int cpu;
-       unsigned long align = 1;
-
-       /* Setup cpu_pda map */
-       setup_cpu_pda_map();
 
        /* Copy section for each CPU (we discard the original) */
-       old_size = PERCPU_ENOUGH_ROOM;
-       align = max_t(unsigned long, PAGE_SIZE, align);
-       size = roundup(old_size, align);
+       size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
 
        pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
                NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
@@ -179,30 +74,67 @@ void __init setup_per_cpu_areas(void)
 
        for_each_possible_cpu(cpu) {
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-               ptr = __alloc_bootmem(size, align,
-                                __pa(MAX_DMA_ADDRESS));
+               ptr = alloc_bootmem_pages(size);
 #else
                int node = early_cpu_to_node(cpu);
                if (!node_online(node) || !NODE_DATA(node)) {
-                       ptr = __alloc_bootmem(size, align,
-                                        __pa(MAX_DMA_ADDRESS));
+                       ptr = alloc_bootmem_pages(size);
                        pr_info("cpu %d has no node %d or node-local memory\n",
                                cpu, node);
                        pr_debug("per cpu data for cpu%d at %016lx\n",
                                 cpu, __pa(ptr));
                } else {
-                       ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
-                                                       __pa(MAX_DMA_ADDRESS));
+                       ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
                        pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
                                cpu, node, __pa(ptr));
                }
 #endif
+
+               memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
                per_cpu_offset(cpu) = ptr - __per_cpu_start;
-               memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+               per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
+               per_cpu(cpu_number, cpu) = cpu;
+               setup_percpu_segment(cpu);
+               /*
+                * Copy data used in early init routines from the
+                * initial arrays to the per cpu data areas.  These
+                * arrays then become expendable and the *_early_ptr's
+                * are zeroed indicating that the static arrays are
+                * gone.
+                */
+#ifdef CONFIG_X86_LOCAL_APIC
+               per_cpu(x86_cpu_to_apicid, cpu) =
+                       early_per_cpu_map(x86_cpu_to_apicid, cpu);
+               per_cpu(x86_bios_cpu_apicid, cpu) =
+                       early_per_cpu_map(x86_bios_cpu_apicid, cpu);
+#endif
+#ifdef CONFIG_X86_64
+               per_cpu(irq_stack_ptr, cpu) =
+                       per_cpu(irq_stack_union.irq_stack, cpu) +
+                       IRQ_STACK_SIZE - 64;
+#ifdef CONFIG_NUMA
+               per_cpu(x86_cpu_to_node_map, cpu) =
+                       early_per_cpu_map(x86_cpu_to_node_map, cpu);
+#endif
+#endif
+               /*
+                * Up to this point, the boot CPU has been using .data.init
+                * area.  Reload any changed state for the boot CPU.
+                */
+               if (cpu == boot_cpu_id)
+                       switch_to_new_gdt(cpu);
+
+               DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
        }
 
-       /* Setup percpu data maps */
-       setup_per_cpu_maps();
+       /* indicate the early static arrays will soon be gone */
+#ifdef CONFIG_X86_LOCAL_APIC
+       early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
+       early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
+#endif
+#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+       early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
+#endif
 
        /* Setup node to cpumask map */
        setup_node_to_cpumask_map();
@@ -210,199 +142,3 @@ void __init setup_per_cpu_areas(void)
        /* Setup cpu initialized, callin, callout masks */
        setup_cpu_local_masks();
 }
-
-#endif
-
-#ifdef X86_64_NUMA
-
-/*
- * Allocate node_to_cpumask_map based on number of available nodes
- * Requires node_possible_map to be valid.
- *
- * Note: node_to_cpumask() is not valid until after this is done.
- */
-static void __init setup_node_to_cpumask_map(void)
-{
-       unsigned int node, num = 0;
-       cpumask_t *map;
-
-       /* setup nr_node_ids if not done yet */
-       if (nr_node_ids == MAX_NUMNODES) {
-               for_each_node_mask(node, node_possible_map)
-                       num = node;
-               nr_node_ids = num + 1;
-       }
-
-       /* allocate the map */
-       map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
-
-       pr_debug("Node to cpumask map at %p for %d nodes\n",
-                map, nr_node_ids);
-
-       /* node_to_cpumask() will now work */
-       node_to_cpumask_map = map;
-}
-
-void __cpuinit numa_set_node(int cpu, int node)
-{
-       int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
-
-       if (cpu_pda(cpu) && node != NUMA_NO_NODE)
-               cpu_pda(cpu)->nodenumber = node;
-
-       if (cpu_to_node_map)
-               cpu_to_node_map[cpu] = node;
-
-       else if (per_cpu_offset(cpu))
-               per_cpu(x86_cpu_to_node_map, cpu) = node;
-
-       else
-               pr_debug("Setting node for non-present cpu %d\n", cpu);
-}
-
-void __cpuinit numa_clear_node(int cpu)
-{
-       numa_set_node(cpu, NUMA_NO_NODE);
-}
-
-#ifndef CONFIG_DEBUG_PER_CPU_MAPS
-
-void __cpuinit numa_add_cpu(int cpu)
-{
-       cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-
-void __cpuinit numa_remove_cpu(int cpu)
-{
-       cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
-}
-
-#else /* CONFIG_DEBUG_PER_CPU_MAPS */
-
-/*
- * --------- debug versions of the numa functions ---------
- */
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
-{
-       int node = cpu_to_node(cpu);
-       cpumask_t *mask;
-       char buf[64];
-
-       if (node_to_cpumask_map == NULL) {
-               printk(KERN_ERR "node_to_cpumask_map NULL\n");
-               dump_stack();
-               return;
-       }
-
-       mask = &node_to_cpumask_map[node];
-       if (enable)
-               cpu_set(cpu, *mask);
-       else
-               cpu_clear(cpu, *mask);
-
-       cpulist_scnprintf(buf, sizeof(buf), mask);
-       printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
-               enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
-}
-
-void __cpuinit numa_add_cpu(int cpu)
-{
-       numa_set_cpumask(cpu, 1);
-}
-
-void __cpuinit numa_remove_cpu(int cpu)
-{
-       numa_set_cpumask(cpu, 0);
-}
-
-int cpu_to_node(int cpu)
-{
-       if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
-               printk(KERN_WARNING
-                       "cpu_to_node(%d): usage too early!\n", cpu);
-               dump_stack();
-               return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-       }
-       return per_cpu(x86_cpu_to_node_map, cpu);
-}
-EXPORT_SYMBOL(cpu_to_node);
-
-/*
- * Same function as cpu_to_node() but used if called before the
- * per_cpu areas are setup.
- */
-int early_cpu_to_node(int cpu)
-{
-       if (early_per_cpu_ptr(x86_cpu_to_node_map))
-               return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-
-       if (!per_cpu_offset(cpu)) {
-               printk(KERN_WARNING
-                       "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
-               dump_stack();
-               return NUMA_NO_NODE;
-       }
-       return per_cpu(x86_cpu_to_node_map, cpu);
-}
-
-
-/* empty cpumask */
-static const cpumask_t cpu_mask_none;
-
-/*
- * Returns a pointer to the bitmask of CPUs on Node 'node'.
- */
-const cpumask_t *cpumask_of_node(int node)
-{
-       if (node_to_cpumask_map == NULL) {
-               printk(KERN_WARNING
-                       "cpumask_of_node(%d): no node_to_cpumask_map!\n",
-                       node);
-               dump_stack();
-               return (const cpumask_t *)&cpu_online_map;
-       }
-       if (node >= nr_node_ids) {
-               printk(KERN_WARNING
-                       "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
-                       node, nr_node_ids);
-               dump_stack();
-               return &cpu_mask_none;
-       }
-       return &node_to_cpumask_map[node];
-}
-EXPORT_SYMBOL(cpumask_of_node);
-
-/*
- * Returns a bitmask of CPUs on Node 'node'.
- *
- * Side note: this function creates the returned cpumask on the stack
- * so with a high NR_CPUS count, excessive stack space is used.  The
- * node_to_cpumask_ptr function should be used whenever possible.
- */
-cpumask_t node_to_cpumask(int node)
-{
-       if (node_to_cpumask_map == NULL) {
-               printk(KERN_WARNING
-                       "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
-               dump_stack();
-               return cpu_online_map;
-       }
-       if (node >= nr_node_ids) {
-               printk(KERN_WARNING
-                       "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
-                       node, nr_node_ids);
-               dump_stack();
-               return cpu_mask_none;
-       }
-       return node_to_cpumask_map[node];
-}
-EXPORT_SYMBOL(node_to_cpumask);
-
-/*
- * --------- end of debug versions of the numa functions ---------
- */
-
-#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
-
-#endif /* X86_64_NUMA */
-
index bb1a3b1..612d3c7 100644 (file)
@@ -53,7 +53,6 @@
 #include <asm/nmi.h>
 #include <asm/irq.h>
 #include <asm/idle.h>
-#include <asm/smp.h>
 #include <asm/trampoline.h>
 #include <asm/cpu.h>
 #include <asm/numa.h>
@@ -63,6 +62,7 @@
 #include <asm/vmi.h>
 #include <asm/genapic.h>
 #include <asm/setup.h>
+#include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
 
 #include <mach_apic.h>
@@ -745,52 +745,6 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
        complete(&c_idle->done);
 }
 
-#ifdef CONFIG_X86_64
-
-/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
-static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
-{
-       if (!after_bootmem)
-               free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
-}
-
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-int __cpuinit get_local_pda(int cpu)
-{
-       struct x8664_pda *oldpda, *newpda;
-       unsigned long size = sizeof(struct x8664_pda);
-       int node = cpu_to_node(cpu);
-
-       if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
-               return 0;
-
-       oldpda = cpu_pda(cpu);
-       newpda = kmalloc_node(size, GFP_ATOMIC, node);
-       if (!newpda) {
-               printk(KERN_ERR "Could not allocate node local PDA "
-                       "for CPU %d on node %d\n", cpu, node);
-
-               if (oldpda)
-                       return 0;       /* have a usable pda */
-               else
-                       return -1;
-       }
-
-       if (oldpda) {
-               memcpy(newpda, oldpda, size);
-               free_bootmem_pda(oldpda);
-       }
-
-       newpda->in_bootmem = 0;
-       cpu_pda(cpu) = newpda;
-       return 0;
-}
-#endif /* CONFIG_X86_64 */
-
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -808,16 +762,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
        };
        INIT_WORK(&c_idle.work, do_fork_idle);
 
-#ifdef CONFIG_X86_64
-       /* Allocate node local memory for AP pdas */
-       if (cpu > 0) {
-               boot_error = get_local_pda(cpu);
-               if (boot_error)
-                       goto restore_state;
-                       /* if can't get pda memory, can't start cpu */
-       }
-#endif
-
        alternatives_smp_switch(1);
 
        c_idle.idle = get_idle_for_cpu(cpu);
@@ -847,14 +791,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 
        set_idle_for_cpu(cpu, c_idle.idle);
 do_rest:
-#ifdef CONFIG_X86_32
        per_cpu(current_task, cpu) = c_idle.idle;
-       init_gdt(cpu);
+#ifdef CONFIG_X86_32
        /* Stack for startup_32 can be just as for start_secondary onwards */
        irq_ctx_init(cpu);
 #else
-       cpu_pda(cpu)->pcurrent = c_idle.idle;
        clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+       initial_gs = per_cpu_offset(cpu);
+       per_cpu(kernel_stack, cpu) =
+               (unsigned long)task_stack_page(c_idle.idle) -
+               KERNEL_STACK_OFFSET + THREAD_SIZE;
 #endif
        early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
        initial_code = (unsigned long)start_secondary;
@@ -931,9 +877,7 @@ do_rest:
                                inquire_remote_apic(apicid);
                }
        }
-#ifdef CONFIG_X86_64
-restore_state:
-#endif
+
        if (boot_error) {
                /* Try to put things back the way they were before ... */
                numa_remove_cpu(cpu); /* was set by numa_add_cpu */
@@ -1125,6 +1069,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
                printk(KERN_ERR "... forcing use of dummy APIC emulation."
                                "(tell your hw vendor)\n");
                smpboot_clear_io_apic();
+               disable_ioapic_setup();
                return -1;
        }
 
@@ -1240,10 +1185,7 @@ out:
 void __init native_smp_prepare_boot_cpu(void)
 {
        int me = smp_processor_id();
-#ifdef CONFIG_X86_32
-       init_gdt(me);
-#endif
-       switch_to_new_gdt();
+       switch_to_new_gdt(me);
        /* already set me in cpu_online_mask in boot_cpu_init() */
        cpumask_set_cpu(me, cpu_callout_mask);
        per_cpu(cpu_state, me) = CPU_ONLINE;
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
deleted file mode 100644 (file)
index 397e309..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * SMP stuff which is common to all sub-architectures.
- */
-#include <linux/module.h>
-#include <asm/smp.h>
-
-#ifdef CONFIG_X86_32
-DEFINE_PER_CPU(unsigned long, this_cpu_off);
-EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-
-/*
- * Initialize the CPU's GDT.  This is either the boot CPU doing itself
- * (still using the master per-cpu area), or a CPU doing it for a
- * secondary which will soon come up.
- */
-__cpuinit void init_gdt(int cpu)
-{
-       struct desc_struct gdt;
-
-       pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
-                       0x2 | DESCTYPE_S, 0x8);
-       gdt.s = 1;
-
-       write_gdt_entry(get_cpu_gdt_table(cpu),
-                       GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
-
-       per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
-       per_cpu(cpu_number, cpu) = cpu;
-}
-#endif
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
deleted file mode 100644 (file)
index ce50546..0000000
+++ /dev/null
@@ -1,256 +0,0 @@
-#include <linux/spinlock.h>
-#include <linux/cpu.h>
-#include <linux/interrupt.h>
-
-#include <asm/tlbflush.h>
-
-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
-                       ____cacheline_aligned = { &init_mm, 0, };
-
-/* must come after the send_IPI functions above for inlining */
-#include <mach_ipi.h>
-
-/*
- *     Smarter SMP flushing macros.
- *             c/o Linus Torvalds.
- *
- *     These mean you can really definitely utterly forget about
- *     writing to user space from interrupts. (Its not allowed anyway).
- *
- *     Optimizations Manfred Spraul <manfred@colorfullife.com>
- */
-
-static cpumask_t flush_cpumask;
-static struct mm_struct *flush_mm;
-static unsigned long flush_va;
-static DEFINE_SPINLOCK(tlbstate_lock);
-
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- *
- * We need to reload %cr3 since the page tables may be going
- * away from under us..
- */
-void leave_mm(int cpu)
-{
-       BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
-       cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
-       load_cr3(swapper_pg_dir);
-}
-EXPORT_SYMBOL_GPL(leave_mm);
-
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- *     Stop ipi delivery for the old mm. This is not synchronized with
- *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
- *     for the wrong mm, and in the worst case we perform a superfluous
- *     tlb flush.
- * 1a2) set cpu_tlbstate to TLBSTATE_OK
- *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- *     was in lazy tlb mode.
- * 1a3) update cpu_tlbstate[].active_mm
- *     Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- *     Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- *     cpu_tlbstate[].active_mm is correct, cpu0 already handles
- *     flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- *     Atomically set the bit [other cpus will start sending flush ipis],
- *     and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- *   runs in kernel space, the cpu could load tlb entries for user space
- *   pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- */
-
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-
-void smp_invalidate_interrupt(struct pt_regs *regs)
-{
-       unsigned long cpu;
-
-       cpu = get_cpu();
-
-       if (!cpu_isset(cpu, flush_cpumask))
-               goto out;
-               /*
-                * This was a BUG() but until someone can quote me the
-                * line from the intel manual that guarantees an IPI to
-                * multiple CPUs is retried _only_ on the erroring CPUs
-                * its staying as a return
-                *
-                * BUG();
-                */
-
-       if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
-               if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
-                       if (flush_va == TLB_FLUSH_ALL)
-                               local_flush_tlb();
-                       else
-                               __flush_tlb_one(flush_va);
-               } else
-                       leave_mm(cpu);
-       }
-       ack_APIC_irq();
-       smp_mb__before_clear_bit();
-       cpu_clear(cpu, flush_cpumask);
-       smp_mb__after_clear_bit();
-out:
-       put_cpu_no_resched();
-       inc_irq_stat(irq_tlb_count);
-}
-
-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
-                            unsigned long va)
-{
-       cpumask_t cpumask = *cpumaskp;
-
-       /*
-        * A couple of (to be removed) sanity checks:
-        *
-        * - current CPU must not be in mask
-        * - mask must exist :)
-        */
-       BUG_ON(cpus_empty(cpumask));
-       BUG_ON(cpu_isset(smp_processor_id(), cpumask));
-       BUG_ON(!mm);
-
-#ifdef CONFIG_HOTPLUG_CPU
-       /* If a CPU which we ran on has gone down, OK. */
-       cpus_and(cpumask, cpumask, cpu_online_map);
-       if (unlikely(cpus_empty(cpumask)))
-               return;
-#endif
-
-       /*
-        * i'm not happy about this global shared spinlock in the
-        * MM hot path, but we'll see how contended it is.
-        * AK: x86-64 has a faster method that could be ported.
-        */
-       spin_lock(&tlbstate_lock);
-
-       flush_mm = mm;
-       flush_va = va;
-       cpus_or(flush_cpumask, cpumask, flush_cpumask);
-
-       /*
-        * Make the above memory operations globally visible before
-        * sending the IPI.
-        */
-       smp_mb();
-       /*
-        * We have to send the IPI only to
-        * CPUs affected.
-        */
-       send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
-
-       while (!cpus_empty(flush_cpumask))
-               /* nothing. lockup detection does not belong here */
-               cpu_relax();
-
-       flush_mm = NULL;
-       flush_va = 0;
-       spin_unlock(&tlbstate_lock);
-}
-
-void flush_tlb_current_task(void)
-{
-       struct mm_struct *mm = current->mm;
-       cpumask_t cpu_mask;
-
-       preempt_disable();
-       cpu_mask = mm->cpu_vm_mask;
-       cpu_clear(smp_processor_id(), cpu_mask);
-
-       local_flush_tlb();
-       if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-       preempt_enable();
-}
-
-void flush_tlb_mm(struct mm_struct *mm)
-{
-       cpumask_t cpu_mask;
-
-       preempt_disable();
-       cpu_mask = mm->cpu_vm_mask;
-       cpu_clear(smp_processor_id(), cpu_mask);
-
-       if (current->active_mm == mm) {
-               if (current->mm)
-                       local_flush_tlb();
-               else
-                       leave_mm(smp_processor_id());
-       }
-       if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-
-       preempt_enable();
-}
-
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
-{
-       struct mm_struct *mm = vma->vm_mm;
-       cpumask_t cpu_mask;
-
-       preempt_disable();
-       cpu_mask = mm->cpu_vm_mask;
-       cpu_clear(smp_processor_id(), cpu_mask);
-
-       if (current->active_mm == mm) {
-               if (current->mm)
-                       __flush_tlb_one(va);
-                else
-                       leave_mm(smp_processor_id());
-       }
-
-       if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, va);
-
-       preempt_enable();
-}
-EXPORT_SYMBOL(flush_tlb_page);
-
-static void do_flush_tlb_all(void *info)
-{
-       unsigned long cpu = smp_processor_id();
-
-       __flush_tlb_all();
-       if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
-               leave_mm(cpu);
-}
-
-void flush_tlb_all(void)
-{
-       on_each_cpu(do_flush_tlb_all, NULL, 1);
-}
-
-void reset_lazy_tlbstate(void)
-{
-       int cpu = raw_smp_processor_id();
-
-       per_cpu(cpu_tlbstate, cpu).state = 0;
-       per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
-}
-
index 6812b82..f4b2f27 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 
 #include <asm/mmu_context.h>
+#include <asm/uv/uv.h>
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_bau.h>
@@ -210,14 +211,15 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
  *
  * Send a broadcast and wait for a broadcast message to complete.
  *
- * The cpumaskp mask contains the cpus the broadcast was sent to.
+ * The flush_mask contains the cpus the broadcast was sent to.
  *
- * Returns 1 if all remote flushing was done. The mask is zeroed.
- * Returns 0 if some remote flushing remains to be done. The mask is left
- * unchanged.
+ * Returns NULL if all remote flushing was done. The mask is zeroed.
+ * Returns @flush_mask if some remote flushing remains to be done. The
+ * mask will have some bits still set.
  */
-int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
-                          cpumask_t *cpumaskp)
+const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
+                                            struct bau_desc *bau_desc,
+                                            struct cpumask *flush_mask)
 {
        int completion_status = 0;
        int right_shift;
@@ -257,66 +259,76 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
                 * the cpu's, all of which are still in the mask.
                 */
                __get_cpu_var(ptcstats).ptc_i++;
-               return 0;
+               return flush_mask;
        }
 
        /*
         * Success, so clear the remote cpu's from the mask so we don't
         * use the IPI method of shootdown on them.
         */
-       for_each_cpu_mask(bit, *cpumaskp) {
+       for_each_cpu(bit, flush_mask) {
                blade = uv_cpu_to_blade_id(bit);
                if (blade == this_blade)
                        continue;
-               cpu_clear(bit, *cpumaskp);
+               cpumask_clear_cpu(bit, flush_mask);
        }
-       if (!cpus_empty(*cpumaskp))
-               return 0;
-       return 1;
+       if (!cpumask_empty(flush_mask))
+               return flush_mask;
+       return NULL;
 }
 
 /**
  * uv_flush_tlb_others - globally purge translation cache of a virtual
  * address or all TLB's
- * @cpumaskp: mask of all cpu's in which the address is to be removed
+ * @cpumask: mask of all cpu's in which the address is to be removed
  * @mm: mm_struct containing virtual address range
  * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ * @cpu: the current cpu
  *
  * This is the entry point for initiating any UV global TLB shootdown.
  *
  * Purges the translation caches of all specified processors of the given
  * virtual address, or purges all TLB's on specified processors.
  *
- * The caller has derived the cpumaskp from the mm_struct and has subtracted
- * the local cpu from the mask.  This function is called only if there
- * are bits set in the mask. (e.g. flush_tlb_page())
+ * The caller has derived the cpumask from the mm_struct.  This function
+ * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
  *
- * The cpumaskp is converted into a nodemask of the nodes containing
+ * The cpumask is converted into a nodemask of the nodes containing
  * the cpus.
  *
- * Returns 1 if all remote flushing was done.
- * Returns 0 if some remote flushing remains to be done.
+ * Note that this function should be called with preemption disabled.
+ *
+ * Returns NULL if all remote flushing was done.
+ * Returns pointer to cpumask if some remote flushing remains to be
+ * done.  The returned pointer is valid till preemption is re-enabled.
  */
-int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
-                       unsigned long va)
+const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+                                         struct mm_struct *mm,
+                                         unsigned long va, unsigned int cpu)
 {
+       static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
+       struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
        int i;
        int bit;
        int blade;
-       int cpu;
+       int uv_cpu;
        int this_blade;
        int locals = 0;
        struct bau_desc *bau_desc;
 
-       cpu = uv_blade_processor_id();
+       WARN_ON(!in_atomic());
+
+       cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+
+       uv_cpu = uv_blade_processor_id();
        this_blade = uv_numa_blade_id();
        bau_desc = __get_cpu_var(bau_control).descriptor_base;
-       bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu;
+       bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
 
        bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
 
        i = 0;
-       for_each_cpu_mask(bit, *cpumaskp) {
+       for_each_cpu(bit, flush_mask) {
                blade = uv_cpu_to_blade_id(bit);
                BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
                if (blade == this_blade) {
@@ -331,17 +343,17 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
                 * no off_node flushing; return status for local node
                 */
                if (locals)
-                       return 0;
+                       return flush_mask;
                else
-                       return 1;
+                       return NULL;
        }
        __get_cpu_var(ptcstats).requestor++;
        __get_cpu_var(ptcstats).ntargeted += i;
 
        bau_desc->payload.address = va;
-       bau_desc->payload.sending_cpu = smp_processor_id();
+       bau_desc->payload.sending_cpu = cpu;
 
-       return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp);
+       return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
 }
 
 /*
index 98c2d05..ed5aee5 100644 (file)
@@ -59,7 +59,6 @@
 #ifdef CONFIG_X86_64
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
-#include <asm/pda.h>
 #else
 #include <asm/processor-flags.h>
 #include <asm/arch_hooks.h>
index 1d3302c..eb9e734 100644 (file)
@@ -670,10 +670,11 @@ static inline int __init activate_vmi(void)
        para_fill(pv_mmu_ops.write_cr2, SetCR2);
        para_fill(pv_mmu_ops.write_cr3, SetCR3);
        para_fill(pv_cpu_ops.write_cr4, SetCR4);
-       para_fill(pv_irq_ops.save_fl, GetInterruptMask);
-       para_fill(pv_irq_ops.restore_fl, SetInterruptMask);
-       para_fill(pv_irq_ops.irq_disable, DisableInterrupts);
-       para_fill(pv_irq_ops.irq_enable, EnableInterrupts);
+
+       para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
+       para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
+       para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
+       para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
 
        para_fill(pv_cpu_ops.wbinvd, WBINVD);
        para_fill(pv_cpu_ops.read_tsc, RDTSC);
index 82c6755..3eba7f7 100644 (file)
@@ -178,14 +178,7 @@ SECTIONS
        __initramfs_end = .;
   }
 #endif
-  . = ALIGN(PAGE_SIZE);
-  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
-       __per_cpu_start = .;
-       *(.data.percpu.page_aligned)
-       *(.data.percpu)
-       *(.data.percpu.shared_aligned)
-       __per_cpu_end = .;
-  }
+  PERCPU(PAGE_SIZE)
   . = ALIGN(PAGE_SIZE);
   /* freed after init ends here */
 
index 1a614c0..07f62d2 100644 (file)
@@ -5,6 +5,7 @@
 #define LOAD_OFFSET __START_KERNEL_map
 
 #include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
 #include <asm/page.h>
 
 #undef i386    /* in case the preprocessor is a 32bit one */
@@ -13,12 +14,15 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
 OUTPUT_ARCH(i386:x86-64)
 ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
-_proxy_pda = 1;
 PHDRS {
        text PT_LOAD FLAGS(5);  /* R_E */
        data PT_LOAD FLAGS(7);  /* RWE */
        user PT_LOAD FLAGS(7);  /* RWE */
        data.init PT_LOAD FLAGS(7);     /* RWE */
+#ifdef CONFIG_SMP
+       percpu PT_LOAD FLAGS(7);        /* RWE */
+#endif
+       data.init2 PT_LOAD FLAGS(7);    /* RWE */
        note PT_NOTE FLAGS(0);  /* ___ */
 }
 SECTIONS
@@ -208,14 +212,28 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
+#ifdef CONFIG_SMP
+  /*
+   * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
+   * output PHDR, so the next output section - __data_nosave - should
+   * start another section data.init2.  Also, pda should be at the head of
+   * percpu area.  Preallocate it and define the percpu offset symbol
+   * so that it can be accessed as a percpu variable.
+   */
+  . = ALIGN(PAGE_SIZE);
+  PERCPU_VADDR(0, :percpu)
+#else
   PERCPU(PAGE_SIZE)
+#endif
 
   . = ALIGN(PAGE_SIZE);
   __init_end = .;
 
   . = ALIGN(PAGE_SIZE);
   __nosave_begin = .;
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
+  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+      *(.data.nosave)
+  } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
   . = ALIGN(PAGE_SIZE);
   __nosave_end = .;
 
@@ -244,3 +262,8 @@ SECTIONS
  */
 ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
        "kernel image bigger than KERNEL_IMAGE_SIZE")
+
+#ifdef CONFIG_SMP
+ASSERT((per_cpu__irq_stack_union == 0),
+        "irq_stack_union is not at start of per-cpu area");
+#endif
index a688f3b..c609205 100644 (file)
@@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void)
                flags &= ~X86_EFLAGS_IF;
        return flags;
 }
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
 
 static void vsmp_restore_fl(unsigned long flags)
 {
@@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned long flags)
                flags |= X86_EFLAGS_AC;
        native_restore_fl(flags);
 }
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
 
 static void vsmp_irq_disable(void)
 {
@@ -53,6 +55,7 @@ static void vsmp_irq_disable(void)
 
        native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
 }
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
 
 static void vsmp_irq_enable(void)
 {
@@ -60,6 +63,7 @@ static void vsmp_irq_enable(void)
 
        native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
 }
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
 
 static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
                                  unsigned long addr, unsigned len)
@@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void)
               cap, ctl);
        if (cap & ctl & (1 << 4)) {
                /* Setup irq ops and turn on vSMP  IRQ fastpath handling */
-               pv_irq_ops.irq_disable = vsmp_irq_disable;
-               pv_irq_ops.irq_enable  = vsmp_irq_enable;
-               pv_irq_ops.save_fl  = vsmp_save_fl;
-               pv_irq_ops.restore_fl  = vsmp_restore_fl;
+               pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
+               pv_irq_ops.irq_enable  = PV_CALLEE_SAVE(vsmp_irq_enable);
+               pv_irq_ops.save_fl  = PV_CALLEE_SAVE(vsmp_save_fl);
+               pv_irq_ops.restore_fl  = PV_CALLEE_SAVE(vsmp_restore_fl);
                pv_init_ops.patch = vsmp_patch;
 
                ctl &= ~(1 << 4);
index 695e426..3909e3b 100644 (file)
@@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy);
 EXPORT_SYMBOL(empty_zero_page);
 EXPORT_SYMBOL(init_level4_pgt);
 EXPORT_SYMBOL(load_gs_index);
-
-EXPORT_SYMBOL(_proxy_pda);
index 92f1c6f..19e33b6 100644 (file)
@@ -173,24 +173,29 @@ static unsigned long save_fl(void)
 {
        return lguest_data.irq_enabled;
 }
+PV_CALLEE_SAVE_REGS_THUNK(save_fl);
 
 /* restore_flags() just sets the flags back to the value given. */
 static void restore_fl(unsigned long flags)
 {
        lguest_data.irq_enabled = flags;
 }
+PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
 
 /* Interrupts go off... */
 static void irq_disable(void)
 {
        lguest_data.irq_enabled = 0;
 }
+PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
 
 /* Interrupts go on... */
 static void irq_enable(void)
 {
        lguest_data.irq_enabled = X86_EFLAGS_IF;
 }
+PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
+
 /*:*/
 /*M:003 Note that we don't check for outstanding interrupts when we re-enable
  * them (or when we unmask an interrupt).  This seems to work for the moment,
@@ -984,10 +989,10 @@ __init void lguest_init(void)
 
        /* interrupt-related operations */
        pv_irq_ops.init_IRQ = lguest_init_IRQ;
-       pv_irq_ops.save_fl = save_fl;
-       pv_irq_ops.restore_fl = restore_fl;
-       pv_irq_ops.irq_disable = irq_disable;
-       pv_irq_ops.irq_enable = irq_enable;
+       pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
+       pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
+       pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
+       pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
        pv_irq_ops.safe_halt = lguest_safe_halt;
 
        /* init-time operations */
index d914a79..66b7eb5 100644 (file)
@@ -9,6 +9,7 @@
 #include <asm/e820.h>
 #include <asm/io.h>
 #include <asm/setup.h>
+#include <asm/cpu.h>
 
 void __init pre_intr_init_hook(void)
 {
index 7ffcdee..98e3c2b 100644 (file)
@@ -400,7 +400,7 @@ void __init find_smp_config(void)
             VOYAGER_SUS_IN_CONTROL_PORT);
 
        current_thread_info()->cpu = boot_cpu_id;
-       x86_write_percpu(cpu_number, boot_cpu_id);
+       percpu_write(cpu_number, boot_cpu_id);
 }
 
 /*
@@ -528,7 +528,6 @@ static void __init do_boot_cpu(__u8 cpu)
        /* init_tasks (in sched.c) is indexed logically */
        stack_start.sp = (void *)idle->thread.sp;
 
-       init_gdt(cpu);
        per_cpu(current_task, cpu) = idle;
        early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
        irq_ctx_init(cpu);
@@ -1745,13 +1744,13 @@ static void __init voyager_smp_prepare_cpus(unsigned int max_cpus)
 
 static void __cpuinit voyager_smp_prepare_boot_cpu(void)
 {
-       init_gdt(smp_processor_id());
-       switch_to_new_gdt();
+       int cpu = smp_processor_id();
+       switch_to_new_gdt(cpu);
 
-       cpu_online_map = cpumask_of_cpu(smp_processor_id());
-       cpu_callout_map = cpumask_of_cpu(smp_processor_id());
-       cpu_callin_map = CPU_MASK_NONE;
-       cpu_present_map = cpumask_of_cpu(smp_processor_id());
+       cpu_set(cpu, cpu_online_map);
+       cpu_set(cpu, cpu_callout_map);
+       cpu_set(cpu, cpu_possible_map);
+       cpu_set(cpu, cpu_present_map);
 
 }
 
@@ -1779,7 +1778,6 @@ static void __init voyager_smp_cpus_done(unsigned int max_cpus)
 void __init smp_setup_processor_id(void)
 {
        current_thread_info()->cpu = hard_smp_processor_id();
-       x86_write_percpu(cpu_number, hard_smp_processor_id());
 }
 
 static void voyager_send_call_func(const struct cpumask *callmask)
index d8cc96a..9f05157 100644 (file)
@@ -1,6 +1,8 @@
 obj-y  :=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
            pat.o pgtable.o gup.o
 
+obj-$(CONFIG_X86_SMP)          += tlb.o
+
 obj-$(CONFIG_X86_32)           += pgtable_32.o iomap_32.o
 
 obj-$(CONFIG_HUGETLB_PAGE)     += hugetlbpage.o
index c76ef1d..976b5a7 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
+#include <linux/magic.h>
 
 #include <asm/system.h>
 #include <asm/desc.h>
@@ -91,8 +92,8 @@ static inline int notify_page_fault(struct pt_regs *regs)
  *
  * Opcode checker based on code by Richard Brunner
  */
-static int is_prefetch(struct pt_regs *regs, unsigned long addr,
-                      unsigned long error_code)
+static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
+                       unsigned long addr)
 {
        unsigned char *instr;
        int scan_more = 1;
@@ -409,15 +410,15 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 }
 
 #ifdef CONFIG_X86_64
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
-                                unsigned long error_code)
+static noinline void pgtable_bad(struct pt_regs *regs,
+                        unsigned long error_code, unsigned long address)
 {
        unsigned long flags = oops_begin();
        int sig = SIGKILL;
-       struct task_struct *tsk;
+       struct task_struct *tsk = current;
 
        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
-              current->comm, address);
+              tsk->comm, address);
        dump_pagetable(address);
        tsk = current;
        tsk->thread.cr2 = address;
@@ -429,6 +430,196 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 }
 #endif
 
+static noinline void no_context(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address)
+{
+       struct task_struct *tsk = current;
+       unsigned long *stackend;
+
+#ifdef CONFIG_X86_64
+       unsigned long flags;
+       int sig;
+#endif
+
+       /* Are we prepared to handle this kernel fault?  */
+       if (fixup_exception(regs))
+               return;
+
+       /*
+        * X86_32
+        * Valid to do another page fault here, because if this fault
+        * had been triggered by is_prefetch fixup_exception would have
+        * handled it.
+        *
+        * X86_64
+        * Hall of shame of CPU/BIOS bugs.
+        */
+       if (is_prefetch(regs, error_code, address))
+               return;
+
+       if (is_errata93(regs, address))
+               return;
+
+       /*
+        * Oops. The kernel tried to access some bad page. We'll have to
+        * terminate things with extreme prejudice.
+        */
+#ifdef CONFIG_X86_32
+       bust_spinlocks(1);
+#else
+       flags = oops_begin();
+#endif
+
+       show_fault_oops(regs, error_code, address);
+
+       stackend = end_of_stack(tsk);
+       if (*stackend != STACK_END_MAGIC)
+               printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+
+       tsk->thread.cr2 = address;
+       tsk->thread.trap_no = 14;
+       tsk->thread.error_code = error_code;
+
+#ifdef CONFIG_X86_32
+       die("Oops", regs, error_code);
+       bust_spinlocks(0);
+       do_exit(SIGKILL);
+#else
+       sig = SIGKILL;
+       if (__die("Oops", regs, error_code))
+               sig = 0;
+       /* Executive summary in case the body of the oops scrolled away */
+       printk(KERN_EMERG "CR2: %016lx\n", address);
+       oops_end(flags, regs, sig);
+#endif
+}
+
+static void __bad_area_nosemaphore(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address,
+                       int si_code)
+{
+       struct task_struct *tsk = current;
+
+       /* User mode accesses just cause a SIGSEGV */
+       if (error_code & PF_USER) {
+               /*
+                * It's possible to have interrupts off here.
+                */
+               local_irq_enable();
+
+               /*
+                * Valid to do another page fault here because this one came
+                * from user space.
+                */
+               if (is_prefetch(regs, error_code, address))
+                       return;
+
+               if (is_errata100(regs, address))
+                       return;
+
+               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+                   printk_ratelimit()) {
+                       printk(
+                       "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+                       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+                       tsk->comm, task_pid_nr(tsk), address,
+                       (void *) regs->ip, (void *) regs->sp, error_code);
+                       print_vma_addr(" in ", regs->ip);
+                       printk("\n");
+               }
+
+               tsk->thread.cr2 = address;
+               /* Kernel addresses are always protection faults */
+               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+               tsk->thread.trap_no = 14;
+               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+               return;
+       }
+
+       if (is_f00f_bug(regs, address))
+               return;
+
+       no_context(regs, error_code, address);
+}
+
+static noinline void bad_area_nosemaphore(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address)
+{
+       __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+}
+
+static void __bad_area(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address,
+                       int si_code)
+{
+       struct mm_struct *mm = current->mm;
+
+       /*
+        * Something tried to access memory that isn't in our memory map..
+        * Fix it, but check if it's kernel or user first..
+        */
+       up_read(&mm->mmap_sem);
+
+       __bad_area_nosemaphore(regs, error_code, address, si_code);
+}
+
+static noinline void bad_area(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address)
+{
+       __bad_area(regs, error_code, address, SEGV_MAPERR);
+}
+
+static noinline void bad_area_access_error(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address)
+{
+       __bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+
+/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
+static void out_of_memory(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address)
+{
+       /*
+        * We ran out of memory, call the OOM killer, and return the userspace
+        * (which will retry the fault, or kill us if we got oom-killed).
+        */
+       up_read(&current->mm->mmap_sem);
+       pagefault_out_of_memory();
+}
+
+static void do_sigbus(struct pt_regs *regs,
+                       unsigned long error_code, unsigned long address)
+{
+       struct task_struct *tsk = current;
+       struct mm_struct *mm = tsk->mm;
+
+       up_read(&mm->mmap_sem);
+
+       /* Kernel mode? Handle exceptions or die */
+       if (!(error_code & PF_USER))
+               no_context(regs, error_code, address);
+#ifdef CONFIG_X86_32
+       /* User space => ok to do another page fault */
+       if (is_prefetch(regs, error_code, address))
+               return;
+#endif
+       tsk->thread.cr2 = address;
+       tsk->thread.error_code = error_code;
+       tsk->thread.trap_no = 14;
+       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+static noinline void mm_fault_error(struct pt_regs *regs,
+               unsigned long error_code, unsigned long address, unsigned int fault)
+{
+       if (fault & VM_FAULT_OOM)
+               out_of_memory(regs, error_code, address);
+       else if (fault & VM_FAULT_SIGBUS)
+               do_sigbus(regs, error_code, address);
+       else
+               BUG();
+}
+
 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 {
        if ((error_code & PF_WRITE) && !pte_write(*pte))
@@ -448,8 +639,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
  * There are no security implications to leaving a stale TLB when
  * increasing the permissions on a page.
  */
-static int spurious_fault(unsigned long address,
-                         unsigned long error_code)
+static noinline int spurious_fault(unsigned long error_code,
+                               unsigned long address)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -494,7 +685,7 @@ static int spurious_fault(unsigned long address,
  *
  * This assumes no large pages in there.
  */
-static int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
 {
 #ifdef CONFIG_X86_32
        unsigned long pgd_paddr;
@@ -573,6 +764,25 @@ static int vmalloc_fault(unsigned long address)
 
 int show_unhandled_signals = 1;
 
+static inline int access_error(unsigned long error_code, int write,
+                               struct vm_area_struct *vma)
+{
+       if (write) {
+               /* write, present and write, not present */
+               if (unlikely(!(vma->vm_flags & VM_WRITE)))
+                       return 1;
+       } else if (unlikely(error_code & PF_PROT)) {
+               /* read, present */
+               return 1;
+       } else {
+               /* read, not present */
+               if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+                       return 1;
+       }
+
+       return 0;
+}
+
 /*
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
@@ -583,16 +793,12 @@ asmlinkage
 #endif
 void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
+       unsigned long address;
        struct task_struct *tsk;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
-       unsigned long address;
-       int write, si_code;
+       int write;
        int fault;
-#ifdef CONFIG_X86_64
-       unsigned long flags;
-       int sig;
-#endif
 
        tsk = current;
        mm = tsk->mm;
@@ -601,8 +807,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
        /* get the address */
        address = read_cr2();
 
-       si_code = SEGV_MAPERR;
-
+       if (unlikely(notify_page_fault(regs)))
        if (unlikely(kmmio_fault(regs, address)))
                return;
 
@@ -629,7 +834,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
                        return;
 
                /* Can handle a stale RO->RW TLB */
-               if (spurious_fault(address, error_code))
+               if (spurious_fault(error_code, address))
                        return;
 
                /* kprobes don't want to hook the spurious faults. */
@@ -639,13 +844,13 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
                 * Don't take the mm semaphore here. If we fixup a prefetch
                 * fault we could otherwise deadlock.
                 */
-               goto bad_area_nosemaphore;
+               bad_area_nosemaphore(regs, error_code, address);
+               return;
        }
 
        /* kprobes don't want to hook the spurious faults. */
        if (notify_page_fault(regs))
                return;
-
        /*
         * It's safe to allow irq's after cr2 has been saved and the
         * vmalloc fault has been handled.
@@ -661,15 +866,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 #ifdef CONFIG_X86_64
        if (unlikely(error_code & PF_RSVD))
-               pgtable_bad(address, regs, error_code);
+               pgtable_bad(regs, error_code, address);
 #endif
 
        /*
         * If we're in an interrupt, have no user context or are running in an
         * atomic region then we must not take the fault.
         */
-       if (unlikely(in_atomic() || !mm))
-               goto bad_area_nosemaphore;
+       if (unlikely(in_atomic() || !mm)) {
+               bad_area_nosemaphore(regs, error_code, address);
+               return;
+       }
 
        /*
         * When running in the kernel we expect faults to occur only to
@@ -687,20 +894,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
         * source.  If this is invalid we can skip the address space check,
         * thus avoiding the deadlock.
         */
-       if (!down_read_trylock(&mm->mmap_sem)) {
+       if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
                if ((error_code & PF_USER) == 0 &&
-                   !search_exception_tables(regs->ip))
-                       goto bad_area_nosemaphore;
+                   !search_exception_tables(regs->ip)) {
+                       bad_area_nosemaphore(regs, error_code, address);
+                       return;
+               }
                down_read(&mm->mmap_sem);
        }
 
        vma = find_vma(mm, address);
-       if (!vma)
-               goto bad_area;
-       if (vma->vm_start <= address)
+       if (unlikely(!vma)) {
+               bad_area(regs, error_code, address);
+               return;
+       }
+       if (likely(vma->vm_start <= address))
                goto good_area;
-       if (!(vma->vm_flags & VM_GROWSDOWN))
-               goto bad_area;
+       if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+               bad_area(regs, error_code, address);
+               return;
+       }
        if (error_code & PF_USER) {
                /*
                 * Accessing the stack below %sp is always a bug.
@@ -708,31 +921,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
                 * and pusha to work.  ("enter $65535,$31" pushes
                 * 32 pointers and then decrements %sp by 65535.)
                 */
-               if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
-                       goto bad_area;
+               if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
+                       bad_area(regs, error_code, address);
+                       return;
+               }
        }
-       if (expand_stack(vma, address))
-               goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
+       if (unlikely(expand_stack(vma, address))) {
+               bad_area(regs, error_code, address);
+               return;
+       }
+
+       /*
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
 good_area:
-       si_code = SEGV_ACCERR;
-       write = 0;
-       switch (error_code & (PF_PROT|PF_WRITE)) {
-       default:        /* 3: write, present */
-               /* fall through */
-       case PF_WRITE:          /* write, not present */
-               if (!(vma->vm_flags & VM_WRITE))
-                       goto bad_area;
-               write++;
-               break;
-       case PF_PROT:           /* read, present */
-               goto bad_area;
-       case 0:                 /* read, not present */
-               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-                       goto bad_area;
+       write = error_code & PF_WRITE;
+       if (unlikely(access_error(error_code, write, vma))) {
+               bad_area_access_error(regs, error_code, address);
+               return;
        }
 
        /*
@@ -742,11 +949,8 @@ good_area:
         */
        fault = handle_mm_fault(mm, vma, address, write);
        if (unlikely(fault & VM_FAULT_ERROR)) {
-               if (fault & VM_FAULT_OOM)
-                       goto out_of_memory;
-               else if (fault & VM_FAULT_SIGBUS)
-                       goto do_sigbus;
-               BUG();
+               mm_fault_error(regs, error_code, address, fault);
+               return;
        }
        if (fault & VM_FAULT_MAJOR)
                tsk->maj_flt++;
@@ -764,128 +968,6 @@ good_area:
        }
 #endif
        up_read(&mm->mmap_sem);
-       return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-       up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-       /* User mode accesses just cause a SIGSEGV */
-       if (error_code & PF_USER) {
-               /*
-                * It's possible to have interrupts off here.
-                */
-               local_irq_enable();
-
-               /*
-                * Valid to do another page fault here because this one came
-                * from user space.
-                */
-               if (is_prefetch(regs, address, error_code))
-                       return;
-
-               if (is_errata100(regs, address))
-                       return;
-
-               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-                   printk_ratelimit()) {
-                       printk(
-                       "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
-                       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-                       tsk->comm, task_pid_nr(tsk), address,
-                       (void *) regs->ip, (void *) regs->sp, error_code);
-                       print_vma_addr(" in ", regs->ip);
-                       printk("\n");
-               }
-
-               tsk->thread.cr2 = address;
-               /* Kernel addresses are always protection faults */
-               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-               tsk->thread.trap_no = 14;
-               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
-               return;
-       }
-
-       if (is_f00f_bug(regs, address))
-               return;
-
-no_context:
-       /* Are we prepared to handle this kernel fault?  */
-       if (fixup_exception(regs))
-               return;
-
-       /*
-        * X86_32
-        * Valid to do another page fault here, because if this fault
-        * had been triggered by is_prefetch fixup_exception would have
-        * handled it.
-        *
-        * X86_64
-        * Hall of shame of CPU/BIOS bugs.
-        */
-       if (is_prefetch(regs, address, error_code))
-               return;
-
-       if (is_errata93(regs, address))
-               return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-#ifdef CONFIG_X86_32
-       bust_spinlocks(1);
-#else
-       flags = oops_begin();
-#endif
-
-       show_fault_oops(regs, error_code, address);
-
-       tsk->thread.cr2 = address;
-       tsk->thread.trap_no = 14;
-       tsk->thread.error_code = error_code;
-
-#ifdef CONFIG_X86_32
-       die("Oops", regs, error_code);
-       bust_spinlocks(0);
-       do_exit(SIGKILL);
-#else
-       sig = SIGKILL;
-       if (__die("Oops", regs, error_code))
-               sig = 0;
-       /* Executive summary in case the body of the oops scrolled away */
-       printk(KERN_EMERG "CR2: %016lx\n", address);
-       oops_end(flags, regs, sig);
-#endif
-
-out_of_memory:
-       /*
-        * We ran out of memory, call the OOM killer, and return the userspace
-        * (which will retry the fault, or kill us if we got oom-killed).
-        */
-       up_read(&mm->mmap_sem);
-       pagefault_out_of_memory();
-       return;
-
-do_sigbus:
-       up_read(&mm->mmap_sem);
-
-       /* Kernel mode? Handle exceptions or die */
-       if (!(error_code & PF_USER))
-               goto no_context;
-#ifdef CONFIG_X86_32
-       /* User space => ok to do another page fault */
-       if (is_prefetch(regs, address, error_code))
-               return;
-#endif
-       tsk->thread.cr2 = address;
-       tsk->thread.error_code = error_code;
-       tsk->thread.trap_no = 14;
-       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
 }
 
 DEFINE_SPINLOCK(pgd_lock);
index 2cef050..00263bf 100644 (file)
@@ -49,7 +49,6 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
-#include <asm/smp.h>
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
index 71a14f8..08d140f 100644 (file)
 #include <asm/acpi.h>
 #include <asm/k8.h>
 
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+# define DBG(x...) printk(KERN_DEBUG x)
+#else
+# define DBG(x...)
+#endif
+
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
@@ -33,6 +39,21 @@ int numa_off __initdata;
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
 
+DEFINE_PER_CPU(int, node_number) = 0;
+EXPORT_PER_CPU_SYMBOL(node_number);
+
+/*
+ * Map cpu index to node index
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+
+/*
+ * Which logical CPUs are on which nodes
+ */
+cpumask_t *node_to_cpumask_map;
+EXPORT_SYMBOL(node_to_cpumask_map);
+
 /*
  * Given a shift value, try to populate memnodemap[]
  * Returns :
@@ -640,3 +661,199 @@ void __init init_cpu_to_node(void)
 #endif
 
 
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: node_to_cpumask() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
+ */
+void __init setup_node_to_cpumask_map(void)
+{
+       unsigned int node, num = 0;
+       cpumask_t *map;
+
+       /* setup nr_node_ids if not done yet */
+       if (nr_node_ids == MAX_NUMNODES) {
+               for_each_node_mask(node, node_possible_map)
+                       num = node;
+               nr_node_ids = num + 1;
+       }
+
+       /* allocate the map */
+       map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
+       DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
+
+       pr_debug("Node to cpumask map at %p for %d nodes\n",
+                map, nr_node_ids);
+
+       /* node_to_cpumask() will now work */
+       node_to_cpumask_map = map;
+}
+
+void __cpuinit numa_set_node(int cpu, int node)
+{
+       int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+
+       /* early setting, no percpu area yet */
+       if (cpu_to_node_map) {
+               cpu_to_node_map[cpu] = node;
+               return;
+       }
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+       if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) {
+               printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+               dump_stack();
+               return;
+       }
+#endif
+       per_cpu(x86_cpu_to_node_map, cpu) = node;
+
+       if (node != NUMA_NO_NODE)
+               per_cpu(node_number, cpu) = node;
+}
+
+void __cpuinit numa_clear_node(int cpu)
+{
+       numa_set_node(cpu, NUMA_NO_NODE);
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+       cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+       cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+/*
+ * --------- debug versions of the numa functions ---------
+ */
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+       int node = early_cpu_to_node(cpu);
+       cpumask_t *mask;
+       char buf[64];
+
+       if (node_to_cpumask_map == NULL) {
+               printk(KERN_ERR "node_to_cpumask_map NULL\n");
+               dump_stack();
+               return;
+       }
+
+       mask = &node_to_cpumask_map[node];
+       if (enable)
+               cpu_set(cpu, *mask);
+       else
+               cpu_clear(cpu, *mask);
+
+       cpulist_scnprintf(buf, sizeof(buf), mask);
+       printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+               enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
+}
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+       numa_set_cpumask(cpu, 1);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+       numa_set_cpumask(cpu, 0);
+}
+
+int cpu_to_node(int cpu)
+{
+       if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+               printk(KERN_WARNING
+                       "cpu_to_node(%d): usage too early!\n", cpu);
+               dump_stack();
+               return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+       }
+       return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(cpu_to_node);
+
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+       if (early_per_cpu_ptr(x86_cpu_to_node_map))
+               return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+       if (!per_cpu_offset(cpu)) {
+               printk(KERN_WARNING
+                       "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+               dump_stack();
+               return NUMA_NO_NODE;
+       }
+       return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+
+/* empty cpumask */
+static const cpumask_t cpu_mask_none;
+
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+const cpumask_t *cpumask_of_node(int node)
+{
+       if (node_to_cpumask_map == NULL) {
+               printk(KERN_WARNING
+                       "cpumask_of_node(%d): no node_to_cpumask_map!\n",
+                       node);
+               dump_stack();
+               return (const cpumask_t *)&cpu_online_map;
+       }
+       if (node >= nr_node_ids) {
+               printk(KERN_WARNING
+                       "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
+                       node, nr_node_ids);
+               dump_stack();
+               return &cpu_mask_none;
+       }
+       return &node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(cpumask_of_node);
+
+/*
+ * Returns a bitmask of CPUs on Node 'node'.
+ *
+ * Side note: this function creates the returned cpumask on the stack
+ * so with a high NR_CPUS count, excessive stack space is used.  The
+ * node_to_cpumask_ptr function should be used whenever possible.
+ */
+cpumask_t node_to_cpumask(int node)
+{
+       if (node_to_cpumask_map == NULL) {
+               printk(KERN_WARNING
+                       "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
+               dump_stack();
+               return cpu_online_map;
+       }
+       if (node >= nr_node_ids) {
+               printk(KERN_WARNING
+                       "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
+                       node, nr_node_ids);
+               dump_stack();
+               return cpu_mask_none;
+       }
+       return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(node_to_cpumask);
+
+/*
+ * --------- end of debug versions of the numa functions ---------
+ */
+
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
index 09737c8..15df1ba 100644 (file)
@@ -21,6 +21,7 @@
 #include <asm/numa.h>
 #include <asm/e820.h>
 #include <asm/genapic.h>
+#include <asm/uv/uv.h>
 
 int acpi_numa __initdata;
 
similarity index 67%
rename from arch/x86/kernel/tlb_64.c
rename to arch/x86/mm/tlb.c
index f8be6f1..72a6d4e 100644 (file)
@@ -1,22 +1,18 @@
 #include <linux/init.h>
 
 #include <linux/mm.h>
-#include <linux/delay.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
 #include <linux/interrupt.h>
+#include <linux/module.h>
 
-#include <asm/mtrr.h>
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-#include <asm/proto.h>
-#include <asm/apicdef.h>
-#include <asm/idle.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/uv_bau.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+                       = { &init_mm, 0, };
 
 #include <mach_ipi.h>
 /*
@@ -33,7 +29,7 @@
  *     To avoid global state use 8 different call vectors.
  *     Each CPU uses a specific vector to trigger flushes on other
  *     CPUs. Depending on the received vector the target CPUs look into
- *     the right per cpu variable for the flush data.
+ *     the right array slot for the flush data.
  *
  *     With more than 8 CPUs they are hashed to the 8 available
  *     vectors. The limited global vector space forces us to this right now.
 
 union smp_flush_state {
        struct {
-               cpumask_t flush_cpumask;
                struct mm_struct *flush_mm;
                unsigned long flush_va;
                spinlock_t tlbstate_lock;
+               DECLARE_BITMAP(flush_cpumask, NR_CPUS);
        };
-       char pad[SMP_CACHE_BYTES];
-} ____cacheline_aligned;
+       char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
+} ____cacheline_internodealigned_in_smp;
 
 /* State is put into the per CPU data section, but padded
    to a full cache line because other CPUs can access it and we don't
    want false sharing in the per cpu data segment. */
-static DEFINE_PER_CPU(union smp_flush_state, flush_state);
+static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
 
 /*
  * We cannot call mmdrop() because we are in interrupt context,
@@ -62,9 +58,9 @@ static DEFINE_PER_CPU(union smp_flush_state, flush_state);
  */
 void leave_mm(int cpu)
 {
-       if (read_pda(mmu_state) == TLBSTATE_OK)
+       if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
                BUG();
-       cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
+       cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
        load_cr3(swapper_pg_dir);
 }
 EXPORT_SYMBOL_GPL(leave_mm);
@@ -117,10 +113,20 @@ EXPORT_SYMBOL_GPL(leave_mm);
  * Interrupts are disabled.
  */
 
-asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
+/*
+ * FIXME: use of asmlinkage is not consistent.  On x86_64 it's noop
+ * but still used for documentation purpose but the usage is slightly
+ * inconsistent.  On x86_32, asmlinkage is regparm(0) but interrupt
+ * entry calls in with the first parameter in %eax.  Maybe define
+ * intrlinkage?
+ */
+#ifdef CONFIG_X86_64
+asmlinkage
+#endif
+void smp_invalidate_interrupt(struct pt_regs *regs)
 {
-       int cpu;
-       int sender;
+       unsigned int cpu;
+       unsigned int sender;
        union smp_flush_state *f;
 
        cpu = smp_processor_id();
@@ -129,9 +135,9 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
         * Use that to determine where the sender put the data.
         */
        sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
-       f = &per_cpu(flush_state, sender);
+       f = &flush_state[sender];
 
-       if (!cpu_isset(cpu, f->flush_cpumask))
+       if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
                goto out;
                /*
                 * This was a BUG() but until someone can quote me the
@@ -142,8 +148,8 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
                 * BUG();
                 */
 
-       if (f->flush_mm == read_pda(active_mm)) {
-               if (read_pda(mmu_state) == TLBSTATE_OK) {
+       if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+               if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
                        if (f->flush_va == TLB_FLUSH_ALL)
                                local_flush_tlb();
                        else
@@ -153,23 +159,21 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
        }
 out:
        ack_APIC_irq();
-       cpu_clear(cpu, f->flush_cpumask);
+       smp_mb__before_clear_bit();
+       cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
+       smp_mb__after_clear_bit();
        inc_irq_stat(irq_tlb_count);
 }
 
-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
-                            unsigned long va)
+static void flush_tlb_others_ipi(const struct cpumask *cpumask,
+                                struct mm_struct *mm, unsigned long va)
 {
-       int sender;
+       unsigned int sender;
        union smp_flush_state *f;
-       cpumask_t cpumask = *cpumaskp;
-
-       if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
-               return;
 
        /* Caller has disabled preemption */
        sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
-       f = &per_cpu(flush_state, sender);
+       f = &flush_state[sender];
 
        /*
         * Could avoid this lock when
@@ -180,7 +184,8 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
 
        f->flush_mm = mm;
        f->flush_va = va;
-       cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
+       cpumask_andnot(to_cpumask(f->flush_cpumask),
+                      cpumask, cpumask_of(smp_processor_id()));
 
        /*
         * Make the above memory operations globally visible before
@@ -191,9 +196,10 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
         * We have to send the IPI only to
         * CPUs affected.
         */
-       send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
+       send_IPI_mask(to_cpumask(f->flush_cpumask),
+                     INVALIDATE_TLB_VECTOR_START + sender);
 
-       while (!cpus_empty(f->flush_cpumask))
+       while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
                cpu_relax();
 
        f->flush_mm = NULL;
@@ -201,12 +207,28 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
        spin_unlock(&f->tlbstate_lock);
 }
 
+void native_flush_tlb_others(const struct cpumask *cpumask,
+                            struct mm_struct *mm, unsigned long va)
+{
+       if (is_uv_system()) {
+               unsigned int cpu;
+
+               cpu = get_cpu();
+               cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
+               if (cpumask)
+                       flush_tlb_others_ipi(cpumask, mm, va);
+               put_cpu();
+               return;
+       }
+       flush_tlb_others_ipi(cpumask, mm, va);
+}
+
 static int __cpuinit init_smp_flush(void)
 {
        int i;
 
-       for_each_possible_cpu(i)
-               spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
+       for (i = 0; i < ARRAY_SIZE(flush_state); i++)
+               spin_lock_init(&flush_state[i].tlbstate_lock);
 
        return 0;
 }
@@ -215,25 +237,18 @@ core_initcall(init_smp_flush);
 void flush_tlb_current_task(void)
 {
        struct mm_struct *mm = current->mm;
-       cpumask_t cpu_mask;
 
        preempt_disable();
-       cpu_mask = mm->cpu_vm_mask;
-       cpu_clear(smp_processor_id(), cpu_mask);
 
        local_flush_tlb();
-       if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+       if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+               flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
        preempt_enable();
 }
 
 void flush_tlb_mm(struct mm_struct *mm)
 {
-       cpumask_t cpu_mask;
-
        preempt_disable();
-       cpu_mask = mm->cpu_vm_mask;
-       cpu_clear(smp_processor_id(), cpu_mask);
 
        if (current->active_mm == mm) {
                if (current->mm)
@@ -241,8 +256,8 @@ void flush_tlb_mm(struct mm_struct *mm)
                else
                        leave_mm(smp_processor_id());
        }
-       if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+       if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+               flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
 
        preempt_enable();
 }
@@ -250,11 +265,8 @@ void flush_tlb_mm(struct mm_struct *mm)
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
 {
        struct mm_struct *mm = vma->vm_mm;
-       cpumask_t cpu_mask;
 
        preempt_disable();
-       cpu_mask = mm->cpu_vm_mask;
-       cpu_clear(smp_processor_id(), cpu_mask);
 
        if (current->active_mm == mm) {
                if (current->mm)
@@ -263,8 +275,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
                        leave_mm(smp_processor_id());
        }
 
-       if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, va);
+       if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+               flush_tlb_others(&mm->cpu_vm_mask, mm, va);
 
        preempt_enable();
 }
@@ -274,7 +286,7 @@ static void do_flush_tlb_all(void *info)
        unsigned long cpu = smp_processor_id();
 
        __flush_tlb_all();
-       if (read_pda(mmu_state) == TLBSTATE_LAZY)
+       if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
                leave_mm(cpu);
 }
 
index 6dcefba..3b767d0 100644 (file)
@@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg
 endif
 
 obj-y          := enlighten.o setup.o multicalls.o mmu.o irq.o \
-                       time.o xen-asm_$(BITS).o grant-table.o suspend.o
+                       time.o xen-asm.o xen-asm_$(BITS).o \
+                       grant-table.o suspend.o
 
 obj-$(CONFIG_SMP)              += smp.o spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS)     += debugfs.o
\ No newline at end of file
index bea2152..3723034 100644 (file)
@@ -61,40 +61,13 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 enum xen_domain_type xen_domain_type = XEN_NATIVE;
 EXPORT_SYMBOL_GPL(xen_domain_type);
 
-/*
- * Identity map, in addition to plain kernel map.  This needs to be
- * large enough to allocate page table pages to allocate the rest.
- * Each page can map 2MB.
- */
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
-
-#ifdef CONFIG_X86_64
-/* l3 pud for userspace vsyscall mapping */
-static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
-#endif /* CONFIG_X86_64 */
-
-/*
- * Note about cr3 (pagetable base) values:
- *
- * xen_cr3 contains the current logical cr3 value; it contains the
- * last set cr3.  This may not be the current effective cr3, because
- * its update may be being lazily deferred.  However, a vcpu looking
- * at its own cr3 can use this value knowing that it everything will
- * be self-consistent.
- *
- * xen_current_cr3 contains the actual vcpu cr3; it is set once the
- * hypercall to set the vcpu cr3 is complete (so it may be a little
- * out of date, but it will never be set early).  If one vcpu is
- * looking at another vcpu's cr3 value, it should use this variable.
- */
-DEFINE_PER_CPU(unsigned long, xen_cr3);         /* cr3 stored as physaddr */
-DEFINE_PER_CPU(unsigned long, xen_current_cr3);         /* actual vcpu cr3 */
-
 struct start_info *xen_start_info;
 EXPORT_SYMBOL_GPL(xen_start_info);
 
 struct shared_info xen_dummy_shared_info;
 
+void *xen_initial_gdt;
+
 /*
  * Point at some empty memory to start with. We map the real shared_info
  * page as soon as fixmap is up and running.
@@ -114,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
  *
  * 0: not available, 1: available
  */
-static int have_vcpu_info_placement =
-#ifdef CONFIG_X86_32
-       1
-#else
-       0
-#endif
-       ;
-
+static int have_vcpu_info_placement = 1;
 
 static void xen_vcpu_setup(int cpu)
 {
@@ -237,7 +203,7 @@ static unsigned long xen_get_debugreg(int reg)
        return HYPERVISOR_get_debugreg(reg);
 }
 
-static void xen_leave_lazy(void)
+void xen_leave_lazy(void)
 {
        paravirt_leave_lazy(paravirt_get_lazy_mode());
        xen_mc_flush();
@@ -598,83 +564,6 @@ static struct apic_ops xen_basic_apic_ops = {
 
 #endif
 
-static void xen_flush_tlb(void)
-{
-       struct mmuext_op *op;
-       struct multicall_space mcs;
-
-       preempt_disable();
-
-       mcs = xen_mc_entry(sizeof(*op));
-
-       op = mcs.args;
-       op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
-       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-       xen_mc_issue(PARAVIRT_LAZY_MMU);
-
-       preempt_enable();
-}
-
-static void xen_flush_tlb_single(unsigned long addr)
-{
-       struct mmuext_op *op;
-       struct multicall_space mcs;
-
-       preempt_disable();
-
-       mcs = xen_mc_entry(sizeof(*op));
-       op = mcs.args;
-       op->cmd = MMUEXT_INVLPG_LOCAL;
-       op->arg1.linear_addr = addr & PAGE_MASK;
-       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-       xen_mc_issue(PARAVIRT_LAZY_MMU);
-
-       preempt_enable();
-}
-
-static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
-                                unsigned long va)
-{
-       struct {
-               struct mmuext_op op;
-               cpumask_t mask;
-       } *args;
-       cpumask_t cpumask = *cpus;
-       struct multicall_space mcs;
-
-       /*
-        * A couple of (to be removed) sanity checks:
-        *
-        * - current CPU must not be in mask
-        * - mask must exist :)
-        */
-       BUG_ON(cpus_empty(cpumask));
-       BUG_ON(cpu_isset(smp_processor_id(), cpumask));
-       BUG_ON(!mm);
-
-       /* If a CPU which we ran on has gone down, OK. */
-       cpus_and(cpumask, cpumask, cpu_online_map);
-       if (cpus_empty(cpumask))
-               return;
-
-       mcs = xen_mc_entry(sizeof(*args));
-       args = mcs.args;
-       args->mask = cpumask;
-       args->op.arg2.vcpumask = &args->mask;
-
-       if (va == TLB_FLUSH_ALL) {
-               args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-       } else {
-               args->op.cmd = MMUEXT_INVLPG_MULTI;
-               args->op.arg1.linear_addr = va;
-       }
-
-       MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
-
-       xen_mc_issue(PARAVIRT_LAZY_MMU);
-}
 
 static void xen_clts(void)
 {
@@ -700,21 +589,6 @@ static void xen_write_cr0(unsigned long cr0)
        xen_mc_issue(PARAVIRT_LAZY_CPU);
 }
 
-static void xen_write_cr2(unsigned long cr2)
-{
-       x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
-}
-
-static unsigned long xen_read_cr2(void)
-{
-       return x86_read_percpu(xen_vcpu)->arch.cr2;
-}
-
-static unsigned long xen_read_cr2_direct(void)
-{
-       return x86_read_percpu(xen_vcpu_info.arch.cr2);
-}
-
 static void xen_write_cr4(unsigned long cr4)
 {
        cr4 &= ~X86_CR4_PGE;
@@ -723,71 +597,6 @@ static void xen_write_cr4(unsigned long cr4)
        native_write_cr4(cr4);
 }
 
-static unsigned long xen_read_cr3(void)
-{
-       return x86_read_percpu(xen_cr3);
-}
-
-static void set_current_cr3(void *v)
-{
-       x86_write_percpu(xen_current_cr3, (unsigned long)v);
-}
-
-static void __xen_write_cr3(bool kernel, unsigned long cr3)
-{
-       struct mmuext_op *op;
-       struct multicall_space mcs;
-       unsigned long mfn;
-
-       if (cr3)
-               mfn = pfn_to_mfn(PFN_DOWN(cr3));
-       else
-               mfn = 0;
-
-       WARN_ON(mfn == 0 && kernel);
-
-       mcs = __xen_mc_entry(sizeof(*op));
-
-       op = mcs.args;
-       op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
-       op->arg1.mfn = mfn;
-
-       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-       if (kernel) {
-               x86_write_percpu(xen_cr3, cr3);
-
-               /* Update xen_current_cr3 once the batch has actually
-                  been submitted. */
-               xen_mc_callback(set_current_cr3, (void *)cr3);
-       }
-}
-
-static void xen_write_cr3(unsigned long cr3)
-{
-       BUG_ON(preemptible());
-
-       xen_mc_batch();  /* disables interrupts */
-
-       /* Update while interrupts are disabled, so its atomic with
-          respect to ipis */
-       x86_write_percpu(xen_cr3, cr3);
-
-       __xen_write_cr3(true, cr3);
-
-#ifdef CONFIG_X86_64
-       {
-               pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
-               if (user_pgd)
-                       __xen_write_cr3(false, __pa(user_pgd));
-               else
-                       __xen_write_cr3(false, 0);
-       }
-#endif
-
-       xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
-}
-
 static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 {
        int ret;
@@ -829,185 +638,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
        return ret;
 }
 
-/* Early in boot, while setting up the initial pagetable, assume
-   everything is pinned. */
-static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
-{
-#ifdef CONFIG_FLATMEM
-       BUG_ON(mem_map);        /* should only be used early */
-#endif
-       make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-}
-
-/* Early release_pte assumes that all pts are pinned, since there's
-   only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(unsigned long pfn)
-{
-       make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
-}
-
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
-       struct mmuext_op op;
-       op.cmd = cmd;
-       op.arg1.mfn = pfn_to_mfn(pfn);
-       if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-               BUG();
-}
-
-/* This needs to make sure the new pte page is pinned iff its being
-   attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
-{
-       struct page *page = pfn_to_page(pfn);
-
-       if (PagePinned(virt_to_page(mm->pgd))) {
-               SetPagePinned(page);
-
-               vm_unmap_aliases();
-               if (!PageHighMem(page)) {
-                       make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
-                       if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-                               pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
-               } else {
-                       /* make sure there are no stray mappings of
-                          this page */
-                       kmap_flush_unused();
-               }
-       }
-}
-
-static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
-{
-       xen_alloc_ptpage(mm, pfn, PT_PTE);
-}
-
-static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
-{
-       xen_alloc_ptpage(mm, pfn, PT_PMD);
-}
-
-static int xen_pgd_alloc(struct mm_struct *mm)
-{
-       pgd_t *pgd = mm->pgd;
-       int ret = 0;
-
-       BUG_ON(PagePinned(virt_to_page(pgd)));
-
-#ifdef CONFIG_X86_64
-       {
-               struct page *page = virt_to_page(pgd);
-               pgd_t *user_pgd;
-
-               BUG_ON(page->private != 0);
-
-               ret = -ENOMEM;
-
-               user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-               page->private = (unsigned long)user_pgd;
-
-               if (user_pgd != NULL) {
-                       user_pgd[pgd_index(VSYSCALL_START)] =
-                               __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
-                       ret = 0;
-               }
-
-               BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
-       }
-#endif
-
-       return ret;
-}
-
-static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-#ifdef CONFIG_X86_64
-       pgd_t *user_pgd = xen_get_user_pgd(pgd);
-
-       if (user_pgd)
-               free_page((unsigned long)user_pgd);
-#endif
-}
-
-/* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(unsigned long pfn, unsigned level)
-{
-       struct page *page = pfn_to_page(pfn);
-
-       if (PagePinned(page)) {
-               if (!PageHighMem(page)) {
-                       if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-                               pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
-                       make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
-               }
-               ClearPagePinned(page);
-       }
-}
-
-static void xen_release_pte(unsigned long pfn)
-{
-       xen_release_ptpage(pfn, PT_PTE);
-}
-
-static void xen_release_pmd(unsigned long pfn)
-{
-       xen_release_ptpage(pfn, PT_PMD);
-}
-
-#if PAGETABLE_LEVELS == 4
-static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
-{
-       xen_alloc_ptpage(mm, pfn, PT_PUD);
-}
-
-static void xen_release_pud(unsigned long pfn)
-{
-       xen_release_ptpage(pfn, PT_PUD);
-}
-#endif
-
-#ifdef CONFIG_HIGHPTE
-static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
-{
-       pgprot_t prot = PAGE_KERNEL;
-
-       if (PagePinned(page))
-               prot = PAGE_KERNEL_RO;
-
-       if (0 && PageHighMem(page))
-               printk("mapping highpte %lx type %d prot %s\n",
-                      page_to_pfn(page), type,
-                      (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
-
-       return kmap_atomic_prot(page, type, prot);
-}
-#endif
-
-#ifdef CONFIG_X86_32
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
-{
-       /* If there's an existing pte, then don't allow _PAGE_RW to be set */
-       if (pte_val_ma(*ptep) & _PAGE_PRESENT)
-               pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
-                              pte_val_ma(pte));
-
-       return pte;
-}
-
-/* Init-time set_pte while constructing initial pagetables, which
-   doesn't allow RO pagetable pages to be remapped RW */
-static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
-{
-       pte = mask_rw_pte(ptep, pte);
-
-       xen_set_pte(ptep, pte);
-}
-#endif
-
-static __init void xen_pagetable_setup_start(pgd_t *base)
-{
-}
-
 void xen_setup_shared_info(void)
 {
        if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1028,37 +658,6 @@ void xen_setup_shared_info(void)
        xen_setup_mfn_list_list();
 }
 
-static __init void xen_pagetable_setup_done(pgd_t *base)
-{
-       xen_setup_shared_info();
-}
-
-static __init void xen_post_allocator_init(void)
-{
-       pv_mmu_ops.set_pte = xen_set_pte;
-       pv_mmu_ops.set_pmd = xen_set_pmd;
-       pv_mmu_ops.set_pud = xen_set_pud;
-#if PAGETABLE_LEVELS == 4
-       pv_mmu_ops.set_pgd = xen_set_pgd;
-#endif
-
-       /* This will work as long as patching hasn't happened yet
-          (which it hasn't) */
-       pv_mmu_ops.alloc_pte = xen_alloc_pte;
-       pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
-       pv_mmu_ops.release_pte = xen_release_pte;
-       pv_mmu_ops.release_pmd = xen_release_pmd;
-#if PAGETABLE_LEVELS == 4
-       pv_mmu_ops.alloc_pud = xen_alloc_pud;
-       pv_mmu_ops.release_pud = xen_release_pud;
-#endif
-
-#ifdef CONFIG_X86_64
-       SetPagePinned(virt_to_page(level3_user_vsyscall));
-#endif
-       xen_mark_init_mm_pinned();
-}
-
 /* This is called once we have the cpu_possible_map */
 void xen_setup_vcpu_info_placement(void)
 {
@@ -1072,10 +671,10 @@ void xen_setup_vcpu_info_placement(void)
        if (have_vcpu_info_placement) {
                printk(KERN_INFO "Xen: using vcpu_info placement\n");
 
-               pv_irq_ops.save_fl = xen_save_fl_direct;
-               pv_irq_ops.restore_fl = xen_restore_fl_direct;
-               pv_irq_ops.irq_disable = xen_irq_disable_direct;
-               pv_irq_ops.irq_enable = xen_irq_enable_direct;
+               pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
+               pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
+               pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
+               pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
                pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
        }
 }
@@ -1133,49 +732,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
        return ret;
 }
 
-static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
-{
-       pte_t pte;
-
-       phys >>= PAGE_SHIFT;
-
-       switch (idx) {
-       case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
-#ifdef CONFIG_X86_F00F_BUG
-       case FIX_F00F_IDT:
-#endif
-#ifdef CONFIG_X86_32
-       case FIX_WP_TEST:
-       case FIX_VDSO:
-# ifdef CONFIG_HIGHMEM
-       case FIX_KMAP_BEGIN ... FIX_KMAP_END:
-# endif
-#else
-       case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-       case FIX_APIC_BASE:     /* maps dummy local APIC */
-#endif
-               pte = pfn_pte(phys, prot);
-               break;
-
-       default:
-               pte = mfn_pte(phys, prot);
-               break;
-       }
-
-       __native_set_fixmap(idx, pte);
-
-#ifdef CONFIG_X86_64
-       /* Replicate changes to map the vsyscall page into the user
-          pagetable vsyscall mapping. */
-       if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
-               unsigned long vaddr = __fix_to_virt(idx);
-               set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
-       }
-#endif
-}
-
 static const struct pv_info xen_info __initdata = {
        .paravirt_enabled = 1,
        .shared_kernel_pmd = 0,
@@ -1271,87 +827,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = {
 #endif
 };
 
-static const struct pv_mmu_ops xen_mmu_ops __initdata = {
-       .pagetable_setup_start = xen_pagetable_setup_start,
-       .pagetable_setup_done = xen_pagetable_setup_done,
-
-       .read_cr2 = xen_read_cr2,
-       .write_cr2 = xen_write_cr2,
-
-       .read_cr3 = xen_read_cr3,
-       .write_cr3 = xen_write_cr3,
-
-       .flush_tlb_user = xen_flush_tlb,
-       .flush_tlb_kernel = xen_flush_tlb,
-       .flush_tlb_single = xen_flush_tlb_single,
-       .flush_tlb_others = xen_flush_tlb_others,
-
-       .pte_update = paravirt_nop,
-       .pte_update_defer = paravirt_nop,
-
-       .pgd_alloc = xen_pgd_alloc,
-       .pgd_free = xen_pgd_free,
-
-       .alloc_pte = xen_alloc_pte_init,
-       .release_pte = xen_release_pte_init,
-       .alloc_pmd = xen_alloc_pte_init,
-       .alloc_pmd_clone = paravirt_nop,
-       .release_pmd = xen_release_pte_init,
-
-#ifdef CONFIG_HIGHPTE
-       .kmap_atomic_pte = xen_kmap_atomic_pte,
-#endif
-
-#ifdef CONFIG_X86_64
-       .set_pte = xen_set_pte,
-#else
-       .set_pte = xen_set_pte_init,
-#endif
-       .set_pte_at = xen_set_pte_at,
-       .set_pmd = xen_set_pmd_hyper,
-
-       .ptep_modify_prot_start = __ptep_modify_prot_start,
-       .ptep_modify_prot_commit = __ptep_modify_prot_commit,
-
-       .pte_val = xen_pte_val,
-       .pte_flags = native_pte_flags,
-       .pgd_val = xen_pgd_val,
-
-       .make_pte = xen_make_pte,
-       .make_pgd = xen_make_pgd,
-
-#ifdef CONFIG_X86_PAE
-       .set_pte_atomic = xen_set_pte_atomic,
-       .set_pte_present = xen_set_pte_at,
-       .pte_clear = xen_pte_clear,
-       .pmd_clear = xen_pmd_clear,
-#endif /* CONFIG_X86_PAE */
-       .set_pud = xen_set_pud_hyper,
-
-       .make_pmd = xen_make_pmd,
-       .pmd_val = xen_pmd_val,
-
-#if PAGETABLE_LEVELS == 4
-       .pud_val = xen_pud_val,
-       .make_pud = xen_make_pud,
-       .set_pgd = xen_set_pgd_hyper,
-
-       .alloc_pud = xen_alloc_pte_init,
-       .release_pud = xen_release_pte_init,
-#endif /* PAGETABLE_LEVELS == 4 */
-
-       .activate_mm = xen_activate_mm,
-       .dup_mmap = xen_dup_mmap,
-       .exit_mmap = xen_exit_mmap,
-
-       .lazy_mode = {
-               .enter = paravirt_enter_lazy_mmu,
-               .leave = xen_leave_lazy,
-       },
-
-       .set_fixmap = xen_set_fixmap,
-};
-
 static void xen_reboot(int reason)
 {
        struct sched_shutdown r = { .reason = reason };
@@ -1394,223 +869,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
 };
 
 
-static void __init xen_reserve_top(void)
-{
-#ifdef CONFIG_X86_32
-       unsigned long top = HYPERVISOR_VIRT_START;
-       struct xen_platform_parameters pp;
-
-       if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
-               top = pp.virt_start;
-
-       reserve_top_address(-top);
-#endif /* CONFIG_X86_32 */
-}
-
-/*
- * Like __va(), but returns address in the kernel mapping (which is
- * all we have until the physical memory mapping has been set up.
- */
-static void *__ka(phys_addr_t paddr)
-{
-#ifdef CONFIG_X86_64
-       return (void *)(paddr + __START_KERNEL_map);
-#else
-       return __va(paddr);
-#endif
-}
-
-/* Convert a machine address to physical address */
-static unsigned long m2p(phys_addr_t maddr)
-{
-       phys_addr_t paddr;
-
-       maddr &= PTE_PFN_MASK;
-       paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
-
-       return paddr;
-}
-
-/* Convert a machine address to kernel virtual */
-static void *m2v(phys_addr_t maddr)
-{
-       return __ka(m2p(maddr));
-}
-
-static void set_page_prot(void *addr, pgprot_t prot)
-{
-       unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
-       pte_t pte = pfn_pte(pfn, prot);
-
-       if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
-               BUG();
-}
-
-static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
-{
-       unsigned pmdidx, pteidx;
-       unsigned ident_pte;
-       unsigned long pfn;
-
-       ident_pte = 0;
-       pfn = 0;
-       for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
-               pte_t *pte_page;
-
-               /* Reuse or allocate a page of ptes */
-               if (pmd_present(pmd[pmdidx]))
-                       pte_page = m2v(pmd[pmdidx].pmd);
-               else {
-                       /* Check for free pte pages */
-                       if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
-                               break;
-
-                       pte_page = &level1_ident_pgt[ident_pte];
-                       ident_pte += PTRS_PER_PTE;
-
-                       pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
-               }
-
-               /* Install mappings */
-               for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
-                       pte_t pte;
-
-                       if (pfn > max_pfn_mapped)
-                               max_pfn_mapped = pfn;
-
-                       if (!pte_none(pte_page[pteidx]))
-                               continue;
-
-                       pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
-                       pte_page[pteidx] = pte;
-               }
-       }
-
-       for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
-               set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
-
-       set_page_prot(pmd, PAGE_KERNEL_RO);
-}
-
-#ifdef CONFIG_X86_64
-static void convert_pfn_mfn(void *v)
-{
-       pte_t *pte = v;
-       int i;
-
-       /* All levels are converted the same way, so just treat them
-          as ptes. */
-       for (i = 0; i < PTRS_PER_PTE; i++)
-               pte[i] = xen_make_pte(pte[i].pte);
-}
-
-/*
- * Set up the inital kernel pagetable.
- *
- * We can construct this by grafting the Xen provided pagetable into
- * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
- * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
- * means that only the kernel has a physical mapping to start with -
- * but that's enough to get __va working.  We need to fill in the rest
- * of the physical mapping once some sort of allocator has been set
- * up.
- */
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
-                                               unsigned long max_pfn)
-{
-       pud_t *l3;
-       pmd_t *l2;
-
-       /* Zap identity mapping */
-       init_level4_pgt[0] = __pgd(0);
-
-       /* Pre-constructed entries are in pfn, so convert to mfn */
-       convert_pfn_mfn(init_level4_pgt);
-       convert_pfn_mfn(level3_ident_pgt);
-       convert_pfn_mfn(level3_kernel_pgt);
-
-       l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
-       l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
-
-       memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-       memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-
-       l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
-       l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
-       memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-
-       /* Set up identity map */
-       xen_map_identity_early(level2_ident_pgt, max_pfn);
-
-       /* Make pagetable pieces RO */
-       set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
-       set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
-       set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
-       set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
-       set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
-       set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
-
-       /* Pin down new L4 */
-       pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
-                         PFN_DOWN(__pa_symbol(init_level4_pgt)));
-
-       /* Unpin Xen-provided one */
-       pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-
-       /* Switch over */
-       pgd = init_level4_pgt;
-
-       /*
-        * At this stage there can be no user pgd, and no page
-        * structure to attach it to, so make sure we just set kernel
-        * pgd.
-        */
-       xen_mc_batch();
-       __xen_write_cr3(true, __pa(pgd));
-       xen_mc_issue(PARAVIRT_LAZY_CPU);
-
-       reserve_early(__pa(xen_start_info->pt_base),
-                     __pa(xen_start_info->pt_base +
-                          xen_start_info->nr_pt_frames * PAGE_SIZE),
-                     "XEN PAGETABLES");
-
-       return pgd;
-}
-#else  /* !CONFIG_X86_64 */
-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
-
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
-                                               unsigned long max_pfn)
-{
-       pmd_t *kernel_pmd;
-
-       init_pg_tables_start = __pa(pgd);
-       init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-       max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
-
-       kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
-       memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
-
-       xen_map_identity_early(level2_kernel_pgt, max_pfn);
-
-       memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
-       set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
-                       __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
-
-       set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
-       set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
-       set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
-
-       pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-
-       xen_write_cr3(__pa(swapper_pg_dir));
-
-       pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
-
-       return swapper_pg_dir;
-}
-#endif /* CONFIG_X86_64 */
-
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
@@ -1650,10 +908,18 @@ asmlinkage void __init xen_start_kernel(void)
        machine_ops = xen_machine_ops;
 
 #ifdef CONFIG_X86_64
-       /* Disable until direct per-cpu data access. */
-       have_vcpu_info_placement = 0;
-       x86_64_init_pda();
+       /*
+        * Setup percpu state.  We only need to do this for 64-bit
+        * because 32-bit already has %fs set properly.
+        */
+       load_percpu_segment(0);
 #endif
+       /*
+        * The only reliable way to retain the initial address of the
+        * percpu gdt_page is to remember it here, so we can go and
+        * mark it RW later, when the initial percpu area is freed.
+        */
+       xen_initial_gdt = &per_cpu(gdt_page, 0);
 
        xen_smp_init();
 
index bb04260..5a07090 100644 (file)
@@ -39,7 +39,7 @@ static unsigned long xen_save_fl(void)
        struct vcpu_info *vcpu;
        unsigned long flags;
 
-       vcpu = x86_read_percpu(xen_vcpu);
+       vcpu = percpu_read(xen_vcpu);
 
        /* flag has opposite sense of mask */
        flags = !vcpu->evtchn_upcall_mask;
@@ -50,6 +50,7 @@ static unsigned long xen_save_fl(void)
        */
        return (-flags) & X86_EFLAGS_IF;
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
 
 static void xen_restore_fl(unsigned long flags)
 {
@@ -62,7 +63,7 @@ static void xen_restore_fl(unsigned long flags)
           make sure we're don't switch CPUs between getting the vcpu
           pointer and updating the mask. */
        preempt_disable();
-       vcpu = x86_read_percpu(xen_vcpu);
+       vcpu = percpu_read(xen_vcpu);
        vcpu->evtchn_upcall_mask = flags;
        preempt_enable_no_resched();
 
@@ -76,6 +77,7 @@ static void xen_restore_fl(unsigned long flags)
                        xen_force_evtchn_callback();
        }
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
 
 static void xen_irq_disable(void)
 {
@@ -83,9 +85,10 @@ static void xen_irq_disable(void)
           make sure we're don't switch CPUs between getting the vcpu
           pointer and updating the mask. */
        preempt_disable();
-       x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+       percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
        preempt_enable_no_resched();
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
 
 static void xen_irq_enable(void)
 {
@@ -96,7 +99,7 @@ static void xen_irq_enable(void)
           the caller is confused and is trying to re-enable interrupts
           on an indeterminate processor. */
 
-       vcpu = x86_read_percpu(xen_vcpu);
+       vcpu = percpu_read(xen_vcpu);
        vcpu->evtchn_upcall_mask = 0;
 
        /* Doesn't matter if we get preempted here, because any
@@ -106,6 +109,7 @@ static void xen_irq_enable(void)
        if (unlikely(vcpu->evtchn_upcall_pending))
                xen_force_evtchn_callback();
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
 
 static void xen_safe_halt(void)
 {
@@ -124,10 +128,12 @@ static void xen_halt(void)
 
 static const struct pv_irq_ops xen_irq_ops __initdata = {
        .init_IRQ = __xen_init_IRQ,
-       .save_fl = xen_save_fl,
-       .restore_fl = xen_restore_fl,
-       .irq_disable = xen_irq_disable,
-       .irq_enable = xen_irq_enable,
+
+       .save_fl = PV_CALLEE_SAVE(xen_save_fl),
+       .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
+       .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
+       .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
+
        .safe_halt = xen_safe_halt,
        .halt = xen_halt,
 #ifdef CONFIG_X86_64
index 503c240..d2e8ed1 100644 (file)
@@ -47,6 +47,7 @@
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
 #include <asm/mmu_context.h>
+#include <asm/setup.h>
 #include <asm/paravirt.h>
 #include <asm/linkage.h>
 
@@ -55,6 +56,8 @@
 
 #include <xen/page.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+#include <xen/hvc-console.h>
 
 #include "multicalls.h"
 #include "mmu.h"
@@ -114,6 +117,37 @@ static inline void check_zero(void)
 
 #endif /* CONFIG_XEN_DEBUG_FS */
 
+
+/*
+ * Identity map, in addition to plain kernel map.  This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Note about cr3 (pagetable base) values:
+ *
+ * xen_cr3 contains the current logical cr3 value; it contains the
+ * last set cr3.  This may not be the current effective cr3, because
+ * its update may be being lazily deferred.  However, a vcpu looking
+ * at its own cr3 can use this value knowing that it everything will
+ * be self-consistent.
+ *
+ * xen_current_cr3 contains the actual vcpu cr3; it is set once the
+ * hypercall to set the vcpu cr3 is complete (so it may be a little
+ * out of date, but it will never be set early).  If one vcpu is
+ * looking at another vcpu's cr3 value, it should use this variable.
+ */
+DEFINE_PER_CPU(unsigned long, xen_cr3);         /* cr3 stored as physaddr */
+DEFINE_PER_CPU(unsigned long, xen_current_cr3);         /* actual vcpu cr3 */
+
+
 /*
  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
  * redzone above it, so round it up to a PGD boundary.
@@ -458,28 +492,33 @@ pteval_t xen_pte_val(pte_t pte)
 {
        return pte_mfn_to_pfn(pte.pte);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 
 pgdval_t xen_pgd_val(pgd_t pgd)
 {
        return pte_mfn_to_pfn(pgd.pgd);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 
 pte_t xen_make_pte(pteval_t pte)
 {
        pte = pte_pfn_to_mfn(pte);
        return native_make_pte(pte);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 
 pgd_t xen_make_pgd(pgdval_t pgd)
 {
        pgd = pte_pfn_to_mfn(pgd);
        return native_make_pgd(pgd);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 
 pmdval_t xen_pmd_val(pmd_t pmd)
 {
        return pte_mfn_to_pfn(pmd.pmd);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
 
 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 {
@@ -556,12 +595,14 @@ pmd_t xen_make_pmd(pmdval_t pmd)
        pmd = pte_pfn_to_mfn(pmd);
        return native_make_pmd(pmd);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 
 #if PAGETABLE_LEVELS == 4
 pudval_t xen_pud_val(pud_t pud)
 {
        return pte_mfn_to_pfn(pud.pud);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 
 pud_t xen_make_pud(pudval_t pud)
 {
@@ -569,6 +610,7 @@ pud_t xen_make_pud(pudval_t pud)
 
        return native_make_pud(pud);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
 
 pgd_t *xen_get_user_pgd(pgd_t *pgd)
 {
@@ -1063,18 +1105,14 @@ static void drop_other_mm_ref(void *info)
        struct mm_struct *mm = info;
        struct mm_struct *active_mm;
 
-#ifdef CONFIG_X86_64
-       active_mm = read_pda(active_mm);
-#else
-       active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
-#endif
+       active_mm = percpu_read(cpu_tlbstate.active_mm);
 
        if (active_mm == mm)
                leave_mm(smp_processor_id());
 
        /* If this cpu still has a stale cr3 reference, then make sure
           it has been flushed. */
-       if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+       if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
                load_cr3(swapper_pg_dir);
                arch_flush_lazy_cpu_mode();
        }
@@ -1156,6 +1194,709 @@ void xen_exit_mmap(struct mm_struct *mm)
        spin_unlock(&mm->page_table_lock);
 }
 
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+}
+
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+       xen_setup_shared_info();
+}
+
+static void xen_write_cr2(unsigned long cr2)
+{
+       percpu_read(xen_vcpu)->arch.cr2 = cr2;
+}
+
+static unsigned long xen_read_cr2(void)
+{
+       return percpu_read(xen_vcpu)->arch.cr2;
+}
+
+unsigned long xen_read_cr2_direct(void)
+{
+       return percpu_read(xen_vcpu_info.arch.cr2);
+}
+
+static void xen_flush_tlb(void)
+{
+       struct mmuext_op *op;
+       struct multicall_space mcs;
+
+       preempt_disable();
+
+       mcs = xen_mc_entry(sizeof(*op));
+
+       op = mcs.args;
+       op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+       preempt_enable();
+}
+
+static void xen_flush_tlb_single(unsigned long addr)
+{
+       struct mmuext_op *op;
+       struct multicall_space mcs;
+
+       preempt_disable();
+
+       mcs = xen_mc_entry(sizeof(*op));
+       op = mcs.args;
+       op->cmd = MMUEXT_INVLPG_LOCAL;
+       op->arg1.linear_addr = addr & PAGE_MASK;
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+       preempt_enable();
+}
+
+static void xen_flush_tlb_others(const struct cpumask *cpus,
+                                struct mm_struct *mm, unsigned long va)
+{
+       struct {
+               struct mmuext_op op;
+               DECLARE_BITMAP(mask, NR_CPUS);
+       } *args;
+       struct multicall_space mcs;
+
+       BUG_ON(cpumask_empty(cpus));
+       BUG_ON(!mm);
+
+       mcs = xen_mc_entry(sizeof(*args));
+       args = mcs.args;
+       args->op.arg2.vcpumask = to_cpumask(args->mask);
+
+       /* Remove us, and any offline CPUS. */
+       cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
+       cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
+       if (unlikely(cpumask_empty(to_cpumask(args->mask))))
+               goto issue;
+
+       if (va == TLB_FLUSH_ALL) {
+               args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+       } else {
+               args->op.cmd = MMUEXT_INVLPG_MULTI;
+               args->op.arg1.linear_addr = va;
+       }
+
+       MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+issue:
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static unsigned long xen_read_cr3(void)
+{
+       return percpu_read(xen_cr3);
+}
+
+static void set_current_cr3(void *v)
+{
+       percpu_write(xen_current_cr3, (unsigned long)v);
+}
+
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
+{
+       struct mmuext_op *op;
+       struct multicall_space mcs;
+       unsigned long mfn;
+
+       if (cr3)
+               mfn = pfn_to_mfn(PFN_DOWN(cr3));
+       else
+               mfn = 0;
+
+       WARN_ON(mfn == 0 && kernel);
+
+       mcs = __xen_mc_entry(sizeof(*op));
+
+       op = mcs.args;
+       op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
+       op->arg1.mfn = mfn;
+
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       if (kernel) {
+               percpu_write(xen_cr3, cr3);
+
+               /* Update xen_current_cr3 once the batch has actually
+                  been submitted. */
+               xen_mc_callback(set_current_cr3, (void *)cr3);
+       }
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+       BUG_ON(preemptible());
+
+       xen_mc_batch();  /* disables interrupts */
+
+       /* Update while interrupts are disabled, so its atomic with
+          respect to ipis */
+       percpu_write(xen_cr3, cr3);
+
+       __xen_write_cr3(true, cr3);
+
+#ifdef CONFIG_X86_64
+       {
+               pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+               if (user_pgd)
+                       __xen_write_cr3(false, __pa(user_pgd));
+               else
+                       __xen_write_cr3(false, 0);
+       }
+#endif
+
+       xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
+}
+
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+       pgd_t *pgd = mm->pgd;
+       int ret = 0;
+
+       BUG_ON(PagePinned(virt_to_page(pgd)));
+
+#ifdef CONFIG_X86_64
+       {
+               struct page *page = virt_to_page(pgd);
+               pgd_t *user_pgd;
+
+               BUG_ON(page->private != 0);
+
+               ret = -ENOMEM;
+
+               user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+               page->private = (unsigned long)user_pgd;
+
+               if (user_pgd != NULL) {
+                       user_pgd[pgd_index(VSYSCALL_START)] =
+                               __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+                       ret = 0;
+               }
+
+               BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+       }
+#endif
+
+       return ret;
+}
+
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+       pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+       if (user_pgd)
+               free_page((unsigned long)user_pgd);
+#endif
+}
+
+#ifdef CONFIG_HIGHPTE
+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+       pgprot_t prot = PAGE_KERNEL;
+
+       if (PagePinned(page))
+               prot = PAGE_KERNEL_RO;
+
+       if (0 && PageHighMem(page))
+               printk("mapping highpte %lx type %d prot %s\n",
+                      page_to_pfn(page), type,
+                      (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+
+       return kmap_atomic_prot(page, type, prot);
+}
+#endif
+
+#ifdef CONFIG_X86_32
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+       /* If there's an existing pte, then don't allow _PAGE_RW to be set */
+       if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+               pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+                              pte_val_ma(pte));
+
+       return pte;
+}
+
+/* Init-time set_pte while constructing initial pagetables, which
+   doesn't allow RO pagetable pages to be remapped RW */
+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+       pte = mask_rw_pte(ptep, pte);
+
+       xen_set_pte(ptep, pte);
+}
+#endif
+
+/* Early in boot, while setting up the initial pagetable, assume
+   everything is pinned. */
+static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
+{
+#ifdef CONFIG_FLATMEM
+       BUG_ON(mem_map);        /* should only be used early */
+#endif
+       make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+/* Early release_pte assumes that all pts are pinned, since there's
+   only init_mm and anything attached to that is pinned. */
+static void xen_release_pte_init(unsigned long pfn)
+{
+       make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
+static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+       struct mmuext_op op;
+       op.cmd = cmd;
+       op.arg1.mfn = pfn_to_mfn(pfn);
+       if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+               BUG();
+}
+
+/* This needs to make sure the new pte page is pinned iff its being
+   attached to a pinned pagetable. */
+static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
+{
+       struct page *page = pfn_to_page(pfn);
+
+       if (PagePinned(virt_to_page(mm->pgd))) {
+               SetPagePinned(page);
+
+               vm_unmap_aliases();
+               if (!PageHighMem(page)) {
+                       make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
+                       if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+                               pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+               } else {
+                       /* make sure there are no stray mappings of
+                          this page */
+                       kmap_flush_unused();
+               }
+       }
+}
+
+static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
+{
+       xen_alloc_ptpage(mm, pfn, PT_PTE);
+}
+
+static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
+{
+       xen_alloc_ptpage(mm, pfn, PT_PMD);
+}
+
+/* This should never happen until we're OK to use struct page */
+static void xen_release_ptpage(unsigned long pfn, unsigned level)
+{
+       struct page *page = pfn_to_page(pfn);
+
+       if (PagePinned(page)) {
+               if (!PageHighMem(page)) {
+                       if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+                               pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+                       make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+               }
+               ClearPagePinned(page);
+       }
+}
+
+static void xen_release_pte(unsigned long pfn)
+{
+       xen_release_ptpage(pfn, PT_PTE);
+}
+
+static void xen_release_pmd(unsigned long pfn)
+{
+       xen_release_ptpage(pfn, PT_PMD);
+}
+
+#if PAGETABLE_LEVELS == 4
+static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
+{
+       xen_alloc_ptpage(mm, pfn, PT_PUD);
+}
+
+static void xen_release_pud(unsigned long pfn)
+{
+       xen_release_ptpage(pfn, PT_PUD);
+}
+#endif
+
+void __init xen_reserve_top(void)
+{
+#ifdef CONFIG_X86_32
+       unsigned long top = HYPERVISOR_VIRT_START;
+       struct xen_platform_parameters pp;
+
+       if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
+               top = pp.virt_start;
+
+       reserve_top_address(-top);
+#endif /* CONFIG_X86_32 */
+}
+
+/*
+ * Like __va(), but returns address in the kernel mapping (which is
+ * all we have until the physical memory mapping has been set up.
+ */
+static void *__ka(phys_addr_t paddr)
+{
+#ifdef CONFIG_X86_64
+       return (void *)(paddr + __START_KERNEL_map);
+#else
+       return __va(paddr);
+#endif
+}
+
+/* Convert a machine address to physical address */
+static unsigned long m2p(phys_addr_t maddr)
+{
+       phys_addr_t paddr;
+
+       maddr &= PTE_PFN_MASK;
+       paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
+
+       return paddr;
+}
+
+/* Convert a machine address to kernel virtual */
+static void *m2v(phys_addr_t maddr)
+{
+       return __ka(m2p(maddr));
+}
+
+static void set_page_prot(void *addr, pgprot_t prot)
+{
+       unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+       pte_t pte = pfn_pte(pfn, prot);
+
+       if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
+               BUG();
+}
+
+static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+{
+       unsigned pmdidx, pteidx;
+       unsigned ident_pte;
+       unsigned long pfn;
+
+       ident_pte = 0;
+       pfn = 0;
+       for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+               pte_t *pte_page;
+
+               /* Reuse or allocate a page of ptes */
+               if (pmd_present(pmd[pmdidx]))
+                       pte_page = m2v(pmd[pmdidx].pmd);
+               else {
+                       /* Check for free pte pages */
+                       if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+                               break;
+
+                       pte_page = &level1_ident_pgt[ident_pte];
+                       ident_pte += PTRS_PER_PTE;
+
+                       pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
+               }
+
+               /* Install mappings */
+               for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+                       pte_t pte;
+
+                       if (pfn > max_pfn_mapped)
+                               max_pfn_mapped = pfn;
+
+                       if (!pte_none(pte_page[pteidx]))
+                               continue;
+
+                       pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
+                       pte_page[pteidx] = pte;
+               }
+       }
+
+       for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+               set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+
+       set_page_prot(pmd, PAGE_KERNEL_RO);
+}
+
+#ifdef CONFIG_X86_64
+static void convert_pfn_mfn(void *v)
+{
+       pte_t *pte = v;
+       int i;
+
+       /* All levels are converted the same way, so just treat them
+          as ptes. */
+       for (i = 0; i < PTRS_PER_PTE; i++)
+               pte[i] = xen_make_pte(pte[i].pte);
+}
+
+/*
+ * Set up the inital kernel pagetable.
+ *
+ * We can construct this by grafting the Xen provided pagetable into
+ * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
+ * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
+ * means that only the kernel has a physical mapping to start with -
+ * but that's enough to get __va working.  We need to fill in the rest
+ * of the physical mapping once some sort of allocator has been set
+ * up.
+ */
+__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+                                        unsigned long max_pfn)
+{
+       pud_t *l3;
+       pmd_t *l2;
+
+       /* Zap identity mapping */
+       init_level4_pgt[0] = __pgd(0);
+
+       /* Pre-constructed entries are in pfn, so convert to mfn */
+       convert_pfn_mfn(init_level4_pgt);
+       convert_pfn_mfn(level3_ident_pgt);
+       convert_pfn_mfn(level3_kernel_pgt);
+
+       l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
+       l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
+
+       memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+       memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+       l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
+       l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
+       memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+       /* Set up identity map */
+       xen_map_identity_early(level2_ident_pgt, max_pfn);
+
+       /* Make pagetable pieces RO */
+       set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+       set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+       set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+       set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+       set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+       set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+
+       /* Pin down new L4 */
+       pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+                         PFN_DOWN(__pa_symbol(init_level4_pgt)));
+
+       /* Unpin Xen-provided one */
+       pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+       /* Switch over */
+       pgd = init_level4_pgt;
+
+       /*
+        * At this stage there can be no user pgd, and no page
+        * structure to attach it to, so make sure we just set kernel
+        * pgd.
+        */
+       xen_mc_batch();
+       __xen_write_cr3(true, __pa(pgd));
+       xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+       reserve_early(__pa(xen_start_info->pt_base),
+                     __pa(xen_start_info->pt_base +
+                          xen_start_info->nr_pt_frames * PAGE_SIZE),
+                     "XEN PAGETABLES");
+
+       return pgd;
+}
+#else  /* !CONFIG_X86_64 */
+static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+
+__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+                                        unsigned long max_pfn)
+{
+       pmd_t *kernel_pmd;
+
+       init_pg_tables_start = __pa(pgd);
+       init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+       max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+
+       kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+       memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
+
+       xen_map_identity_early(level2_kernel_pgt, max_pfn);
+
+       memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+       set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
+                       __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+
+       set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+       set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+       set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+
+       pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+       xen_write_cr3(__pa(swapper_pg_dir));
+
+       pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+
+       return swapper_pg_dir;
+}
+#endif /* CONFIG_X86_64 */
+
+static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
+{
+       pte_t pte;
+
+       phys >>= PAGE_SHIFT;
+
+       switch (idx) {
+       case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
+#ifdef CONFIG_X86_F00F_BUG
+       case FIX_F00F_IDT:
+#endif
+#ifdef CONFIG_X86_32
+       case FIX_WP_TEST:
+       case FIX_VDSO:
+# ifdef CONFIG_HIGHMEM
+       case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
+#else
+       case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+       case FIX_APIC_BASE:     /* maps dummy local APIC */
+#endif
+               pte = pfn_pte(phys, prot);
+               break;
+
+       default:
+               pte = mfn_pte(phys, prot);
+               break;
+       }
+
+       __native_set_fixmap(idx, pte);
+
+#ifdef CONFIG_X86_64
+       /* Replicate changes to map the vsyscall page into the user
+          pagetable vsyscall mapping. */
+       if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+               unsigned long vaddr = __fix_to_virt(idx);
+               set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+       }
+#endif
+}
+
+__init void xen_post_allocator_init(void)
+{
+       pv_mmu_ops.set_pte = xen_set_pte;
+       pv_mmu_ops.set_pmd = xen_set_pmd;
+       pv_mmu_ops.set_pud = xen_set_pud;
+#if PAGETABLE_LEVELS == 4
+       pv_mmu_ops.set_pgd = xen_set_pgd;
+#endif
+
+       /* This will work as long as patching hasn't happened yet
+          (which it hasn't) */
+       pv_mmu_ops.alloc_pte = xen_alloc_pte;
+       pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+       pv_mmu_ops.release_pte = xen_release_pte;
+       pv_mmu_ops.release_pmd = xen_release_pmd;
+#if PAGETABLE_LEVELS == 4
+       pv_mmu_ops.alloc_pud = xen_alloc_pud;
+       pv_mmu_ops.release_pud = xen_release_pud;
+#endif
+
+#ifdef CONFIG_X86_64
+       SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
+       xen_mark_init_mm_pinned();
+}
+
+
+const struct pv_mmu_ops xen_mmu_ops __initdata = {
+       .pagetable_setup_start = xen_pagetable_setup_start,
+       .pagetable_setup_done = xen_pagetable_setup_done,
+
+       .read_cr2 = xen_read_cr2,
+       .write_cr2 = xen_write_cr2,
+
+       .read_cr3 = xen_read_cr3,
+       .write_cr3 = xen_write_cr3,
+
+       .flush_tlb_user = xen_flush_tlb,
+       .flush_tlb_kernel = xen_flush_tlb,
+       .flush_tlb_single = xen_flush_tlb_single,
+       .flush_tlb_others = xen_flush_tlb_others,
+
+       .pte_update = paravirt_nop,
+       .pte_update_defer = paravirt_nop,
+
+       .pgd_alloc = xen_pgd_alloc,
+       .pgd_free = xen_pgd_free,
+
+       .alloc_pte = xen_alloc_pte_init,
+       .release_pte = xen_release_pte_init,
+       .alloc_pmd = xen_alloc_pte_init,
+       .alloc_pmd_clone = paravirt_nop,
+       .release_pmd = xen_release_pte_init,
+
+#ifdef CONFIG_HIGHPTE
+       .kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
+
+#ifdef CONFIG_X86_64
+       .set_pte = xen_set_pte,
+#else
+       .set_pte = xen_set_pte_init,
+#endif
+       .set_pte_at = xen_set_pte_at,
+       .set_pmd = xen_set_pmd_hyper,
+
+       .ptep_modify_prot_start = __ptep_modify_prot_start,
+       .ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
+       .pte_val = PV_CALLEE_SAVE(xen_pte_val),
+       .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
+
+       .make_pte = PV_CALLEE_SAVE(xen_make_pte),
+       .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
+
+#ifdef CONFIG_X86_PAE
+       .set_pte_atomic = xen_set_pte_atomic,
+       .set_pte_present = xen_set_pte_at,
+       .pte_clear = xen_pte_clear,
+       .pmd_clear = xen_pmd_clear,
+#endif /* CONFIG_X86_PAE */
+       .set_pud = xen_set_pud_hyper,
+
+       .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
+       .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
+
+#if PAGETABLE_LEVELS == 4
+       .pud_val = PV_CALLEE_SAVE(xen_pud_val),
+       .make_pud = PV_CALLEE_SAVE(xen_make_pud),
+       .set_pgd = xen_set_pgd_hyper,
+
+       .alloc_pud = xen_alloc_pte_init,
+       .release_pud = xen_release_pte_init,
+#endif /* PAGETABLE_LEVELS == 4 */
+
+       .activate_mm = xen_activate_mm,
+       .dup_mmap = xen_dup_mmap,
+       .exit_mmap = xen_exit_mmap,
+
+       .lazy_mode = {
+               .enter = paravirt_enter_lazy_mmu,
+               .leave = xen_leave_lazy,
+       },
+
+       .set_fixmap = xen_set_fixmap,
+};
+
+
 #ifdef CONFIG_XEN_DEBUG_FS
 
 static struct dentry *d_mmu_debug;
index 98d7165..24d1b44 100644 (file)
@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t
 void  xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
                                  pte_t *ptep, pte_t pte);
 
+unsigned long xen_read_cr2_direct(void);
+
+extern const struct pv_mmu_ops xen_mmu_ops;
 #endif /* _XEN_MMU_H */
index fa3e107..9e565da 100644 (file)
@@ -41,7 +41,7 @@ static inline void xen_mc_issue(unsigned mode)
                xen_mc_flush();
 
        /* restore flags saved in xen_mc_batch */
-       local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+       local_irq_restore(percpu_read(xen_mc_irq_flags));
 }
 
 /* Set up a callback to be called when the current batch is flushed */
index c44e206..035582a 100644 (file)
@@ -50,11 +50,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
  */
 static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
 {
-#ifdef CONFIG_X86_32
-       __get_cpu_var(irq_stat).irq_resched_count++;
-#else
-       add_pda(irq_resched_count, 1);
-#endif
+       inc_irq_stat(irq_resched_count);
 
        return IRQ_HANDLED;
 }
@@ -78,7 +74,7 @@ static __cpuinit void cpu_bringup(void)
        xen_setup_cpu_clockevents();
 
        cpu_set(cpu, cpu_online_map);
-       x86_write_percpu(cpu_state, CPU_ONLINE);
+       percpu_write(cpu_state, CPU_ONLINE);
        wmb();
 
        /* We can take interrupts now: we're officially "up". */
@@ -174,7 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
 
        /* We've switched to the "real" per-cpu gdt, so make sure the
           old memory can be recycled */
-       make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
+       make_lowmem_page_readwrite(xen_initial_gdt);
 
        xen_setup_vcpu_info_placement();
 }
@@ -239,6 +235,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
        ctxt->user_regs.ss = __KERNEL_DS;
 #ifdef CONFIG_X86_32
        ctxt->user_regs.fs = __KERNEL_PERCPU;
+#else
+       ctxt->gs_base_kernel = per_cpu_offset(cpu);
 #endif
        ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
        ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
@@ -283,23 +281,14 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
        struct task_struct *idle = idle_task(cpu);
        int rc;
 
-#ifdef CONFIG_X86_64
-       /* Allocate node local memory for AP pdas */
-       WARN_ON(cpu == 0);
-       if (cpu > 0) {
-               rc = get_local_pda(cpu);
-               if (rc)
-                       return rc;
-       }
-#endif
-
-#ifdef CONFIG_X86_32
-       init_gdt(cpu);
        per_cpu(current_task, cpu) = idle;
+#ifdef CONFIG_X86_32
        irq_ctx_init(cpu);
 #else
-       cpu_pda(cpu)->pcurrent = idle;
        clear_tsk_thread_flag(idle, TIF_FORK);
+       per_cpu(kernel_stack, cpu) =
+               (unsigned long)task_stack_page(idle) -
+               KERNEL_STACK_OFFSET + THREAD_SIZE;
 #endif
        xen_setup_timer(cpu);
        xen_init_lock_cpu(cpu);
@@ -445,11 +434,7 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
 {
        irq_enter();
        generic_smp_call_function_interrupt();
-#ifdef CONFIG_X86_32
-       __get_cpu_var(irq_stat).irq_call_count++;
-#else
-       add_pda(irq_call_count, 1);
-#endif
+       inc_irq_stat(irq_call_count);
        irq_exit();
 
        return IRQ_HANDLED;
@@ -459,11 +444,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
 {
        irq_enter();
        generic_smp_call_function_single_interrupt();
-#ifdef CONFIG_X86_32
-       __get_cpu_var(irq_stat).irq_call_count++;
-#else
-       add_pda(irq_call_count, 1);
-#endif
+       inc_irq_stat(irq_call_count);
        irq_exit();
 
        return IRQ_HANDLED;
index 212ffe0..95be7b4 100644 (file)
@@ -6,6 +6,7 @@
 
 #include <asm/xen/hypercall.h>
 #include <asm/xen/page.h>
+#include <asm/fixmap.h>
 
 #include "xen-ops.h"
 #include "mmu.h"
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644 (file)
index 0000000..4c6f967
--- /dev/null
@@ -0,0 +1,140 @@
+/*
+       Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+       The inline versions are the same as the direct-use versions, with the
+       pre- and post-amble chopped off.
+
+       This code is encoded for size rather than absolute efficiency,
+       with a view to being able to inline as much as possible.
+
+       We only bother with direct forms (ie, vcpu in percpu data) of
+       the operations here; the indirect forms are better handled in
+       C, since they're generally too large to inline anyway.
+ */
+
+#include <asm/asm-offsets.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+
+#include "xen-asm.h"
+
+/*
+       Enable events.  This clears the event mask and tests the pending
+       event status with one and operation.  If there are pending
+       events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+       /* Unmask events */
+       movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+
+       /* Preempt here doesn't matter because that will deal with
+          any pending interrupts.  The pending check may end up being
+          run on the wrong CPU, but that doesn't hurt. */
+
+       /* Test for pending */
+       testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+       jz 1f
+
+2:     call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+       ret
+       ENDPROC(xen_irq_enable_direct)
+       RELOC(xen_irq_enable_direct, 2b+1)
+
+
+/*
+       Disabling events is simply a matter of making the event mask
+       non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+       movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+       ret
+       ENDPROC(xen_irq_disable_direct)
+       RELOC(xen_irq_disable_direct, 0)
+
+/*
+       (xen_)save_fl is used to get the current interrupt enable status.
+       Callers expect the status to be in X86_EFLAGS_IF, and other bits
+       may be set in the return value.  We take advantage of this by
+       making sure that X86_EFLAGS_IF has the right value (and other bits
+       in that byte are 0), but other bits in the return value are
+       undefined.  We need to toggle the state of the bit, because
+       Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+       testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+       setz %ah
+       addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+       ret
+       ENDPROC(xen_save_fl_direct)
+       RELOC(xen_save_fl_direct, 0)
+
+
+/*
+       In principle the caller should be passing us a value return
+       from xen_save_fl_direct, but for robustness sake we test only
+       the X86_EFLAGS_IF flag rather than the whole byte. After
+       setting the interrupt mask state, it checks for unmasked
+       pending events and enters the hypervisor to get them delivered
+       if so.
+ */
+ENTRY(xen_restore_fl_direct)
+#ifdef CONFIG_X86_64
+       testw $X86_EFLAGS_IF, %di
+#else
+       testb $X86_EFLAGS_IF>>8, %ah
+#endif
+       setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+       /* Preempt here doesn't matter because that will deal with
+          any pending interrupts.  The pending check may end up being
+          run on the wrong CPU, but that doesn't hurt. */
+
+       /* check for unmasked and pending */
+       cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+       jz 1f
+2:     call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+       ret
+       ENDPROC(xen_restore_fl_direct)
+       RELOC(xen_restore_fl_direct, 2b+1)
+
+
+/*
+       Force an event check by making a hypercall,
+       but preserve regs before making the call.
+ */
+check_events:
+#ifdef CONFIG_X86_32
+       push %eax
+       push %ecx
+       push %edx
+       call xen_force_evtchn_callback
+       pop %edx
+       pop %ecx
+       pop %eax
+#else
+       push %rax
+       push %rcx
+       push %rdx
+       push %rsi
+       push %rdi
+       push %r8
+       push %r9
+       push %r10
+       push %r11
+       call xen_force_evtchn_callback
+       pop %r11
+       pop %r10
+       pop %r9
+       pop %r8
+       pop %rdi
+       pop %rsi
+       pop %rdx
+       pop %rcx
+       pop %rax
+#endif
+       ret
+
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
new file mode 100644 (file)
index 0000000..4652764
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef _XEN_XEN_ASM_H
+#define _XEN_XEN_ASM_H
+
+#include <linux/linkage.h>
+
+#define RELOC(x, v)    .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)    .globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+#endif
index 42786f5..082d173 100644 (file)
        generally too large to inline anyway.
  */
 
-#include <linux/linkage.h>
-
-#include <asm/asm-offsets.h>
+//#include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
-#include <asm/percpu.h>
 #include <asm/processor-flags.h>
 #include <asm/segment.h>
 
 #include <xen/interface/xen.h>
 
-#define RELOC(x, v)    .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x)    .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-/*
-       Enable events.  This clears the event mask and tests the pending
-       event status with one and operation.  If there are pending
-       events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
-       /* Unmask events */
-       movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-
-       /* Preempt here doesn't matter because that will deal with
-          any pending interrupts.  The pending check may end up being
-          run on the wrong CPU, but that doesn't hurt. */
-
-       /* Test for pending */
-       testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
-       jz 1f
-
-2:     call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
-       ret
-       ENDPROC(xen_irq_enable_direct)
-       RELOC(xen_irq_enable_direct, 2b+1)
-
+#include "xen-asm.h"
 
 /*
-       Disabling events is simply a matter of making the event mask
-       non-zero.
- */
-ENTRY(xen_irq_disable_direct)
-       movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
-       ret
-       ENDPROC(xen_irq_disable_direct)
-       RELOC(xen_irq_disable_direct, 0)
-
-/*
-       (xen_)save_fl is used to get the current interrupt enable status.
-       Callers expect the status to be in X86_EFLAGS_IF, and other bits
-       may be set in the return value.  We take advantage of this by
-       making sure that X86_EFLAGS_IF has the right value (and other bits
-       in that byte are 0), but other bits in the return value are
-       undefined.  We need to toggle the state of the bit, because
-       Xen and x86 use opposite senses (mask vs enable).
- */
-ENTRY(xen_save_fl_direct)
-       testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-       setz %ah
-       addb %ah,%ah
-ENDPATCH(xen_save_fl_direct)
-       ret
-       ENDPROC(xen_save_fl_direct)
-       RELOC(xen_save_fl_direct, 0)
-
-
-/*
-       In principle the caller should be passing us a value return
-       from xen_save_fl_direct, but for robustness sake we test only
-       the X86_EFLAGS_IF flag rather than the whole byte. After
-       setting the interrupt mask state, it checks for unmasked
-       pending events and enters the hypervisor to get them delivered
-       if so.
+       Force an event check by making a hypercall,
+       but preserve regs before making the call.
  */
-ENTRY(xen_restore_fl_direct)
-       testb $X86_EFLAGS_IF>>8, %ah
-       setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-       /* Preempt here doesn't matter because that will deal with
-          any pending interrupts.  The pending check may end up being
-          run on the wrong CPU, but that doesn't hurt. */
-
-       /* check for unmasked and pending */
-       cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
-       jz 1f
-2:     call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
+check_events:
+       push %eax
+       push %ecx
+       push %edx
+       call xen_force_evtchn_callback
+       pop %edx
+       pop %ecx
+       pop %eax
        ret
-       ENDPROC(xen_restore_fl_direct)
-       RELOC(xen_restore_fl_direct, 2b+1)
 
 /*
        We can't use sysexit directly, because we're not running in ring0.
@@ -289,17 +216,3 @@ ENTRY(xen_iret_crit_fixup)
        lea 4(%edi),%esp                /* point esp to new frame */
 2:     jmp xen_do_upcall
 
-
-/*
-       Force an event check by making a hypercall,
-       but preserve regs before making the call.
- */
-check_events:
-       push %eax
-       push %ecx
-       push %edx
-       call xen_force_evtchn_callback
-       pop %edx
-       pop %ecx
-       pop %eax
-       ret
index 05794c5..d205a28 100644 (file)
        generally too large to inline anyway.
  */
 
-#include <linux/linkage.h>
-
-#include <asm/asm-offsets.h>
-#include <asm/processor-flags.h>
 #include <asm/errno.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
 #include <asm/segment.h>
 
 #include <xen/interface/xen.h>
 
-#define RELOC(x, v)    .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x)    .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-#if 1
-/*
-       x86-64 does not yet support direct access to percpu variables
-       via a segment override, so we just need to make sure this code
-       never gets used
- */
-#define BUG                    ud2a
-#define PER_CPU_VAR(var, off)  0xdeadbeef
-#endif
-
-/*
-       Enable events.  This clears the event mask and tests the pending
-       event status with one and operation.  If there are pending
-       events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
-       BUG
-
-       /* Unmask events */
-       movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
-
-       /* Preempt here doesn't matter because that will deal with
-          any pending interrupts.  The pending check may end up being
-          run on the wrong CPU, but that doesn't hurt. */
-
-       /* Test for pending */
-       testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
-       jz 1f
-
-2:     call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
-       ret
-       ENDPROC(xen_irq_enable_direct)
-       RELOC(xen_irq_enable_direct, 2b+1)
-
-/*
-       Disabling events is simply a matter of making the event mask
-       non-zero.
- */
-ENTRY(xen_irq_disable_direct)
-       BUG
-
-       movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
-ENDPATCH(xen_irq_disable_direct)
-       ret
-       ENDPROC(xen_irq_disable_direct)
-       RELOC(xen_irq_disable_direct, 0)
-
-/*
-       (xen_)save_fl is used to get the current interrupt enable status.
-       Callers expect the status to be in X86_EFLAGS_IF, and other bits
-       may be set in the return value.  We take advantage of this by
-       making sure that X86_EFLAGS_IF has the right value (and other bits
-       in that byte are 0), but other bits in the return value are
-       undefined.  We need to toggle the state of the bit, because
-       Xen and x86 use opposite senses (mask vs enable).
- */
-ENTRY(xen_save_fl_direct)
-       BUG
-
-       testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
-       setz %ah
-       addb %ah,%ah
-ENDPATCH(xen_save_fl_direct)
-       ret
-       ENDPROC(xen_save_fl_direct)
-       RELOC(xen_save_fl_direct, 0)
-
-/*
-       In principle the caller should be passing us a value return
-       from xen_save_fl_direct, but for robustness sake we test only
-       the X86_EFLAGS_IF flag rather than the whole byte. After
-       setting the interrupt mask state, it checks for unmasked
-       pending events and enters the hypervisor to get them delivered
-       if so.
- */
-ENTRY(xen_restore_fl_direct)
-       BUG
-
-       testb $X86_EFLAGS_IF>>8, %ah
-       setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
-       /* Preempt here doesn't matter because that will deal with
-          any pending interrupts.  The pending check may end up being
-          run on the wrong CPU, but that doesn't hurt. */
-
-       /* check for unmasked and pending */
-       cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
-       jz 1f
-2:     call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
-       ret
-       ENDPROC(xen_restore_fl_direct)
-       RELOC(xen_restore_fl_direct, 2b+1)
-
-
-/*
-       Force an event check by making a hypercall,
-       but preserve regs before making the call.
- */
-check_events:
-       push %rax
-       push %rcx
-       push %rdx
-       push %rsi
-       push %rdi
-       push %r8
-       push %r9
-       push %r10
-       push %r11
-       call xen_force_evtchn_callback
-       pop %r11
-       pop %r10
-       pop %r9
-       pop %r8
-       pop %rdi
-       pop %rsi
-       pop %rdx
-       pop %rcx
-       pop %rax
-       ret
+#include "xen-asm.h"
 
 ENTRY(xen_adjust_exception_frame)
        mov 8+0(%rsp),%rcx
@@ -195,11 +66,11 @@ RELOC(xen_sysexit, 1b+1)
 ENTRY(xen_sysret64)
        /* We're already on the usermode stack at this point, but still
           with the kernel gs, so we can easily switch back */
-       movq %rsp, %gs:pda_oldrsp
-       movq %gs:pda_kernelstack,%rsp
+       movq %rsp, PER_CPU_VAR(old_rsp)
+       movq PER_CPU_VAR(kernel_stack),%rsp
 
        pushq $__USER_DS
-       pushq %gs:pda_oldrsp
+       pushq PER_CPU_VAR(old_rsp)
        pushq %r11
        pushq $__USER_CS
        pushq %rcx
@@ -212,11 +83,11 @@ RELOC(xen_sysret64, 1b+1)
 ENTRY(xen_sysret32)
        /* We're already on the usermode stack at this point, but still
           with the kernel gs, so we can easily switch back */
-       movq %rsp, %gs:pda_oldrsp
-       movq %gs:pda_kernelstack, %rsp
+       movq %rsp, PER_CPU_VAR(old_rsp)
+       movq PER_CPU_VAR(kernel_stack), %rsp
 
        pushq $__USER32_DS
-       pushq %gs:pda_oldrsp
+       pushq PER_CPU_VAR(old_rsp)
        pushq %r11
        pushq $__USER32_CS
        pushq %rcx
index c1f8faf..2f5ef26 100644 (file)
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];
 
+extern void *xen_initial_gdt;
+
 struct trap_info;
 void xen_copy_trap_info(struct trap_info *traps);
 
+DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 DECLARE_PER_CPU(unsigned long, xen_cr3);
 DECLARE_PER_CPU(unsigned long, xen_current_cr3);
 
@@ -22,6 +25,13 @@ extern struct shared_info *HYPERVISOR_shared_info;
 
 void xen_setup_mfn_list_list(void);
 void xen_setup_shared_info(void);
+void xen_setup_machphys_mapping(void);
+pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
+void xen_ident_map_ISA(void);
+void xen_reserve_top(void);
+
+void xen_leave_lazy(void);
+void xen_post_allocator_init(void);
 
 char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
index 719ee5c..5b257a5 100644 (file)
@@ -107,7 +107,7 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
 /*
  * Print cpu online, possible, present, and system maps
  */
-static ssize_t print_cpus_map(char *buf, cpumask_t *map)
+static ssize_t print_cpus_map(char *buf, const struct cpumask *map)
 {
        int n = cpulist_scnprintf(buf, PAGE_SIZE-2, map);
 
index a778fb5..bf6b132 100644 (file)
 #include <linux/hardirq.h>
 #include <linux/topology.h>
 
-#define define_one_ro(_name)           \
+#define define_one_ro_named(_name, _func)                              \
+static SYSDEV_ATTR(_name, 0444, _func, NULL)
+
+#define define_one_ro(_name)                           \
 static SYSDEV_ATTR(_name, 0444, show_##_name, NULL)
 
 #define define_id_show_func(name)                              \
@@ -42,8 +45,8 @@ static ssize_t show_##name(struct sys_device *dev,            \
        return sprintf(buf, "%d\n", topology_##name(cpu));      \
 }
 
-#if defined(topology_thread_siblings) || defined(topology_core_siblings)
-static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
+#if defined(topology_thread_cpumask) || defined(topology_core_cpumask)
+static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
 {
        ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
        int n = 0;
@@ -65,7 +68,7 @@ static ssize_t show_##name(struct sys_device *dev,                    \
                           struct sysdev_attribute *attr, char *buf)    \
 {                                                                      \
        unsigned int cpu = dev->id;                                     \
-       return show_cpumap(0, &(topology_##name(cpu)), buf);            \
+       return show_cpumap(0, topology_##name(cpu), buf);               \
 }
 
 #define define_siblings_show_list(name)                                        \
@@ -74,7 +77,7 @@ static ssize_t show_##name##_list(struct sys_device *dev,             \
                                  char *buf)                            \
 {                                                                      \
        unsigned int cpu = dev->id;                                     \
-       return show_cpumap(1, &(topology_##name(cpu)), buf);            \
+       return show_cpumap(1, topology_##name(cpu), buf);               \
 }
 
 #else
@@ -82,9 +85,7 @@ static ssize_t show_##name##_list(struct sys_device *dev,             \
 static ssize_t show_##name(struct sys_device *dev,                     \
                           struct sysdev_attribute *attr, char *buf)    \
 {                                                                      \
-       unsigned int cpu = dev->id;                                     \
-       cpumask_t mask = topology_##name(cpu);                          \
-       return show_cpumap(0, &mask, buf);                              \
+       return show_cpumap(0, topology_##name(dev->id), buf);           \
 }
 
 #define define_siblings_show_list(name)                                        \
@@ -92,9 +93,7 @@ static ssize_t show_##name##_list(struct sys_device *dev,             \
                                  struct sysdev_attribute *attr,        \
                                  char *buf)                            \
 {                                                                      \
-       unsigned int cpu = dev->id;                                     \
-       cpumask_t mask = topology_##name(cpu);                          \
-       return show_cpumap(1, &mask, buf);                              \
+       return show_cpumap(1, topology_##name(dev->id), buf);           \
 }
 #endif
 
@@ -107,13 +106,13 @@ define_one_ro(physical_package_id);
 define_id_show_func(core_id);
 define_one_ro(core_id);
 
-define_siblings_show_func(thread_siblings);
-define_one_ro(thread_siblings);
-define_one_ro(thread_siblings_list);
+define_siblings_show_func(thread_cpumask);
+define_one_ro_named(thread_siblings, show_thread_cpumask);
+define_one_ro_named(thread_siblings_list, show_thread_cpumask_list);
 
-define_siblings_show_func(core_siblings);
-define_one_ro(core_siblings);
-define_one_ro(core_siblings_list);
+define_siblings_show_func(core_cpumask);
+define_one_ro_named(core_siblings, show_core_cpumask);
+define_one_ro_named(core_siblings_list, show_core_cpumask_list);
 
 static struct attribute *default_attrs[] = {
        &attr_physical_package_id.attr,
index 777fba4..3009e01 100644 (file)
@@ -244,7 +244,7 @@ static ssize_t host_control_on_shutdown_store(struct device *dev,
  */
 int dcdbas_smi_request(struct smi_cmd *smi_cmd)
 {
-       cpumask_t old_mask;
+       cpumask_var_t old_mask;
        int ret = 0;
 
        if (smi_cmd->magic != SMI_CMD_MAGIC) {
@@ -254,8 +254,11 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
        }
 
        /* SMI requires CPU 0 */
-       old_mask = current->cpus_allowed;
-       set_cpus_allowed_ptr(current, &cpumask_of_cpu(0));
+       if (!alloc_cpumask_var(&old_mask, GFP_KERNEL))
+               return -ENOMEM;
+
+       cpumask_copy(old_mask, &current->cpus_allowed);
+       set_cpus_allowed_ptr(current, cpumask_of(0));
        if (smp_processor_id() != 0) {
                dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n",
                        __func__);
@@ -275,7 +278,8 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
        );
 
 out:
-       set_cpus_allowed_ptr(current, &old_mask);
+       set_cpus_allowed_ptr(current, old_mask);
+       free_cpumask_var(old_mask);
        return ret;
 }
 
index c64e679..1c48408 100644 (file)
@@ -162,7 +162,7 @@ config ENCLOSURE_SERVICES
 config SGI_XP
        tristate "Support communication between SGI SSIs"
        depends on NET
-       depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP
+       depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_UV) && SMP
        select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
        select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
        select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP
@@ -189,7 +189,7 @@ config HP_ILO
 
 config SGI_GRU
        tristate "SGI GRU driver"
-       depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP
+       depends on (X86_UV || IA64_SGI_UV || IA64_GENERIC) && SMP
        default n
        select MMU_NOTIFIER
        ---help---
index f93f03a..1b5f579 100644 (file)
@@ -19,6 +19,8 @@
 #ifndef __GRU_H__
 #define __GRU_H__
 
+#include <asm/uv/uv.h>
+
 /*
  * GRU architectural definitions
  */
index 7b4cbd5..069ad3a 100644 (file)
@@ -15,6 +15,8 @@
 
 #include <linux/mutex.h>
 
+#include <asm/uv/uv.h>
+
 #ifdef CONFIG_IA64
 #include <asm/system.h>
 #include <asm/sn/arch.h>       /* defines is_shub1() and is_shub2() */
index 89218f7..6576170 100644 (file)
@@ -318,7 +318,7 @@ xpc_hb_checker(void *ignore)
 
        /* this thread was marked active by xpc_hb_init() */
 
-       set_cpus_allowed_ptr(current, &cpumask_of_cpu(XPC_HB_CHECK_CPU));
+       set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU));
 
        /* set our heartbeating to other partitions into motion */
        xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
index ab0e09b..847e9bb 100644 (file)
@@ -854,20 +854,27 @@ static void efx_fini_io(struct efx_nic *efx)
  * interrupts across them. */
 static int efx_wanted_rx_queues(void)
 {
-       cpumask_t core_mask;
+       cpumask_var_t core_mask;
        int count;
        int cpu;
 
-       cpus_clear(core_mask);
+       if (!alloc_cpumask_var(&core_mask, GFP_KERNEL)) {
+               printk(KERN_WARNING
+                      "efx.c: allocation failure, irq balancing hobbled\n");
+               return 1;
+       }
+
+       cpumask_clear(core_mask);
        count = 0;
        for_each_online_cpu(cpu) {
-               if (!cpu_isset(cpu, core_mask)) {
+               if (!cpumask_test_cpu(cpu, core_mask)) {
                        ++count;
-                       cpus_or(core_mask, core_mask,
-                               topology_core_siblings(cpu));
+                       cpumask_or(core_mask, core_mask,
+                                  topology_core_cpumask(cpu));
                }
        }
 
+       free_cpumask_var(core_mask);
        return count;
 }
 
index 9da5a4b..c3ea5fa 100644 (file)
@@ -38,7 +38,7 @@
 
 static LIST_HEAD(dying_tasks);
 static LIST_HEAD(dead_tasks);
-static cpumask_t marked_cpus = CPU_MASK_NONE;
+static cpumask_var_t marked_cpus;
 static DEFINE_SPINLOCK(task_mortuary);
 static void process_task_mortuary(void);
 
@@ -456,10 +456,10 @@ static void mark_done(int cpu)
 {
        int i;
 
-       cpu_set(cpu, marked_cpus);
+       cpumask_set_cpu(cpu, marked_cpus);
 
        for_each_online_cpu(i) {
-               if (!cpu_isset(i, marked_cpus))
+               if (!cpumask_test_cpu(i, marked_cpus))
                        return;
        }
 
@@ -468,7 +468,7 @@ static void mark_done(int cpu)
         */
        process_task_mortuary();
 
-       cpus_clear(marked_cpus);
+       cpumask_clear(marked_cpus);
 }
 
 
@@ -565,6 +565,20 @@ void sync_buffer(int cpu)
        mutex_unlock(&buffer_mutex);
 }
 
+int __init buffer_sync_init(void)
+{
+       if (!alloc_cpumask_var(&marked_cpus, GFP_KERNEL))
+               return -ENOMEM;
+
+       cpumask_clear(marked_cpus);
+               return 0;
+}
+
+void __exit buffer_sync_cleanup(void)
+{
+       free_cpumask_var(marked_cpus);
+}
+
 /* The function can be used to add a buffer worth of data directly to
  * the kernel buffer. The buffer is assumed to be a circular buffer.
  * Take the entries from index start and end at index end, wrapping
index 3110732..0ebf5db 100644 (file)
@@ -19,4 +19,8 @@ void sync_stop(void);
 /* sync the given CPU's buffer */
 void sync_buffer(int cpu);
 
+/* initialize/destroy the buffer system. */
+int buffer_sync_init(void);
+void buffer_sync_cleanup(void);
+
 #endif /* OPROFILE_BUFFER_SYNC_H */
index 3cffce9..ced39f6 100644 (file)
@@ -183,6 +183,10 @@ static int __init oprofile_init(void)
 {
        int err;
 
+       err = buffer_sync_init();
+       if (err)
+               return err;
+
        err = oprofile_arch_init(&oprofile_ops);
 
        if (err < 0 || timer) {
@@ -191,8 +195,10 @@ static int __init oprofile_init(void)
        }
 
        err = oprofilefs_register();
-       if (err)
+       if (err) {
                oprofile_arch_exit();
+               buffer_sync_cleanup();
+       }
 
        return err;
 }
@@ -202,6 +208,7 @@ static void __exit oprofile_exit(void)
 {
        oprofilefs_unregister();
        oprofile_arch_exit();
+       buffer_sync_cleanup();
 }
 
 
index f78371b..5a57753 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/irq.h>
 #include <asm/io_apic.h>
 #include <asm/smp.h>
+#include <asm/cpu.h>
 #include <linux/intel-iommu.h>
 #include "intr_remapping.h"
 
index eb0dfde..3141e14 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/bootmem.h>
 
 #include <asm/ptrace.h>
 #include <asm/irq.h>
@@ -75,7 +76,14 @@ enum {
 static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
        [0 ... NR_EVENT_CHANNELS-1] = -1
 };
-static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+struct cpu_evtchn_s {
+       unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
+};
+static struct cpu_evtchn_s *cpu_evtchn_mask_p;
+static inline unsigned long *cpu_evtchn_mask(int cpu)
+{
+       return cpu_evtchn_mask_p[cpu].bits;
+}
 static u8 cpu_evtchn[NR_EVENT_CHANNELS];
 
 /* Reference counts for bindings to IRQs. */
@@ -115,7 +123,7 @@ static inline unsigned long active_evtchns(unsigned int cpu,
                                           unsigned int idx)
 {
        return (sh->evtchn_pending[idx] &
-               cpu_evtchn_mask[cpu][idx] &
+               cpu_evtchn_mask(cpu)[idx] &
                ~sh->evtchn_mask[idx]);
 }
 
@@ -125,11 +133,11 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 
        BUG_ON(irq == -1);
 #ifdef CONFIG_SMP
-       irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu);
+       cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
 #endif
 
-       __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
-       __set_bit(chn, cpu_evtchn_mask[cpu]);
+       __clear_bit(chn, cpu_evtchn_mask(cpu_evtchn[chn]));
+       __set_bit(chn, cpu_evtchn_mask(cpu));
 
        cpu_evtchn[chn] = cpu;
 }
@@ -142,12 +150,12 @@ static void init_evtchn_cpu_bindings(void)
 
        /* By default all event channels notify CPU#0. */
        for_each_irq_desc(i, desc) {
-               desc->affinity = cpumask_of_cpu(0);
+               cpumask_copy(desc->affinity, cpumask_of(0));
        }
 #endif
 
        memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
-       memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+       memset(cpu_evtchn_mask(0), ~0, sizeof(cpu_evtchn_mask(0)));
 }
 
 static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
@@ -822,6 +830,10 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
 void __init xen_init_IRQ(void)
 {
        int i;
+       size_t size = nr_cpu_ids * sizeof(struct cpu_evtchn_s);
+
+       cpu_evtchn_mask_p = alloc_bootmem(size);
+       BUG_ON(cpu_evtchn_mask_p == NULL);
 
        init_evtchn_cpu_bindings();
 
index 9b91617..e7e83b6 100644 (file)
@@ -100,7 +100,7 @@ static void do_suspend(void)
        /* XXX use normal device tree? */
        xenbus_suspend();
 
-       err = stop_machine(xen_suspend, &cancelled, &cpumask_of_cpu(0));
+       err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
        if (err) {
                printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
                goto out;
index b0e63c6..00f45ff 100644 (file)
@@ -80,4 +80,56 @@ extern void setup_per_cpu_areas(void);
 #define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
                                        __typeof__(type) per_cpu_var(name)
 
+/*
+ * Optional methods for optimized non-lvalue per-cpu variable access.
+ *
+ * @var can be a percpu variable or a field of it and its size should
+ * equal char, int or long.  percpu_read() evaluates to a lvalue and
+ * all others to void.
+ *
+ * These operations are guaranteed to be atomic w.r.t. preemption.
+ * The generic versions use plain get/put_cpu_var().  Archs are
+ * encouraged to implement single-instruction alternatives which don't
+ * require preemption protection.
+ */
+#ifndef percpu_read
+# define percpu_read(var)                                              \
+  ({                                                                   \
+       typeof(per_cpu_var(var)) __tmp_var__;                           \
+       __tmp_var__ = get_cpu_var(var);                                 \
+       put_cpu_var(var);                                               \
+       __tmp_var__;                                                    \
+  })
+#endif
+
+#define __percpu_generic_to_op(var, val, op)                           \
+do {                                                                   \
+       get_cpu_var(var) op val;                                        \
+       put_cpu_var(var);                                               \
+} while (0)
+
+#ifndef percpu_write
+# define percpu_write(var, val)                __percpu_generic_to_op(var, (val), =)
+#endif
+
+#ifndef percpu_add
+# define percpu_add(var, val)          __percpu_generic_to_op(var, (val), +=)
+#endif
+
+#ifndef percpu_sub
+# define percpu_sub(var, val)          __percpu_generic_to_op(var, (val), -=)
+#endif
+
+#ifndef percpu_and
+# define percpu_and(var, val)          __percpu_generic_to_op(var, (val), &=)
+#endif
+
+#ifndef percpu_or
+# define percpu_or(var, val)           __percpu_generic_to_op(var, (val), |=)
+#endif
+
+#ifndef percpu_xor
+# define percpu_xor(var, val)          __percpu_generic_to_op(var, (val), ^=)
+#endif
+
 #endif /* _ASM_GENERIC_PERCPU_H_ */
index 79a7ff9..4ce48e8 100644 (file)
@@ -9,7 +9,7 @@ extern char __bss_start[], __bss_stop[];
 extern char __init_begin[], __init_end[];
 extern char _sinittext[], _einittext[];
 extern char _end[];
-extern char __per_cpu_start[], __per_cpu_end[];
+extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __start_rodata[], __end_rodata[];
index c61fab1..5406e70 100644 (file)
        *(.initcall7.init)                                              \
        *(.initcall7s.init)
 
+/**
+ * PERCPU_VADDR - define output section for percpu area
+ * @vaddr: explicit base address (optional)
+ * @phdr: destination PHDR (optional)
+ *
+ * Macro which expands to output section for percpu area.  If @vaddr
+ * is not blank, it specifies explicit base address and all percpu
+ * symbols will be offset from the given address.  If blank, @vaddr
+ * always equals @laddr + LOAD_OFFSET.
+ *
+ * @phdr defines the output PHDR to use if not blank.  Be warned that
+ * output PHDR is sticky.  If @phdr is specified, the next output
+ * section in the linker script will go there too.  @phdr should have
+ * a leading colon.
+ *
+ * Note that this macros defines __per_cpu_load as an absolute symbol.
+ * If there is no need to put the percpu section at a predetermined
+ * address, use PERCPU().
+ */
+#define PERCPU_VADDR(vaddr, phdr)                                      \
+       VMLINUX_SYMBOL(__per_cpu_load) = .;                             \
+       .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load)          \
+                               - LOAD_OFFSET) {                        \
+               VMLINUX_SYMBOL(__per_cpu_start) = .;                    \
+               *(.data.percpu.first)                                   \
+               *(.data.percpu.page_aligned)                            \
+               *(.data.percpu)                                         \
+               *(.data.percpu.shared_aligned)                          \
+               VMLINUX_SYMBOL(__per_cpu_end) = .;                      \
+       } phdr                                                          \
+       . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);
+
+/**
+ * PERCPU - define output section for percpu area, simple version
+ * @align: required alignment
+ *
+ * Align to @align and outputs output section for percpu area.  This
+ * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and
+ * __per_cpu_start will be identical.
+ *
+ * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except
+ * that __per_cpu_load is defined as a relative symbol against
+ * .data.percpu which is required for relocatable x86_32
+ * configuration.
+ */
 #define PERCPU(align)                                                  \
        . = ALIGN(align);                                               \
-       VMLINUX_SYMBOL(__per_cpu_start) = .;                            \
-       .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {          \
+       .data.percpu    : AT(ADDR(.data.percpu) - LOAD_OFFSET) {        \
+               VMLINUX_SYMBOL(__per_cpu_load) = .;                     \
+               VMLINUX_SYMBOL(__per_cpu_start) = .;                    \
+               *(.data.percpu.first)                                   \
                *(.data.percpu.page_aligned)                            \
                *(.data.percpu)                                         \
                *(.data.percpu.shared_aligned)                          \
-       }                                                               \
-       VMLINUX_SYMBOL(__per_cpu_end) = .;
+               VMLINUX_SYMBOL(__per_cpu_end) = .;                      \
+       }
index 9127f6b..472f117 100644 (file)
@@ -467,6 +467,7 @@ int show_interrupts(struct seq_file *p, void *v);
 struct irq_desc;
 
 extern int early_irq_init(void);
+extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
 extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
 
index f899b50..27a6753 100644 (file)
@@ -182,11 +182,11 @@ struct irq_desc {
        unsigned int            irqs_unhandled;
        spinlock_t              lock;
 #ifdef CONFIG_SMP
-       cpumask_t               affinity;
+       cpumask_var_t           affinity;
        unsigned int            cpu;
-#endif
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-       cpumask_t               pending_mask;
+       cpumask_var_t           pending_mask;
+#endif
 #endif
 #ifdef CONFIG_PROC_FS
        struct proc_dir_entry   *dir;
@@ -422,4 +422,84 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 
 #endif /* !CONFIG_S390 */
 
+#ifdef CONFIG_SMP
+/**
+ * init_alloc_desc_masks - allocate cpumasks for irq_desc
+ * @desc:      pointer to irq_desc struct
+ * @cpu:       cpu which will be handling the cpumasks
+ * @boot:      true if need bootmem
+ *
+ * Allocates affinity and pending_mask cpumask if required.
+ * Returns true if successful (or not required).
+ * Side effect: affinity has all bits set, pending_mask has all bits clear.
+ */
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+                                                               bool boot)
+{
+       int node;
+
+       if (boot) {
+               alloc_bootmem_cpumask_var(&desc->affinity);
+               cpumask_setall(desc->affinity);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+               alloc_bootmem_cpumask_var(&desc->pending_mask);
+               cpumask_clear(desc->pending_mask);
+#endif
+               return true;
+       }
+
+       node = cpu_to_node(cpu);
+
+       if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
+               return false;
+       cpumask_setall(desc->affinity);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+       if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
+               free_cpumask_var(desc->affinity);
+               return false;
+       }
+       cpumask_clear(desc->pending_mask);
+#endif
+       return true;
+}
+
+/**
+ * init_copy_desc_masks - copy cpumasks for irq_desc
+ * @old_desc:  pointer to old irq_desc struct
+ * @new_desc:  pointer to new irq_desc struct
+ *
+ * Insures affinity and pending_masks are copied to new irq_desc.
+ * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the
+ * irq_desc struct so the copy is redundant.
+ */
+
+static inline void init_copy_desc_masks(struct irq_desc *old_desc,
+                                       struct irq_desc *new_desc)
+{
+#ifdef CONFIG_CPUMASKS_OFFSTACK
+       cpumask_copy(new_desc->affinity, old_desc->affinity);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+       cpumask_copy(new_desc->pending_mask, old_desc->pending_mask);
+#endif
+#endif
+}
+
+#else /* !CONFIG_SMP */
+
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+                                                               bool boot)
+{
+       return true;
+}
+
+static inline void init_copy_desc_masks(struct irq_desc *old_desc,
+                                       struct irq_desc *new_desc)
+{
+}
+
+#endif /* CONFIG_SMP */
+
 #endif /* _LINUX_IRQ_H */
index 86af92e..887477b 100644 (file)
@@ -20,6 +20,7 @@
 
 # define for_each_irq_desc_reverse(irq, desc)                          \
        for (irq = nr_irqs - 1; irq >= 0; irq--)
+
 #else /* CONFIG_GENERIC_HARDIRQS */
 
 extern int nr_irqs;
index 0b4df7e..5b4e28b 100644 (file)
@@ -49,4 +49,5 @@
 #define FUTEXFS_SUPER_MAGIC    0xBAD1DEA
 #define INOTIFYFS_SUPER_MAGIC  0x2BAD1DEA
 
+#define STACK_END_MAGIC                0x57AC6E9D
 #endif /* __LINUX_MAGIC_H__ */
index 9f2a375..0e24202 100644 (file)
@@ -9,34 +9,39 @@
 #include <asm/percpu.h>
 
 #ifdef CONFIG_SMP
-#define DEFINE_PER_CPU(type, name)                                     \
-       __attribute__((__section__(".data.percpu")))                    \
-       PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+#define PER_CPU_BASE_SECTION ".data.percpu"
 
 #ifdef MODULE
-#define SHARED_ALIGNED_SECTION ".data.percpu"
+#define PER_CPU_SHARED_ALIGNED_SECTION ""
 #else
-#define SHARED_ALIGNED_SECTION ".data.percpu.shared_aligned"
+#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
 #endif
+#define PER_CPU_FIRST_SECTION ".first"
 
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)                      \
-       __attribute__((__section__(SHARED_ALIGNED_SECTION)))            \
-       PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name             \
-       ____cacheline_aligned_in_smp
+#else
+
+#define PER_CPU_BASE_SECTION ".data"
+#define PER_CPU_SHARED_ALIGNED_SECTION ""
+#define PER_CPU_FIRST_SECTION ""
+
+#endif
 
-#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                        \
-       __attribute__((__section__(".data.percpu.page_aligned")))       \
+#define DEFINE_PER_CPU_SECTION(type, name, section)                    \
+       __attribute__((__section__(PER_CPU_BASE_SECTION section)))      \
        PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
-#else
+
 #define DEFINE_PER_CPU(type, name)                                     \
-       PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+       DEFINE_PER_CPU_SECTION(type, name, "")
 
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)                    \
-       DEFINE_PER_CPU(type, name)
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)                      \
+       DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
 
-#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                      \
-       DEFINE_PER_CPU(type, name)
-#endif
+#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                                \
+       DEFINE_PER_CPU_SECTION(type, name, ".page_aligned")
+
+#define DEFINE_PER_CPU_FIRST(type, name)                               \
+       DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION)
 
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
index 2127e95..28b3f50 100644 (file)
@@ -1161,10 +1161,9 @@ struct task_struct {
        pid_t pid;
        pid_t tgid;
 
-#ifdef CONFIG_CC_STACKPROTECTOR
        /* Canary value for the -fstack-protector gcc feature */
        unsigned long stack_canary;
-#endif
+
        /* 
         * pointers to (original) parent process, youngest child, younger sibling,
         * older sibling, respectively.  (p->father can be replaced with 
@@ -2070,6 +2069,19 @@ static inline int object_is_on_stack(void *obj)
 
 extern void thread_info_cache_init(void);
 
+#ifdef CONFIG_DEBUG_STACK_USAGE
+static inline unsigned long stack_not_used(struct task_struct *p)
+{
+       unsigned long *n = end_of_stack(p);
+
+       do {    /* Skip over canary */
+               n++;
+       } while (!*n);
+
+       return (unsigned long)n - (unsigned long)end_of_stack(p);
+}
+#endif
+
 /* set thread flags in other task's structures
  * - see asm/thread_info.h for TIF_xxxx flags available
  */
diff --git a/include/linux/stackprotector.h b/include/linux/stackprotector.h
new file mode 100644 (file)
index 0000000..6f3e54c
--- /dev/null
@@ -0,0 +1,16 @@
+#ifndef _LINUX_STACKPROTECTOR_H
+#define _LINUX_STACKPROTECTOR_H 1
+
+#include <linux/compiler.h>
+#include <linux/sched.h>
+#include <linux/random.h>
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+# include <asm/stackprotector.h>
+#else
+static inline void boot_init_stack_canary(void)
+{
+}
+#endif
+
+#endif
index e632d29..a16b9e0 100644 (file)
@@ -193,5 +193,11 @@ int arch_update_cpu_topology(void);
 #ifndef topology_core_siblings
 #define topology_core_siblings(cpu)            cpumask_of_cpu(cpu)
 #endif
+#ifndef topology_thread_cpumask
+#define topology_thread_cpumask(cpu)           cpumask_of(cpu)
+#endif
+#ifndef topology_core_cpumask
+#define topology_core_cpumask(cpu)             cpumask_of(cpu)
+#endif
 
 #endif /* _LINUX_TOPOLOGY_H */
index 8442094..bfe4fb0 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
+#include <linux/stackprotector.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
@@ -539,6 +540,12 @@ asmlinkage void __init start_kernel(void)
         */
        lockdep_init();
        debug_objects_early_init();
+
+       /*
+        * Set up the the initial canary ASAP:
+        */
+       boot_init_stack_canary();
+
        cgroup_init_early();
 
        local_irq_disable();
index f80dec3..70612c1 100644 (file)
@@ -977,12 +977,9 @@ static void check_stack_usage(void)
 {
        static DEFINE_SPINLOCK(low_water_lock);
        static int lowest_to_date = THREAD_SIZE;
-       unsigned long *n = end_of_stack(current);
        unsigned long free;
 
-       while (*n == 0)
-               n++;
-       free = (unsigned long)n - (unsigned long)end_of_stack(current);
+       free = stack_not_used(current);
 
        if (free >= lowest_to_date)
                return;
index 6d5dbb7..c078438 100644 (file)
@@ -61,6 +61,7 @@
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
 #include <trace/sched.h>
+#include <linux/magic.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -212,6 +213,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 {
        struct task_struct *tsk;
        struct thread_info *ti;
+       unsigned long *stackend;
+
        int err;
 
        prepare_to_copy(orig);
@@ -237,6 +240,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
                goto out;
 
        setup_thread_stack(tsk, orig);
+       stackend = end_of_stack(tsk);
+       *stackend = STACK_END_MAGIC;    /* for overflow detection */
 
 #ifdef CONFIG_CC_STACKPROTECTOR
        tsk->stack_canary = get_random_int();
index 7de11bd..122fef4 100644 (file)
@@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq)
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
 #ifdef CONFIG_SMP
-       cpumask_setall(&desc->affinity);
+       cpumask_setall(desc->affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+       cpumask_clear(desc->pending_mask);
+#endif
 #endif
        spin_unlock_irqrestore(&desc->lock, flags);
 }
index 3aba8d1..f51eaee 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
+#include <linux/bootmem.h>
 
 #include "internals.h"
 
@@ -69,6 +70,7 @@ int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
 #ifdef CONFIG_SPARSE_IRQ
+
 static struct irq_desc irq_desc_init = {
        .irq        = -1,
        .status     = IRQ_DISABLED,
@@ -76,9 +78,6 @@ static struct irq_desc irq_desc_init = {
        .handle_irq = handle_bad_irq,
        .depth      = 1,
        .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-#ifdef CONFIG_SMP
-       .affinity   = CPU_MASK_ALL
-#endif
 };
 
 void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
@@ -113,6 +112,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
                printk(KERN_ERR "can not alloc kstat_irqs\n");
                BUG_ON(1);
        }
+       if (!init_alloc_desc_masks(desc, cpu, false)) {
+               printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
+               BUG_ON(1);
+       }
        arch_init_chip_data(desc, cpu);
 }
 
@@ -121,7 +124,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
  */
 DEFINE_SPINLOCK(sparse_irq_lock);
 
-struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
+struct irq_desc **irq_desc_ptrs __read_mostly;
 
 static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
        [0 ... NR_IRQS_LEGACY-1] = {
@@ -131,14 +134,10 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
                .handle_irq = handle_bad_irq,
                .depth      = 1,
                .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-#ifdef CONFIG_SMP
-               .affinity   = CPU_MASK_ALL
-#endif
        }
 };
 
-/* FIXME: use bootmem alloc ...*/
-static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+static unsigned int *kstat_irqs_legacy;
 
 int __init early_irq_init(void)
 {
@@ -148,18 +147,30 @@ int __init early_irq_init(void)
 
        init_irq_default_affinity();
 
+        /* initialize nr_irqs based on nr_cpu_ids */
+       arch_probe_nr_irqs();
+       printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
+
        desc = irq_desc_legacy;
        legacy_count = ARRAY_SIZE(irq_desc_legacy);
 
+       /* allocate irq_desc_ptrs array based on nr_irqs */
+       irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+
+       /* allocate based on nr_cpu_ids */
+       /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
+       kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
+                                         sizeof(int));
+
        for (i = 0; i < legacy_count; i++) {
                desc[i].irq = i;
-               desc[i].kstat_irqs = kstat_irqs_legacy[i];
+               desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-
+               init_alloc_desc_masks(&desc[i], 0, true);
                irq_desc_ptrs[i] = desc + i;
        }
 
-       for (i = legacy_count; i < NR_IRQS; i++)
+       for (i = legacy_count; i < nr_irqs; i++)
                irq_desc_ptrs[i] = NULL;
 
        return arch_early_irq_init();
@@ -167,7 +178,10 @@ int __init early_irq_init(void)
 
 struct irq_desc *irq_to_desc(unsigned int irq)
 {
-       return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
+       if (irq_desc_ptrs && irq < nr_irqs)
+               return irq_desc_ptrs[irq];
+
+       return NULL;
 }
 
 struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
@@ -176,10 +190,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
        unsigned long flags;
        int node;
 
-       if (irq >= NR_IRQS) {
-               printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
-                               irq, NR_IRQS);
-               WARN_ON(1);
+       if (irq >= nr_irqs) {
+               WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
+                       irq, nr_irqs);
                return NULL;
        }
 
@@ -221,9 +234,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
                .handle_irq = handle_bad_irq,
                .depth = 1,
                .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
-#ifdef CONFIG_SMP
-               .affinity = CPU_MASK_ALL
-#endif
        }
 };
 
@@ -235,12 +245,15 @@ int __init early_irq_init(void)
 
        init_irq_default_affinity();
 
+       printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+
        desc = irq_desc;
        count = ARRAY_SIZE(irq_desc);
 
-       for (i = 0; i < count; i++)
+       for (i = 0; i < count; i++) {
                desc[i].irq = i;
-
+               init_alloc_desc_masks(&desc[i], 0, true);
+       }
        return arch_early_irq_init();
 }
 
index e6d0a43..40416a8 100644 (file)
@@ -16,7 +16,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 extern struct lock_class_key irq_desc_lock_class;
 extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
 extern spinlock_t sparse_irq_lock;
+
+#ifdef CONFIG_SPARSE_IRQ
+/* irq_desc_ptrs allocated at boot time */
+extern struct irq_desc **irq_desc_ptrs;
+#else
+/* irq_desc_ptrs is a fixed size array */
 extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
+#endif
 
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
index 291f036..a3a5dc9 100644 (file)
@@ -90,14 +90,14 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
-               cpumask_copy(&desc->affinity, cpumask);
+               cpumask_copy(desc->affinity, cpumask);
                desc->chip->set_affinity(irq, cpumask);
        } else {
                desc->status |= IRQ_MOVE_PENDING;
-               cpumask_copy(&desc->pending_mask, cpumask);
+               cpumask_copy(desc->pending_mask, cpumask);
        }
 #else
-       cpumask_copy(&desc->affinity, cpumask);
+       cpumask_copy(desc->affinity, cpumask);
        desc->chip->set_affinity(irq, cpumask);
 #endif
        desc->status |= IRQ_AFFINITY_SET;
@@ -119,16 +119,16 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
         * one of the targets is online.
         */
        if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
-               if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+               if (cpumask_any_and(desc->affinity, cpu_online_mask)
                    < nr_cpu_ids)
                        goto set_affinity;
                else
                        desc->status &= ~IRQ_AFFINITY_SET;
        }
 
-       cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
+       cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
 set_affinity:
-       desc->chip->set_affinity(irq, &desc->affinity);
+       desc->chip->set_affinity(irq, desc->affinity);
 
        return 0;
 }
index bd72329..e05ad9b 100644 (file)
@@ -18,7 +18,7 @@ void move_masked_irq(int irq)
 
        desc->status &= ~IRQ_MOVE_PENDING;
 
-       if (unlikely(cpumask_empty(&desc->pending_mask)))
+       if (unlikely(cpumask_empty(desc->pending_mask)))
                return;
 
        if (!desc->chip->set_affinity)
@@ -38,13 +38,13 @@ void move_masked_irq(int irq)
         * For correct operation this depends on the caller
         * masking the irqs.
         */
-       if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
+       if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
                   < nr_cpu_ids)) {
-               cpumask_and(&desc->affinity,
-                           &desc->pending_mask, cpu_online_mask);
-               desc->chip->set_affinity(irq, &desc->affinity);
+               cpumask_and(desc->affinity,
+                           desc->pending_mask, cpu_online_mask);
+               desc->chip->set_affinity(irq, desc->affinity);
        }
-       cpumask_clear(&desc->pending_mask);
+       cpumask_clear(desc->pending_mask);
 }
 
 void move_native_irq(int irq)
index acd8835..7f9b804 100644 (file)
@@ -38,15 +38,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
        old_desc->kstat_irqs = NULL;
 }
 
-static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
                 struct irq_desc *desc, int cpu)
 {
        memcpy(desc, old_desc, sizeof(struct irq_desc));
+       if (!init_alloc_desc_masks(desc, cpu, false)) {
+               printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
+                               "for migration.\n", irq);
+               return false;
+       }
        spin_lock_init(&desc->lock);
        desc->cpu = cpu;
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
        init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+       init_copy_desc_masks(old_desc, desc);
        arch_init_copy_chip_data(old_desc, desc, cpu);
+       return true;
 }
 
 static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -76,12 +83,18 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        node = cpu_to_node(cpu);
        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
        if (!desc) {
-               printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq);
+               printk(KERN_ERR "irq %d: can not get new irq_desc "
+                               "for migration.\n", irq);
+               /* still use old one */
+               desc = old_desc;
+               goto out_unlock;
+       }
+       if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
                /* still use old one */
+               kfree(desc);
                desc = old_desc;
                goto out_unlock;
        }
-       init_copy_one_irq_desc(irq, old_desc, desc, cpu);
 
        irq_desc_ptrs[irq] = desc;
        spin_unlock_irqrestore(&sparse_irq_lock, flags);
index aae3f74..692363d 100644 (file)
@@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir;
 static int irq_affinity_proc_show(struct seq_file *m, void *v)
 {
        struct irq_desc *desc = irq_to_desc((long)m->private);
-       const struct cpumask *mask = &desc->affinity;
+       const struct cpumask *mask = desc->affinity;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (desc->status & IRQ_MOVE_PENDING)
-               mask = &desc->pending_mask;
+               mask = desc->pending_mask;
 #endif
        seq_cpumask(m, mask);
        seq_putc(m, '\n');
index 2a2ff36..33cab3d 100644 (file)
@@ -74,6 +74,9 @@ NORET_TYPE void panic(const char * fmt, ...)
        vsnprintf(buf, sizeof(buf), fmt, args);
        va_end(args);
        printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+       dump_stack();
+#endif
        bust_spinlocks(0);
 
        /*
@@ -355,15 +358,22 @@ EXPORT_SYMBOL(warn_slowpath);
 #endif
 
 #ifdef CONFIG_CC_STACKPROTECTOR
+
+#ifndef GCC_HAS_SP
+#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
+#endif
+
 /*
  * Called when gcc's -fstack-protector feature is used, and
  * gcc detects corruption of the on-stack canary value
  */
 void __stack_chk_fail(void)
 {
-       panic("stack-protector: Kernel stack is corrupted");
+       panic("stack-protector: Kernel stack is corrupted in: %p\n",
+               __builtin_return_address(0));
 }
 EXPORT_SYMBOL(__stack_chk_fail);
+
 #endif
 
 core_param(panic, panic_timeout, int, 0644);
index 8ee437a..fc17fd9 100644 (file)
@@ -5949,12 +5949,7 @@ void sched_show_task(struct task_struct *p)
                printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
-       {
-               unsigned long *n = end_of_stack(p);
-               while (!*n)
-                       n++;
-               free = (unsigned long)n - (unsigned long)end_of_stack(p);
-       }
+       free = stack_not_used(p);
 #endif
        printk(KERN_CONT "%5lu %5d %6d\n", free,
                task_pid_nr(p), task_pid_nr(p->real_parent));
index bac1061..da932f4 100644 (file)
@@ -960,12 +960,13 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 
-static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
+static inline int pick_optimal_cpu(int this_cpu,
+                                  const struct cpumask *mask)
 {
        int first;
 
        /* "this_cpu" is cheaper to preempt than a remote processor */
-       if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
+       if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
                return this_cpu;
 
        first = cpumask_first(mask);
@@ -981,6 +982,7 @@ static int find_lowest_rq(struct task_struct *task)
        struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
+       cpumask_var_t domain_mask;
 
        if (task->rt.nr_cpus_allowed == 1)
                return -1; /* No other targets possible */
@@ -1013,19 +1015,25 @@ static int find_lowest_rq(struct task_struct *task)
        if (this_cpu == cpu)
                this_cpu = -1; /* Skip this_cpu opt if the same */
 
-       for_each_domain(cpu, sd) {
-               if (sd->flags & SD_WAKE_AFFINE) {
-                       cpumask_t domain_mask;
-                       int       best_cpu;
+       if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
+               for_each_domain(cpu, sd) {
+                       if (sd->flags & SD_WAKE_AFFINE) {
+                               int best_cpu;
 
-                       cpumask_and(&domain_mask, sched_domain_span(sd),
-                                   lowest_mask);
+                               cpumask_and(domain_mask,
+                                           sched_domain_span(sd),
+                                           lowest_mask);
 
-                       best_cpu = pick_optimal_cpu(this_cpu,
-                                                   &domain_mask);
-                       if (best_cpu != -1)
-                               return best_cpu;
+                               best_cpu = pick_optimal_cpu(this_cpu,
+                                                           domain_mask);
+
+                               if (best_cpu != -1) {
+                                       free_cpumask_var(domain_mask);
+                                       return best_cpu;
+                               }
+                       }
                }
+               free_cpumask_var(domain_mask);
        }
 
        /*
index bdbe9de..0365b48 100644 (file)
@@ -795,6 +795,11 @@ int __init __weak early_irq_init(void)
        return 0;
 }
 
+int __init __weak arch_probe_nr_irqs(void)
+{
+       return 0;
+}
+
 int __init __weak arch_early_irq_init(void)
 {
        return 0;