Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux...

author Mike Travis <travis@sgi.com>

Thu, 1 Jan 2009 01:34:16 +0000 (17:34 -0800)

committer Ingo Molnar <mingo@elte.hu>

Sat, 3 Jan 2009 17:53:31 +0000 (18:53 +0100)
author Mike Travis <travis@sgi.com>
Thu, 1 Jan 2009 01:34:16 +0000 (17:34 -0800)
committer Ingo Molnar <mingo@elte.hu>
Sat, 3 Jan 2009 17:53:31 +0000 (18:53 +0100)
diff --combined arch/ia64/include/asm/topology.h

index a3cc9f6,97ae7f5..76a33a9
--- 1/arch/ia64/include/asm/topology.h
--- 2/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@@ -34,6 -34,7 +34,7 @@@
    * Returns a bitmask of CPUs on Node 'node'.
    */
   #define node_to_cpumask(node) (node_to_cpu_mask[node])
+ #define cpumask_of_node(node) (&node_to_cpu_mask[node])
   
   /*
    * Returns the number of the node containing Node 'nid'.
@@@ -45,7 -46,7 +46,7 @@@
   /*
    * Returns the number of the first CPU on Node 'node'.
    */
- #define node_to_first_cpu(node) (first_cpu(node_to_cpumask(node)))
+ #define node_to_first_cpu(node) (cpumask_first(cpumask_of_node(node)))
   
   /*
    * Determines the node for a given pci bus
@@@ -55,6 -56,7 +56,6 @@@
   void build_cpu_to_node_map(void);
   
   #define SD_CPU_INIT (struct sched_domain) {           \
- -      .span                   = CPU_MASK_NONE,        \
         .parent                 = NULL,                 \
         .child                  = NULL,                 \
         .groups                 = NULL,                 \
@@@ -79,6 -81,7 +80,6 @@@
   
   /* sched_domains SD_NODE_INIT for IA64 NUMA machines */
   #define SD_NODE_INIT (struct sched_domain) {          \
- -      .span                   = CPU_MASK_NONE,        \
         .parent                 = NULL,                 \
         .child                  = NULL,                 \
         .groups                 = NULL,                 \
@@@ -109,6 -112,8 +110,8 @@@
   #define topology_core_id(cpu)                 (cpu_data(cpu)->core_id)
   #define topology_core_siblings(cpu)           (cpu_core_map[cpu])
   #define topology_thread_siblings(cpu)         (per_cpu(cpu_sibling_map, cpu))
+ #define topology_core_cpumask(cpu)            (&cpu_core_map[cpu])
+ #define topology_thread_cpumask(cpu)          (&per_cpu(cpu_sibling_map, cpu))
   #define smt_capable()                                 (smp_num_siblings > 1)
   #endif
   
@@@ -119,6 -124,10 +122,10 @@@ extern void arch_fix_phys_package_id(in
                                         node_to_cpumask(pcibus_to_node(bus)) \
                                 )
   
+ #define cpumask_of_pcibus(bus)        (pcibus_to_node(bus) == -1 ?            \
+                                cpu_all_mask :                         \
+                                cpumask_from_node(pcibus_to_node(bus)))
+ 
   #include <asm-generic/topology.h>
   
   #endif /* _ASM_IA64_TOPOLOGY_H */
diff --combined arch/mips/include/asm/mach-ip27/topology.h

index 1fb959f,c1c3f5b..55d4815
--- 1/arch/mips/include/asm/mach-ip27/topology.h
--- 2/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@@ -25,11 -25,13 +25,13 @@@ extern struct cpuinfo_ip27 sn_cpu_info[
   #define cpu_to_node(cpu)      (sn_cpu_info[(cpu)].p_nodeid)
   #define parent_node(node)     (node)
   #define node_to_cpumask(node) (hub_data(node)->h_cpus)
- #define node_to_first_cpu(node)       (first_cpu(node_to_cpumask(node)))
+ #define cpumask_of_node(node) (&hub_data(node)->h_cpus)
+ #define node_to_first_cpu(node)       (cpumask_first(cpumask_of_node(node)))
   struct pci_bus;
   extern int pcibus_to_node(struct pci_bus *);
   
   #define pcibus_to_cpumask(bus)        (cpu_online_map)
+ #define cpumask_of_pcibus(bus)        (cpu_online_mask)
   
   extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
   
@@@ -37,6 -39,7 +39,6 @@@
   
   /* sched_domains SD_NODE_INIT for SGI IP27 machines */
   #define SD_NODE_INIT (struct sched_domain) {          \
- -      .span                   = CPU_MASK_NONE,        \
         .parent                 = NULL,                 \
         .child                  = NULL,                 \
         .groups                 = NULL,                 \
diff --combined arch/powerpc/include/asm/topology.h

index 373fca3,236dae1..3752585
--- 1/arch/powerpc/include/asm/topology.h
--- 2/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@@ -22,11 -22,11 +22,11 @@@ static inline cpumask_t node_to_cpumask
         return numa_cpumask_lookup_table[node];
   }
   
+ #define cpumask_of_node(node) (&numa_cpumask_lookup_table[node])
+ 
   static inline int node_to_first_cpu(int node)
   {
-       cpumask_t tmp;
-       tmp = node_to_cpumask(node);
-       return first_cpu(tmp);
+       return cpumask_first(cpumask_of_node(node));
   }
   
   int of_node_to_nid(struct device_node *device);
@@@ -46,8 -46,13 +46,12 @@@ static inline int pcibus_to_node(struc
                                         node_to_cpumask(pcibus_to_node(bus)) \
                                 )
   
+ #define cpumask_of_pcibus(bus)        (pcibus_to_node(bus) == -1 ?            \
+                                cpu_all_mask :                         \
+                                cpumask_of_node(pcibus_to_node(bus)))
+ 
   /* sched_domains SD_NODE_INIT for PPC64 machines */
   #define SD_NODE_INIT (struct sched_domain) {          \
- -      .span                   = CPU_MASK_NONE,        \
         .parent                 = NULL,                 \
         .child                  = NULL,                 \
         .groups                 = NULL,                 \
@@@ -108,6 -113,8 +112,8 @@@ static inline void sysfs_remove_device_
   
   #define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu))
   #define topology_core_siblings(cpu)   (per_cpu(cpu_core_map, cpu))
+ #define topology_thread_cpumask(cpu)  (&per_cpu(cpu_sibling_map, cpu))
+ #define topology_core_cpumask(cpu)    (&per_cpu(cpu_core_map, cpu))
   #define topology_core_id(cpu)         (cpu_to_core_id(cpu))
   #endif
   #endif
diff --combined arch/sh/include/asm/topology.h

index 279d9cc,9aa160d..066f0fb
--- 1/arch/sh/include/asm/topology.h
--- 2/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@@ -5,6 -5,7 +5,6 @@@
   
   /* sched_domains SD_NODE_INIT for sh machines */
   #define SD_NODE_INIT (struct sched_domain) {          \
- -      .span                   = CPU_MASK_NONE,        \
         .parent                 = NULL,                 \
         .child                  = NULL,                 \
         .groups                 = NULL,                 \
@@@ -32,6 -33,7 +32,7 @@@
   #define parent_node(node)     ((void)(node),0)
   
   #define node_to_cpumask(node) ((void)node, cpu_online_map)
+ #define cpumask_of_node(node) ((void)node, cpu_online_mask)
   #define node_to_first_cpu(node)       ((void)(node),0)
   
   #define pcibus_to_node(bus)   ((void)(bus), -1)
diff --combined arch/x86/Kconfig

index 0ca2eb7,0f44add..249d1e0
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -19,6 -19,8 +19,8 @@@ config X86_6
   config X86
         def_bool y
         select HAVE_AOUT if X86_32
+       select HAVE_READQ
+       select HAVE_WRITEQ
         select HAVE_UNSTABLE_SCHED_CLOCK
         select HAVE_IDE
         select HAVE_OPROFILE
@@@ -90,6 -92,10 +92,10 @@@ config GENERIC_IOMA
   config GENERIC_BUG
         def_bool y
         depends on BUG
+       select GENERIC_BUG_RELATIVE_POINTERS if X86_64
+ 
+ config GENERIC_BUG_RELATIVE_POINTERS
+       bool
   
   config GENERIC_HWEIGHT
         def_bool y
@@@ -244,16 -250,19 +250,19 @@@ config X86_HAS_BOOT_CPU_I
   config SPARSE_IRQ
         bool "Support sparse irq numbering"
         depends on PCI_MSI || HT_IRQ
-       default y
         help
-         This enables support for sparse irq, esp for msi/msi-x. You may need
-         if you have lots of cards supports msi-x installed.
+         This enables support for sparse irqs. This is useful for distro
+         kernels that want to define a high CONFIG_NR_CPUS value but still
+         want to have low kernel memory footprint on smaller machines.
   
-         If you don't know what to do here, say Y.
+         ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
+           out the irq_desc[] array in a more NUMA-friendly way. )
+ 
+         If you don't know what to do here, say N.
   
   config NUMA_MIGRATE_IRQ_DESC
         bool "Move irq desc when changing irq smp_affinity"
-       depends on SPARSE_IRQ && SMP
+       depends on SPARSE_IRQ && NUMA
         default n
         help
           This enables moving irq_desc to cpu/node that irq will use handled.
@@@ -264,21 -273,13 +273,13 @@@ config X86_FIND_SMP_CONFI
         def_bool y
         depends on X86_MPPARSE || X86_VOYAGER
   
- if ACPI
   config X86_MPPARSE
-       def_bool y
-       bool "Enable MPS table"
+       bool "Enable MPS table" if ACPI
+       default y
         depends on X86_LOCAL_APIC
         help
           For old smp systems that do not have proper acpi support. Newer systems
           (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
- endif
- 
- if !ACPI
- config X86_MPPARSE
-       def_bool y
-       depends on X86_LOCAL_APIC
- endif
   
   choice
         prompt "Subarchitecture Type"
@@@ -500,7 -501,7 +501,7 @@@ config HPET_TIME
            The HPET provides a stable time base on SMP
            systems, unlike the TSC, but it is more expensive to access,
            as it is off-chip.  You can find the HPET spec at
-          <http://www.intel.com/hardwaredesign/hpetspec.htm>.
+          <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
   
            You can safely choose Y here.  However, HPET will only be
            activated if the platform and the BIOS support this feature.
@@@ -587,7 -588,7 +588,7 @@@ config AMD_IOMM
   
   # need this always selected by IOMMU for the VIA workaround
   config SWIOTLB
-       bool
+       def_bool y if X86_64
         help
           Support for software bounce buffers used on x86-64 systems
           which don't have a hardware IOMMU (e.g. the current generation
@@@ -600,20 -601,19 +601,20 @@@ config IOMMU_HELPE
   
   config MAXSMP
         bool "Configure Maximum number of SMP Processors and NUMA Nodes"
- -      depends on X86_64 && SMP && BROKEN
+ +      depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
+ +      select CPUMASK_OFFSTACK
         default n
         help
           Configure maximum number of CPUS and NUMA Nodes for this architecture.
           If unsure, say N.
   
   config NR_CPUS
- -      int "Maximum number of CPUs (2-512)" if !MAXSMP
- -      range 2 512
- -      depends on SMP
+ +      int "Maximum number of CPUs" if SMP && !MAXSMP
+ +      range 2 512 if SMP && !MAXSMP
+ +      default "1" if !SMP
         default "4096" if MAXSMP
- -      default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
- -      default "8"
+ +      default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
+ +      default "8" if SMP
         help
           This allows you to specify the maximum number of CPUs which this
           kernel will support.  The maximum supported value is 512 and the
@@@ -679,6 -679,30 +680,30 @@@ config X86_VISWS_API
         def_bool y
         depends on X86_32 && X86_VISWS
   
+ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
+       bool "Reroute for broken boot IRQs"
+       default n
+       depends on X86_IO_APIC
+       help
+         This option enables a workaround that fixes a source of
+         spurious interrupts. This is recommended when threaded
+         interrupt handling is used on systems where the generation of
+         superfluous "boot interrupts" cannot be disabled.
+ 
+         Some chipsets generate a legacy INTx "boot IRQ" when the IRQ
+         entry in the chipset's IO-APIC is masked (as, e.g. the RT
+         kernel does during interrupt handling). On chipsets where this
+         boot IRQ generation cannot be disabled, this workaround keeps
+         the original IRQ line masked so that only the equivalent "boot
+         IRQ" is delivered to the CPUs. The workaround also tells the
+         kernel to set up the IRQ handler on the boot IRQ line. In this
+         way only one interrupt is delivered to the kernel. Otherwise
+         the spurious second interrupt may cause the kernel to bring
+         down (vital) interrupt lines.
+ 
+         Only affects "broken" chipsets. Interrupt sharing may be
+         increased on these systems.
+ 
   config X86_MCE
         bool "Machine Check Exception"
         depends on !X86_VOYAGER
@@@ -975,24 -999,37 +1000,37 @@@ config X86_PA
   config ARCH_PHYS_ADDR_T_64BIT
          def_bool X86_64 || X86_PAE
   
+ config DIRECT_GBPAGES
+       bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
+       default y
+       depends on X86_64
+       help
+         Allow the kernel linear mapping to use 1GB pages on CPUs that
+         support it. This can improve the kernel's performance a tiny bit by
+         reducing TLB pressure. If in doubt, say "Y".
+ 
   # Common NUMA Features
   config NUMA
-       bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
+       bool "Numa Memory Allocation and Scheduler Support"
         depends on SMP
         depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
         default n if X86_PC
         default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
         help
           Enable NUMA (Non Uniform Memory Access) support.
+ 
           The kernel will try to allocate memory used by a CPU on the
           local memory controller of the CPU and add some more
           NUMA awareness to the kernel.
   
-         For 32-bit this is currently highly experimental and should be only
-         used for kernel development. It might also cause boot failures.
-         For 64-bit this is recommended on all multiprocessor Opteron systems.
-         If the system is EM64T, you should say N unless your system is
-         EM64T NUMA.
+         For 64-bit this is recommended if the system is Intel Core i7
+         (or later), AMD Opteron, or EM64T NUMA.
+ 
+         For 32-bit this is only needed on (rare) 32-bit-only platforms
+         that support NUMA topologies, such as NUMAQ / Summit, or if you
+         boot a 32-bit kernel on a 64-bit NUMA platform.
+ 
+         Otherwise, you should say N.
   
   comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
         depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
@@@ -1512,6 -1549,10 +1550,10 @@@ config ARCH_ENABLE_MEMORY_HOTPLU
         def_bool y
         depends on X86_64 || (X86_32 && HIGHMEM)
   
+ config ARCH_ENABLE_MEMORY_HOTREMOVE
+       def_bool y
+       depends on MEMORY_HOTPLUG
+ 
   config HAVE_ARCH_EARLY_PFN_TO_NID
         def_bool X86_64
         depends on NUMA
diff --combined arch/x86/include/asm/irq.h

index 4bb732e,28e409f..592688e
--- 1/arch/x86/include/asm/irq.h
--- 2/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@@ -31,13 -31,9 +31,9 @@@ static inline int irq_canonicalize(int 
   # endif
   #endif
   
- #ifdef CONFIG_IRQBALANCE
- extern int irqbalance_disable(char *str);
- #endif
- 
   #ifdef CONFIG_HOTPLUG_CPU
   #include <linux/cpumask.h>
- -extern void fixup_irqs(cpumask_t map);
+ +extern void fixup_irqs(void);
   #endif
   
   extern unsigned int do_IRQ(struct pt_regs *regs);
@@@ -46,6 -42,5 +42,6 @@@ extern void native_init_IRQ(void)
   
   /* Interrupt vector management */
   extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
+ +extern int vector_used_by_percpu_irq(unsigned int vector);
   
   #endif /* _ASM_X86_IRQ_H */
diff --combined arch/x86/include/asm/topology.h

index 79e31e9,168203c..4e2f2e0
--- 1/arch/x86/include/asm/topology.h
--- 2/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@@ -61,13 -61,19 +61,19 @@@ static inline int cpu_to_node(int cpu
    *
    * Side note: this function creates the returned cpumask on the stack
    * so with a high NR_CPUS count, excessive stack space is used.  The
-  * node_to_cpumask_ptr function should be used whenever possible.
+  * cpumask_of_node function should be used whenever possible.
    */
   static inline cpumask_t node_to_cpumask(int node)
   {
         return node_to_cpumask_map[node];
   }
   
+ /* Returns a bitmask of CPUs on Node 'node'. */
+ static inline const struct cpumask *cpumask_of_node(int node)
+ {
+       return &node_to_cpumask_map[node];
+ }
+ 
   #else /* CONFIG_X86_64 */
   
   /* Mappings between node number and cpus on that node. */
@@@ -82,7 -88,7 +88,7 @@@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_n
   #ifdef CONFIG_DEBUG_PER_CPU_MAPS
   extern int cpu_to_node(int cpu);
   extern int early_cpu_to_node(int cpu);
- extern const cpumask_t *_node_to_cpumask_ptr(int node);
+ extern const cpumask_t *cpumask_of_node(int node);
   extern cpumask_t node_to_cpumask(int node);
   
   #else /* !CONFIG_DEBUG_PER_CPU_MAPS */
@@@ -103,7 -109,7 +109,7 @@@ static inline int early_cpu_to_node(in
   }
   
   /* Returns a pointer to the cpumask of CPUs on Node 'node'. */
- static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+ static inline const cpumask_t *cpumask_of_node(int node)
   {
         return &node_to_cpumask_map[node];
   }
@@@ -116,12 -122,15 +122,15 @@@ static inline cpumask_t node_to_cpumask
   
   #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
   
- /* Replace default node_to_cpumask_ptr with optimized version */
+ /*
+  * Replace default node_to_cpumask_ptr with optimized version
+  * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
+  */
   #define node_to_cpumask_ptr(v, node)          \
-               const cpumask_t *v = _node_to_cpumask_ptr(node)
+               const cpumask_t *v = cpumask_of_node(node)
   
   #define node_to_cpumask_ptr_next(v, node)     \
-                          v = _node_to_cpumask_ptr(node)
+                          v = cpumask_of_node(node)
   
   #endif /* CONFIG_X86_64 */
   
@@@ -187,7 -196,7 +196,7 @@@ extern int __node_distance(int, int)
   #define       cpu_to_node(cpu)        0
   #define       early_cpu_to_node(cpu)  0
   
- static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+ static inline const cpumask_t *cpumask_of_node(int node)
   {
         return &cpu_online_map;
   }
@@@ -200,12 -209,15 +209,15 @@@ static inline int node_to_first_cpu(in
         return first_cpu(cpu_online_map);
   }
   
- /* Replace default node_to_cpumask_ptr with optimized version */
+ /*
+  * Replace default node_to_cpumask_ptr with optimized version
+  * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
+  */
   #define node_to_cpumask_ptr(v, node)          \
-               const cpumask_t *v = _node_to_cpumask_ptr(node)
+               const cpumask_t *v = cpumask_of_node(node)
   
   #define node_to_cpumask_ptr_next(v, node)     \
-                          v = _node_to_cpumask_ptr(node)
+                          v = cpumask_of_node(node)
   #endif
   
   #include <asm-generic/topology.h>
@@@ -214,20 -226,18 +226,20 @@@
   /* Returns the number of the first CPU on Node 'node'. */
   static inline int node_to_first_cpu(int node)
   {
-       node_to_cpumask_ptr(mask, node);
-       return first_cpu(*mask);
+       return cpumask_first(cpumask_of_node(node));
   }
   #endif
   
   extern cpumask_t cpu_coregroup_map(int cpu);
+ extern const struct cpumask *cpu_coregroup_mask(int cpu);
   
   #ifdef ENABLE_TOPO_DEFINES
   #define topology_physical_package_id(cpu)     (cpu_data(cpu).phys_proc_id)
   #define topology_core_id(cpu)                 (cpu_data(cpu).cpu_core_id)
   #define topology_core_siblings(cpu)           (per_cpu(cpu_core_map, cpu))
   #define topology_thread_siblings(cpu)         (per_cpu(cpu_sibling_map, cpu))
+ +#define topology_core_cpumask(cpu)            (&per_cpu(cpu_core_map, cpu))
+ +#define topology_thread_cpumask(cpu)          (&per_cpu(cpu_sibling_map, cpu))
   
   /* indicates that pointers to the topology cpumask_t maps are valid */
   #define arch_provides_topology_pointers               yes
diff --combined arch/x86/kernel/apic.c

index b901927,6107b41..6b7f824
--- 1/arch/x86/kernel/apic.c
--- 2/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@@ -30,6 -30,7 +30,7 @@@
   #include <linux/module.h>
   #include <linux/dmi.h>
   #include <linux/dmar.h>
+ #include <linux/ftrace.h>
   
   #include <asm/atomic.h>
   #include <asm/smp.h>
@@@ -118,6 -119,8 +119,6 @@@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_o
   
   int first_system_vector = 0xfe;
   
- -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
- -
   /*
    * Debug level, exported for io_apic.c
    */
@@@ -139,7 -142,7 +140,7 @@@ static int lapic_next_event(unsigned lo
                             struct clock_event_device *evt);
   static void lapic_timer_setup(enum clock_event_mode mode,
                               struct clock_event_device *evt);
- -static void lapic_timer_broadcast(const struct cpumask *mask);
+ +static void lapic_timer_broadcast(const cpumask_t *mask);
   static void apic_pm_activate(void);
   
   /*
@@@ -452,10 -455,10 +453,10 @@@ static void lapic_timer_setup(enum cloc
   /*
    * Local APIC timer broadcast function
    */
- -static void lapic_timer_broadcast(const struct cpumask *mask)
+ +static void lapic_timer_broadcast(const cpumask_t *mask)
   {
   #ifdef CONFIG_SMP
- -      send_IPI_mask(*mask, LOCAL_TIMER_VECTOR);
+ +      send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
   #endif
   }
   
@@@ -775,11 -778,7 +776,7 @@@ static void local_apic_timer_interrupt(
         /*
          * the NMI deadlock-detector uses this.
          */
- #ifdef CONFIG_X86_64
-       add_pda(apic_timer_irqs, 1);
- #else
-       per_cpu(irq_stat, cpu).apic_timer_irqs++;
- #endif
+       inc_irq_stat(apic_timer_irqs);
   
         evt->event_handler(evt);
   }
@@@ -792,7 -791,7 +789,7 @@@
    * [ if a single-CPU system runs an SMP kernel then we call the local
    *   interrupt as well. Thus we cannot inline the local irq ... ]
    */
- void smp_apic_timer_interrupt(struct pt_regs *regs)
+ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
   {
         struct pt_regs *old_regs = set_irq_regs(regs);
   
@@@ -806,9 -805,7 +803,7 @@@
          * Besides, if we don't timer interrupts ignore the global
          * interrupt lock, which is the WrongThing (tm) to do.
          */
- #ifdef CONFIG_X86_64
         exit_idle();
- #endif
         irq_enter();
         local_apic_timer_interrupt();
         irq_exit();
@@@ -1666,9 -1663,7 +1661,7 @@@ void smp_spurious_interrupt(struct pt_r
   {
         u32 v;
   
- #ifdef CONFIG_X86_64
         exit_idle();
- #endif
         irq_enter();
         /*
          * Check if this really is a spurious interrupt and ACK it
@@@ -1679,14 -1674,11 +1672,11 @@@
         if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
                 ack_APIC_irq();
   
- #ifdef CONFIG_X86_64
-       add_pda(irq_spurious_count, 1);
- #else
+       inc_irq_stat(irq_spurious_count);
+ 
         /* see sw-dev-man vol 3, chapter 7.4.13.5 */
         pr_info("spurious APIC interrupt on CPU#%d, "
                 "should never happen.\n", smp_processor_id());
-       __get_cpu_var(irq_stat).irq_spurious_count++;
- #endif
         irq_exit();
   }
   
@@@ -1697,9 -1689,7 +1687,7 @@@ void smp_error_interrupt(struct pt_reg
   {
         u32 v, v1;
   
- #ifdef CONFIG_X86_64
         exit_idle();
- #endif
         irq_enter();
         /* First tickle the hardware, only then report what went on. -- REW */
         v = apic_read(APIC_ESR);
@@@ -1817,32 -1807,28 +1805,32 @@@ void disconnect_bsp_APIC(int virt_wire_
   void __cpuinit generic_processor_info(int apicid, int version)
   {
         int cpu;
- -      cpumask_t tmp_map;
   
         /*
          * Validate version
          */
         if (version == 0x0) {
                 pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
- -                      "fixing up to 0x10. (tell your hw vendor)\n",
- -                      version);
+ +                         "fixing up to 0x10. (tell your hw vendor)\n",
+ +                              version);
                 version = 0x10;
         }
         apic_version[apicid] = version;
   
- -      if (num_processors >= NR_CPUS) {
- -              pr_warning("WARNING: NR_CPUS limit of %i reached."
- -                      "  Processor ignored.\n", NR_CPUS);
+ +      if (num_processors >= nr_cpu_ids) {
+ +              int max = nr_cpu_ids;
+ +              int thiscpu = max + disabled_cpus;
+ +
+ +              pr_warning(
+ +                      "ACPI: NR_CPUS/possible_cpus limit of %i reached."
+ +                      "  Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+ +
+ +              disabled_cpus++;
                 return;
         }
   
         num_processors++;
- -      cpus_complement(tmp_map, cpu_present_map);
- -      cpu = first_cpu(tmp_map);
+ +      cpu = cpumask_next_zero(-1, cpu_present_mask);
   
         physid_set(apicid, phys_cpu_present_map);
         if (apicid == boot_cpu_physical_apicid) {
@@@ -1892,8 -1878,8 +1880,8 @@@
         }
   #endif
   
- -      cpu_set(cpu, cpu_possible_map);
- -      cpu_set(cpu, cpu_present_map);
+ +      set_cpu_possible(cpu, true);
+ +      set_cpu_present(cpu, true);
   }
   
   #ifdef CONFIG_X86_64
@@@ -2095,7 -2081,7 +2083,7 @@@ __cpuinit int apic_is_clustered_box(voi
         bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
         bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
   
- -      for (i = 0; i < NR_CPUS; i++) {
+ +      for (i = 0; i < nr_cpu_ids; i++) {
                 /* are we being called early in kernel startup? */
                 if (bios_cpu_apicid) {
                         id = bios_cpu_apicid[i];
diff --combined arch/x86/kernel/cpu/intel_cacheinfo.c

index 7bd00a5,15cf14e..48533d7
--- 1/arch/x86/kernel/cpu/intel_cacheinfo.c
--- 2/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@@ -534,16 -534,31 +534,16 @@@ static void __cpuinit free_cache_attrib
         per_cpu(cpuid4_info, cpu) = NULL;
   }
   
- -static int __cpuinit detect_cache_attributes(unsigned int cpu)
+ +static void __cpuinit get_cpu_leaves(void *_retval)
   {
- -      struct _cpuid4_info     *this_leaf;
- -      unsigned long           j;
- -      int                     retval;
- -      cpumask_t               oldmask;
- -
- -      if (num_cache_leaves == 0)
- -              return -ENOENT;
- -
- -      per_cpu(cpuid4_info, cpu) = kzalloc(
- -          sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- -      if (per_cpu(cpuid4_info, cpu) == NULL)
- -              return -ENOMEM;
- -
- -      oldmask = current->cpus_allowed;
- -      retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- -      if (retval)
- -              goto out;
+ +      int j, *retval = _retval, cpu = smp_processor_id();
   
         /* Do cpuid and store the results */
         for (j = 0; j < num_cache_leaves; j++) {
+ +              struct _cpuid4_info *this_leaf;
                 this_leaf = CPUID4_INFO_IDX(cpu, j);
- -              retval = cpuid4_cache_lookup(j, this_leaf);
- -              if (unlikely(retval < 0)) {
+ +              *retval = cpuid4_cache_lookup(j, this_leaf);
+ +              if (unlikely(*retval < 0)) {
                         int i;
   
                         for (i = 0; i < j; i++)
@@@ -552,21 -567,9 +552,21 @@@
                 }
                 cache_shared_cpu_map_setup(cpu, j);
         }
- -      set_cpus_allowed_ptr(current, &oldmask);
+ +}
+ +
+ +static int __cpuinit detect_cache_attributes(unsigned int cpu)
+ +{
+ +      int                     retval;
+ +
+ +      if (num_cache_leaves == 0)
+ +              return -ENOENT;
+ +
+ +      per_cpu(cpuid4_info, cpu) = kzalloc(
+ +          sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
+ +      if (per_cpu(cpuid4_info, cpu) == NULL)
+ +              return -ENOMEM;
   
- -out:
+ +      smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
         if (retval) {
                 kfree(per_cpu(cpuid4_info, cpu));
                 per_cpu(cpuid4_info, cpu) = NULL;
@@@ -641,20 -644,17 +641,17 @@@ static inline ssize_t show_shared_cpu_l
         return show_shared_cpu_map_func(leaf, 1, buf);
   }
   
- static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
-       switch(this_leaf->eax.split.type) {
-           case CACHE_TYPE_DATA:
+ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
+ {
+       switch (this_leaf->eax.split.type) {
+       case CACHE_TYPE_DATA:
                 return sprintf(buf, "Data\n");
-               break;
-           case CACHE_TYPE_INST:
+       case CACHE_TYPE_INST:
                 return sprintf(buf, "Instruction\n");
-               break;
-           case CACHE_TYPE_UNIFIED:
+       case CACHE_TYPE_UNIFIED:
                 return sprintf(buf, "Unified\n");
-               break;
-           default:
+       default:
                 return sprintf(buf, "Unknown\n");
-               break;
         }
   }
   
diff --combined arch/x86/kernel/cpu/mcheck/mce_amd_64.c

index a1de80f,748c8f9..a5a5e05
--- 1/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
--- 2/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@@ -83,41 -83,34 +83,41 @@@ static DEFINE_PER_CPU(unsigned char, ba
    * CPU Initialization
    */
   
+ +struct thresh_restart {
+ +      struct threshold_block *b;
+ +      int reset;
+ +      u16 old_limit;
+ +};
+ +
   /* must be called with correct cpu affinity */
- -static void threshold_restart_bank(struct threshold_block *b,
- -                                 int reset, u16 old_limit)
+ +static long threshold_restart_bank(void *_tr)
   {
+ +      struct thresh_restart *tr = _tr;
         u32 mci_misc_hi, mci_misc_lo;
   
- -      rdmsr(b->address, mci_misc_lo, mci_misc_hi);
+ +      rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
   
- -      if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
- -              reset = 1;      /* limit cannot be lower than err count */
+ +      if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+ +              tr->reset = 1;  /* limit cannot be lower than err count */
   
- -      if (reset) {            /* reset err count and overflow bit */
+ +      if (tr->reset) {                /* reset err count and overflow bit */
                 mci_misc_hi =
                     (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
- -                  (THRESHOLD_MAX - b->threshold_limit);
- -      } else if (old_limit) { /* change limit w/o reset */
+ +                  (THRESHOLD_MAX - tr->b->threshold_limit);
+ +      } else if (tr->old_limit) {     /* change limit w/o reset */
                 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
- -                  (old_limit - b->threshold_limit);
+ +                  (tr->old_limit - tr->b->threshold_limit);
                 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
                     (new_count & THRESHOLD_MAX);
         }
   
- -      b->interrupt_enable ?
+ +      tr->b->interrupt_enable ?
             (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
             (mci_misc_hi &= ~MASK_INT_TYPE_HI);
   
         mci_misc_hi |= MASK_COUNT_EN_HI;
- -      wrmsr(b->address, mci_misc_lo, mci_misc_hi);
+ +      wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+ +      return 0;
   }
   
   /* cpu init entry point, called from mce.c with preempt off */
@@@ -127,7 -120,6 +127,7 @@@ void __cpuinit mce_amd_feature_init(str
         unsigned int cpu = smp_processor_id();
         u8 lvt_off;
         u32 low = 0, high = 0, address = 0;
+ +      struct thresh_restart tr;
   
         for (bank = 0; bank < NR_BANKS; ++bank) {
                 for (block = 0; block < NR_BLOCKS; ++block) {
@@@ -170,10 -162,7 +170,10 @@@
                         wrmsr(address, low, high);
   
                         threshold_defaults.address = address;
- -                      threshold_restart_bank(&threshold_defaults, 0, 0);
+ +                      tr.b = &threshold_defaults;
+ +                      tr.reset = 0;
+ +                      tr.old_limit = 0;
+ +                      threshold_restart_bank(&tr);
                 }
         }
   }
@@@ -248,7 -237,7 +248,7 @@@ asmlinkage void mce_threshold_interrupt
                 }
         }
   out:
-       add_pda(irq_threshold_count, 1);
+       inc_irq_stat(irq_threshold_count);
         irq_exit();
   }
   
@@@ -262,6 -251,20 +262,6 @@@ struct threshold_attr 
         ssize_t(*store) (struct threshold_block *, const char *, size_t count);
   };
   
- -static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
- -                                         cpumask_t *newmask)
- -{
- -      *oldmask = current->cpus_allowed;
- -      cpus_clear(*newmask);
- -      cpu_set(cpu, *newmask);
- -      set_cpus_allowed_ptr(current, newmask);
- -}
- -
- -static void affinity_restore(const cpumask_t *oldmask)
- -{
- -      set_cpus_allowed_ptr(current, oldmask);
- -}
- -
   #define SHOW_FIELDS(name)                                           \
   static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
   {                                                                   \
@@@ -274,16 -277,15 +274,16 @@@ static ssize_t store_interrupt_enable(s
                                       const char *buf, size_t count)
   {
         char *end;
- -      cpumask_t oldmask, newmask;
+ +      struct thresh_restart tr;
         unsigned long new = simple_strtoul(buf, &end, 0);
         if (end == buf)
                 return -EINVAL;
         b->interrupt_enable = !!new;
   
- -      affinity_set(b->cpu, &oldmask, &newmask);
- -      threshold_restart_bank(b, 0, 0);
- -      affinity_restore(&oldmask);
+ +      tr.b = b;
+ +      tr.reset = 0;
+ +      tr.old_limit = 0;
+ +      work_on_cpu(b->cpu, threshold_restart_bank, &tr);
   
         return end - buf;
   }
@@@ -292,7 -294,8 +292,7 @@@ static ssize_t store_threshold_limit(st
                                      const char *buf, size_t count)
   {
         char *end;
- -      cpumask_t oldmask, newmask;
- -      u16 old;
+ +      struct thresh_restart tr;
         unsigned long new = simple_strtoul(buf, &end, 0);
         if (end == buf)
                 return -EINVAL;
@@@ -300,36 -303,34 +300,36 @@@
                 new = THRESHOLD_MAX;
         if (new < 1)
                 new = 1;
- -      old = b->threshold_limit;
+ +      tr.old_limit = b->threshold_limit;
         b->threshold_limit = new;
+ +      tr.b = b;
+ +      tr.reset = 0;
   
- -      affinity_set(b->cpu, &oldmask, &newmask);
- -      threshold_restart_bank(b, 0, old);
- -      affinity_restore(&oldmask);
+ +      work_on_cpu(b->cpu, threshold_restart_bank, &tr);
   
         return end - buf;
   }
   
- -static ssize_t show_error_count(struct threshold_block *b, char *buf)
+ +static long local_error_count(void *_b)
   {
- -      u32 high, low;
- -      cpumask_t oldmask, newmask;
- -      affinity_set(b->cpu, &oldmask, &newmask);
+ +      struct threshold_block *b = _b;
+ +      u32 low, high;
+ +
         rdmsr(b->address, low, high);
- -      affinity_restore(&oldmask);
- -      return sprintf(buf, "%x\n",
- -                     (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
+ +      return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
+ +}
+ +
+ +static ssize_t show_error_count(struct threshold_block *b, char *buf)
+ +{
+ +      return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
   }
   
   static ssize_t store_error_count(struct threshold_block *b,
                                  const char *buf, size_t count)
   {
- -      cpumask_t oldmask, newmask;
- -      affinity_set(b->cpu, &oldmask, &newmask);
- -      threshold_restart_bank(b, 1, 0);
- -      affinity_restore(&oldmask);
+ +      struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
+ +
+ +      work_on_cpu(b->cpu, threshold_restart_bank, &tr);
         return 1;
   }
   
@@@ -462,19 -463,12 +462,19 @@@ out_free
         return err;
   }
   
+ +static long local_allocate_threshold_blocks(void *_bank)
+ +{
+ +      unsigned int *bank = _bank;
+ +
+ +      return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
+ +                                       MSR_IA32_MC0_MISC + *bank * 4);
+ +}
+ +
   /* symlinks sibling shared banks to first core.  first core owns dir/files. */
   static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
   {
         int i, err = 0;
         struct threshold_bank *b = NULL;
- -      cpumask_t oldmask, newmask;
         char name[32];
   
         sprintf(name, "threshold_bank%i", bank);
@@@ -525,7 -519,11 +525,7 @@@
   
         per_cpu(threshold_banks, cpu)[bank] = b;
   
- -      affinity_set(cpu, &oldmask, &newmask);
- -      err = allocate_threshold_blocks(cpu, bank, 0,
- -                                      MSR_IA32_MC0_MISC + bank * 4);
- -      affinity_restore(&oldmask);
- -
+ +      err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
         if (err)
                 goto out_free;
   
diff --combined arch/x86/kernel/genx2apic_uv_x.c

index 0e88be1,dece172..b193e08
--- 1/arch/x86/kernel/genx2apic_uv_x.c
--- 2/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@@ -10,6 -10,7 +10,7 @@@
   
   #include <linux/kernel.h>
   #include <linux/threads.h>
+ #include <linux/cpu.h>
   #include <linux/cpumask.h>
   #include <linux/string.h>
   #include <linux/ctype.h>
@@@ -17,6 -18,9 +18,9 @@@
   #include <linux/sched.h>
   #include <linux/module.h>
   #include <linux/hardirq.h>
+ #include <linux/timer.h>
+ #include <linux/proc_fs.h>
+ #include <asm/current.h>
   #include <asm/smp.h>
   #include <asm/ipi.h>
   #include <asm/genapic.h>
@@@ -75,15 -79,16 +79,15 @@@ EXPORT_SYMBOL(sn_rtc_cycles_per_second)
   
   /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
   
- -static cpumask_t uv_target_cpus(void)
+ +static const struct cpumask *uv_target_cpus(void)
   {
- -      return cpumask_of_cpu(0);
+ +      return cpumask_of(0);
   }
   
- -static cpumask_t uv_vector_allocation_domain(int cpu)
+ +static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
   {
- -      cpumask_t domain = CPU_MASK_NONE;
- -      cpu_set(cpu, domain);
- -      return domain;
+ +      cpumask_clear(retmask);
+ +      cpumask_set_cpu(cpu, retmask);
   }
   
   int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
@@@ -122,37 -127,28 +126,37 @@@ static void uv_send_IPI_one(int cpu, in
         uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
   }
   
- -static void uv_send_IPI_mask(cpumask_t mask, int vector)
+ +static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
   {
         unsigned int cpu;
   
- -      for_each_possible_cpu(cpu)
- -              if (cpu_isset(cpu, mask))
+ +      for_each_cpu(cpu, mask)
+ +              uv_send_IPI_one(cpu, vector);
+ +}
+ +
+ +static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+ +{
+ +      unsigned int cpu;
+ +      unsigned int this_cpu = smp_processor_id();
+ +
+ +      for_each_cpu(cpu, mask)
+ +              if (cpu != this_cpu)
                         uv_send_IPI_one(cpu, vector);
   }
   
   static void uv_send_IPI_allbutself(int vector)
   {
- -      cpumask_t mask = cpu_online_map;
- -
- -      cpu_clear(smp_processor_id(), mask);
+ +      unsigned int cpu;
+ +      unsigned int this_cpu = smp_processor_id();
   
- -      if (!cpus_empty(mask))
- -              uv_send_IPI_mask(mask, vector);
+ +      for_each_online_cpu(cpu)
+ +              if (cpu != this_cpu)
+ +                      uv_send_IPI_one(cpu, vector);
   }
   
   static void uv_send_IPI_all(int vector)
   {
- -      uv_send_IPI_mask(cpu_online_map, vector);
+ +      uv_send_IPI_mask(cpu_online_mask, vector);
   }
   
   static int uv_apic_id_registered(void)
@@@ -164,7 -160,7 +168,7 @@@ static void uv_init_apic_ldr(void
   {
   }
   
- -static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
+ +static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
   {
         int cpu;
   
@@@ -172,30 -168,13 +176,30 @@@
          * We're using fixed IRQ delivery, can only return one phys APIC ID.
          * May as well be the first.
          */
- -      cpu = first_cpu(cpumask);
+ +      cpu = cpumask_first(cpumask);
         if ((unsigned)cpu < nr_cpu_ids)
                 return per_cpu(x86_cpu_to_apicid, cpu);
         else
                 return BAD_APICID;
   }
   
+ +static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ +                                            const struct cpumask *andmask)
+ +{
+ +      int cpu;
+ +
+ +      /*
+ +       * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ +       * May as well be the first.
+ +       */
+ +      for_each_cpu_and(cpu, cpumask, andmask)
+ +              if (cpumask_test_cpu(cpu, cpu_online_mask))
+ +                      break;
+ +      if (cpu < nr_cpu_ids)
+ +              return per_cpu(x86_cpu_to_apicid, cpu);
+ +      return BAD_APICID;
+ +}
+ +
   static unsigned int get_apic_id(unsigned long x)
   {
         unsigned int id;
@@@ -243,10 -222,8 +247,10 @@@ struct genapic apic_x2apic_uv_x = 
         .send_IPI_all = uv_send_IPI_all,
         .send_IPI_allbutself = uv_send_IPI_allbutself,
         .send_IPI_mask = uv_send_IPI_mask,
+ +      .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
         .send_IPI_self = uv_send_IPI_self,
         .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
+ +      .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
         .phys_pkg_id = phys_pkg_id,
         .get_apic_id = get_apic_id,
         .set_apic_id = set_apic_id,
@@@ -382,6 -359,103 +386,103 @@@ static __init void uv_rtc_init(void
                 sn_rtc_cycles_per_second = ticks_per_sec;
   }
   
+ /*
+  * percpu heartbeat timer
+  */
+ static void uv_heartbeat(unsigned long ignored)
+ {
+       struct timer_list *timer = &uv_hub_info->scir.timer;
+       unsigned char bits = uv_hub_info->scir.state;
+ 
+       /* flip heartbeat bit */
+       bits ^= SCIR_CPU_HEARTBEAT;
+ 
+       /* is this cpu idle? */
+       if (idle_cpu(raw_smp_processor_id()))
+               bits &= ~SCIR_CPU_ACTIVITY;
+       else
+               bits |= SCIR_CPU_ACTIVITY;
+ 
+       /* update system controller interface reg */
+       uv_set_scir_bits(bits);
+ 
+       /* enable next timer period */
+       mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+ }
+ 
+ static void __cpuinit uv_heartbeat_enable(int cpu)
+ {
+       if (!uv_cpu_hub_info(cpu)->scir.enabled) {
+               struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
+ 
+               uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
+               setup_timer(timer, uv_heartbeat, cpu);
+               timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
+               add_timer_on(timer, cpu);
+               uv_cpu_hub_info(cpu)->scir.enabled = 1;
+       }
+ 
+       /* check boot cpu */
+       if (!uv_cpu_hub_info(0)->scir.enabled)
+               uv_heartbeat_enable(0);
+ }
+ 
+ #ifdef CONFIG_HOTPLUG_CPU
+ static void __cpuinit uv_heartbeat_disable(int cpu)
+ {
+       if (uv_cpu_hub_info(cpu)->scir.enabled) {
+               uv_cpu_hub_info(cpu)->scir.enabled = 0;
+               del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
+       }
+       uv_set_cpu_scir_bits(cpu, 0xff);
+ }
+ 
+ /*
+  * cpu hotplug notifier
+  */
+ static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
+                                      unsigned long action, void *hcpu)
+ {
+       long cpu = (long)hcpu;
+ 
+       switch (action) {
+       case CPU_ONLINE:
+               uv_heartbeat_enable(cpu);
+               break;
+       case CPU_DOWN_PREPARE:
+               uv_heartbeat_disable(cpu);
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+ }
+ 
+ static __init void uv_scir_register_cpu_notifier(void)
+ {
+       hotcpu_notifier(uv_scir_cpu_notify, 0);
+ }
+ 
+ #else /* !CONFIG_HOTPLUG_CPU */
+ 
+ static __init void uv_scir_register_cpu_notifier(void)
+ {
+ }
+ 
+ static __init int uv_init_heartbeat(void)
+ {
+       int cpu;
+ 
+       if (is_uv_system())
+               for_each_online_cpu(cpu)
+                       uv_heartbeat_enable(cpu);
+       return 0;
+ }
+ 
+ late_initcall(uv_init_heartbeat);
+ 
+ #endif /* !CONFIG_HOTPLUG_CPU */
+ 
   /*
    * Called on each cpu to initialize the per_cpu UV data area.
    *    ZZZ hotplug not supported yet
@@@ -455,7 -529,7 +556,7 @@@ void __init uv_system_init(void
   
         uv_bios_init();
         uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
-                           &uv_coherency_id, &uv_region_size);
+                           &sn_coherency_id, &sn_region_size);
         uv_rtc_init();
   
         for_each_present_cpu(cpu) {
@@@ -466,8 -540,7 +567,7 @@@
                 uv_blade_info[blade].nr_possible_cpus++;
   
                 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
-               uv_cpu_hub_info(cpu)->lowmem_remap_top =
-                                       lowmem_redir_base + lowmem_redir_size;
+               uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
                 uv_cpu_hub_info(cpu)->m_val = m_val;
                 uv_cpu_hub_info(cpu)->n_val = m_val;
                 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
@@@ -477,7 -550,8 +577,8 @@@
                 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
                 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
                 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
-               uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id;
+               uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
+               uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
                 uv_node_to_blade[nid] = blade;
                 uv_cpu_to_blade[cpu] = blade;
                 max_pnode = max(pnode, max_pnode);
@@@ -494,4 -568,6 +595,6 @@@
         map_mmioh_high(max_pnode);
   
         uv_cpu_init();
+       uv_scir_register_cpu_notifier();
+       proc_mkdir("sgi_uv", NULL);
   }
diff --combined arch/x86/kernel/io_apic.c

index 1cbf7c8,e774596..3e070bb
--- 1/arch/x86/kernel/io_apic.c
--- 2/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@@ -136,8 -136,8 +136,8 @@@ static struct irq_pin_list *get_one_fre
   
   struct irq_cfg {
         struct irq_pin_list *irq_2_pin;
- -      cpumask_t domain;
- -      cpumask_t old_domain;
+ +      cpumask_var_t domain;
+ +      cpumask_var_t old_domain;
         unsigned move_cleanup_count;
         u8 vector;
         u8 move_in_progress : 1;
@@@ -152,22 -152,22 +152,22 @@@ static struct irq_cfg irq_cfgx[] = 
   #else
   static struct irq_cfg irq_cfgx[NR_IRQS] = {
   #endif
- -      [0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
- -      [1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
- -      [2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
- -      [3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
- -      [4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
- -      [5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
- -      [6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
- -      [7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
- -      [8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
- -      [9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
- -      [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
- -      [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
- -      [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
- -      [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
- -      [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
- -      [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+ +      [0]  = { .vector = IRQ0_VECTOR,  },
+ +      [1]  = { .vector = IRQ1_VECTOR,  },
+ +      [2]  = { .vector = IRQ2_VECTOR,  },
+ +      [3]  = { .vector = IRQ3_VECTOR,  },
+ +      [4]  = { .vector = IRQ4_VECTOR,  },
+ +      [5]  = { .vector = IRQ5_VECTOR,  },
+ +      [6]  = { .vector = IRQ6_VECTOR,  },
+ +      [7]  = { .vector = IRQ7_VECTOR,  },
+ +      [8]  = { .vector = IRQ8_VECTOR,  },
+ +      [9]  = { .vector = IRQ9_VECTOR,  },
+ +      [10] = { .vector = IRQ10_VECTOR, },
+ +      [11] = { .vector = IRQ11_VECTOR, },
+ +      [12] = { .vector = IRQ12_VECTOR, },
+ +      [13] = { .vector = IRQ13_VECTOR, },
+ +      [14] = { .vector = IRQ14_VECTOR, },
+ +      [15] = { .vector = IRQ15_VECTOR, },
   };
   
   void __init arch_early_irq_init(void)
@@@ -183,10 -183,6 +183,10 @@@
         for (i = 0; i < count; i++) {
                 desc = irq_to_desc(i);
                 desc->chip_data = &cfg[i];
+ +              alloc_bootmem_cpumask_var(&cfg[i].domain);
+ +              alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+ +              if (i < NR_IRQS_LEGACY)
+ +                      cpumask_setall(cfg[i].domain);
         }
   }
   
@@@ -211,20 -207,6 +211,20 @@@ static struct irq_cfg *get_one_free_irq
         node = cpu_to_node(cpu);
   
         cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+ +      if (cfg) {
+ +              /* FIXME: needs alloc_cpumask_var_node() */
+ +              if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) {
+ +                      kfree(cfg);
+ +                      cfg = NULL;
+ +              } else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) {
+ +                      free_cpumask_var(cfg->domain);
+ +                      kfree(cfg);
+ +                      cfg = NULL;
+ +              } else {
+ +                      cpumask_clear(cfg->domain);
+ +                      cpumask_clear(cfg->old_domain);
+ +              }
+ +      }
         printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
   
         return cfg;
@@@ -347,14 -329,13 +347,14 @@@ void arch_free_chip_data(struct irq_des
         }
   }
   
- -static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+ +static void
+ +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
   {
         struct irq_cfg *cfg = desc->chip_data;
   
         if (!cfg->move_in_progress) {
                 /* it means that domain is not changed */
- -              if (!cpus_intersects(desc->affinity, mask))
+ +              if (!cpumask_intersects(&desc->affinity, mask))
                         cfg->move_desc_pending = 1;
         }
   }
@@@ -369,8 -350,7 +369,8 @@@ static struct irq_cfg *irq_cfg(unsigne
   #endif
   
   #ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
- -static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+ +static inline void
+ +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
   {
   }
   #endif
@@@ -501,26 -481,6 +501,26 @@@ static void ioapic_mask_entry(int apic
   }
   
   #ifdef CONFIG_SMP
+ +static void send_cleanup_vector(struct irq_cfg *cfg)
+ +{
+ +      cpumask_var_t cleanup_mask;
+ +
+ +      if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+ +              unsigned int i;
+ +              cfg->move_cleanup_count = 0;
+ +              for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ +                      cfg->move_cleanup_count++;
+ +              for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ +                      send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+ +      } else {
+ +              cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ +              cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+ +              send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ +              free_cpumask_var(cleanup_mask);
+ +      }
+ +      cfg->move_in_progress = 0;
+ +}
+ +
   static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
   {
         int apic, pin;
@@@ -556,61 -516,48 +556,61 @@@
         }
   }
   
- -static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
+ +static int
+ +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
   
- -static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
+ +/*
+ + * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
+ + * of that, or returns BAD_APICID and leaves desc->affinity untouched.
+ + */
+ +static unsigned int
+ +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
   {
         struct irq_cfg *cfg;
- -      unsigned long flags;
- -      unsigned int dest;
- -      cpumask_t tmp;
         unsigned int irq;
   
- -      cpus_and(tmp, mask, cpu_online_map);
- -      if (cpus_empty(tmp))
- -              return;
+ +      if (!cpumask_intersects(mask, cpu_online_mask))
+ +              return BAD_APICID;
   
         irq = desc->irq;
         cfg = desc->chip_data;
         if (assign_irq_vector(irq, cfg, mask))
- -              return;
+ +              return BAD_APICID;
   
+ +      cpumask_and(&desc->affinity, cfg->domain, mask);
         set_extra_move_desc(desc, mask);
+ +      return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+ +}
   
- -      cpus_and(tmp, cfg->domain, mask);
- -      dest = cpu_mask_to_apicid(tmp);
- -      /*
- -       * Only the high 8 bits are valid.
- -       */
- -      dest = SET_APIC_LOGICAL_ID(dest);
+ +static void
+ +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ +{
+ +      struct irq_cfg *cfg;
+ +      unsigned long flags;
+ +      unsigned int dest;
+ +      unsigned int irq;
+ +
+ +      irq = desc->irq;
+ +      cfg = desc->chip_data;
   
         spin_lock_irqsave(&ioapic_lock, flags);
- -      __target_IO_APIC_irq(irq, dest, cfg);
- -      desc->affinity = mask;
+ +      dest = set_desc_affinity(desc, mask);
+ +      if (dest != BAD_APICID) {
+ +              /* Only the high 8 bits are valid. */
+ +              dest = SET_APIC_LOGICAL_ID(dest);
+ +              __target_IO_APIC_irq(irq, dest, cfg);
+ +      }
         spin_unlock_irqrestore(&ioapic_lock, flags);
   }
   
- -static void set_ioapic_affinity_irq(unsigned int irq,
- -                                  const struct cpumask *mask)
+ +static void
+ +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
   {
         struct irq_desc *desc;
   
         desc = irq_to_desc(irq);
   
- -      set_ioapic_affinity_irq_desc(desc, *mask);
+ +      set_ioapic_affinity_irq_desc(desc, mask);
   }
   #endif /* CONFIG_SMP */
   
@@@ -1272,8 -1219,7 +1272,8 @@@ void unlock_vector_lock(void
         spin_unlock(&vector_lock);
   }
   
- -static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
+ +static int
+ +__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
   {
         /*
          * NOTE! The local APIC isn't very good at handling
@@@ -1288,49 -1234,49 +1288,49 @@@
          */
         static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
         unsigned int old_vector;
- -      int cpu;
+ +      int cpu, err;
+ +      cpumask_var_t tmp_mask;
   
         if ((cfg->move_in_progress) || cfg->move_cleanup_count)
                 return -EBUSY;
   
- -      /* Only try and allocate irqs on cpus that are present */
- -      cpus_and(mask, mask, cpu_online_map);
+ +      if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+ +              return -ENOMEM;
   
         old_vector = cfg->vector;
         if (old_vector) {
- -              cpumask_t tmp;
- -              cpus_and(tmp, cfg->domain, mask);
- -              if (!cpus_empty(tmp))
+ +              cpumask_and(tmp_mask, mask, cpu_online_mask);
+ +              cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+ +              if (!cpumask_empty(tmp_mask)) {
+ +                      free_cpumask_var(tmp_mask);
                         return 0;
+ +              }
         }
   
- -      for_each_cpu_mask_nr(cpu, mask) {
- -              cpumask_t domain, new_mask;
+ +      /* Only try and allocate irqs on cpus that are present */
+ +      err = -ENOSPC;
+ +      for_each_cpu_and(cpu, mask, cpu_online_mask) {
                 int new_cpu;
                 int vector, offset;
   
- -              domain = vector_allocation_domain(cpu);
- -              cpus_and(new_mask, domain, cpu_online_map);
+ +              vector_allocation_domain(cpu, tmp_mask);
   
                 vector = current_vector;
                 offset = current_offset;
   next:
                 vector += 8;
                 if (vector >= first_system_vector) {
- -                      /* If we run out of vectors on large boxen, must share them. */
+ +                      /* If out of vectors on large boxen, must share them. */
                         offset = (offset + 1) % 8;
                         vector = FIRST_DEVICE_VECTOR + offset;
                 }
                 if (unlikely(current_vector == vector))
                         continue;
- -#ifdef CONFIG_X86_64
- -              if (vector == IA32_SYSCALL_VECTOR)
- -                      goto next;
- -#else
- -              if (vector == SYSCALL_VECTOR)
+ +
+ +              if (test_bit(vector, used_vectors))
                         goto next;
- -#endif
- -              for_each_cpu_mask_nr(new_cpu, new_mask)
+ +
+ +              for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
                         if (per_cpu(vector_irq, new_cpu)[vector] != -1)
                                 goto next;
                 /* Found one! */
@@@ -1338,21 -1284,18 +1338,21 @@@
                 current_offset = offset;
                 if (old_vector) {
                         cfg->move_in_progress = 1;
- -                      cfg->old_domain = cfg->domain;
+ +                      cpumask_copy(cfg->old_domain, cfg->domain);
                 }
- -              for_each_cpu_mask_nr(new_cpu, new_mask)
+ +              for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
                         per_cpu(vector_irq, new_cpu)[vector] = irq;
                 cfg->vector = vector;
- -              cfg->domain = domain;
- -              return 0;
+ +              cpumask_copy(cfg->domain, tmp_mask);
+ +              err = 0;
+ +              break;
         }
- -      return -ENOSPC;
+ +      free_cpumask_var(tmp_mask);
+ +      return err;
   }
   
- -static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
+ +static int
+ +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
   {
         int err;
         unsigned long flags;
@@@ -1365,20 -1308,23 +1365,20 @@@
   
   static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
   {
- -      cpumask_t mask;
         int cpu, vector;
   
         BUG_ON(!cfg->vector);
   
         vector = cfg->vector;
- -      cpus_and(mask, cfg->domain, cpu_online_map);
- -      for_each_cpu_mask_nr(cpu, mask)
+ +      for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
                 per_cpu(vector_irq, cpu)[vector] = -1;
   
         cfg->vector = 0;
- -      cpus_clear(cfg->domain);
+ +      cpumask_clear(cfg->domain);
   
         if (likely(!cfg->move_in_progress))
                 return;
- -      cpus_and(mask, cfg->old_domain, cpu_online_map);
- -      for_each_cpu_mask_nr(cpu, mask) {
+ +      for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
                 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
                                                                 vector++) {
                         if (per_cpu(vector_irq, cpu)[vector] != irq)
@@@ -1403,7 -1349,7 +1403,7 @@@ void __setup_vector_irq(int cpu
                 if (!desc)
                         continue;
                 cfg = desc->chip_data;
- -              if (!cpu_isset(cpu, cfg->domain))
+ +              if (!cpumask_test_cpu(cpu, cfg->domain))
                         continue;
                 vector = cfg->vector;
                 per_cpu(vector_irq, cpu)[vector] = irq;
@@@ -1415,7 -1361,7 +1415,7 @@@
                         continue;
   
                 cfg = irq_cfg(irq);
- -              if (!cpu_isset(cpu, cfg->domain))
+ +              if (!cpumask_test_cpu(cpu, cfg->domain))
                         per_cpu(vector_irq, cpu)[vector] = -1;
         }
   }
@@@ -1551,17 -1497,18 +1551,17 @@@ static void setup_IO_APIC_irq(int apic
   {
         struct irq_cfg *cfg;
         struct IO_APIC_route_entry entry;
- -      cpumask_t mask;
+ +      unsigned int dest;
   
         if (!IO_APIC_IRQ(irq))
                 return;
   
         cfg = desc->chip_data;
   
- -      mask = TARGET_CPUS;
- -      if (assign_irq_vector(irq, cfg, mask))
+ +      if (assign_irq_vector(irq, cfg, TARGET_CPUS))
                 return;
   
- -      cpus_and(mask, cfg->domain, mask);
+ +      dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
   
         apic_printk(APIC_VERBOSE,KERN_DEBUG
                     "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@@ -1571,7 -1518,8 +1571,7 @@@
   
   
         if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
- -                             cpu_mask_to_apicid(mask), trigger, polarity,
- -                             cfg->vector)) {
+ +                             dest, trigger, polarity, cfg->vector)) {
                 printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
                        mp_ioapics[apic].mp_apicid, pin);
                 __clear_irq_vector(irq, cfg);
@@@ -2293,7 -2241,7 +2293,7 @@@ static int ioapic_retrigger_irq(unsigne
         unsigned long flags;
   
         spin_lock_irqsave(&vector_lock, flags);
- -      send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
+ +      send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
         spin_unlock_irqrestore(&vector_lock, flags);
   
         return 1;
@@@ -2342,17 -2290,18 +2342,17 @@@ static DECLARE_DELAYED_WORK(ir_migratio
    * as simple as edge triggered migration and we can do the irq migration
    * with a simple atomic update to IO-APIC RTE.
    */
- -static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
+ +static void
+ +migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
   {
         struct irq_cfg *cfg;
- -      cpumask_t tmp, cleanup_mask;
         struct irte irte;
         int modify_ioapic_rte;
         unsigned int dest;
         unsigned long flags;
         unsigned int irq;
   
- -      cpus_and(tmp, mask, cpu_online_map);
- -      if (cpus_empty(tmp))
+ +      if (!cpumask_intersects(mask, cpu_online_mask))
                 return;
   
         irq = desc->irq;
@@@ -2365,7 -2314,8 +2365,7 @@@
   
         set_extra_move_desc(desc, mask);
   
- -      cpus_and(tmp, cfg->domain, mask);
- -      dest = cpu_mask_to_apicid(tmp);
+ +      dest = cpu_mask_to_apicid_and(cfg->domain, mask);
   
         modify_ioapic_rte = desc->status & IRQ_LEVEL;
         if (modify_ioapic_rte) {
@@@ -2382,10 -2332,14 +2382,10 @@@
          */
         modify_irte(irq, &irte);
   
- -      if (cfg->move_in_progress) {
- -              cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- -              cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- -              send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- -              cfg->move_in_progress = 0;
- -      }
+ +      if (cfg->move_in_progress)
+ +              send_cleanup_vector(cfg);
   
- -      desc->affinity = mask;
+ +      cpumask_copy(&desc->affinity, mask);
   }
   
   static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
@@@ -2407,11 -2361,11 +2407,11 @@@
         }
   
         /* everthing is clear. we have right of way */
- -      migrate_ioapic_irq_desc(desc, desc->pending_mask);
+ +      migrate_ioapic_irq_desc(desc, &desc->pending_mask);
   
         ret = 0;
         desc->status &= ~IRQ_MOVE_PENDING;
- -      cpus_clear(desc->pending_mask);
+ +      cpumask_clear(&desc->pending_mask);
   
   unmask:
         unmask_IO_APIC_irq_desc(desc);
@@@ -2448,12 -2402,11 +2448,12 @@@ static void ir_irq_migration(struct wor
   /*
    * Migrates the IRQ destination in the process context.
    */
- -static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
+ +static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+ +                                          const struct cpumask *mask)
   {
         if (desc->status & IRQ_LEVEL) {
                 desc->status |= IRQ_MOVE_PENDING;
- -              desc->pending_mask = mask;
+ +              cpumask_copy(&desc->pending_mask, mask);
                 migrate_irq_remapped_level_desc(desc);
                 return;
         }
@@@ -2465,17 -2418,16 +2465,16 @@@ static void set_ir_ioapic_affinity_irq(
   {
         struct irq_desc *desc = irq_to_desc(irq);
   
- -      set_ir_ioapic_affinity_irq_desc(desc, *mask);
+ +      set_ir_ioapic_affinity_irq_desc(desc, mask);
   }
   #endif
   
   asmlinkage void smp_irq_move_cleanup_interrupt(void)
   {
         unsigned vector, me;
+ 
         ack_APIC_irq();
- #ifdef CONFIG_X86_64
         exit_idle();
- #endif
         irq_enter();
   
         me = smp_processor_id();
@@@ -2497,7 -2449,7 +2496,7 @@@
                 if (!cfg->move_cleanup_count)
                         goto unlock;
   
- -              if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+ +              if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
                         goto unlock;
   
                 __get_cpu_var(vector_irq)[vector] = -1;
@@@ -2520,7 -2472,7 +2519,7 @@@ static void irq_complete_move(struct ir
                 if (likely(!cfg->move_desc_pending))
                         return;
   
-               /* domain is not change, but affinity is changed */
+               /* domain has not changed, but affinity did */
                 me = smp_processor_id();
                 if (cpu_isset(me, desc->affinity)) {
                         *descp = desc = move_irq_desc(desc, me);
@@@ -2534,14 -2486,20 +2533,14 @@@
   
         vector = ~get_irq_regs()->orig_ax;
         me = smp_processor_id();
- -      if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
- -              cpumask_t cleanup_mask;
- -
   #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
                 *descp = desc = move_irq_desc(desc, me);
                 /* get the new one */
                 cfg = desc->chip_data;
   #endif
   
- -              cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- -              cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- -              send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- -              cfg->move_in_progress = 0;
- -      }
+ +      if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+ +              send_cleanup_vector(cfg);
   }
   #else
   static inline void irq_complete_move(struct irq_desc **descp) {}
@@@ -3266,13 -3224,16 +3265,13 @@@ static int msi_compose_msg(struct pci_d
         struct irq_cfg *cfg;
         int err;
         unsigned dest;
- -      cpumask_t tmp;
   
         cfg = irq_cfg(irq);
- -      tmp = TARGET_CPUS;
- -      err = assign_irq_vector(irq, cfg, tmp);
+ +      err = assign_irq_vector(irq, cfg, TARGET_CPUS);
         if (err)
                 return err;
   
- -      cpus_and(tmp, cfg->domain, tmp);
- -      dest = cpu_mask_to_apicid(tmp);
+ +      dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
   
   #ifdef CONFIG_INTR_REMAP
         if (irq_remapped(irq)) {
@@@ -3332,12 -3293,19 +3331,12 @@@ static void set_msi_irq_affinity(unsign
         struct irq_cfg *cfg;
         struct msi_msg msg;
         unsigned int dest;
- -      cpumask_t tmp;
   
- -      if (!cpumask_intersects(mask, cpu_online_mask))
+ +      dest = set_desc_affinity(desc, mask);
+ +      if (dest == BAD_APICID)
                 return;
   
         cfg = desc->chip_data;
- -      if (assign_irq_vector(irq, cfg, *mask))
- -              return;
- -
- -      set_extra_move_desc(desc, *mask);
- -
- -      cpumask_and(&tmp, &cfg->domain, mask);
- -      dest = cpu_mask_to_apicid(tmp);
   
         read_msi_msg_desc(desc, &msg);
   
@@@ -3347,27 -3315,37 +3346,27 @@@
         msg.address_lo |= MSI_ADDR_DEST_ID(dest);
   
         write_msi_msg_desc(desc, &msg);
- -      cpumask_copy(&desc->affinity, mask);
   }
   #ifdef CONFIG_INTR_REMAP
   /*
    * Migrate the MSI irq to another cpumask. This migration is
    * done in the process context using interrupt-remapping hardware.
    */
- -static void ir_set_msi_irq_affinity(unsigned int irq,
- -                                  const struct cpumask *mask)
+ +static void
+ +ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
   {
         struct irq_desc *desc = irq_to_desc(irq);
- -      struct irq_cfg *cfg;
+ +      struct irq_cfg *cfg = desc->chip_data;
         unsigned int dest;
- -      cpumask_t tmp, cleanup_mask;
         struct irte irte;
   
- -      if (!cpumask_intersects(mask, cpu_online_mask))
- -              return;
- -
         if (get_irte(irq, &irte))
                 return;
   
- -      cfg = desc->chip_data;
- -      if (assign_irq_vector(irq, cfg, *mask))
+ +      dest = set_desc_affinity(desc, mask);
+ +      if (dest == BAD_APICID)
                 return;
   
- -      set_extra_move_desc(desc, *mask);
- -
- -      cpumask_and(&tmp, &cfg->domain, mask);
- -      dest = cpu_mask_to_apicid(tmp);
- -
         irte.vector = cfg->vector;
         irte.dest_id = IRTE_DEST(dest);
   
@@@ -3381,8 -3359,14 +3380,8 @@@
          * at the new destination. So, time to cleanup the previous
          * vector allocation.
          */
- -      if (cfg->move_in_progress) {
- -              cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- -              cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- -              send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- -              cfg->move_in_progress = 0;
- -      }
- -
- -      cpumask_copy(&desc->affinity, mask);
+ +      if (cfg->move_in_progress)
+ +              send_cleanup_vector(cfg);
   }
   
   #endif
@@@ -3579,12 -3563,19 +3578,12 @@@ static void dmar_msi_set_affinity(unsig
         struct irq_cfg *cfg;
         struct msi_msg msg;
         unsigned int dest;
- -      cpumask_t tmp;
   
- -      if (!cpumask_intersects(mask, cpu_online_mask))
+ +      dest = set_desc_affinity(desc, mask);
+ +      if (dest == BAD_APICID)
                 return;
   
         cfg = desc->chip_data;
- -      if (assign_irq_vector(irq, cfg, *mask))
- -              return;
- -
- -      set_extra_move_desc(desc, *mask);
- -
- -      cpumask_and(&tmp, &cfg->domain, mask);
- -      dest = cpu_mask_to_apicid(tmp);
   
         dmar_msi_read(irq, &msg);
   
@@@ -3594,6 -3585,7 +3593,6 @@@
         msg.address_lo |= MSI_ADDR_DEST_ID(dest);
   
         dmar_msi_write(irq, &msg);
- -      cpumask_copy(&desc->affinity, mask);
   }
   
   #endif /* CONFIG_SMP */
@@@ -3633,12 -3625,19 +3632,12 @@@ static void hpet_msi_set_affinity(unsig
         struct irq_cfg *cfg;
         struct msi_msg msg;
         unsigned int dest;
- -      cpumask_t tmp;
   
- -      if (!cpumask_intersects(mask, cpu_online_mask))
+ +      dest = set_desc_affinity(desc, mask);
+ +      if (dest == BAD_APICID)
                 return;
   
         cfg = desc->chip_data;
- -      if (assign_irq_vector(irq, cfg, *mask))
- -              return;
- -
- -      set_extra_move_desc(desc, *mask);
- -
- -      cpumask_and(&tmp, &cfg->domain, mask);
- -      dest = cpu_mask_to_apicid(tmp);
   
         hpet_msi_read(irq, &msg);
   
@@@ -3648,6 -3647,7 +3647,6 @@@
         msg.address_lo |= MSI_ADDR_DEST_ID(dest);
   
         hpet_msi_write(irq, &msg);
- -      cpumask_copy(&desc->affinity, mask);
   }
   
   #endif /* CONFIG_SMP */
@@@ -3707,14 -3707,22 +3706,14 @@@ static void set_ht_irq_affinity(unsigne
         struct irq_desc *desc = irq_to_desc(irq);
         struct irq_cfg *cfg;
         unsigned int dest;
- -      cpumask_t tmp;
   
- -      if (!cpumask_intersects(mask, cpu_online_mask))
+ +      dest = set_desc_affinity(desc, mask);
+ +      if (dest == BAD_APICID)
                 return;
   
         cfg = desc->chip_data;
- -      if (assign_irq_vector(irq, cfg, *mask))
- -              return;
- -
- -      set_extra_move_desc(desc, *mask);
- -
- -      cpumask_and(&tmp, &cfg->domain, mask);
- -      dest = cpu_mask_to_apicid(tmp);
   
         target_ht_irq(irq, dest, cfg->vector);
- -      cpumask_copy(&desc->affinity, mask);
   }
   
   #endif
@@@ -3734,14 -3742,17 +3733,14 @@@ int arch_setup_ht_irq(unsigned int irq
   {
         struct irq_cfg *cfg;
         int err;
- -      cpumask_t tmp;
   
         cfg = irq_cfg(irq);
- -      tmp = TARGET_CPUS;
- -      err = assign_irq_vector(irq, cfg, tmp);
+ +      err = assign_irq_vector(irq, cfg, TARGET_CPUS);
         if (!err) {
                 struct ht_irq_msg msg;
                 unsigned dest;
   
- -              cpus_and(tmp, cfg->domain, tmp);
- -              dest = cpu_mask_to_apicid(tmp);
+ +              dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
   
                 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
   
@@@ -3777,7 -3788,7 +3776,7 @@@
   int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
                        unsigned long mmr_offset)
   {
- -      const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+ +      const struct cpumask *eligible_cpu = cpumask_of(cpu);
         struct irq_cfg *cfg;
         int mmr_pnode;
         unsigned long mmr_value;
@@@ -3787,7 -3798,7 +3786,7 @@@
   
         cfg = irq_cfg(irq);
   
- -      err = assign_irq_vector(irq, cfg, *eligible_cpu);
+ +      err = assign_irq_vector(irq, cfg, eligible_cpu);
         if (err != 0)
                 return err;
   
@@@ -3806,7 -3817,7 +3805,7 @@@
         entry->polarity = 0;
         entry->trigger = 0;
         entry->mask = 0;
- -      entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+ +      entry->dest = cpu_mask_to_apicid(eligible_cpu);
   
         mmr_pnode = uv_blade_to_pnode(mmr_blade);
         uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@@ -4017,7 -4028,7 +4016,7 @@@ void __init setup_ioapic_dest(void
         int pin, ioapic, irq, irq_entry;
         struct irq_desc *desc;
         struct irq_cfg *cfg;
- -      cpumask_t mask;
+ +      const struct cpumask *mask;
   
         if (skip_ioapic_setup == 1)
                 return;
@@@ -4048,7 -4059,7 +4047,7 @@@
                          */
                         if (desc->status &
                             (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- -                              mask = desc->affinity;
+ +                              mask = &desc->affinity;
                         else
                                 mask = TARGET_CPUS;
   
diff --combined arch/x86/kernel/irq_64.c

index fca2991,54c69d4..6383d50
--- 1/arch/x86/kernel/irq_64.c
--- 2/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@@ -13,12 -13,12 +13,12 @@@
   #include <linux/seq_file.h>
   #include <linux/module.h>
   #include <linux/delay.h>
+ #include <linux/ftrace.h>
   #include <asm/uaccess.h>
   #include <asm/io_apic.h>
   #include <asm/idle.h>
   #include <asm/smp.h>
   
- #ifdef CONFIG_DEBUG_STACKOVERFLOW
   /*
    * Probabilistic stack overflow check:
    *
@@@ -28,26 -28,25 +28,25 @@@
    */
   static inline void stack_overflow_check(struct pt_regs *regs)
   {
+ #ifdef CONFIG_DEBUG_STACKOVERFLOW
         u64 curbase = (u64)task_stack_page(current);
-       static unsigned long warned = -60*HZ;
- 
-       if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
-           regs->sp <  curbase + sizeof(struct thread_info) + 128 &&
-           time_after(jiffies, warned + 60*HZ)) {
-               printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
-                      current->comm, curbase, regs->sp);
-               show_stack(NULL,NULL);
-               warned = jiffies;
-       }
- }
+ 
+       WARN_ONCE(regs->sp >= curbase &&
+                 regs->sp <= curbase + THREAD_SIZE &&
+                 regs->sp <  curbase + sizeof(struct thread_info) +
+                                       sizeof(struct pt_regs) + 128,
+ 
+                 "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
+                       current->comm, curbase, regs->sp);
   #endif
+ }
   
   /*
    * do_IRQ handles all normal device IRQ's (the special
    * SMP cross-CPU interrupts have their own specific
    * handlers).
    */
- asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+ asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
   {
         struct pt_regs *old_regs = set_irq_regs(regs);
         struct irq_desc *desc;
@@@ -60,9 -59,7 +59,7 @@@
         irq_enter();
         irq = __get_cpu_var(vector_irq)[vector];
   
- #ifdef CONFIG_DEBUG_STACKOVERFLOW
         stack_overflow_check(regs);
- #endif
   
         desc = irq_to_desc(irq);
         if (likely(desc))
@@@ -83,17 -80,16 +80,17 @@@
   }
   
   #ifdef CONFIG_HOTPLUG_CPU
- -void fixup_irqs(cpumask_t map)
+ +/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
+ +void fixup_irqs(void)
   {
         unsigned int irq;
         static int warned;
         struct irq_desc *desc;
   
         for_each_irq_desc(irq, desc) {
- -              cpumask_t mask;
                 int break_affinity = 0;
                 int set_affinity = 1;
+ +              const struct cpumask *affinity;
   
                 if (!desc)
                         continue;
@@@ -103,23 -99,23 +100,23 @@@
                 /* interrupt's are disabled at this point */
                 spin_lock(&desc->lock);
   
+ +              affinity = &desc->affinity;
                 if (!irq_has_action(irq) ||
- -                  cpus_equal(desc->affinity, map)) {
+ +                  cpumask_equal(affinity, cpu_online_mask)) {
                         spin_unlock(&desc->lock);
                         continue;
                 }
   
- -              cpus_and(mask, desc->affinity, map);
- -              if (cpus_empty(mask)) {
+ +              if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
                         break_affinity = 1;
- -                      mask = map;
+ +                      affinity = cpu_all_mask;
                 }
   
                 if (desc->chip->mask)
                         desc->chip->mask(irq);
   
                 if (desc->chip->set_affinity)
- -                      desc->chip->set_affinity(irq, &mask);
+ +                      desc->chip->set_affinity(irq, affinity);
                 else if (!(warned++))
                         set_affinity = 0;
   
diff --combined arch/x86/kernel/irqinit_32.c

index 61aa2a1,203384e..8472329
--- 1/arch/x86/kernel/irqinit_32.c
--- 2/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@@ -110,18 -110,6 +110,18 @@@ DEFINE_PER_CPU(vector_irq_t, vector_irq
         [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
   };
   
+ +int vector_used_by_percpu_irq(unsigned int vector)
+ +{
+ +      int cpu;
+ +
+ +      for_each_online_cpu(cpu) {
+ +              if (per_cpu(vector_irq, cpu)[vector] != -1)
+ +                      return 1;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
   /* Overridden in paravirt.c */
   void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
   
@@@ -140,7 -128,7 +140,7 @@@ void __init native_init_IRQ(void
         for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
                 /* SYSCALL_VECTOR was reserved in trap_init. */
                 if (i != SYSCALL_VECTOR)
-                       set_intr_gate(i, interrupt[i]);
+                       set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
         }
   
   
@@@ -158,12 -146,10 +158,12 @@@
         alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
   
         /* IPI for single call function */
- -      set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
+ +      alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+ +                               call_function_single_interrupt);
   
         /* Low priority IPI to cleanup after moving an irq */
         set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+ +      set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
   #endif
   
   #ifdef CONFIG_X86_LOCAL_APIC
diff --combined arch/x86/kernel/irqinit_64.c

index 1020919,6190e6e..31ebfe3
--- 1/arch/x86/kernel/irqinit_64.c
--- 2/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@@ -23,41 -23,6 +23,6 @@@
   #include <asm/apic.h>
   #include <asm/i8259.h>
   
- /*
-  * Common place to define all x86 IRQ vectors
-  *
-  * This builds up the IRQ handler stubs using some ugly macros in irq.h
-  *
-  * These macros create the low-level assembly IRQ routines that save
-  * register context and call do_IRQ(). do_IRQ() then does all the
-  * operations that are needed to keep the AT (or SMP IOAPIC)
-  * interrupt-controller happy.
-  */
- 
- #define IRQ_NAME2(nr) nr##_interrupt(void)
- #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
- 
- /*
-  *    SMP has a few special interrupts for IPI messages
-  */
- 
- #define BUILD_IRQ(nr)                         \
-       asmlinkage void IRQ_NAME(nr);           \
-       asm("\n.text\n.p2align\n"               \
-           "IRQ" #nr "_interrupt:\n\t"         \
-           "push $~(" #nr ") ; "               \
-           "jmp common_interrupt\n"            \
-           ".previous");
- 
- #define BI(x,y) \
-       BUILD_IRQ(x##y)
- 
- #define BUILD_16_IRQS(x) \
-       BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
-       BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
-       BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
-       BI(x,c) BI(x,d) BI(x,e) BI(x,f)
- 
   /*
    * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
    * (these are usually mapped to vectors 0x30-0x3f)
@@@ -73,37 -38,6 +38,6 @@@
    *
    * (these are usually mapped into the 0x30-0xff vector range)
    */
-                                     BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
- BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
- BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
- BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
- 
- #undef BUILD_16_IRQS
- #undef BI
- 
- 
- #define IRQ(x,y) \
-       IRQ##x##y##_interrupt
- 
- #define IRQLIST_16(x) \
-       IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
-       IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
-       IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
-       IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
- 
- /* for the irq vectors */
- static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
-                                         IRQLIST_16(0x2), IRQLIST_16(0x3),
-       IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
-       IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
-       IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
- };
- 
- #undef IRQ
- #undef IRQLIST_16
- 
- 
- 
   
   /*
    * IRQ2 is cascade interrupt to second interrupt controller
@@@ -135,18 -69,6 +69,18 @@@ DEFINE_PER_CPU(vector_irq_t, vector_irq
         [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
   };
   
+ +int vector_used_by_percpu_irq(unsigned int vector)
+ +{
+ +      int cpu;
+ +
+ +      for_each_online_cpu(cpu) {
+ +              if (per_cpu(vector_irq, cpu)[vector] != -1)
+ +                      return 1;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
   void __init init_ISA_irqs(void)
   {
         int i;
@@@ -199,7 -121,6 +133,7 @@@ static void __init smp_intr_init(void
   
         /* Low priority IPI to cleanup after moving an irq */
         set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+ +      set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
   #endif
   }
   
diff --combined arch/x86/kernel/setup_percpu.c

index 0b63b08,8e8b119..49f3f70
--- 1/arch/x86/kernel/setup_percpu.c
--- 2/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@@ -152,11 -152,6 +152,11 @@@ void __init setup_per_cpu_areas(void
         old_size = PERCPU_ENOUGH_ROOM;
         align = max_t(unsigned long, PAGE_SIZE, align);
         size = roundup(old_size, align);
+ +
+ +      printk(KERN_INFO
+ +              "NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
+ +              NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
+ +
         printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
                           size);
   
@@@ -173,24 -168,24 +173,24 @@@
                                "cpu %d has no node %d or node-local memory\n",
                                 cpu, node);
                         if (ptr)
- -                              printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
+ +                              printk(KERN_DEBUG
+ +                                      "per cpu data for cpu%d at %016lx\n",
                                          cpu, __pa(ptr));
                 }
                 else {
                         ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
                                                         __pa(MAX_DMA_ADDRESS));
                         if (ptr)
- -                              printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
- -                                       cpu, node, __pa(ptr));
+ +                              printk(KERN_DEBUG
+ +                                      "per cpu data for cpu%d on node%d "
+ +                                      "at %016lx\n",
+ +                                      cpu, node, __pa(ptr));
                 }
   #endif
                 per_cpu_offset(cpu) = ptr - __per_cpu_start;
                 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
         }
   
- -      printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
- -              NR_CPUS, nr_cpu_ids, nr_node_ids);
- -
         /* Setup percpu data maps */
         setup_per_cpu_maps();
   
@@@ -339,25 -334,25 +339,25 @@@ static const cpumask_t cpu_mask_none
   /*
    * Returns a pointer to the bitmask of CPUs on Node 'node'.
    */
- const cpumask_t *_node_to_cpumask_ptr(int node)
+ const cpumask_t *cpumask_of_node(int node)
   {
         if (node_to_cpumask_map == NULL) {
                 printk(KERN_WARNING
-                       "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
+                       "cpumask_of_node(%d): no node_to_cpumask_map!\n",
                         node);
                 dump_stack();
                 return (const cpumask_t *)&cpu_online_map;
         }
         if (node >= nr_node_ids) {
                 printk(KERN_WARNING
-                       "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
+                       "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
                         node, nr_node_ids);
                 dump_stack();
                 return &cpu_mask_none;
         }
         return &node_to_cpumask_map[node];
   }
- EXPORT_SYMBOL(_node_to_cpumask_ptr);
+ EXPORT_SYMBOL(cpumask_of_node);
   
   /*
    * Returns a bitmask of CPUs on Node 'node'.
diff --combined arch/x86/kernel/smp.c

index 49ed667,7e558db..beea264
--- 1/arch/x86/kernel/smp.c
--- 2/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@@ -118,22 -118,22 +118,22 @@@ static void native_smp_send_reschedule(
                 WARN_ON(1);
                 return;
         }
- -      send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+ +      send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
   }
   
   void native_send_call_func_single_ipi(int cpu)
   {
- -      send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
+ +      send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
   }
   
- -void native_send_call_func_ipi(cpumask_t mask)
+ +void native_send_call_func_ipi(const struct cpumask *mask)
   {
         cpumask_t allbutself;
   
         allbutself = cpu_online_map;
         cpu_clear(smp_processor_id(), allbutself);
   
- -      if (cpus_equal(mask, allbutself) &&
+ +      if (cpus_equal(*mask, allbutself) &&
             cpus_equal(cpu_online_map, cpu_callout_map))
                 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
         else
@@@ -165,11 -165,7 +165,7 @@@ static void native_smp_send_stop(void
   void smp_reschedule_interrupt(struct pt_regs *regs)
   {
         ack_APIC_irq();
- #ifdef CONFIG_X86_32
-       __get_cpu_var(irq_stat).irq_resched_count++;
- #else
-       add_pda(irq_resched_count, 1);
- #endif
+       inc_irq_stat(irq_resched_count);
   }
   
   void smp_call_function_interrupt(struct pt_regs *regs)
@@@ -177,11 -173,7 +173,7 @@@
         ack_APIC_irq();
         irq_enter();
         generic_smp_call_function_interrupt();
- #ifdef CONFIG_X86_32
-       __get_cpu_var(irq_stat).irq_call_count++;
- #else
-       add_pda(irq_call_count, 1);
- #endif
+       inc_irq_stat(irq_call_count);
         irq_exit();
   }
   
@@@ -190,11 -182,7 +182,7 @@@ void smp_call_function_single_interrupt
         ack_APIC_irq();
         irq_enter();
         generic_smp_call_function_single_interrupt();
- #ifdef CONFIG_X86_32
-       __get_cpu_var(irq_stat).irq_call_count++;
- #else
-       add_pda(irq_call_count, 1);
- #endif
+       inc_irq_stat(irq_call_count);
         irq_exit();
   }
   
diff --combined arch/x86/kernel/smpboot.c

index 1a9941b,c539205..9e177a4
--- 1/arch/x86/kernel/smpboot.c
--- 2/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@@ -282,7 -282,7 +282,7 @@@ static int __cpuinitdata unsafe_smp
   /*
    * Activate a secondary processor.
    */
- static void __cpuinit start_secondary(void *unused)
+ notrace static void __cpuinit start_secondary(void *unused)
   {
         /*
          * Don't put *anything* before cpu_init(), SMP booting is too
@@@ -496,7 -496,7 +496,7 @@@ void __cpuinit set_cpu_sibling_map(int 
   }
   
   /* maps the cpu to the sched domain representing multi-core */
- cpumask_t cpu_coregroup_map(int cpu)
+ const struct cpumask *cpu_coregroup_mask(int cpu)
   {
         struct cpuinfo_x86 *c = &cpu_data(cpu);
         /*
@@@ -504,9 -504,14 +504,14 @@@
          * And for power savings, we return cpu_core_map
          */
         if (sched_mc_power_savings || sched_smt_power_savings)
-               return per_cpu(cpu_core_map, cpu);
+               return &per_cpu(cpu_core_map, cpu);
         else
-               return c->llc_shared_map;
+               return &c->llc_shared_map;
+ }
+ 
+ cpumask_t cpu_coregroup_map(int cpu)
+ {
+       return *cpu_coregroup_mask(cpu);
   }
   
   static void impress_friends(void)
@@@ -1075,8 -1080,10 +1080,10 @@@ static int __init smp_sanity_check(unsi
   #endif
   
         if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
-               printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
-                                   "by the BIOS.\n", hard_smp_processor_id());
+               printk(KERN_WARNING
+                       "weird, boot CPU (#%d) not listed by the BIOS.\n",
+                       hard_smp_processor_id());
+ 
                 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
         }
   
@@@ -1252,15 -1259,6 +1259,15 @@@ void __init native_smp_cpus_done(unsign
         check_nmi_watchdog();
   }
   
+ +static int __initdata setup_possible_cpus = -1;
+ +static int __init _setup_possible_cpus(char *str)
+ +{
+ +      get_option(&str, &setup_possible_cpus);
+ +      return 0;
+ +}
+ +early_param("possible_cpus", _setup_possible_cpus);
+ +
+ +
   /*
    * cpu_possible_map should be static, it cannot change as cpu's
    * are onlined, or offlined. The reason is per-cpu data-structures
@@@ -1273,7 -1271,7 +1280,7 @@@
    *
    * Three ways to find out the number of additional hotplug CPUs:
    * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
- - * - The user can overwrite it with additional_cpus=NUM
+ + * - The user can overwrite it with possible_cpus=NUM
    * - Otherwise don't reserve additional CPUs.
    * We do this because additional CPUs waste a lot of memory.
    * -AK
@@@ -1286,17 -1284,9 +1293,17 @@@ __init void prefill_possible_map(void
         if (!num_processors)
                 num_processors = 1;
   
- -      possible = num_processors + disabled_cpus;
- -      if (possible > NR_CPUS)
- -              possible = NR_CPUS;
+ +      if (setup_possible_cpus == -1)
+ +              possible = num_processors + disabled_cpus;
+ +      else
+ +              possible = setup_possible_cpus;
+ +
+ +      if (possible > CONFIG_NR_CPUS) {
+ +              printk(KERN_WARNING
+ +                      "%d Processors exceeds NR_CPUS limit of %d\n",
+ +                      possible, CONFIG_NR_CPUS);
+ +              possible = CONFIG_NR_CPUS;
+ +      }
   
         printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
                 possible, max_t(int, possible - num_processors, 0));
@@@ -1361,7 -1351,7 +1368,7 @@@ void cpu_disable_common(void
         lock_vector_lock();
         remove_cpu_from_maps(cpu);
         unlock_vector_lock();
- -      fixup_irqs(cpu_online_map);
+ +      fixup_irqs();
   }
   
   int native_cpu_disable(void)
diff --combined arch/x86/kernel/tlb_32.c

index 174ea90,8da059f..ce50546
--- 1/arch/x86/kernel/tlb_32.c
--- 2/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@@ -34,9 -34,8 +34,8 @@@ static DEFINE_SPINLOCK(tlbstate_lock)
    */
   void leave_mm(int cpu)
   {
-       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
-               BUG();
-       cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
+       BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
+       cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
         load_cr3(swapper_pg_dir);
   }
   EXPORT_SYMBOL_GPL(leave_mm);
@@@ -104,8 -103,8 +103,8 @@@ void smp_invalidate_interrupt(struct pt
                  * BUG();
                  */
   
-       if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
-               if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
+       if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
+               if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
                         if (flush_va == TLB_FLUSH_ALL)
                                 local_flush_tlb();
                         else
@@@ -119,7 -118,7 +118,7 @@@
         smp_mb__after_clear_bit();
   out:
         put_cpu_no_resched();
-       __get_cpu_var(irq_stat).irq_tlb_count++;
+       inc_irq_stat(irq_tlb_count);
   }
   
   void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
@@@ -164,7 -163,7 +163,7 @@@
          * We have to send the IPI only to
          * CPUs affected.
          */
- -      send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+ +      send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
   
         while (!cpus_empty(flush_cpumask))
                 /* nothing. lockup detection does not belong here */
@@@ -238,7 -237,7 +237,7 @@@ static void do_flush_tlb_all(void *info
         unsigned long cpu = smp_processor_id();
   
         __flush_tlb_all();
-       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
+       if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
                 leave_mm(cpu);
   }
   
diff --combined arch/x86/kernel/tlb_64.c

index de6f1bd,29887d7..f8be6f1
--- 1/arch/x86/kernel/tlb_64.c
--- 2/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@@ -154,7 -154,7 +154,7 @@@ asmlinkage void smp_invalidate_interrup
   out:
         ack_APIC_irq();
         cpu_clear(cpu, f->flush_cpumask);
-       add_pda(irq_tlb_count, 1);
+       inc_irq_stat(irq_tlb_count);
   }
   
   void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
@@@ -191,7 -191,7 +191,7 @@@
          * We have to send the IPI only to
          * CPUs affected.
          */
- -      send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
+ +      send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
   
         while (!cpus_empty(f->flush_cpumask))
                 cpu_relax();
diff --combined arch/x86/kernel/traps.c

index 4a6dff3,141907a..2d1f4c7
--- 1/arch/x86/kernel/traps.c
--- 2/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@@ -72,6 -72,9 +72,6 @@@
   
   #include "cpu/mcheck/mce.h"
   
- -DECLARE_BITMAP(used_vectors, NR_VECTORS);
- -EXPORT_SYMBOL_GPL(used_vectors);
- -
   asmlinkage int system_call(void);
   
   /* Do we ignore FPU interrupts ? */
@@@ -86,9 -89,6 +86,9 @@@ gate_desc idt_table[256
         __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
   #endif
   
+ +DECLARE_BITMAP(used_vectors, NR_VECTORS);
+ +EXPORT_SYMBOL_GPL(used_vectors);
+ +
   static int ignore_nmis;
   
   static inline void conditional_sti(struct pt_regs *regs)
@@@ -481,11 -481,7 +481,7 @@@ do_nmi(struct pt_regs *regs, long error
   {
         nmi_enter();
   
- #ifdef CONFIG_X86_32
-       { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
- #else
-       add_pda(__nmi_count, 1);
- #endif
+       inc_irq_stat(__nmi_count);
   
         if (!ignore_nmis)
                 default_do_nmi(regs);
@@@ -664,7 -660,7 +660,7 @@@ void math_error(void __user *ip
   {
         struct task_struct *task;
         siginfo_t info;
-       unsigned short cwd, swd;
+       unsigned short cwd, swd, err;
   
         /*
          * Save the info for the exception handler and clear the error.
@@@ -675,7 -671,6 +671,6 @@@
         task->thread.error_code = 0;
         info.si_signo = SIGFPE;
         info.si_errno = 0;
-       info.si_code = __SI_FAULT;
         info.si_addr = ip;
         /*
          * (~cwd & swd) will mask out exceptions that are not set to unmasked
@@@ -689,34 -684,31 +684,31 @@@
          */
         cwd = get_fpu_cwd(task);
         swd = get_fpu_swd(task);
-       switch (swd & ~cwd & 0x3f) {
-       case 0x000: /* No unmasked exception */
+ 
+       err = swd & ~cwd & 0x3f;
+ 
   #ifdef CONFIG_X86_32
+       if (!err)
                 return;
   #endif
-       default: /* Multiple exceptions */
-               break;
-       case 0x001: /* Invalid Op */
+ 
+       if (err & 0x001) {      /* Invalid op */
                 /*
                  * swd & 0x240 == 0x040: Stack Underflow
                  * swd & 0x240 == 0x240: Stack Overflow
                  * User must clear the SF bit (0x40) if set
                  */
                 info.si_code = FPE_FLTINV;
-               break;
-       case 0x002: /* Denormalize */
-       case 0x010: /* Underflow */
-               info.si_code = FPE_FLTUND;
-               break;
-       case 0x004: /* Zero Divide */
+       } else if (err & 0x004) { /* Divide by Zero */
                 info.si_code = FPE_FLTDIV;
-               break;
-       case 0x008: /* Overflow */
+       } else if (err & 0x008) { /* Overflow */
                 info.si_code = FPE_FLTOVF;
-               break;
-       case 0x020: /* Precision */
+       } else if (err & 0x012) { /* Denormal, Underflow */
+               info.si_code = FPE_FLTUND;
+       } else if (err & 0x020) { /* Precision */
                 info.si_code = FPE_FLTRES;
-               break;
+       } else {
+               info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */
         }
         force_sig_info(SIGFPE, &info, task);
   }
@@@ -949,7 -941,9 +941,7 @@@ dotraplinkage void do_iret_error(struc
   
   void __init trap_init(void)
   {
- -#ifdef CONFIG_X86_32
         int i;
- -#endif
   
   #ifdef CONFIG_EISA
         void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@@ -1006,15 -1000,11 +998,15 @@@
         }
   
         set_system_trap_gate(SYSCALL_VECTOR, &system_call);
+ +#endif
   
         /* Reserve all the builtin and the syscall vector: */
         for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
                 set_bit(i, used_vectors);
   
+ +#ifdef CONFIG_X86_64
+ +      set_bit(IA32_SYSCALL_VECTOR, used_vectors);
+ +#else
         set_bit(SYSCALL_VECTOR, used_vectors);
   #endif
         /*
diff --combined arch/x86/xen/mmu.c

index e59e53b,773d68d..503c240
--- 1/arch/x86/xen/mmu.c
--- 2/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@@ -154,13 -154,13 +154,13 @@@ void xen_setup_mfn_list_list(void
   {
         unsigned pfn, idx;
   
-       for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
+       for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
                 unsigned topidx = p2m_top_index(pfn);
   
                 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
         }
   
-       for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
+       for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
                 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
                 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
         }
@@@ -179,7 -179,7 +179,7 @@@ void __init xen_build_dynamic_phys_to_m
         unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
         unsigned pfn;
   
-       for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
+       for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
                 unsigned topidx = p2m_top_index(pfn);
   
                 p2m_top[topidx] = &mfn_list[pfn];
@@@ -207,7 -207,7 +207,7 @@@ static void alloc_p2m(unsigned long **p
         p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
         BUG_ON(p == NULL);
   
-       for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
+       for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
                 p[i] = INVALID_P2M_ENTRY;
   
         if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
@@@ -407,7 -407,8 +407,8 @@@ out
                 preempt_enable();
   }
   
- pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
+                                unsigned long addr, pte_t *ptep)
   {
         /* Just return the pte as-is.  We preserve the bits on commit */
         return *ptep;
@@@ -878,7 -879,8 +879,8 @@@ static void __xen_pgd_pin(struct mm_str
   
                 if (user_pgd) {
                         xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
-                       xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
+                       xen_do_pin(MMUEXT_PIN_L4_TABLE,
+                                  PFN_DOWN(__pa(user_pgd)));
                 }
         }
   #else /* CONFIG_X86_32 */
@@@ -993,7 -995,8 +995,8 @@@ static void __xen_pgd_unpin(struct mm_s
                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
   
                 if (user_pgd) {
-                       xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
+                       xen_do_pin(MMUEXT_UNPIN_TABLE,
+                                  PFN_DOWN(__pa(user_pgd)));
                         xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
                 }
         }
@@@ -1079,7 -1082,7 +1082,7 @@@ static void drop_other_mm_ref(void *inf
   
   static void xen_drop_mm_ref(struct mm_struct *mm)
   {
- -      cpumask_t mask;
+ +      cpumask_var_t mask;
         unsigned cpu;
   
         if (current->active_mm == mm) {
@@@ -1091,16 -1094,7 +1094,16 @@@
         }
   
         /* Get the "official" set of cpus referring to our pagetable. */
- -      mask = mm->cpu_vm_mask;
+ +      if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
+ +              for_each_online_cpu(cpu) {
+ +                      if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
+ +                          && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+ +                              continue;
+ +                      smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
+ +              }
+ +              return;
+ +      }
+ +      cpumask_copy(mask, &mm->cpu_vm_mask);
   
         /* It's possible that a vcpu may have a stale reference to our
            cr3, because its in lazy mode, and it hasn't yet flushed
@@@ -1109,12 -1103,11 +1112,12 @@@
            if needed. */
         for_each_online_cpu(cpu) {
                 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
- -                      cpu_set(cpu, mask);
+ +                      cpumask_set_cpu(cpu, mask);
         }
   
- -      if (!cpus_empty(mask))
- -              smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
+ +      if (!cpumask_empty(mask))
+ +              smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
+ +      free_cpumask_var(mask);
   }
   #else
   static void xen_drop_mm_ref(struct mm_struct *mm)
diff --combined include/linux/sched.h

index e5f928a,8395e71..158d53d
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -250,7 -250,7 +250,7 @@@ extern void init_idle_bootup_task(struc
   extern int runqueue_is_locked(void);
   extern void task_rq_unlock_wait(struct task_struct *p);
   
- -extern cpumask_t nohz_cpu_mask;
+ +extern cpumask_var_t nohz_cpu_mask;
   #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
   extern int select_nohz_load_balancer(int cpu);
   #else
@@@ -571,12 -571,6 +571,6 @@@ struct signal_struct 
          */
         struct rlimit rlim[RLIM_NLIMITS];
   
-       /* keep the process-shared keyrings here so that they do the right
-        * thing in threads created with CLONE_THREAD */
- #ifdef CONFIG_KEYS
-       struct key *session_keyring;    /* keyring inherited over fork */
-       struct key *process_keyring;    /* keyring private to this process */
- #endif
   #ifdef CONFIG_BSD_PROCESS_ACCT
         struct pacct_struct pacct;      /* per-process accounting information */
   #endif
@@@ -647,6 -641,7 +641,7 @@@ struct user_struct 
         /* Hash table maintenance information */
         struct hlist_node uidhash_node;
         uid_t uid;
+       struct user_namespace *user_ns;
   
   #ifdef CONFIG_USER_SCHED
         struct task_group *tg;
@@@ -664,6 -659,7 +659,7 @@@ extern struct user_struct *find_user(ui
   extern struct user_struct root_user;
   #define INIT_USER (&root_user)
   
+ 
   struct backing_dev_info;
   struct reclaim_state;
   
@@@ -671,8 -667,7 +667,7 @@@
   struct sched_info {
         /* cumulative counters */
         unsigned long pcount;         /* # of times run on this cpu */
-       unsigned long long cpu_time,  /* time spent on the cpu */
-                          run_delay; /* time spent waiting on a runqueue */
+       unsigned long long run_delay; /* time spent waiting on a runqueue */
   
         /* timestamps */
         unsigned long long last_arrival,/* when we last ran on a cpu */
@@@ -763,51 -758,20 +758,51 @@@ enum cpu_idle_type 
   #define SD_SERIALIZE          1024    /* Only a single load balancing instance */
   #define SD_WAKE_IDLE_FAR      2048    /* Gain latency sacrificing cache hit */
   
- -#define BALANCE_FOR_MC_POWER  \
- -      (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
+ +enum powersavings_balance_level {
+ +      POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
+ +      POWERSAVINGS_BALANCE_BASIC,     /* Fill one thread/core/package
+ +                                       * first for long running threads
+ +                                       */
+ +      POWERSAVINGS_BALANCE_WAKEUP,    /* Also bias task wakeups to semi-idle
+ +                                       * cpu package for power savings
+ +                                       */
+ +      MAX_POWERSAVINGS_BALANCE_LEVELS
+ +};
   
- -#define BALANCE_FOR_PKG_POWER \
- -      ((sched_mc_power_savings || sched_smt_power_savings) ?  \
- -       SD_POWERSAVINGS_BALANCE : 0)
+ +extern int sched_mc_power_savings, sched_smt_power_savings;
   
- -#define test_sd_parent(sd, flag)      ((sd->parent &&         \
- -                                       (sd->parent->flags & flag)) ? 1 : 0)
+ +static inline int sd_balance_for_mc_power(void)
+ +{
+ +      if (sched_smt_power_savings)
+ +              return SD_POWERSAVINGS_BALANCE;
   
+ +      return 0;
+ +}
+ +
+ +static inline int sd_balance_for_package_power(void)
+ +{
+ +      if (sched_mc_power_savings | sched_smt_power_savings)
+ +              return SD_POWERSAVINGS_BALANCE;
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Optimise SD flags for power savings:
+ + * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
+ + * Keep default SD flags if sched_{smt,mc}_power_saving=0
+ + */
+ +
+ +static inline int sd_power_saving_flags(void)
+ +{
+ +      if (sched_mc_power_savings | sched_smt_power_savings)
+ +              return SD_BALANCE_NEWIDLE;
+ +
+ +      return 0;
+ +}
   
   struct sched_group {
         struct sched_group *next;       /* Must be a circular list */
- -      cpumask_t cpumask;
   
         /*
          * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@@ -820,15 -784,8 +815,15 @@@
          * (see include/linux/reciprocal_div.h)
          */
         u32 reciprocal_cpu_power;
+ +
+ +      unsigned long cpumask[];
   };
   
+ +static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+ +{
+ +      return to_cpumask(sg->cpumask);
+ +}
+ +
   enum sched_domain_level {
         SD_LV_NONE = 0,
         SD_LV_SIBLING,
@@@ -852,6 -809,7 +847,6 @@@ struct sched_domain 
         struct sched_domain *parent;    /* top domain must be null terminated */
         struct sched_domain *child;     /* bottom domain must be null terminated */
         struct sched_group *groups;     /* the balancing groups of the domain */
- -      cpumask_t span;                 /* span of all CPUs in this domain */
         unsigned long min_interval;     /* Minimum balance interval ms */
         unsigned long max_interval;     /* Maximum balance interval ms */
         unsigned int busy_factor;       /* less balancing by factor if busy */
@@@ -906,73 -864,25 +901,42 @@@
   #ifdef CONFIG_SCHED_DEBUG
         char *name;
   #endif
+ +
+ +      /* span of all CPUs in this domain */
+ +      unsigned long span[];
   };
   
- -extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ +static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
+ +{
+ +      return to_cpumask(sd->span);
+ +}
+ +
+ +extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                                     struct sched_domain_attr *dattr_new);
   extern int arch_reinit_sched_domains(void);
   
+ +/* Test a flag in parent sched domain */
+ +static inline int test_sd_parent(struct sched_domain *sd, int flag)
+ +{
+ +      if (sd->parent && (sd->parent->flags & flag))
+ +              return 1;
+ +
+ +      return 0;
+ +}
+ +
   #else /* CONFIG_SMP */
   
   struct sched_domain_attr;
   
   static inline void
- -partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ +partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                         struct sched_domain_attr *dattr_new)
   {
   }
   #endif        /* !CONFIG_SMP */
   
   struct io_context;                    /* See blkdev.h */
- #define NGROUPS_SMALL         32
- #define NGROUPS_PER_BLOCK     ((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
- struct group_info {
-       int ngroups;
-       atomic_t usage;
-       gid_t small_block[NGROUPS_SMALL];
-       int nblocks;
-       gid_t *blocks[0];
- };
   
- /*
-  * get_group_info() must be called with the owning task locked (via task_lock())
-  * when task != current.  The reason being that the vast majority of callers are
-  * looking at current->group_info, which can not be changed except by the
-  * current task.  Changing current->group_info requires the task lock, too.
-  */
- #define get_group_info(group_info) do { \
-       atomic_inc(&(group_info)->usage); \
- } while (0)
- 
- #define put_group_info(group_info) do { \
-       if (atomic_dec_and_test(&(group_info)->usage)) \
-               groups_free(group_info); \
- } while (0)
- 
- extern struct group_info *groups_alloc(int gidsetsize);
- extern void groups_free(struct group_info *group_info);
- extern int set_current_groups(struct group_info *group_info);
- extern int groups_search(struct group_info *group_info, gid_t grp);
- /* access the groups "array" with this macro */
- #define GROUP_AT(gi, i) \
-     ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
   
   #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
   extern void prefetch_stack(struct task_struct *t);
@@@ -1016,7 -926,7 +980,7 @@@ struct sched_class 
         void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
   
         void (*set_cpus_allowed)(struct task_struct *p,
- -                               const cpumask_t *newmask);
+ +                               const struct cpumask *newmask);
   
         void (*rq_online)(struct rq *rq);
         void (*rq_offline)(struct rq *rq);
@@@ -1228,6 -1138,7 +1192,7 @@@ struct task_struct 
          * The buffer to hold the BTS data.
          */
         void *bts_buffer;
+       size_t bts_size;
   #endif /* CONFIG_X86_PTRACE_BTS */
   
         /* PID/PID hash table linkage. */
@@@ -1251,17 -1162,12 +1216,12 @@@
         struct list_head cpu_timers[3];
   
   /* process credentials */
-       uid_t uid,euid,suid,fsuid;
-       gid_t gid,egid,sgid,fsgid;
-       struct group_info *group_info;
-       kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
-       struct user_struct *user;
-       unsigned securebits;
- #ifdef CONFIG_KEYS
-       unsigned char jit_keyring;      /* default keyring to attach requested keys to */
-       struct key *request_key_auth;   /* assumed request_key authority */
-       struct key *thread_keyring;     /* keyring private to this thread */
- #endif
+       const struct cred *real_cred;   /* objective and real subjective task
+                                        * credentials (COW) */
+       const struct cred *cred;        /* effective (overridable) subjective task
+                                        * credentials (COW) */
+       struct mutex cred_exec_mutex;   /* execve vs ptrace cred calculation mutex */
+ 
         char comm[TASK_COMM_LEN]; /* executable name excluding path
                                      - access with [gs]et_task_comm (which lock
                                        it with task_lock())
@@@ -1298,9 -1204,6 +1258,6 @@@
         int (*notifier)(void *priv);
         void *notifier_data;
         sigset_t *notifier_mask;
- #ifdef CONFIG_SECURITY
-       void *security;
- #endif
         struct audit_context *audit_context;
   #ifdef CONFIG_AUDITSYSCALL
         uid_t loginuid;
@@@ -1676,12 -1579,12 +1633,12 @@@ extern cputime_t task_gtime(struct task
   
   #ifdef CONFIG_SMP
   extern int set_cpus_allowed_ptr(struct task_struct *p,
- -                              const cpumask_t *new_mask);
+ +                              const struct cpumask *new_mask);
   #else
   static inline int set_cpus_allowed_ptr(struct task_struct *p,
- -                                     const cpumask_t *new_mask)
+ +                                     const struct cpumask *new_mask)
   {
- -      if (!cpu_isset(0, *new_mask))
+ +      if (!cpumask_test_cpu(0, new_mask))
                 return -EINVAL;
         return 0;
   }
@@@ -1857,7 -1760,6 +1814,6 @@@ static inline struct user_struct *get_u
         return u;
   }
   extern void free_uid(struct user_struct *);
- extern void switch_uid(struct user_struct *);
   extern void release_uids(struct user_namespace *ns);
   
   #include <asm/current.h>
@@@ -1876,9 -1778,6 +1832,6 @@@ extern void wake_up_new_task(struct tas
   extern void sched_fork(struct task_struct *p, int clone_flags);
   extern void sched_dead(struct task_struct *p);
   
- extern int in_group_p(gid_t);
- extern int in_egroup_p(gid_t);
- 
   extern void proc_caches_init(void);
   extern void flush_signals(struct task_struct *);
   extern void ignore_signals(struct task_struct *);
@@@ -2010,6 -1909,8 +1963,8 @@@ static inline unsigned long wait_task_i
   #define for_each_process(p) \
         for (p = &init_task ; (p = next_task(p)) != &init_task ; )
   
+ extern bool is_single_threaded(struct task_struct *);
+ 
   /*
    * Careful: do_each_thread/while_each_thread is a double loop so
    *          'break' will not work as expected - use goto instead.
@@@ -2294,8 -2195,10 +2249,8 @@@ __trace_special(void *__tr, void *__dat
   }
   #endif
   
- -extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
- -extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
- -
- -extern int sched_mc_power_savings, sched_smt_power_savings;
+ +extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+ +extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
   
   extern void normalize_rt_tasks(void);
   
diff --combined kernel/rcuclassic.c

index c03ca3e,0ff9b05..6ec495f
--- 1/kernel/rcuclassic.c
--- 2/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@@ -63,14 -63,14 +63,14 @@@ static struct rcu_ctrlblk rcu_ctrlblk 
         .completed = -300,
         .pending = -300,
         .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
-       .cpumask = CPU_MASK_NONE,
+       .cpumask = CPU_BITS_NONE,
   };
   static struct rcu_ctrlblk rcu_bh_ctrlblk = {
         .cur = -300,
         .completed = -300,
         .pending = -300,
         .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
-       .cpumask = CPU_MASK_NONE,
+       .cpumask = CPU_BITS_NONE,
   };
   
   DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
@@@ -85,7 -85,6 +85,6 @@@ static void force_quiescent_state(struc
                         struct rcu_ctrlblk *rcp)
   {
         int cpu;
-       cpumask_t cpumask;
         unsigned long flags;
   
         set_need_resched();
@@@ -96,10 -95,10 +95,10 @@@
                  * Don't send IPI to itself. With irqs disabled,
                  * rdp->cpu is the current cpu.
                  *
-                * cpu_online_map is updated by the _cpu_down()
+                * cpu_online_mask is updated by the _cpu_down()
                  * using __stop_machine(). Since we're in irqs disabled
                  * section, __stop_machine() is not exectuting, hence
-                * the cpu_online_map is stable.
+                * the cpu_online_mask is stable.
                  *
                  * However,  a cpu might have been offlined _just_ before
                  * we disabled irqs while entering here.
@@@ -107,13 -106,14 +106,14 @@@
                  * notification, leading to the offlined cpu's bit
                  * being set in the rcp->cpumask.
                  *
-                * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent
+                * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
                  * sending smp_reschedule() to an offlined CPU.
                  */
-               cpus_and(cpumask, rcp->cpumask, cpu_online_map);
-               cpu_clear(rdp->cpu, cpumask);
-               for_each_cpu_mask_nr(cpu, cpumask)
-                       smp_send_reschedule(cpu);
+               for_each_cpu_and(cpu,
+                                 to_cpumask(rcp->cpumask), cpu_online_mask) {
+                       if (cpu != rdp->cpu)
+                               smp_send_reschedule(cpu);
+               }
         }
         spin_unlock_irqrestore(&rcp->lock, flags);
   }
@@@ -193,7 -193,7 +193,7 @@@ static void print_other_cpu_stall(struc
   
         printk(KERN_ERR "INFO: RCU detected CPU stalls:");
         for_each_possible_cpu(cpu) {
-               if (cpu_isset(cpu, rcp->cpumask))
+               if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
                         printk(" %d", cpu);
         }
         printk(" (detected by %d, t=%ld jiffies)\n",
@@@ -221,7 -221,8 +221,8 @@@ static void check_cpu_stall(struct rcu_
         long delta;
   
         delta = jiffies - rcp->jiffies_stall;
-       if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
+       if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
+               delta >= 0) {
   
                 /* We haven't checked in, so go dump stack. */
                 print_cpu_stall(rcp);
@@@ -393,7 -394,8 +394,7 @@@ static void rcu_start_batch(struct rcu_
                  * unnecessarily.
                  */
                 smp_mb();
- -              cpumask_andnot(to_cpumask(rcp->cpumask),
- -                             cpu_online_mask, &nohz_cpu_mask);
+ +              cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask);
   
                 rcp->signaled = 0;
         }
@@@ -406,8 -408,8 +407,8 @@@
    */
   static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
   {
-       cpu_clear(cpu, rcp->cpumask);
-       if (cpus_empty(rcp->cpumask)) {
+       cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
+       if (cpumask_empty(to_cpumask(rcp->cpumask))) {
                 /* batch completed ! */
                 rcp->completed = rcp->cur;
                 rcu_start_batch(rcp);
diff --combined kernel/sched.c

index 756d981,f209566..27ba1d6
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -209,7 -209,6 +209,6 @@@ void init_rt_bandwidth(struct rt_bandwi
         hrtimer_init(&rt_b->rt_period_timer,
                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rt_b->rt_period_timer.function = sched_rt_period_timer;
-       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
   }
   
   static inline int rt_bandwidth_enabled(void)
@@@ -361,7 -360,9 +360,9 @@@ static inline struct task_group *task_g
         struct task_group *tg;
   
   #ifdef CONFIG_USER_SCHED
-       tg = p->user->tg;
+       rcu_read_lock();
+       tg = __task_cred(p)->user->tg;
+       rcu_read_unlock();
   #elif defined(CONFIG_CGROUP_SCHED)
         tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                 struct task_group, css);
@@@ -497,26 -498,18 +498,26 @@@ struct rt_rq 
    */
   struct root_domain {
         atomic_t refcount;
- -      cpumask_t span;
- -      cpumask_t online;
+ +      cpumask_var_t span;
+ +      cpumask_var_t online;
   
         /*
          * The "RT overload" flag: it gets set if a CPU has more than
          * one runnable RT task.
          */
- -      cpumask_t rto_mask;
+ +      cpumask_var_t rto_mask;
         atomic_t rto_count;
   #ifdef CONFIG_SMP
         struct cpupri cpupri;
   #endif
+ +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ +      /*
+ +       * Preferred wake up cpu nominated by sched_mc balance that will be
+ +       * used when most cpus are idle in the system indicating overall very
+ +       * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
+ +       */
+ +      unsigned int sched_mc_preferred_wakeup_cpu;
+ +#endif
   };
   
   /*
@@@ -610,6 -603,8 +611,8 @@@ struct rq 
   #ifdef CONFIG_SCHEDSTATS
         /* latency stats */
         struct sched_info rq_sched_info;
+       unsigned long long rq_cpu_time;
+       /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
   
         /* sys_sched_yield() stats */
         unsigned int yld_exp_empty;
@@@ -1143,7 -1138,6 +1146,6 @@@ static void init_rq_hrtick(struct rq *r
   
         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rq->hrtick_timer.function = hrtick;
-       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
   }
   #else /* CONFIG_SCHED_HRTICK */
   static inline void hrtick_clear(struct rq *rq)
@@@ -1520,7 -1514,7 +1522,7 @@@ static int tg_shares_up(struct task_gro
         struct sched_domain *sd = data;
         int i;
   
- -      for_each_cpu_mask(i, sd->span) {
+ +      for_each_cpu(i, sched_domain_span(sd)) {
                 /*
                  * If there are currently no tasks on the cpu pretend there
                  * is one of average load so that when a new task gets to
@@@ -1541,7 -1535,7 +1543,7 @@@
         if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                 shares = tg->shares;
   
- -      for_each_cpu_mask(i, sd->span)
+ +      for_each_cpu(i, sched_domain_span(sd))
                 update_group_shares_cpu(tg, i, shares, rq_weight);
   
         return 0;
@@@ -1871,6 -1865,8 +1873,8 @@@ void set_task_cpu(struct task_struct *p
   
         clock_offset = old_rq->clock - new_rq->clock;
   
+       trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+ 
   #ifdef CONFIG_SCHEDSTATS
         if (p->se.wait_start)
                 p->se.wait_start -= clock_offset;
@@@ -2105,17 -2101,15 +2109,17 @@@ find_idlest_group(struct sched_domain *
                 int i;
   
                 /* Skip over this group if it has no CPUs allowed */
- -              if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+ +              if (!cpumask_intersects(sched_group_cpus(group),
+ +                                      &p->cpus_allowed))
                         continue;
   
- -              local_group = cpu_isset(this_cpu, group->cpumask);
+ +              local_group = cpumask_test_cpu(this_cpu,
+ +                                             sched_group_cpus(group));
   
                 /* Tally up the load of all CPUs in the group */
                 avg_load = 0;
   
- -              for_each_cpu_mask_nr(i, group->cpumask) {
+ +              for_each_cpu(i, sched_group_cpus(group)) {
                         /* Bias balancing toward cpus of our domain */
                         if (local_group)
                                 load = source_load(i, load_idx);
@@@ -2147,14 -2141,17 +2151,14 @@@
    * find_idlest_cpu - find the idlest cpu among the cpus in group.
    */
   static int
- -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
- -              cpumask_t *tmp)
+ +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
   {
         unsigned long load, min_load = ULONG_MAX;
         int idlest = -1;
         int i;
   
         /* Traverse only the allowed CPUs */
- -      cpus_and(*tmp, group->cpumask, p->cpus_allowed);
- -
- -      for_each_cpu_mask_nr(i, *tmp) {
+ +      for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
                 load = weighted_cpuload(i);
   
                 if (load < min_load || (load == min_load && i == this_cpu)) {
@@@ -2196,6 -2193,7 +2200,6 @@@ static int sched_balance_self(int cpu, 
                 update_shares(sd);
   
         while (sd) {
- -              cpumask_t span, tmpmask;
                 struct sched_group *group;
                 int new_cpu, weight;
   
@@@ -2204,13 -2202,14 +2208,13 @@@
                         continue;
                 }
   
- -              span = sd->span;
                 group = find_idlest_group(sd, t, cpu);
                 if (!group) {
                         sd = sd->child;
                         continue;
                 }
   
- -              new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+ +              new_cpu = find_idlest_cpu(group, t, cpu);
                 if (new_cpu == -1 || new_cpu == cpu) {
                         /* Now try balancing at a lower domain level of cpu */
                         sd = sd->child;
@@@ -2219,10 -2218,10 +2223,10 @@@
   
                 /* Now try balancing at a lower domain level of new_cpu */
                 cpu = new_cpu;
+ +              weight = cpumask_weight(sched_domain_span(sd));
                 sd = NULL;
- -              weight = cpus_weight(span);
                 for_each_domain(cpu, tmp) {
- -                      if (weight <= cpus_weight(tmp->span))
+ +                      if (weight <= cpumask_weight(sched_domain_span(tmp)))
                                 break;
                         if (tmp->flags & flag)
                                 sd = tmp;
@@@ -2267,7 -2266,7 +2271,7 @@@ static int try_to_wake_up(struct task_s
                 cpu = task_cpu(p);
   
                 for_each_domain(this_cpu, sd) {
- -                      if (cpu_isset(cpu, sd->span)) {
+ +                      if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                 update_shares(sd);
                                 break;
                         }
@@@ -2277,6 -2276,7 +2281,7 @@@
   
         smp_wmb();
         rq = task_rq_lock(p, &flags);
+       update_rq_clock(rq);
         old_state = p->state;
         if (!(old_state & state))
                 goto out;
@@@ -2315,7 -2315,7 +2320,7 @@@
         else {
                 struct sched_domain *sd;
                 for_each_domain(this_cpu, sd) {
- -                      if (cpu_isset(cpu, sd->span)) {
+ +                      if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                 schedstat_inc(sd, ttwu_wake_remote);
                                 break;
                         }
@@@ -2334,12 -2334,11 +2339,11 @@@ out_activate
                 schedstat_inc(p, se.nr_wakeups_local);
         else
                 schedstat_inc(p, se.nr_wakeups_remote);
-       update_rq_clock(rq);
         activate_task(rq, p, 1);
         success = 1;
   
   out_running:
-       trace_sched_wakeup(rq, p);
+       trace_sched_wakeup(rq, p, success);
         check_preempt_curr(rq, p, sync);
   
         p->state = TASK_RUNNING;
@@@ -2472,7 -2471,7 +2476,7 @@@ void wake_up_new_task(struct task_struc
                 p->sched_class->task_new(rq, p);
                 inc_nr_running(rq);
         }
-       trace_sched_wakeup_new(rq, p);
+       trace_sched_wakeup_new(rq, p, 1);
         check_preempt_curr(rq, p, 0);
   #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
@@@ -2847,11 -2846,10 +2851,10 @@@ static void sched_migrate_task(struct t
         struct rq *rq;
   
         rq = task_rq_lock(p, &flags);
- -      if (!cpu_isset(dest_cpu, p->cpus_allowed)
+ +      if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
             || unlikely(!cpu_active(dest_cpu)))
                 goto out;
   
-       trace_sched_migrate_task(rq, p, dest_cpu);
         /* force the process onto the specified CPU */
         if (migrate_task(p, dest_cpu, &req)) {
                 /* Need to wait for migration thread (might exit: take ref). */
@@@ -2913,7 -2911,7 +2916,7 @@@ int can_migrate_task(struct task_struc
          * 2) cannot be migrated to this CPU due to cpus_allowed, or
          * 3) are cache-hot on their current CPU.
          */
- -      if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+ +      if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
                 schedstat_inc(p, se.nr_failed_migrations_affine);
                 return 0;
         }
@@@ -3088,7 -3086,7 +3091,7 @@@ static int move_one_task(struct rq *thi
   static struct sched_group *
   find_busiest_group(struct sched_domain *sd, int this_cpu,
                    unsigned long *imbalance, enum cpu_idle_type idle,
- -                 int *sd_idle, const cpumask_t *cpus, int *balance)
+ +                 int *sd_idle, const struct cpumask *cpus, int *balance)
   {
         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@@ -3124,11 -3122,10 +3127,11 @@@
                 unsigned long sum_avg_load_per_task;
                 unsigned long avg_load_per_task;
   
- -              local_group = cpu_isset(this_cpu, group->cpumask);
+ +              local_group = cpumask_test_cpu(this_cpu,
+ +                                             sched_group_cpus(group));
   
                 if (local_group)
- -                      balance_cpu = first_cpu(group->cpumask);
+ +                      balance_cpu = cpumask_first(sched_group_cpus(group));
   
                 /* Tally up the load of all CPUs in the group */
                 sum_weighted_load = sum_nr_running = avg_load = 0;
@@@ -3137,8 -3134,13 +3140,8 @@@
                 max_cpu_load = 0;
                 min_cpu_load = ~0UL;
   
- -              for_each_cpu_mask_nr(i, group->cpumask) {
- -                      struct rq *rq;
- -
- -                      if (!cpu_isset(i, *cpus))
- -                              continue;
- -
- -                      rq = cpu_rq(i);
+ +              for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ +                      struct rq *rq = cpu_rq(i);
   
                         if (*sd_idle && rq->nr_running)
                                 *sd_idle = 0;
@@@ -3249,8 -3251,8 +3252,8 @@@
                  */
                 if ((sum_nr_running < min_nr_running) ||
                     (sum_nr_running == min_nr_running &&
- -                   first_cpu(group->cpumask) <
- -                   first_cpu(group_min->cpumask))) {
+ +                   cpumask_first(sched_group_cpus(group)) >
+ +                   cpumask_first(sched_group_cpus(group_min)))) {
                         group_min = group;
                         min_nr_running = sum_nr_running;
                         min_load_per_task = sum_weighted_load /
@@@ -3265,8 -3267,8 +3268,8 @@@
                 if (sum_nr_running <= group_capacity - 1) {
                         if (sum_nr_running > leader_nr_running ||
                             (sum_nr_running == leader_nr_running &&
- -                           first_cpu(group->cpumask) >
- -                            first_cpu(group_leader->cpumask))) {
+ +                           cpumask_first(sched_group_cpus(group)) <
+ +                           cpumask_first(sched_group_cpus(group_leader)))) {
                                 group_leader = group;
                                 leader_nr_running = sum_nr_running;
                         }
@@@ -3392,10 -3394,6 +3395,10 @@@ out_balanced
   
         if (this == group_leader && group_leader != group_min) {
                 *imbalance = min_load_per_task;
+ +              if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+ +                      cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+ +                              cpumask_first(sched_group_cpus(group_leader));
+ +              }
                 return group_min;
         }
   #endif
@@@ -3409,16 -3407,16 +3412,16 @@@ ret
    */
   static struct rq *
   find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
- -                 unsigned long imbalance, const cpumask_t *cpus)
+ +                 unsigned long imbalance, const struct cpumask *cpus)
   {
         struct rq *busiest = NULL, *rq;
         unsigned long max_load = 0;
         int i;
   
- -      for_each_cpu_mask_nr(i, group->cpumask) {
+ +      for_each_cpu(i, sched_group_cpus(group)) {
                 unsigned long wl;
   
- -              if (!cpu_isset(i, *cpus))
+ +              if (!cpumask_test_cpu(i, cpus))
                         continue;
   
                 rq = cpu_rq(i);
@@@ -3448,7 -3446,7 +3451,7 @@@
    */
   static int load_balance(int this_cpu, struct rq *this_rq,
                         struct sched_domain *sd, enum cpu_idle_type idle,
- -                      int *balance, cpumask_t *cpus)
+ +                      int *balance, struct cpumask *cpus)
   {
         int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
         struct sched_group *group;
@@@ -3456,7 -3454,7 +3459,7 @@@
         struct rq *busiest;
         unsigned long flags;
   
- -      cpus_setall(*cpus);
+ +      cpumask_setall(cpus);
   
         /*
          * When power savings policy is enabled for the parent domain, idle
@@@ -3516,8 -3514,8 +3519,8 @@@ redo
   
                 /* All tasks on this runqueue were pinned by CPU affinity */
                 if (unlikely(all_pinned)) {
- -                      cpu_clear(cpu_of(busiest), *cpus);
- -                      if (!cpus_empty(*cpus))
+ +                      cpumask_clear_cpu(cpu_of(busiest), cpus);
+ +                      if (!cpumask_empty(cpus))
                                 goto redo;
                         goto out_balanced;
                 }
@@@ -3534,8 -3532,7 +3537,8 @@@
                         /* don't kick the migration_thread, if the curr
                          * task on busiest cpu can't be moved to this_cpu
                          */
- -                      if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ +                      if (!cpumask_test_cpu(this_cpu,
+ +                                            &busiest->curr->cpus_allowed)) {
                                 spin_unlock_irqrestore(&busiest->lock, flags);
                                 all_pinned = 1;
                                 goto out_one_pinned;
@@@ -3610,7 -3607,7 +3613,7 @@@ out
    */
   static int
   load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
- -                      cpumask_t *cpus)
+ +                      struct cpumask *cpus)
   {
         struct sched_group *group;
         struct rq *busiest = NULL;
@@@ -3619,7 -3616,7 +3622,7 @@@
         int sd_idle = 0;
         int all_pinned = 0;
   
- -      cpus_setall(*cpus);
+ +      cpumask_setall(cpus);
   
         /*
          * When power savings policy is enabled for the parent domain, idle
@@@ -3663,71 -3660,17 +3666,71 @@@ redo
                 double_unlock_balance(this_rq, busiest);
   
                 if (unlikely(all_pinned)) {
- -                      cpu_clear(cpu_of(busiest), *cpus);
- -                      if (!cpus_empty(*cpus))
+ +                      cpumask_clear_cpu(cpu_of(busiest), cpus);
+ +                      if (!cpumask_empty(cpus))
                                 goto redo;
                 }
         }
   
         if (!ld_moved) {
+ +              int active_balance = 0;
+ +
                 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
                 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
                     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                         return -1;
+ +
+ +              if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+ +                      return -1;
+ +
+ +              if (sd->nr_balance_failed++ < 2)
+ +                      return -1;
+ +
+ +              /*
+ +               * The only task running in a non-idle cpu can be moved to this
+ +               * cpu in an attempt to completely freeup the other CPU
+ +               * package. The same method used to move task in load_balance()
+ +               * have been extended for load_balance_newidle() to speedup
+ +               * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
+ +               *
+ +               * The package power saving logic comes from
+ +               * find_busiest_group().  If there are no imbalance, then
+ +               * f_b_g() will return NULL.  However when sched_mc={1,2} then
+ +               * f_b_g() will select a group from which a running task may be
+ +               * pulled to this cpu in order to make the other package idle.
+ +               * If there is no opportunity to make a package idle and if
+ +               * there are no imbalance, then f_b_g() will return NULL and no
+ +               * action will be taken in load_balance_newidle().
+ +               *
+ +               * Under normal task pull operation due to imbalance, there
+ +               * will be more than one task in the source run queue and
+ +               * move_tasks() will succeed.  ld_moved will be true and this
+ +               * active balance code will not be triggered.
+ +               */
+ +
+ +              /* Lock busiest in correct order while this_rq is held */
+ +              double_lock_balance(this_rq, busiest);
+ +
+ +              /*
+ +               * don't kick the migration_thread, if the curr
+ +               * task on busiest cpu can't be moved to this_cpu
+ +               */
+ +              if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ +                      double_unlock_balance(this_rq, busiest);
+ +                      all_pinned = 1;
+ +                      return ld_moved;
+ +              }
+ +
+ +              if (!busiest->active_balance) {
+ +                      busiest->active_balance = 1;
+ +                      busiest->push_cpu = this_cpu;
+ +                      active_balance = 1;
+ +              }
+ +
+ +              double_unlock_balance(this_rq, busiest);
+ +              if (active_balance)
+ +                      wake_up_process(busiest->migration_thread);
+ +
         } else
                 sd->nr_balance_failed = 0;
   
@@@ -3753,10 -3696,7 +3756,10 @@@ static void idle_balance(int this_cpu, 
         struct sched_domain *sd;
         int pulled_task = 0;
         unsigned long next_balance = jiffies + HZ;
- -      cpumask_t tmpmask;
+ +      cpumask_var_t tmpmask;
+ +
+ +      if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+ +              return;
   
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
@@@ -3767,7 -3707,7 +3770,7 @@@
                 if (sd->flags & SD_BALANCE_NEWIDLE)
                         /* If we've pulled tasks over stop searching: */
                         pulled_task = load_balance_newidle(this_cpu, this_rq,
- -                                                         sd, &tmpmask);
+ +                                                         sd, tmpmask);
   
                 interval = msecs_to_jiffies(sd->balance_interval);
                 if (time_after(next_balance, sd->last_balance + interval))
@@@ -3782,7 -3722,6 +3785,7 @@@
                  */
                 this_rq->next_balance = next_balance;
         }
+ +      free_cpumask_var(tmpmask);
   }
   
   /*
@@@ -3820,7 -3759,7 +3823,7 @@@ static void active_load_balance(struct 
         /* Search for an sd spanning us and the target CPU. */
         for_each_domain(target_cpu, sd) {
                 if ((sd->flags & SD_LOAD_BALANCE) &&
- -                  cpu_isset(busiest_cpu, sd->span))
+ +                  cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
                                 break;
         }
   
@@@ -3839,9 -3778,10 +3842,9 @@@
   #ifdef CONFIG_NO_HZ
   static struct {
         atomic_t load_balancer;
- -      cpumask_t cpu_mask;
+ +      cpumask_var_t cpu_mask;
   } nohz ____cacheline_aligned = {
         .load_balancer = ATOMIC_INIT(-1),
- -      .cpu_mask = CPU_MASK_NONE,
   };
   
   /*
@@@ -3869,7 -3809,7 +3872,7 @@@ int select_nohz_load_balancer(int stop_
         int cpu = smp_processor_id();
   
         if (stop_tick) {
- -              cpu_set(cpu, nohz.cpu_mask);
+ +              cpumask_set_cpu(cpu, nohz.cpu_mask);
                 cpu_rq(cpu)->in_nohz_recently = 1;
   
                 /*
@@@ -3883,7 -3823,7 +3886,7 @@@
                 }
   
                 /* time for ilb owner also to sleep */
- -              if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ +              if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                         if (atomic_read(&nohz.load_balancer) == cpu)
                                 atomic_set(&nohz.load_balancer, -1);
                         return 0;
@@@ -3896,10 -3836,10 +3899,10 @@@
                 } else if (atomic_read(&nohz.load_balancer) == cpu)
                         return 1;
         } else {
- -              if (!cpu_isset(cpu, nohz.cpu_mask))
+ +              if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                         return 0;
   
- -              cpu_clear(cpu, nohz.cpu_mask);
+ +              cpumask_clear_cpu(cpu, nohz.cpu_mask);
   
                 if (atomic_read(&nohz.load_balancer) == cpu)
                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@@ -3927,11 -3867,7 +3930,11 @@@ static void rebalance_domains(int cpu, 
         unsigned long next_balance = jiffies + 60*HZ;
         int update_next_balance = 0;
         int need_serialize;
- -      cpumask_t tmp;
+ +      cpumask_var_t tmp;
+ +
+ +      /* Fails alloc?  Rebalancing probably not a priority right now. */
+ +      if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
+ +              return;
   
         for_each_domain(cpu, sd) {
                 if (!(sd->flags & SD_LOAD_BALANCE))
@@@ -3956,7 -3892,7 +3959,7 @@@
                 }
   
                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
- -                      if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+ +                      if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
                                 /*
                                  * We've pulled tasks over so either we're no
                                  * longer idle, or one of our SMT siblings is
@@@ -3990,8 -3926,6 +3993,8 @@@ out
          */
         if (likely(update_next_balance))
                 rq->next_balance = next_balance;
+ +
+ +      free_cpumask_var(tmp);
   }
   
   /*
@@@ -4016,13 -3950,12 +4019,13 @@@ static void run_rebalance_domains(struc
          */
         if (this_rq->idle_at_tick &&
             atomic_read(&nohz.load_balancer) == this_cpu) {
- -              cpumask_t cpus = nohz.cpu_mask;
                 struct rq *rq;
                 int balance_cpu;
   
- -              cpu_clear(this_cpu, cpus);
- -              for_each_cpu_mask_nr(balance_cpu, cpus) {
+ +              for_each_cpu(balance_cpu, nohz.cpu_mask) {
+ +                      if (balance_cpu == this_cpu)
+ +                              continue;
+ +
                         /*
                          * If this cpu gets work to do, stop the load balancing
                          * work being done for other cpus. Next load
@@@ -4060,7 -3993,7 +4063,7 @@@ static inline void trigger_load_balance
                 rq->in_nohz_recently = 0;
   
                 if (atomic_read(&nohz.load_balancer) == cpu) {
- -                      cpu_clear(cpu, nohz.cpu_mask);
+ +                      cpumask_clear_cpu(cpu, nohz.cpu_mask);
                         atomic_set(&nohz.load_balancer, -1);
                 }
   
@@@ -4073,7 -4006,7 +4076,7 @@@
                          * TBD: Traverse the sched domains and nominate
                          * the nearest cpu in the nohz.cpu_mask.
                          */
- -                      int ilb = first_cpu(nohz.cpu_mask);
+ +                      int ilb = cpumask_first(nohz.cpu_mask);
   
                         if (ilb < nr_cpu_ids)
                                 resched_cpu(ilb);
@@@ -4085,7 -4018,7 +4088,7 @@@
          * cpus with ticks stopped, is it time for that to stop?
          */
         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
- -          cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ +          cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                 resched_cpu(cpu);
                 return;
         }
@@@ -4095,7 -4028,7 +4098,7 @@@
          * someone else, then no need raise the SCHED_SOFTIRQ
          */
         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
- -          cpu_isset(cpu, nohz.cpu_mask))
+ +          cpumask_test_cpu(cpu, nohz.cpu_mask))
                 return;
   #endif
         if (time_after_eq(jiffies, rq->next_balance))
@@@ -5187,6 -5120,22 +5190,22 @@@ __setscheduler(struct rq *rq, struct ta
         set_load_weight(p);
   }
   
+ /*
+  * check the target process has a UID that matches the current process's
+  */
+ static bool check_same_owner(struct task_struct *p)
+ {
+       const struct cred *cred = current_cred(), *pcred;
+       bool match;
+ 
+       rcu_read_lock();
+       pcred = __task_cred(p);
+       match = (cred->euid == pcred->euid ||
+                cred->euid == pcred->uid);
+       rcu_read_unlock();
+       return match;
+ }
+ 
   static int __sched_setscheduler(struct task_struct *p, int policy,
                                 struct sched_param *param, bool user)
   {
@@@ -5246,8 -5195,7 +5265,7 @@@ recheck
                         return -EPERM;
   
                 /* can't change other user's priorities */
-               if ((current->euid != p->euid) &&
-                   (current->euid != p->uid))
+               if (!check_same_owner(p))
                         return -EPERM;
         }
   
@@@ -5453,9 -5401,10 +5471,9 @@@ out_unlock
         return retval;
   }
   
- -long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+ +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
   {
- -      cpumask_t cpus_allowed;
- -      cpumask_t new_mask = *in_mask;
+ +      cpumask_var_t cpus_allowed, new_mask;
         struct task_struct *p;
         int retval;
   
@@@ -5477,58 -5426,45 +5495,57 @@@
         get_task_struct(p);
         read_unlock(&tasklist_lock);
   
+ +      if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+ +              retval = -ENOMEM;
+ +              goto out_put_task;
+ +      }
+ +      if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ +              retval = -ENOMEM;
+ +              goto out_free_cpus_allowed;
+ +      }
         retval = -EPERM;
-       if ((current->euid != p->euid) && (current->euid != p->uid) &&
-                       !capable(CAP_SYS_NICE))
+       if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
   
         retval = security_task_setscheduler(p, 0, NULL);
         if (retval)
                 goto out_unlock;
   
- -      cpuset_cpus_allowed(p, &cpus_allowed);
- -      cpus_and(new_mask, new_mask, cpus_allowed);
+ +      cpuset_cpus_allowed(p, cpus_allowed);
+ +      cpumask_and(new_mask, in_mask, cpus_allowed);
    again:
- -      retval = set_cpus_allowed_ptr(p, &new_mask);
+ +      retval = set_cpus_allowed_ptr(p, new_mask);
   
         if (!retval) {
- -              cpuset_cpus_allowed(p, &cpus_allowed);
- -              if (!cpus_subset(new_mask, cpus_allowed)) {
+ +              cpuset_cpus_allowed(p, cpus_allowed);
+ +              if (!cpumask_subset(new_mask, cpus_allowed)) {
                         /*
                          * We must have raced with a concurrent cpuset
                          * update. Just reset the cpus_allowed to the
                          * cpuset's cpus_allowed
                          */
- -                      new_mask = cpus_allowed;
+ +                      cpumask_copy(new_mask, cpus_allowed);
                         goto again;
                 }
         }
   out_unlock:
+ +      free_cpumask_var(new_mask);
+ +out_free_cpus_allowed:
+ +      free_cpumask_var(cpus_allowed);
+ +out_put_task:
         put_task_struct(p);
         put_online_cpus();
         return retval;
   }
   
   static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
- -                           cpumask_t *new_mask)
+ +                           struct cpumask *new_mask)
   {
- -      if (len < sizeof(cpumask_t)) {
- -              memset(new_mask, 0, sizeof(cpumask_t));
- -      } else if (len > sizeof(cpumask_t)) {
- -              len = sizeof(cpumask_t);
- -      }
+ +      if (len < cpumask_size())
+ +              cpumask_clear(new_mask);
+ +      else if (len > cpumask_size())
+ +              len = cpumask_size();
+ +
         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
   }
   
@@@ -5541,20 -5477,17 +5558,20 @@@
   asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
                                       unsigned long __user *user_mask_ptr)
   {
- -      cpumask_t new_mask;
+ +      cpumask_var_t new_mask;
         int retval;
   
- -      retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
- -      if (retval)
- -              return retval;
+ +      if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ +              return -ENOMEM;
   
- -      return sched_setaffinity(pid, &new_mask);
+ +      retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+ +      if (retval == 0)
+ +              retval = sched_setaffinity(pid, new_mask);
+ +      free_cpumask_var(new_mask);
+ +      return retval;
   }
   
- -long sched_getaffinity(pid_t pid, cpumask_t *mask)
+ +long sched_getaffinity(pid_t pid, struct cpumask *mask)
   {
         struct task_struct *p;
         int retval;
@@@ -5571,7 -5504,7 +5588,7 @@@
         if (retval)
                 goto out_unlock;
   
- -      cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+ +      cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
   
   out_unlock:
         read_unlock(&tasklist_lock);
@@@ -5590,24 -5523,19 +5607,24 @@@ asmlinkage long sys_sched_getaffinity(p
                                       unsigned long __user *user_mask_ptr)
   {
         int ret;
- -      cpumask_t mask;
+ +      cpumask_var_t mask;
   
- -      if (len < sizeof(cpumask_t))
+ +      if (len < cpumask_size())
                 return -EINVAL;
   
- -      ret = sched_getaffinity(pid, &mask);
- -      if (ret < 0)
- -              return ret;
+ +      if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ +              return -ENOMEM;
   
- -      if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
- -              return -EFAULT;
+ +      ret = sched_getaffinity(pid, mask);
+ +      if (ret == 0) {
+ +              if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+ +                      ret = -EFAULT;
+ +              else
+ +                      ret = cpumask_size();
+ +      }
+ +      free_cpumask_var(mask);
   
- -      return sizeof(cpumask_t);
+ +      return ret;
   }
   
   /**
@@@ -5949,7 -5877,7 +5966,7 @@@ void __cpuinit init_idle(struct task_st
         idle->se.exec_start = sched_clock();
   
         idle->prio = idle->normal_prio = MAX_PRIO;
- -      idle->cpus_allowed = cpumask_of_cpu(cpu);
+ +      cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
         __set_task_cpu(idle, cpu);
   
         rq->curr = rq->idle = idle;
@@@ -5976,9 -5904,9 +5993,9 @@@
    * indicates which cpus entered this state. This is used
    * in the rcu update to wait only for active cpus. For system
    * which do not switch off the HZ timer nohz_cpu_mask should
- - * always be CPU_MASK_NONE.
+ + * always be CPU_BITS_NONE.
    */
- -cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+ +cpumask_var_t nohz_cpu_mask;
   
   /*
    * Increase the granularity value when there are more CPUs,
@@@ -6033,7 -5961,7 +6050,7 @@@ static inline void sched_init_granulari
    * task must not exit() & deallocate itself prematurely. The
    * call is not atomic; no spinlocks may be held.
    */
- -int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
+ +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
   {
         struct migration_req req;
         unsigned long flags;
@@@ -6041,13 -5969,13 +6058,13 @@@
         int ret = 0;
   
         rq = task_rq_lock(p, &flags);
- -      if (!cpus_intersects(*new_mask, cpu_online_map)) {
+ +      if (!cpumask_intersects(new_mask, cpu_online_mask)) {
                 ret = -EINVAL;
                 goto out;
         }
   
         if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
- -                   !cpus_equal(p->cpus_allowed, *new_mask))) {
+ +                   !cpumask_equal(&p->cpus_allowed, new_mask))) {
                 ret = -EINVAL;
                 goto out;
         }
@@@ -6055,15 -5983,15 +6072,15 @@@
         if (p->sched_class->set_cpus_allowed)
                 p->sched_class->set_cpus_allowed(p, new_mask);
         else {
- -              p->cpus_allowed = *new_mask;
- -              p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
+ +              cpumask_copy(&p->cpus_allowed, new_mask);
+ +              p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
         }
   
         /* Can the task run on the task's current CPU? If so, we're done */
- -      if (cpu_isset(task_cpu(p), *new_mask))
+ +      if (cpumask_test_cpu(task_cpu(p), new_mask))
                 goto out;
   
- -      if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
+ +      if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                 /* Need help from migration thread: drop lock and wait. */
                 task_rq_unlock(rq, &flags);
                 wake_up_process(rq->migration_thread);
@@@ -6105,7 -6033,7 +6122,7 @@@ static int __migrate_task(struct task_s
         if (task_cpu(p) != src_cpu)
                 goto done;
         /* Affinity changed (again). */
- -      if (!cpu_isset(dest_cpu, p->cpus_allowed))
+ +      if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                 goto fail;
   
         on_rq = p->se.on_rq;
@@@ -6202,43 -6130,50 +6219,43 @@@ static int __migrate_task_irq(struct ta
    */
   static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
   {
- -      unsigned long flags;
- -      cpumask_t mask;
- -      struct rq *rq;
         int dest_cpu;
+ +      /* FIXME: Use cpumask_of_node here. */
+ +      cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
+ +      const struct cpumask *nodemask = &_nodemask;
+ +
+ +again:
+ +      /* Look for allowed, online CPU in same node. */
+ +      for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
+ +              if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+ +                      goto move;
+ +
+ +      /* Any allowed, online CPU? */
+ +      dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
+ +      if (dest_cpu < nr_cpu_ids)
+ +              goto move;
+ +
+ +      /* No more Mr. Nice Guy. */
+ +      if (dest_cpu >= nr_cpu_ids) {
+ +              cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+ +              dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
   
- -      do {
- -              /* On same node? */
- -              mask = node_to_cpumask(cpu_to_node(dead_cpu));
- -              cpus_and(mask, mask, p->cpus_allowed);
- -              dest_cpu = any_online_cpu(mask);
- -
- -              /* On any allowed CPU? */
- -              if (dest_cpu >= nr_cpu_ids)
- -                      dest_cpu = any_online_cpu(p->cpus_allowed);
- -
- -              /* No more Mr. Nice Guy. */
- -              if (dest_cpu >= nr_cpu_ids) {
- -                      cpumask_t cpus_allowed;
- -
- -                      cpuset_cpus_allowed_locked(p, &cpus_allowed);
- -                      /*
- -                       * Try to stay on the same cpuset, where the
- -                       * current cpuset may be a subset of all cpus.
- -                       * The cpuset_cpus_allowed_locked() variant of
- -                       * cpuset_cpus_allowed() will not block. It must be
- -                       * called within calls to cpuset_lock/cpuset_unlock.
- -                       */
- -                      rq = task_rq_lock(p, &flags);
- -                      p->cpus_allowed = cpus_allowed;
- -                      dest_cpu = any_online_cpu(p->cpus_allowed);
- -                      task_rq_unlock(rq, &flags);
- -
- -                      /*
- -                       * Don't tell them about moving exiting tasks or
- -                       * kernel threads (both mm NULL), since they never
- -                       * leave kernel.
- -                       */
- -                      if (p->mm && printk_ratelimit()) {
- -                              printk(KERN_INFO "process %d (%s) no "
- -                                     "longer affine to cpu%d\n",
- -                                      task_pid_nr(p), p->comm, dead_cpu);
- -                      }
+ +              /*
+ +               * Don't tell them about moving exiting tasks or
+ +               * kernel threads (both mm NULL), since they never
+ +               * leave kernel.
+ +               */
+ +              if (p->mm && printk_ratelimit()) {
+ +                      printk(KERN_INFO "process %d (%s) no "
+ +                             "longer affine to cpu%d\n",
+ +                             task_pid_nr(p), p->comm, dead_cpu);
                 }
- -      } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
+ +      }
+ +
+ +move:
+ +      /* It can have affinity changed while we were choosing. */
+ +      if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+ +              goto again;
   }
   
   /*
@@@ -6250,7 -6185,7 +6267,7 @@@
    */
   static void migrate_nr_uninterruptible(struct rq *rq_src)
   {
- -      struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
+ +      struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
         unsigned long flags;
   
         local_irq_save(flags);
@@@ -6540,7 -6475,7 +6557,7 @@@ static void set_rq_online(struct rq *rq
         if (!rq->online) {
                 const struct sched_class *class;
   
- -              cpu_set(rq->cpu, rq->rd->online);
+ +              cpumask_set_cpu(rq->cpu, rq->rd->online);
                 rq->online = 1;
   
                 for_each_class(class) {
@@@ -6560,7 -6495,7 +6577,7 @@@ static void set_rq_offline(struct rq *r
                                 class->rq_offline(rq);
                 }
   
- -              cpu_clear(rq->cpu, rq->rd->online);
+ +              cpumask_clear_cpu(rq->cpu, rq->rd->online);
                 rq->online = 0;
         }
   }
@@@ -6601,7 -6536,7 +6618,7 @@@ migration_call(struct notifier_block *n
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
- -                      BUG_ON(!cpu_isset(cpu, rq->rd->span));
+ +                      BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
   
                         set_rq_online(rq);
                 }
@@@ -6615,7 -6550,7 +6632,7 @@@
                         break;
                 /* Unbind it from offline cpu so it can run. Fall thru. */
                 kthread_bind(cpu_rq(cpu)->migration_thread,
- -                           any_online_cpu(cpu_online_map));
+ +                           cpumask_any(cpu_online_mask));
                 kthread_stop(cpu_rq(cpu)->migration_thread);
                 cpu_rq(cpu)->migration_thread = NULL;
                 break;
@@@ -6665,7 -6600,7 +6682,7 @@@
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
- -                      BUG_ON(!cpu_isset(cpu, rq->rd->span));
+ +                      BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                         set_rq_offline(rq);
                 }
                 spin_unlock_irqrestore(&rq->lock, flags);
@@@ -6704,13 -6639,13 +6721,13 @@@ early_initcall(migration_init)
   #ifdef CONFIG_SCHED_DEBUG
   
   static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
- -                                cpumask_t *groupmask)
+ +                                struct cpumask *groupmask)
   {
         struct sched_group *group = sd->groups;
         char str[256];
   
- -      cpulist_scnprintf(str, sizeof(str), &sd->span);
- -      cpus_clear(*groupmask);
+ +      cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
+ +      cpumask_clear(groupmask);
   
         printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
   
@@@ -6724,11 -6659,11 +6741,11 @@@
   
         printk(KERN_CONT "span %s level %s\n", str, sd->name);
   
- -      if (!cpu_isset(cpu, sd->span)) {
+ +      if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                 printk(KERN_ERR "ERROR: domain->span does not contain "
                                 "CPU%d\n", cpu);
         }
- -      if (!cpu_isset(cpu, group->cpumask)) {
+ +      if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
                 printk(KERN_ERR "ERROR: domain->groups does not contain"
                                 " CPU%d\n", cpu);
         }
@@@ -6748,32 -6683,31 +6765,32 @@@
                         break;
                 }
   
- -              if (!cpus_weight(group->cpumask)) {
+ +              if (!cpumask_weight(sched_group_cpus(group))) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: empty group\n");
                         break;
                 }
   
- -              if (cpus_intersects(*groupmask, group->cpumask)) {
+ +              if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: repeated CPUs\n");
                         break;
                 }
   
- -              cpus_or(*groupmask, *groupmask, group->cpumask);
+ +              cpumask_or(groupmask, groupmask, sched_group_cpus(group));
   
- -              cpulist_scnprintf(str, sizeof(str), &group->cpumask);
+ +              cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                 printk(KERN_CONT " %s", str);
   
                 group = group->next;
         } while (group != sd->groups);
         printk(KERN_CONT "\n");
   
- -      if (!cpus_equal(sd->span, *groupmask))
+ +      if (!cpumask_equal(sched_domain_span(sd), groupmask))
                 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
   
- -      if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
+ +      if (sd->parent &&
+ +          !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
                 printk(KERN_ERR "ERROR: parent span is not a superset "
                         "of domain->span\n");
         return 0;
@@@ -6781,7 -6715,7 +6798,7 @@@
   
   static void sched_domain_debug(struct sched_domain *sd, int cpu)
   {
- -      cpumask_t *groupmask;
+ +      cpumask_var_t groupmask;
         int level = 0;
   
         if (!sd) {
@@@ -6791,7 -6725,8 +6808,7 @@@
   
         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
   
- -      groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
- -      if (!groupmask) {
+ +      if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
                 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
                 return;
         }
@@@ -6804,7 -6739,7 +6821,7 @@@
                 if (!sd)
                         break;
         }
- -      kfree(groupmask);
+ +      free_cpumask_var(groupmask);
   }
   #else /* !CONFIG_SCHED_DEBUG */
   # define sched_domain_debug(sd, cpu) do { } while (0)
@@@ -6812,7 -6747,7 +6829,7 @@@
   
   static int sd_degenerate(struct sched_domain *sd)
   {
- -      if (cpus_weight(sd->span) == 1)
+ +      if (cpumask_weight(sched_domain_span(sd)) == 1)
                 return 1;
   
         /* Following flags need at least 2 groups */
@@@ -6843,7 -6778,7 +6860,7 @@@ sd_parent_degenerate(struct sched_domai
         if (sd_degenerate(parent))
                 return 1;
   
- -      if (!cpus_equal(sd->span, parent->span))
+ +      if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
                 return 0;
   
         /* Does parent contain flags not in child? */
@@@ -6867,16 -6802,6 +6884,16 @@@
         return 1;
   }
   
+ +static void free_rootdomain(struct root_domain *rd)
+ +{
+ +      cpupri_cleanup(&rd->cpupri);
+ +
+ +      free_cpumask_var(rd->rto_mask);
+ +      free_cpumask_var(rd->online);
+ +      free_cpumask_var(rd->span);
+ +      kfree(rd);
+ +}
+ +
   static void rq_attach_root(struct rq *rq, struct root_domain *rd)
   {
         unsigned long flags;
@@@ -6886,63 -6811,38 +6903,63 @@@
         if (rq->rd) {
                 struct root_domain *old_rd = rq->rd;
   
- -              if (cpu_isset(rq->cpu, old_rd->online))
+ +              if (cpumask_test_cpu(rq->cpu, old_rd->online))
                         set_rq_offline(rq);
   
- -              cpu_clear(rq->cpu, old_rd->span);
+ +              cpumask_clear_cpu(rq->cpu, old_rd->span);
   
                 if (atomic_dec_and_test(&old_rd->refcount))
- -                      kfree(old_rd);
+ +                      free_rootdomain(old_rd);
         }
   
         atomic_inc(&rd->refcount);
         rq->rd = rd;
   
- -      cpu_set(rq->cpu, rd->span);
- -      if (cpu_isset(rq->cpu, cpu_online_map))
+ +      cpumask_set_cpu(rq->cpu, rd->span);
+ +      if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
                 set_rq_online(rq);
   
         spin_unlock_irqrestore(&rq->lock, flags);
   }
   
- -static void init_rootdomain(struct root_domain *rd)
+ +static int init_rootdomain(struct root_domain *rd, bool bootmem)
   {
         memset(rd, 0, sizeof(*rd));
   
- -      cpus_clear(rd->span);
- -      cpus_clear(rd->online);
+ +      if (bootmem) {
+ +              alloc_bootmem_cpumask_var(&def_root_domain.span);
+ +              alloc_bootmem_cpumask_var(&def_root_domain.online);
+ +              alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
+ +              cpupri_init(&rd->cpupri, true);
+ +              return 0;
+ +      }
+ +
+ +      if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ +              goto free_rd;
+ +      if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ +              goto free_span;
+ +      if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ +              goto free_online;
+ +
+ +      if (cpupri_init(&rd->cpupri, false) != 0)
+ +              goto free_rto_mask;
+ +      return 0;
   
- -      cpupri_init(&rd->cpupri);
+ +free_rto_mask:
+ +      free_cpumask_var(rd->rto_mask);
+ +free_online:
+ +      free_cpumask_var(rd->online);
+ +free_span:
+ +      free_cpumask_var(rd->span);
+ +free_rd:
+ +      kfree(rd);
+ +      return -ENOMEM;
   }
   
   static void init_defrootdomain(void)
   {
- -      init_rootdomain(&def_root_domain);
+ +      init_rootdomain(&def_root_domain, true);
+ +
         atomic_set(&def_root_domain.refcount, 1);
   }
   
@@@ -6954,10 -6854,7 +6971,10 @@@ static struct root_domain *alloc_rootdo
         if (!rd)
                 return NULL;
   
- -      init_rootdomain(rd);
+ +      if (init_rootdomain(rd, false) != 0) {
+ +              kfree(rd);
+ +              return NULL;
+ +      }
   
         return rd;
   }
@@@ -6999,12 -6896,19 +7016,12 @@@ cpu_attach_domain(struct sched_domain *
   }
   
   /* cpus with isolated domains */
- -static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
+ +static cpumask_var_t cpu_isolated_map;
   
   /* Setup the mask of cpus configured for isolated domains */
   static int __init isolated_cpu_setup(char *str)
   {
- -      static int __initdata ints[NR_CPUS];
- -      int i;
- -
- -      str = get_options(str, ARRAY_SIZE(ints), ints);
- -      cpus_clear(cpu_isolated_map);
- -      for (i = 1; i <= ints[0]; i++)
- -              if (ints[i] < NR_CPUS)
- -                      cpu_set(ints[i], cpu_isolated_map);
+ +      cpulist_parse(str, cpu_isolated_map);
         return 1;
   }
   
@@@ -7013,43 -6917,42 +7030,43 @@@ __setup("isolcpus=", isolated_cpu_setup
   /*
    * init_sched_build_groups takes the cpumask we wish to span, and a pointer
    * to a function which identifies what group(along with sched group) a CPU
- - * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
- - * (due to the fact that we keep track of groups covered with a cpumask_t).
+ + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+ + * (due to the fact that we keep track of groups covered with a struct cpumask).
    *
    * init_sched_build_groups will build a circular linked list of the groups
    * covered by the given span, and will set each group's ->cpumask correctly,
    * and ->cpu_power to 0.
    */
   static void
- -init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
- -                      int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+ +init_sched_build_groups(const struct cpumask *span,
+ +                      const struct cpumask *cpu_map,
+ +                      int (*group_fn)(int cpu, const struct cpumask *cpu_map,
                                         struct sched_group **sg,
- -                                      cpumask_t *tmpmask),
- -                      cpumask_t *covered, cpumask_t *tmpmask)
+ +                                      struct cpumask *tmpmask),
+ +                      struct cpumask *covered, struct cpumask *tmpmask)
   {
         struct sched_group *first = NULL, *last = NULL;
         int i;
   
- -      cpus_clear(*covered);
+ +      cpumask_clear(covered);
   
- -      for_each_cpu_mask_nr(i, *span) {
+ +      for_each_cpu(i, span) {
                 struct sched_group *sg;
                 int group = group_fn(i, cpu_map, &sg, tmpmask);
                 int j;
   
- -              if (cpu_isset(i, *covered))
+ +              if (cpumask_test_cpu(i, covered))
                         continue;
   
- -              cpus_clear(sg->cpumask);
+ +              cpumask_clear(sched_group_cpus(sg));
                 sg->__cpu_power = 0;
   
- -              for_each_cpu_mask_nr(j, *span) {
+ +              for_each_cpu(j, span) {
                         if (group_fn(j, cpu_map, NULL, tmpmask) != group)
                                 continue;
   
- -                      cpu_set(j, *covered);
- -                      cpu_set(j, sg->cpumask);
+ +                      cpumask_set_cpu(j, covered);
+ +                      cpumask_set_cpu(j, sched_group_cpus(sg));
                 }
                 if (!first)
                         first = sg;
@@@ -7113,10 -7016,9 +7130,10 @@@ static int find_next_best_node(int node
    * should be one that prevents unnecessary balancing, but also spreads tasks
    * out optimally.
    */
- -static void sched_domain_node_span(int node, cpumask_t *span)
+ +static void sched_domain_node_span(int node, struct cpumask *span)
   {
         nodemask_t used_nodes;
+ +      /* FIXME: use cpumask_of_node() */
         node_to_cpumask_ptr(nodemask, node);
         int i;
   
@@@ -7137,34 -7039,19 +7154,34 @@@
   
   int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
   
+ +/*
+ + * The cpus mask in sched_group and sched_domain hangs off the end.
+ + * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+ + * for nr_cpu_ids < CONFIG_NR_CPUS.
+ + */
+ +struct static_sched_group {
+ +      struct sched_group sg;
+ +      DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
+ +};
+ +
+ +struct static_sched_domain {
+ +      struct sched_domain sd;
+ +      DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+ +};
+ +
   /*
    * SMT sched-domains:
    */
   #ifdef CONFIG_SCHED_SMT
- -static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
- -static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+ +static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
+ +static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
   
   static int
- -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
- -               cpumask_t *unused)
+ +cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+ +               struct sched_group **sg, struct cpumask *unused)
   {
         if (sg)
- -              *sg = &per_cpu(sched_group_cpus, cpu);
+ +              *sg = &per_cpu(sched_group_cpus, cpu).sg;
         return cpu;
   }
   #endif /* CONFIG_SCHED_SMT */
@@@ -7173,55 -7060,56 +7190,55 @@@
    * multi-core sched-domains:
    */
   #ifdef CONFIG_SCHED_MC
- -static DEFINE_PER_CPU(struct sched_domain, core_domains);
- -static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+ +static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+ +static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
   #endif /* CONFIG_SCHED_MC */
   
   #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
   static int
- -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
- -                cpumask_t *mask)
+ +cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+ +                struct sched_group **sg, struct cpumask *mask)
   {
         int group;
   
- -      *mask = per_cpu(cpu_sibling_map, cpu);
- -      cpus_and(*mask, *mask, *cpu_map);
- -      group = first_cpu(*mask);
+ +      cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+ +      group = cpumask_first(mask);
         if (sg)
- -              *sg = &per_cpu(sched_group_core, group);
+ +              *sg = &per_cpu(sched_group_core, group).sg;
         return group;
   }
   #elif defined(CONFIG_SCHED_MC)
   static int
- -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
- -                cpumask_t *unused)
+ +cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+ +                struct sched_group **sg, struct cpumask *unused)
   {
         if (sg)
- -              *sg = &per_cpu(sched_group_core, cpu);
+ +              *sg = &per_cpu(sched_group_core, cpu).sg;
         return cpu;
   }
   #endif
   
- -static DEFINE_PER_CPU(struct sched_domain, phys_domains);
- -static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+ +static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+ +static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
   
   static int
- -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
- -                cpumask_t *mask)
+ +cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+ +                struct sched_group **sg, struct cpumask *mask)
   {
         int group;
   #ifdef CONFIG_SCHED_MC
- -      *mask = *cpu_coregroup_mask(cpu);
+ +      /* FIXME: Use cpu_coregroup_mask. */
+ +      *mask = cpu_coregroup_map(cpu);
         cpus_and(*mask, *mask, *cpu_map);
- -      group = first_cpu(*mask);
+ +      group = cpumask_first(mask);
   #elif defined(CONFIG_SCHED_SMT)
- -      *mask = per_cpu(cpu_sibling_map, cpu);
- -      cpus_and(*mask, *mask, *cpu_map);
- -      group = first_cpu(*mask);
+ +      cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+ +      group = cpumask_first(mask);
   #else
         group = cpu;
   #endif
         if (sg)
- -              *sg = &per_cpu(sched_group_phys, group);
+ +              *sg = &per_cpu(sched_group_phys, group).sg;
         return group;
   }
   
@@@ -7235,21 -7123,19 +7252,21 @@@ static DEFINE_PER_CPU(struct sched_doma
   static struct sched_group ***sched_group_nodes_bycpu;
   
   static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
- -static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+ +static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
   
- -static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
- -                               struct sched_group **sg, cpumask_t *nodemask)
+ +static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+ +                               struct sched_group **sg,
+ +                               struct cpumask *nodemask)
   {
         int group;
+ +      /* FIXME: use cpumask_of_node */
+ +      node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
   
- -      *nodemask = node_to_cpumask(cpu_to_node(cpu));
- -      cpus_and(*nodemask, *nodemask, *cpu_map);
- -      group = first_cpu(*nodemask);
+ +      cpumask_and(nodemask, pnodemask, cpu_map);
+ +      group = cpumask_first(nodemask);
   
         if (sg)
- -              *sg = &per_cpu(sched_group_allnodes, group);
+ +              *sg = &per_cpu(sched_group_allnodes, group).sg;
         return group;
   }
   
@@@ -7261,11 -7147,11 +7278,11 @@@ static void init_numa_sched_groups_powe
         if (!sg)
                 return;
         do {
- -              for_each_cpu_mask_nr(j, sg->cpumask) {
+ +              for_each_cpu(j, sched_group_cpus(sg)) {
                         struct sched_domain *sd;
   
- -                      sd = &per_cpu(phys_domains, j);
- -                      if (j != first_cpu(sd->groups->cpumask)) {
+ +                      sd = &per_cpu(phys_domains, j).sd;
+ +                      if (j != cpumask_first(sched_group_cpus(sd->groups))) {
                                 /*
                                  * Only add "power" once for each
                                  * physical package.
@@@ -7282,12 -7168,11 +7299,12 @@@
   
   #ifdef CONFIG_NUMA
   /* Free memory allocated for various sched_group structures */
- -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ +static void free_sched_groups(const struct cpumask *cpu_map,
+ +                            struct cpumask *nodemask)
   {
         int cpu, i;
   
- -      for_each_cpu_mask_nr(cpu, *cpu_map) {
+ +      for_each_cpu(cpu, cpu_map) {
                 struct sched_group **sched_group_nodes
                         = sched_group_nodes_bycpu[cpu];
   
@@@ -7296,11 -7181,10 +7313,11 @@@
   
                 for (i = 0; i < nr_node_ids; i++) {
                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
+ +                      /* FIXME: Use cpumask_of_node */
+ +                      node_to_cpumask_ptr(pnodemask, i);
   
- -                      *nodemask = node_to_cpumask(i);
- -                      cpus_and(*nodemask, *nodemask, *cpu_map);
- -                      if (cpus_empty(*nodemask))
+ +                      cpus_and(*nodemask, *pnodemask, *cpu_map);
+ +                      if (cpumask_empty(nodemask))
                                 continue;
   
                         if (sg == NULL)
@@@ -7318,8 -7202,7 +7335,8 @@@ next_sg
         }
   }
   #else /* !CONFIG_NUMA */
- -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ +static void free_sched_groups(const struct cpumask *cpu_map,
+ +                            struct cpumask *nodemask)
   {
   }
   #endif /* CONFIG_NUMA */
@@@ -7345,7 -7228,7 +7362,7 @@@ static void init_sched_groups_power(in
   
         WARN_ON(!sd || !sd->groups);
   
- -      if (cpu != first_cpu(sd->groups->cpumask))
+ +      if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
                 return;
   
         child = sd->child;
@@@ -7410,6 -7293,48 +7427,6 @@@ SD_INIT_FUNC(CPU
    SD_INIT_FUNC(MC)
   #endif
   
- -/*
- - * To minimize stack usage kmalloc room for cpumasks and share the
- - * space as the usage in build_sched_domains() dictates.  Used only
- - * if the amount of space is significant.
- - */
- -struct allmasks {
- -      cpumask_t tmpmask;                      /* make this one first */
- -      union {
- -              cpumask_t nodemask;
- -              cpumask_t this_sibling_map;
- -              cpumask_t this_core_map;
- -      };
- -      cpumask_t send_covered;
- -
- -#ifdef CONFIG_NUMA
- -      cpumask_t domainspan;
- -      cpumask_t covered;
- -      cpumask_t notcovered;
- -#endif
- -};
- -
- -#if   NR_CPUS > 128
- -#define SCHED_CPUMASK_DECLARE(v)      struct allmasks *v
- -static inline void sched_cpumask_alloc(struct allmasks **masks)
- -{
- -      *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
- -}
- -static inline void sched_cpumask_free(struct allmasks *masks)
- -{
- -      kfree(masks);
- -}
- -#else
- -#define SCHED_CPUMASK_DECLARE(v)      struct allmasks _v, *v = &_v
- -static inline void sched_cpumask_alloc(struct allmasks **masks)
- -{ }
- -static inline void sched_cpumask_free(struct allmasks *masks)
- -{ }
- -#endif
- -
- -#define       SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
- -                      ((unsigned long)(a) + offsetof(struct allmasks, v))
- -
   static int default_relax_domain_level = -1;
   
   static int __init setup_relax_domain_level(char *str)
@@@ -7449,38 -7374,17 +7466,38 @@@ static void set_domain_attribute(struc
    * Build sched domains for a given set of cpus and attach the sched domains
    * to the individual cpus
    */
- -static int __build_sched_domains(const cpumask_t *cpu_map,
+ +static int __build_sched_domains(const struct cpumask *cpu_map,
                                  struct sched_domain_attr *attr)
   {
- -      int i;
+ +      int i, err = -ENOMEM;
         struct root_domain *rd;
- -      SCHED_CPUMASK_DECLARE(allmasks);
- -      cpumask_t *tmpmask;
+ +      cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
+ +              tmpmask;
   #ifdef CONFIG_NUMA
+ +      cpumask_var_t domainspan, covered, notcovered;
         struct sched_group **sched_group_nodes = NULL;
         int sd_allnodes = 0;
   
+ +      if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
+ +              goto out;
+ +      if (!alloc_cpumask_var(&covered, GFP_KERNEL))
+ +              goto free_domainspan;
+ +      if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
+ +              goto free_covered;
+ +#endif
+ +
+ +      if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
+ +              goto free_notcovered;
+ +      if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
+ +              goto free_nodemask;
+ +      if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
+ +              goto free_this_sibling_map;
+ +      if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
+ +              goto free_this_core_map;
+ +      if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ +              goto free_send_covered;
+ +
+ +#ifdef CONFIG_NUMA
         /*
          * Allocate the per-node list of sched groups
          */
@@@ -7488,37 -7392,54 +7505,37 @@@
                                     GFP_KERNEL);
         if (!sched_group_nodes) {
                 printk(KERN_WARNING "Can not alloc sched group node list\n");
- -              return -ENOMEM;
+ +              goto free_tmpmask;
         }
   #endif
   
         rd = alloc_rootdomain();
         if (!rd) {
                 printk(KERN_WARNING "Cannot alloc root domain\n");
- -#ifdef CONFIG_NUMA
- -              kfree(sched_group_nodes);
- -#endif
- -              return -ENOMEM;
+ +              goto free_sched_groups;
         }
   
- -      /* get space for all scratch cpumask variables */
- -      sched_cpumask_alloc(&allmasks);
- -      if (!allmasks) {
- -              printk(KERN_WARNING "Cannot alloc cpumask array\n");
- -              kfree(rd);
   #ifdef CONFIG_NUMA
- -              kfree(sched_group_nodes);
- -#endif
- -              return -ENOMEM;
- -      }
- -
- -      tmpmask = (cpumask_t *)allmasks;
- -
- -
- -#ifdef CONFIG_NUMA
- -      sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+ +      sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
   #endif
   
         /*
          * Set up domains for cpus specified by the cpu_map.
          */
- -      for_each_cpu_mask_nr(i, *cpu_map) {
+ +      for_each_cpu(i, cpu_map) {
                 struct sched_domain *sd = NULL, *p;
- -              SCHED_CPUMASK_VAR(nodemask, allmasks);
   
+ +              /* FIXME: use cpumask_of_node */
                 *nodemask = node_to_cpumask(cpu_to_node(i));
                 cpus_and(*nodemask, *nodemask, *cpu_map);
   
   #ifdef CONFIG_NUMA
- -              if (cpus_weight(*cpu_map) >
- -                              SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
+ +              if (cpumask_weight(cpu_map) >
+ +                              SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
                         sd = &per_cpu(allnodes_domains, i);
                         SD_INIT(sd, ALLNODES);
                         set_domain_attribute(sd, attr);
- -                      sd->span = *cpu_map;
+ +                      cpumask_copy(sched_domain_span(sd), cpu_map);
                         cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
                         p = sd;
                         sd_allnodes = 1;
@@@ -7528,19 -7449,18 +7545,19 @@@
                 sd = &per_cpu(node_domains, i);
                 SD_INIT(sd, NODE);
                 set_domain_attribute(sd, attr);
- -              sched_domain_node_span(cpu_to_node(i), &sd->span);
+ +              sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
                 sd->parent = p;
                 if (p)
                         p->child = sd;
- -              cpus_and(sd->span, sd->span, *cpu_map);
+ +              cpumask_and(sched_domain_span(sd),
+ +                          sched_domain_span(sd), cpu_map);
   #endif
   
                 p = sd;
- -              sd = &per_cpu(phys_domains, i);
+ +              sd = &per_cpu(phys_domains, i).sd;
                 SD_INIT(sd, CPU);
                 set_domain_attribute(sd, attr);
- -              sd->span = *nodemask;
+ +              cpumask_copy(sched_domain_span(sd), nodemask);
                 sd->parent = p;
                 if (p)
                         p->child = sd;
@@@ -7548,12 -7468,11 +7565,12 @@@
   
   #ifdef CONFIG_SCHED_MC
                 p = sd;
- -              sd = &per_cpu(core_domains, i);
+ +              sd = &per_cpu(core_domains, i).sd;
                 SD_INIT(sd, MC);
                 set_domain_attribute(sd, attr);
- -              sd->span = *cpu_coregroup_mask(i);
- -              cpus_and(sd->span, sd->span, *cpu_map);
+ +              *sched_domain_span(sd) = cpu_coregroup_map(i);
+ +              cpumask_and(sched_domain_span(sd),
+ +                          sched_domain_span(sd), cpu_map);
                 sd->parent = p;
                 p->child = sd;
                 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7561,11 -7480,11 +7578,11 @@@
   
   #ifdef CONFIG_SCHED_SMT
                 p = sd;
- -              sd = &per_cpu(cpu_domains, i);
+ +              sd = &per_cpu(cpu_domains, i).sd;
                 SD_INIT(sd, SIBLING);
                 set_domain_attribute(sd, attr);
- -              sd->span = per_cpu(cpu_sibling_map, i);
- -              cpus_and(sd->span, sd->span, *cpu_map);
+ +              cpumask_and(sched_domain_span(sd),
+ +                          &per_cpu(cpu_sibling_map, i), cpu_map);
                 sd->parent = p;
                 p->child = sd;
                 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7574,10 -7493,13 +7591,10 @@@
   
   #ifdef CONFIG_SCHED_SMT
         /* Set up CPU (sibling) groups */
- -      for_each_cpu_mask_nr(i, *cpu_map) {
- -              SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
- -              SCHED_CPUMASK_VAR(send_covered, allmasks);
- -
- -              *this_sibling_map = per_cpu(cpu_sibling_map, i);
- -              cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
- -              if (i != first_cpu(*this_sibling_map))
+ +      for_each_cpu(i, cpu_map) {
+ +              cpumask_and(this_sibling_map,
+ +                          &per_cpu(cpu_sibling_map, i), cpu_map);
+ +              if (i != cpumask_first(this_sibling_map))
                         continue;
   
                 init_sched_build_groups(this_sibling_map, cpu_map,
@@@ -7588,11 -7510,13 +7605,11 @@@
   
   #ifdef CONFIG_SCHED_MC
         /* Set up multi-core groups */
- -      for_each_cpu_mask_nr(i, *cpu_map) {
- -              SCHED_CPUMASK_VAR(this_core_map, allmasks);
- -              SCHED_CPUMASK_VAR(send_covered, allmasks);
- -
- -              *this_core_map = *cpu_coregroup_mask(i);
+ +      for_each_cpu(i, cpu_map) {
+ +              /* FIXME: Use cpu_coregroup_mask */
+ +              *this_core_map = cpu_coregroup_map(i);
                 cpus_and(*this_core_map, *this_core_map, *cpu_map);
- -              if (i != first_cpu(*this_core_map))
+ +              if (i != cpumask_first(this_core_map))
                         continue;
   
                 init_sched_build_groups(this_core_map, cpu_map,
@@@ -7603,10 -7527,12 +7620,10 @@@
   
         /* Set up physical groups */
         for (i = 0; i < nr_node_ids; i++) {
- -              SCHED_CPUMASK_VAR(nodemask, allmasks);
- -              SCHED_CPUMASK_VAR(send_covered, allmasks);
- -
+ +              /* FIXME: Use cpumask_of_node */
                 *nodemask = node_to_cpumask(i);
                 cpus_and(*nodemask, *nodemask, *cpu_map);
- -              if (cpus_empty(*nodemask))
+ +              if (cpumask_empty(nodemask))
                         continue;
   
                 init_sched_build_groups(nodemask, cpu_map,
@@@ -7617,6 -7543,8 +7634,6 @@@
   #ifdef CONFIG_NUMA
         /* Set up node groups */
         if (sd_allnodes) {
- -              SCHED_CPUMASK_VAR(send_covered, allmasks);
- -
                 init_sched_build_groups(cpu_map, cpu_map,
                                         &cpu_to_allnodes_group,
                                         send_covered, tmpmask);
@@@ -7625,58 -7553,58 +7642,58 @@@
         for (i = 0; i < nr_node_ids; i++) {
                 /* Set up node groups */
                 struct sched_group *sg, *prev;
- -              SCHED_CPUMASK_VAR(nodemask, allmasks);
- -              SCHED_CPUMASK_VAR(domainspan, allmasks);
- -              SCHED_CPUMASK_VAR(covered, allmasks);
                 int j;
   
+ +              /* FIXME: Use cpumask_of_node */
                 *nodemask = node_to_cpumask(i);
- -              cpus_clear(*covered);
+ +              cpumask_clear(covered);
   
                 cpus_and(*nodemask, *nodemask, *cpu_map);
- -              if (cpus_empty(*nodemask)) {
+ +              if (cpumask_empty(nodemask)) {
                         sched_group_nodes[i] = NULL;
                         continue;
                 }
   
                 sched_domain_node_span(i, domainspan);
- -              cpus_and(*domainspan, *domainspan, *cpu_map);
+ +              cpumask_and(domainspan, domainspan, cpu_map);
   
- -              sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+ +              sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ +                                GFP_KERNEL, i);
                 if (!sg) {
                         printk(KERN_WARNING "Can not alloc domain group for "
                                 "node %d\n", i);
                         goto error;
                 }
                 sched_group_nodes[i] = sg;
- -              for_each_cpu_mask_nr(j, *nodemask) {
+ +              for_each_cpu(j, nodemask) {
                         struct sched_domain *sd;
   
                         sd = &per_cpu(node_domains, j);
                         sd->groups = sg;
                 }
                 sg->__cpu_power = 0;
- -              sg->cpumask = *nodemask;
+ +              cpumask_copy(sched_group_cpus(sg), nodemask);
                 sg->next = sg;
- -              cpus_or(*covered, *covered, *nodemask);
+ +              cpumask_or(covered, covered, nodemask);
                 prev = sg;
   
                 for (j = 0; j < nr_node_ids; j++) {
- -                      SCHED_CPUMASK_VAR(notcovered, allmasks);
                         int n = (i + j) % nr_node_ids;
+ +                      /* FIXME: Use cpumask_of_node */
                         node_to_cpumask_ptr(pnodemask, n);
   
- -                      cpus_complement(*notcovered, *covered);
- -                      cpus_and(*tmpmask, *notcovered, *cpu_map);
- -                      cpus_and(*tmpmask, *tmpmask, *domainspan);
- -                      if (cpus_empty(*tmpmask))
+ +                      cpumask_complement(notcovered, covered);
+ +                      cpumask_and(tmpmask, notcovered, cpu_map);
+ +                      cpumask_and(tmpmask, tmpmask, domainspan);
+ +                      if (cpumask_empty(tmpmask))
                                 break;
   
- -                      cpus_and(*tmpmask, *tmpmask, *pnodemask);
- -                      if (cpus_empty(*tmpmask))
+ +                      cpumask_and(tmpmask, tmpmask, pnodemask);
+ +                      if (cpumask_empty(tmpmask))
                                 continue;
   
- -                      sg = kmalloc_node(sizeof(struct sched_group),
+ +                      sg = kmalloc_node(sizeof(struct sched_group) +
+ +                                        cpumask_size(),
                                           GFP_KERNEL, i);
                         if (!sg) {
                                 printk(KERN_WARNING
@@@ -7684,9 -7612,9 +7701,9 @@@
                                 goto error;
                         }
                         sg->__cpu_power = 0;
- -                      sg->cpumask = *tmpmask;
+ +                      cpumask_copy(sched_group_cpus(sg), tmpmask);
                         sg->next = prev->next;
- -                      cpus_or(*covered, *covered, *tmpmask);
+ +                      cpumask_or(covered, covered, tmpmask);
                         prev->next = sg;
                         prev = sg;
                 }
@@@ -7695,22 -7623,22 +7712,22 @@@
   
         /* Calculate CPU power for physical packages and nodes */
   #ifdef CONFIG_SCHED_SMT
- -      for_each_cpu_mask_nr(i, *cpu_map) {
- -              struct sched_domain *sd = &per_cpu(cpu_domains, i);
+ +      for_each_cpu(i, cpu_map) {
+ +              struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
   
                 init_sched_groups_power(i, sd);
         }
   #endif
   #ifdef CONFIG_SCHED_MC
- -      for_each_cpu_mask_nr(i, *cpu_map) {
- -              struct sched_domain *sd = &per_cpu(core_domains, i);
+ +      for_each_cpu(i, cpu_map) {
+ +              struct sched_domain *sd = &per_cpu(core_domains, i).sd;
   
                 init_sched_groups_power(i, sd);
         }
   #endif
   
- -      for_each_cpu_mask_nr(i, *cpu_map) {
- -              struct sched_domain *sd = &per_cpu(phys_domains, i);
+ +      for_each_cpu(i, cpu_map) {
+ +              struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
   
                 init_sched_groups_power(i, sd);
         }
@@@ -7722,78 -7650,53 +7739,78 @@@
         if (sd_allnodes) {
                 struct sched_group *sg;
   
- -              cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+ +              cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
                                                                 tmpmask);
                 init_numa_sched_groups_power(sg);
         }
   #endif
   
         /* Attach the domains */
- -      for_each_cpu_mask_nr(i, *cpu_map) {
+ +      for_each_cpu(i, cpu_map) {
                 struct sched_domain *sd;
   #ifdef CONFIG_SCHED_SMT
- -              sd = &per_cpu(cpu_domains, i);
+ +              sd = &per_cpu(cpu_domains, i).sd;
   #elif defined(CONFIG_SCHED_MC)
- -              sd = &per_cpu(core_domains, i);
+ +              sd = &per_cpu(core_domains, i).sd;
   #else
- -              sd = &per_cpu(phys_domains, i);
+ +              sd = &per_cpu(phys_domains, i).sd;
   #endif
                 cpu_attach_domain(sd, rd, i);
         }
   
- -      sched_cpumask_free(allmasks);
- -      return 0;
+ +      err = 0;
+ +
+ +free_tmpmask:
+ +      free_cpumask_var(tmpmask);
+ +free_send_covered:
+ +      free_cpumask_var(send_covered);
+ +free_this_core_map:
+ +      free_cpumask_var(this_core_map);
+ +free_this_sibling_map:
+ +      free_cpumask_var(this_sibling_map);
+ +free_nodemask:
+ +      free_cpumask_var(nodemask);
+ +free_notcovered:
+ +#ifdef CONFIG_NUMA
+ +      free_cpumask_var(notcovered);
+ +free_covered:
+ +      free_cpumask_var(covered);
+ +free_domainspan:
+ +      free_cpumask_var(domainspan);
+ +out:
+ +#endif
+ +      return err;
+ +
+ +free_sched_groups:
+ +#ifdef CONFIG_NUMA
+ +      kfree(sched_group_nodes);
+ +#endif
+ +      goto free_tmpmask;
   
   #ifdef CONFIG_NUMA
   error:
         free_sched_groups(cpu_map, tmpmask);
- -      sched_cpumask_free(allmasks);
- -      kfree(rd);
- -      return -ENOMEM;
+ +      free_rootdomain(rd);
+ +      goto free_tmpmask;
   #endif
   }
   
- -static int build_sched_domains(const cpumask_t *cpu_map)
+ +static int build_sched_domains(const struct cpumask *cpu_map)
   {
         return __build_sched_domains(cpu_map, NULL);
   }
   
- -static cpumask_t *doms_cur;   /* current sched domains */
+ +static struct cpumask *doms_cur;      /* current sched domains */
   static int ndoms_cur;         /* number of sched domains in 'doms_cur' */
   static struct sched_domain_attr *dattr_cur;
                                 /* attribues of custom domains in 'doms_cur' */
   
   /*
    * Special case: If a kmalloc of a doms_cur partition (array of
- - * cpumask_t) fails, then fallback to a single sched domain,
- - * as determined by the single cpumask_t fallback_doms.
+ + * cpumask) fails, then fallback to a single sched domain,
+ + * as determined by the single cpumask fallback_doms.
    */
- -static cpumask_t fallback_doms;
+ +static cpumask_var_t fallback_doms;
   
   /*
    * arch_update_cpu_topology lets virtualized architectures update the
@@@ -7810,16 -7713,16 +7827,16 @@@ int __attribute__((weak)) arch_update_c
    * For now this just excludes isolated cpus, but could be used to
    * exclude other special cases in the future.
    */
- -static int arch_init_sched_domains(const cpumask_t *cpu_map)
+ +static int arch_init_sched_domains(const struct cpumask *cpu_map)
   {
         int err;
   
         arch_update_cpu_topology();
         ndoms_cur = 1;
- -      doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+ +      doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
         if (!doms_cur)
- -              doms_cur = &fallback_doms;
- -      cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+ +              doms_cur = fallback_doms;
+ +      cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
         dattr_cur = NULL;
         err = build_sched_domains(doms_cur);
         register_sched_domain_sysctl();
@@@ -7827,8 -7730,8 +7844,8 @@@
         return err;
   }
   
- -static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
- -                                     cpumask_t *tmpmask)
+ +static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
+ +                                     struct cpumask *tmpmask)
   {
         free_sched_groups(cpu_map, tmpmask);
   }
@@@ -7837,16 -7740,15 +7854,16 @@@
    * Detach sched domains from a group of cpus specified in cpu_map
    * These cpus will now be attached to the NULL domain
    */
- -static void detach_destroy_domains(const cpumask_t *cpu_map)
+ +static void detach_destroy_domains(const struct cpumask *cpu_map)
   {
- -      cpumask_t tmpmask;
+ +      /* Save because hotplug lock held. */
+ +      static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
         int i;
   
- -      for_each_cpu_mask_nr(i, *cpu_map)
+ +      for_each_cpu(i, cpu_map)
                 cpu_attach_domain(NULL, &def_root_domain, i);
         synchronize_sched();
- -      arch_destroy_sched_domains(cpu_map, &tmpmask);
+ +      arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
   }
   
   /* handle null as "default" */
@@@ -7871,7 -7773,7 +7888,7 @@@ static int dattrs_equal(struct sched_do
    * doms_new[] to the current sched domain partitioning, doms_cur[].
    * It destroys each deleted domain and builds each new domain.
    *
- - * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+ + * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
    * The masks don't intersect (don't overlap.) We should setup one
    * sched domain for each mask. CPUs not in any of the cpumasks will
    * not be load balanced. If the same cpumask appears both in the
@@@ -7885,14 -7787,13 +7902,14 @@@
    * the single partition 'fallback_doms', it also forces the domains
    * to be rebuilt.
    *
- - * If doms_new == NULL it will be replaced with cpu_online_map.
+ + * If doms_new == NULL it will be replaced with cpu_online_mask.
    * ndoms_new == 0 is a special case for destroying existing domains,
    * and it will not create the default domain.
    *
    * Call with hotplug lock held
    */
- -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ +/* FIXME: Change to struct cpumask *doms_new[] */
+ +void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                              struct sched_domain_attr *dattr_new)
   {
         int i, j, n;
@@@ -7911,7 -7812,7 +7928,7 @@@
         /* Destroy deleted domains */
         for (i = 0; i < ndoms_cur; i++) {
                 for (j = 0; j < n && !new_topology; j++) {
- -                      if (cpus_equal(doms_cur[i], doms_new[j])
+ +                      if (cpumask_equal(&doms_cur[i], &doms_new[j])
                             && dattrs_equal(dattr_cur, i, dattr_new, j))
                                 goto match1;
                 }
@@@ -7923,15 -7824,15 +7940,15 @@@ match1
   
         if (doms_new == NULL) {
                 ndoms_cur = 0;
- -              doms_new = &fallback_doms;
- -              cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+ +              doms_new = fallback_doms;
+ +              cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
                 WARN_ON_ONCE(dattr_new);
         }
   
         /* Build new domains */
         for (i = 0; i < ndoms_new; i++) {
                 for (j = 0; j < ndoms_cur && !new_topology; j++) {
- -                      if (cpus_equal(doms_new[i], doms_cur[j])
+ +                      if (cpumask_equal(&doms_new[i], &doms_cur[j])
                             && dattrs_equal(dattr_new, i, dattr_cur, j))
                                 goto match2;
                 }
@@@ -7943,7 -7844,7 +7960,7 @@@ match2
         }
   
         /* Remember the new sched domains */
- -      if (doms_cur != &fallback_doms)
+ +      if (doms_cur != fallback_doms)
                 kfree(doms_cur);
         kfree(dattr_cur);       /* kfree(NULL) is safe */
         doms_cur = doms_new;
@@@ -7972,25 -7873,14 +7989,25 @@@ int arch_reinit_sched_domains(void
   static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
   {
         int ret;
+ +      unsigned int level = 0;
   
- -      if (buf[0] != '0' && buf[0] != '1')
+ +      if (sscanf(buf, "%u", &level) != 1)
+ +              return -EINVAL;
+ +
+ +      /*
+ +       * level is always be positive so don't check for
+ +       * level < POWERSAVINGS_BALANCE_NONE which is 0
+ +       * What happens on 0 or 1 byte write,
+ +       * need to check for count as well?
+ +       */
+ +
+ +      if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
                 return -EINVAL;
   
         if (smt)
- -              sched_smt_power_savings = (buf[0] == '1');
+ +              sched_smt_power_savings = level;
         else
- -              sched_mc_power_savings = (buf[0] == '1');
+ +              sched_mc_power_savings = level;
   
         ret = arch_reinit_sched_domains();
   
@@@ -8094,9 -7984,7 +8111,9 @@@ static int update_runtime(struct notifi
   
   void __init sched_init_smp(void)
   {
- -      cpumask_t non_isolated_cpus;
+ +      cpumask_var_t non_isolated_cpus;
+ +
+ +      alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
   
   #if defined(CONFIG_NUMA)
         sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@@ -8105,10 -7993,10 +8122,10 @@@
   #endif
         get_online_cpus();
         mutex_lock(&sched_domains_mutex);
- -      arch_init_sched_domains(&cpu_online_map);
- -      cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
- -      if (cpus_empty(non_isolated_cpus))
- -              cpu_set(smp_processor_id(), non_isolated_cpus);
+ +      arch_init_sched_domains(cpu_online_mask);
+ +      cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+ +      if (cpumask_empty(non_isolated_cpus))
+ +              cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
         mutex_unlock(&sched_domains_mutex);
         put_online_cpus();
   
@@@ -8123,13 -8011,9 +8140,13 @@@
         init_hrtick();
   
         /* Move init over to a non-isolated CPU */
- -      if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
+ +      if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
                 BUG();
         sched_init_granularity();
+ +      free_cpumask_var(non_isolated_cpus);
+ +
+ +      alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+ +      init_sched_rt_class();
   }
   #else
   void __init sched_init_smp(void)
@@@ -8444,15 -8328,6 +8461,15 @@@ void __init sched_init(void
          */
         current->sched_class = &fair_sched_class;
   
+ +      /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
+ +      alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ +#ifdef CONFIG_SMP
+ +#ifdef CONFIG_NO_HZ
+ +      alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+ +#endif
+ +      alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ +#endif /* SMP */
+ +
         scheduler_running = 1;
   }
   
@@@ -9423,6 -9298,41 +9440,41 @@@ cpuacct_destroy(struct cgroup_subsys *s
         kfree(ca);
   }
   
+ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+ {
+       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 data;
+ 
+ #ifndef CONFIG_64BIT
+       /*
+        * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+        */
+       spin_lock_irq(&cpu_rq(cpu)->lock);
+       data = *cpuusage;
+       spin_unlock_irq(&cpu_rq(cpu)->lock);
+ #else
+       data = *cpuusage;
+ #endif
+ 
+       return data;
+ }
+ 
+ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+ {
+       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ 
+ #ifndef CONFIG_64BIT
+       /*
+        * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+        */
+       spin_lock_irq(&cpu_rq(cpu)->lock);
+       *cpuusage = val;
+       spin_unlock_irq(&cpu_rq(cpu)->lock);
+ #else
+       *cpuusage = val;
+ #endif
+ }
+ 
   /* return total cpu usage (in nanoseconds) of a group */
   static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
   {
@@@ -9430,17 -9340,8 +9482,8 @@@
         u64 totalcpuusage = 0;
         int i;
   
-       for_each_possible_cpu(i) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
- 
-               /*
-                * Take rq->lock to make 64-bit addition safe on 32-bit
-                * platforms.
-                */
-               spin_lock_irq(&cpu_rq(i)->lock);
-               totalcpuusage += *cpuusage;
-               spin_unlock_irq(&cpu_rq(i)->lock);
-       }
+       for_each_present_cpu(i)
+               totalcpuusage += cpuacct_cpuusage_read(ca, i);
   
         return totalcpuusage;
   }
@@@ -9457,23 -9358,39 +9500,39 @@@ static int cpuusage_write(struct cgrou
                 goto out;
         }
   
-       for_each_possible_cpu(i) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+       for_each_present_cpu(i)
+               cpuacct_cpuusage_write(ca, i, 0);
   
-               spin_lock_irq(&cpu_rq(i)->lock);
-               *cpuusage = 0;
-               spin_unlock_irq(&cpu_rq(i)->lock);
-       }
   out:
         return err;
   }
   
+ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+                                  struct seq_file *m)
+ {
+       struct cpuacct *ca = cgroup_ca(cgroup);
+       u64 percpu;
+       int i;
+ 
+       for_each_present_cpu(i) {
+               percpu = cpuacct_cpuusage_read(ca, i);
+               seq_printf(m, "%llu ", (unsigned long long) percpu);
+       }
+       seq_printf(m, "\n");
+       return 0;
+ }
+ 
   static struct cftype files[] = {
         {
                 .name = "usage",
                 .read_u64 = cpuusage_read,
                 .write_u64 = cpuusage_write,
         },
+       {
+               .name = "usage_percpu",
+               .read_seq_string = cpuacct_percpu_seq_read,
+       },
+ 
   };
   
   static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
diff --combined kernel/sched_fair.c

index 36b5e34,5ad4440..56c0efe
--- 1/kernel/sched_fair.c
--- 2/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@@ -492,6 -492,8 +492,8 @@@ static void update_curr(struct cfs_rq *
          * overflow on 32 bits):
          */
         delta_exec = (unsigned long)(now - curr->exec_start);
+       if (!delta_exec)
+               return;
   
         __update_curr(cfs_rq, curr, delta_exec);
         curr->exec_start = now;
@@@ -1017,33 -1019,16 +1019,33 @@@ static void yield_task_fair(struct rq *
    * search starts with cpus closest then further out as needed,
    * so we always favor a closer, idle cpu.
    * Domains may include CPUs that are not usable for migration,
- - * hence we need to mask them out (cpu_active_map)
+ + * hence we need to mask them out (cpu_active_mask)
    *
    * Returns the CPU we should wake onto.
    */
   #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
   static int wake_idle(int cpu, struct task_struct *p)
   {
- -      cpumask_t tmp;
         struct sched_domain *sd;
         int i;
+ +      unsigned int chosen_wakeup_cpu;
+ +      int this_cpu;
+ +
+ +      /*
+ +       * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
+ +       * are idle and this is not a kernel thread and this task's affinity
+ +       * allows it to be moved to preferred cpu, then just move!
+ +       */
+ +
+ +      this_cpu = smp_processor_id();
+ +      chosen_wakeup_cpu =
+ +              cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
+ +
+ +      if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
+ +              idle_cpu(cpu) && idle_cpu(this_cpu) &&
+ +              p->mm && !(p->flags & PF_KTHREAD) &&
+ +              cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
+ +              return chosen_wakeup_cpu;
   
         /*
          * If it is idle, then it is the best cpu to run this task.
@@@ -1061,9 -1046,10 +1063,9 @@@
                 if ((sd->flags & SD_WAKE_IDLE)
                     || ((sd->flags & SD_WAKE_IDLE_FAR)
                         && !task_hot(p, task_rq(p)->clock, sd))) {
- -                      cpus_and(tmp, sd->span, p->cpus_allowed);
- -                      cpus_and(tmp, tmp, cpu_active_map);
- -                      for_each_cpu_mask_nr(i, tmp) {
- -                              if (idle_cpu(i)) {
+ +                      for_each_cpu_and(i, sched_domain_span(sd),
+ +                                       &p->cpus_allowed) {
+ +                              if (cpu_active(i) && idle_cpu(i)) {
                                         if (i != task_cpu(p)) {
                                                 schedstat_inc(p,
                                                        se.nr_wakeups_idle);
@@@ -1256,13 -1242,13 +1258,13 @@@ static int select_task_rq_fair(struct t
          * this_cpu and prev_cpu are present in:
          */
         for_each_domain(this_cpu, sd) {
- -              if (cpu_isset(prev_cpu, sd->span)) {
+ +              if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
                         this_sd = sd;
                         break;
                 }
         }
   
- -      if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+ +      if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
                 goto out;
   
         /*
@@@ -1361,12 -1347,11 +1363,11 @@@ static void check_preempt_wakeup(struc
   {
         struct task_struct *curr = rq->curr;
         struct sched_entity *se = &curr->se, *pse = &p->se;
+       struct cfs_rq *cfs_rq = task_cfs_rq(curr);
   
-       if (unlikely(rt_prio(p->prio))) {
-               struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+       update_curr(cfs_rq);
   
-               update_rq_clock(rq);
-               update_curr(cfs_rq);
+       if (unlikely(rt_prio(p->prio))) {
                 resched_task(curr);
                 return;
         }
diff --combined kernel/sched_rt.c

index 1bbd990,51d2af3..833b6d4
--- 1/kernel/sched_rt.c
--- 2/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@@ -15,7 -15,7 +15,7 @@@ static inline void rt_set_overload(stru
         if (!rq->online)
                 return;
   
- -      cpu_set(rq->cpu, rq->rd->rto_mask);
+ +      cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
         /*
          * Make sure the mask is visible before we set
          * the overload count. That is checked to determine
@@@ -34,7 -34,7 +34,7 @@@ static inline void rt_clear_overload(st
   
         /* the order here really doesn't matter */
         atomic_dec(&rq->rd->rto_count);
- -      cpu_clear(rq->cpu, rq->rd->rto_mask);
+ +      cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
   }
   
   static void update_rt_migration(struct rq *rq)
@@@ -77,7 -77,7 +77,7 @@@ static inline u64 sched_rt_period(struc
   }
   
   #define for_each_leaf_rt_rq(rt_rq, rq) \
-       list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
+       list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
   
   static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
   {
@@@ -139,14 -139,14 +139,14 @@@ static int rt_se_boosted(struct sched_r
   }
   
   #ifdef CONFIG_SMP
- -static inline cpumask_t sched_rt_period_mask(void)
+ +static inline const struct cpumask *sched_rt_period_mask(void)
   {
         return cpu_rq(smp_processor_id())->rd->span;
   }
   #else
- -static inline cpumask_t sched_rt_period_mask(void)
+ +static inline const struct cpumask *sched_rt_period_mask(void)
   {
- -      return cpu_online_map;
+ +      return cpu_online_mask;
   }
   #endif
   
@@@ -212,9 -212,9 +212,9 @@@ static inline int rt_rq_throttled(struc
         return rt_rq->rt_throttled;
   }
   
- -static inline cpumask_t sched_rt_period_mask(void)
+ +static inline const struct cpumask *sched_rt_period_mask(void)
   {
- -      return cpu_online_map;
+ +      return cpu_online_mask;
   }
   
   static inline
@@@ -241,11 -241,11 +241,11 @@@ static int do_balance_runtime(struct rt
         int i, weight, more = 0;
         u64 rt_period;
   
- -      weight = cpus_weight(rd->span);
+ +      weight = cpumask_weight(rd->span);
   
         spin_lock(&rt_b->rt_runtime_lock);
         rt_period = ktime_to_ns(rt_b->rt_period);
- -      for_each_cpu_mask_nr(i, rd->span) {
+ +      for_each_cpu(i, rd->span) {
                 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
                 s64 diff;
   
@@@ -324,7 -324,7 +324,7 @@@ static void __disable_runtime(struct r
                 /*
                  * Greedy reclaim, take back as much as we can.
                  */
- -              for_each_cpu_mask(i, rd->span) {
+ +              for_each_cpu(i, rd->span) {
                         struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
                         s64 diff;
   
@@@ -429,13 -429,13 +429,13 @@@ static inline int balance_runtime(struc
   static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
   {
         int i, idle = 1;
- -      cpumask_t span;
+ +      const struct cpumask *span;
   
         if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                 return 1;
   
         span = sched_rt_period_mask();
- -      for_each_cpu_mask(i, span) {
+ +      for_each_cpu(i, span) {
                 int enqueue = 0;
                 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
                 struct rq *rq = rq_of_rt_rq(rt_rq);
@@@ -805,20 -805,17 +805,20 @@@ static int select_task_rq_rt(struct tas
   
   static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
   {
- -      cpumask_t mask;
+ +      cpumask_var_t mask;
   
         if (rq->curr->rt.nr_cpus_allowed == 1)
                 return;
   
- -      if (p->rt.nr_cpus_allowed != 1
- -          && cpupri_find(&rq->rd->cpupri, p, &mask))
+ +      if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
                 return;
   
- -      if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
- -              return;
+ +      if (p->rt.nr_cpus_allowed != 1
+ +          && cpupri_find(&rq->rd->cpupri, p, mask))
+ +              goto free;
+ +
+ +      if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
+ +              goto free;
   
         /*
          * There appears to be other cpus that can accept
@@@ -827,8 -824,6 +827,8 @@@
          */
         requeue_task_rt(rq, p, 1);
         resched_task(rq->curr);
+ +free:
+ +      free_cpumask_var(mask);
   }
   
   #endif /* CONFIG_SMP */
@@@ -919,7 -914,7 +919,7 @@@ static void deactivate_task(struct rq *
   static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
   {
         if (!task_running(rq, p) &&
- -          (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
+ +          (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
             (p->rt.nr_cpus_allowed > 1))
                 return 1;
         return 0;
@@@ -958,7 -953,7 +958,7 @@@ static struct task_struct *pick_next_hi
         return next;
   }
   
- -static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+ +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
   
   static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
   {
@@@ -978,7 -973,7 +978,7 @@@
   static int find_lowest_rq(struct task_struct *task)
   {
         struct sched_domain *sd;
- -      cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
+ +      struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
         int this_cpu = smp_processor_id();
         int cpu      = task_cpu(task);
   
@@@ -993,7 -988,7 +993,7 @@@
          * I guess we might want to change cpupri_find() to ignore those
          * in the first place.
          */
- -      cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+ +      cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
   
         /*
          * At this point we have built a mask of cpus representing the
@@@ -1003,7 -998,7 +1003,7 @@@
          * We prioritize the last cpu that the task executed on since
          * it is most likely cache-hot in that location.
          */
- -      if (cpu_isset(cpu, *lowest_mask))
+ +      if (cpumask_test_cpu(cpu, lowest_mask))
                 return cpu;
   
         /*
@@@ -1018,8 -1013,7 +1018,8 @@@
                         cpumask_t domain_mask;
                         int       best_cpu;
   
- -                      cpus_and(domain_mask, sd->span, *lowest_mask);
+ +                      cpumask_and(&domain_mask, sched_domain_span(sd),
+ +                                  lowest_mask);
   
                         best_cpu = pick_optimal_cpu(this_cpu,
                                                     &domain_mask);
@@@ -1060,8 -1054,8 +1060,8 @@@ static struct rq *find_lock_lowest_rq(s
                          * Also make sure that it wasn't scheduled on its rq.
                          */
                         if (unlikely(task_rq(task) != rq ||
- -                                   !cpu_isset(lowest_rq->cpu,
- -                                              task->cpus_allowed) ||
+ +                                   !cpumask_test_cpu(lowest_rq->cpu,
+ +                                                     &task->cpus_allowed) ||
                                      task_running(rq, task) ||
                                      !task->se.on_rq)) {
   
@@@ -1182,7 -1176,7 +1182,7 @@@ static int pull_rt_task(struct rq *this
   
         next = pick_next_task_rt(this_rq);
   
- -      for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
+ +      for_each_cpu(cpu, this_rq->rd->rto_mask) {
                 if (this_cpu == cpu)
                         continue;
   
@@@ -1311,9 -1305,9 +1311,9 @@@ move_one_task_rt(struct rq *this_rq, in
   }
   
   static void set_cpus_allowed_rt(struct task_struct *p,
- -                              const cpumask_t *new_mask)
+ +                              const struct cpumask *new_mask)
   {
- -      int weight = cpus_weight(*new_mask);
+ +      int weight = cpumask_weight(new_mask);
   
         BUG_ON(!rt_task(p));
   
@@@ -1334,7 -1328,7 +1334,7 @@@
                 update_rt_migration(rq);
         }
   
- -      p->cpus_allowed    = *new_mask;
+ +      cpumask_copy(&p->cpus_allowed, new_mask);
         p->rt.nr_cpus_allowed = weight;
   }
   
@@@ -1377,14 -1371,6 +1377,14 @@@ static void switched_from_rt(struct rq 
         if (!rq->rt.rt_nr_running)
                 pull_rt_task(rq);
   }
+ +
+ +static inline void init_sched_rt_class(void)
+ +{
+ +      unsigned int i;
+ +
+ +      for_each_possible_cpu(i)
+ +              alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
+ +}
   #endif /* CONFIG_SMP */
   
   /*
@@@ -1555,4 -1541,3 +1555,4 @@@ static void print_rt_stats(struct seq_f
         rcu_read_unlock();
   }
   #endif /* CONFIG_SCHED_DEBUG */
+ +
diff --combined kernel/sched_stats.h

index 5fcf0e1,b59fd9c..f2773b5
--- 1/kernel/sched_stats.h
--- 2/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@@ -31,7 -31,7 +31,7 @@@ static int show_schedstat(struct seq_fi
                     rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
                     rq->sched_switch, rq->sched_count, rq->sched_goidle,
                     rq->ttwu_count, rq->ttwu_local,
-                   rq->rq_sched_info.cpu_time,
+                   rq->rq_cpu_time,
                     rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
   
                 seq_printf(seq, "\n");
@@@ -42,8 -42,7 +42,8 @@@
                 for_each_domain(cpu, sd) {
                         enum cpu_idle_type itype;
   
- -                      cpumask_scnprintf(mask_str, mask_len, &sd->span);
+ +                      cpumask_scnprintf(mask_str, mask_len,
+ +                                        sched_domain_span(sd));
                         seq_printf(seq, "domain%d %s", dcount++, mask_str);
                         for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
                                         itype++) {
@@@ -124,7 -123,7 +124,7 @@@ static inline voi
   rq_sched_info_depart(struct rq *rq, unsigned long long delta)
   {
         if (rq)
-               rq->rq_sched_info.cpu_time += delta;
+               rq->rq_cpu_time += delta;
   }
   
   static inline void
@@@ -237,7 -236,6 +237,6 @@@ static inline void sched_info_depart(st
         unsigned long long delta = task_rq(t)->clock -
                                         t->sched_info.last_arrival;
   
-       t->sched_info.cpu_time += delta;
         rq_sched_info_depart(task_rq(t), delta);
   
         if (t->state == TASK_RUNNING)
diff --combined kernel/time/tick-sched.c

index 70f872c,8f3fc25..76a574b
--- 1/kernel/time/tick-sched.c
--- 2/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@@ -144,7 -144,7 +144,7 @@@ void tick_nohz_update_jiffies(void
         if (!ts->tick_stopped)
                 return;
   
- -      cpu_clear(cpu, nohz_cpu_mask);
+ +      cpumask_clear_cpu(cpu, nohz_cpu_mask);
         now = ktime_get();
         ts->idle_waketime = now;
   
@@@ -247,7 -247,7 +247,7 @@@ void tick_nohz_stop_sched_tick(int inid
         if (need_resched())
                 goto end;
   
-       if (unlikely(local_softirq_pending())) {
+       if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
                 static int ratelimit;
   
                 if (ratelimit < 10) {
@@@ -282,8 -282,31 +282,31 @@@
         /* Schedule the tick, if we are at least one jiffie off */
         if ((long)delta_jiffies >= 1) {
   
+               /*
+               * calculate the expiry time for the next timer wheel
+               * timer
+               */
+               expires = ktime_add_ns(last_update, tick_period.tv64 *
+                                  delta_jiffies);
+ 
+               /*
+                * If this cpu is the one which updates jiffies, then
+                * give up the assignment and let it be taken by the
+                * cpu which runs the tick timer next, which might be
+                * this cpu as well. If we don't drop this here the
+                * jiffies might be stale and do_timer() never
+                * invoked.
+                */
+               if (cpu == tick_do_timer_cpu)
+                       tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ 
                 if (delta_jiffies > 1)
- -                      cpu_set(cpu, nohz_cpu_mask);
+ +                      cpumask_set_cpu(cpu, nohz_cpu_mask);
+ 
+               /* Skip reprogram of event if its not changed */
+               if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
+                       goto out;
+ 
                 /*
                  * nohz_stop_sched_tick can be called several times before
                  * the nohz_restart_sched_tick is called. This happens when
@@@ -296,7 -319,7 +319,7 @@@
                                 /*
                                  * sched tick not stopped!
                                  */
- -                              cpu_clear(cpu, nohz_cpu_mask);
+ +                              cpumask_clear_cpu(cpu, nohz_cpu_mask);
                                 goto out;
                         }
   
@@@ -306,17 -329,6 +329,6 @@@
                         rcu_enter_nohz();
                 }
   
-               /*
-                * If this cpu is the one which updates jiffies, then
-                * give up the assignment and let it be taken by the
-                * cpu which runs the tick timer next, which might be
-                * this cpu as well. If we don't drop this here the
-                * jiffies might be stale and do_timer() never
-                * invoked.
-                */
-               if (cpu == tick_do_timer_cpu)
-                       tick_do_timer_cpu = TICK_DO_TIMER_NONE;
- 
                 ts->idle_sleeps++;
   
                 /*
@@@ -332,12 -344,7 +344,7 @@@
                         goto out;
                 }
   
-               /*
-                * calculate the expiry time for the next timer wheel
-                * timer
-                */
-               expires = ktime_add_ns(last_update, tick_period.tv64 *
-                                      delta_jiffies);
+               /* Mark expiries */
                 ts->idle_expires = expires;
   
                 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
@@@ -354,7 -361,7 +361,7 @@@
                  * softirq.
                  */
                 tick_do_update_jiffies64(ktime_get());
- -              cpu_clear(cpu, nohz_cpu_mask);
+ +              cpumask_clear_cpu(cpu, nohz_cpu_mask);
         }
         raise_softirq_irqoff(TIMER_SOFTIRQ);
   out:
@@@ -432,7 -439,7 +439,7 @@@ void tick_nohz_restart_sched_tick(void
         select_nohz_load_balancer(0);
         now = ktime_get();
         tick_do_update_jiffies64(now);
- -      cpu_clear(cpu, nohz_cpu_mask);
+ +      cpumask_clear_cpu(cpu, nohz_cpu_mask);
   
         /*
          * We stopped the tick in idle. Update process times would miss the
@@@ -681,7 -688,6 +688,6 @@@ void tick_setup_sched_timer(void
          */
         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
         ts->sched_timer.function = tick_sched_timer;
-       ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
   
         /* Get the next period (per cpu) */
         hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
author	Mike Travis <travis@sgi.com>
	Thu, 1 Jan 2009 01:34:16 +0000 (17:34 -0800)
committer	Ingo Molnar <mingo@elte.hu>
	Sat, 3 Jan 2009 17:53:31 +0000 (18:53 +0100)
		1	2
arch/ia64/include/asm/topology.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/mips/include/asm/mach-ip27/topology.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/include/asm/topology.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/sh/include/asm/topology.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/irq.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/topology.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/apic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/intel_cacheinfo.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/mcheck/mce_amd_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/genx2apic_uv_x.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/io_apic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/irq_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/irqinit_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/irqinit_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/setup_percpu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/smp.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/smpboot.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/tlb_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/tlb_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/traps.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/xen/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcuclassic.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_rt.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_stats.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/tick-sched.c	patch \|	diff1 \|	diff2 \|	blob \| history