Merge branch 'linus' into cpus4096

author Ingo Molnar <mingo@elte.hu>

Tue, 15 Jul 2008 22:29:07 +0000 (00:29 +0200)

committer Ingo Molnar <mingo@elte.hu>

Tue, 15 Jul 2008 22:29:07 +0000 (00:29 +0200)
author Ingo Molnar <mingo@elte.hu>
Tue, 15 Jul 2008 22:29:07 +0000 (00:29 +0200)
committer Ingo Molnar <mingo@elte.hu>
Tue, 15 Jul 2008 22:29:07 +0000 (00:29 +0200)
diff --combined arch/x86/kernel/cpu/intel_cacheinfo.c

index bfade33,2c8afaf..a7b0f8f
--- 1/arch/x86/kernel/cpu/intel_cacheinfo.c
--- 2/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@@ -62,6 -62,7 +62,7 @@@ static struct _cache_table cache_table[
         { 0x4b, LVL_3,      8192 },     /* 16-way set assoc, 64 byte line size */
         { 0x4c, LVL_3,     12288 },     /* 12-way set assoc, 64 byte line size */
         { 0x4d, LVL_3,     16384 },     /* 16-way set assoc, 64 byte line size */
+       { 0x4e, LVL_2,      6144 },     /* 24-way set assoc, 64 byte line size */
         { 0x60, LVL_1_DATA, 16 },       /* 8-way set assoc, sectored cache, 64 byte line size */
         { 0x66, LVL_1_DATA, 8 },        /* 4-way set assoc, sectored cache, 64 byte line size */
         { 0x67, LVL_1_DATA, 16 },       /* 4-way set assoc, sectored cache, 64 byte line size */
@@@ -488,7 -489,7 +489,7 @@@ static void __cpuinit cache_remove_shar
         int sibling;
   
         this_leaf = CPUID4_INFO_IDX(cpu, index);
- -      for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) {
+ +      for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
                 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 
                 cpu_clear(cpu, sibling_leaf->shared_cpu_map);
         }
diff --combined arch/x86/kernel/io_apic_64.c

index e2838cb,6510cde..bf27114
--- 1/arch/x86/kernel/io_apic_64.c
--- 2/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@@ -61,7 -61,7 +61,7 @@@ struct irq_cfg 
   };
   
   /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
- struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
+ static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
         [0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
         [1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
         [2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
@@@ -82,6 -82,10 +82,10 @@@
   
   static int assign_irq_vector(int irq, cpumask_t mask);
   
+ int first_system_vector = 0xfe;
+ 
+ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+ 
   #define __apicdebuginit  __init
   
   int sis_apic_bug; /* not actually supported, dummy for compile */
@@@ -90,7 -94,7 +94,7 @@@ static int no_timer_check
   
   static int disable_timer_pin_1 __initdata;
   
- int timer_over_8254 __initdata = 1;
+ int timer_through_8259 __initdata;
   
   /* Where if anywhere is the i8259 connect in external int mode */
   static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
@@@ -104,15 -108,17 +108,17 @@@ DEFINE_SPINLOCK(vector_lock)
   int nr_ioapic_registers[MAX_IO_APICS];
   
   /* I/O APIC entries */
- struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+ struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
   int nr_ioapics;
   
   /* MP IRQ source entries */
- struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
   
   /* # of MP IRQ source entries */
   int mp_irq_entries;
   
+ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+ 
   /*
    * Rough estimation of how many shared IRQs there are, can
    * be changed anytime.
@@@ -140,7 -146,7 +146,7 @@@ struct io_apic 
   static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
   {
         return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-               + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+               + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
   }
   
   static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@@ -183,7 -189,7 +189,7 @@@ static bool io_apic_level_ack_pending(u
                         break;
                 reg = io_apic_read(entry->apic, 0x10 + pin*2);
                 /* Is the remote IRR bit set? */
-               if ((reg >> 14) & 1) {
+               if (reg & IO_APIC_REDIR_REMOTE_IRR) {
                         spin_unlock_irqrestore(&ioapic_lock, flags);
                         return true;
                 }
@@@ -298,7 -304,7 +304,7 @@@ static void __target_IO_APIC_irq(unsign
                         break;
                 io_apic_write(apic, 0x11 + pin*2, dest);
                 reg = io_apic_read(apic, 0x10 + pin*2);
-               reg &= ~0x000000ff;
+               reg &= ~IO_APIC_REDIR_VECTOR_MASK;
                 reg |= vector;
                 io_apic_modify(apic, reg);
                 if (!entry->next)
@@@ -360,16 -366,37 +366,37 @@@ static void add_pin_to_irq(unsigned in
         entry->pin = pin;
   }
   
+ /*
+  * Reroute an IRQ to a different pin.
+  */
+ static void __init replace_pin_at_irq(unsigned int irq,
+                                     int oldapic, int oldpin,
+                                     int newapic, int newpin)
+ {
+       struct irq_pin_list *entry = irq_2_pin + irq;
+ 
+       while (1) {
+               if (entry->apic == oldapic && entry->pin == oldpin) {
+                       entry->apic = newapic;
+                       entry->pin = newpin;
+               }
+               if (!entry->next)
+                       break;
+               entry = irq_2_pin + entry->next;
+       }
+ }
+ 
   
   #define DO_ACTION(name,R,ACTION, FINAL)                                       \
                                                                         \
         static void name##_IO_APIC_irq (unsigned int irq)               \
         __DO_ACTION(R, ACTION, FINAL)
   
- DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
-                                               /* mask = 1 */
- DO_ACTION( __unmask,           0, &= 0xfffeffff, )
-                                               /* mask = 0 */
+ /* mask = 1 */
+ DO_ACTION(__mask,     0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
+ 
+ /* mask = 0 */
+ DO_ACTION(__unmask,   0, &= ~IO_APIC_REDIR_MASKED, )
   
   static void mask_IO_APIC_irq (unsigned int irq)
   {
@@@ -430,20 -457,6 +457,6 @@@ static int __init disable_timer_pin_set
   }
   __setup("disable_timer_pin_1", disable_timer_pin_setup);
   
- static int __init setup_disable_8254_timer(char *s)
- {
-       timer_over_8254 = -1;
-       return 1;
- }
- static int __init setup_enable_8254_timer(char *s)
- {
-       timer_over_8254 = 2;
-       return 1;
- }
- 
- __setup("disable_8254_timer", setup_disable_8254_timer);
- __setup("enable_8254_timer", setup_enable_8254_timer);
- 
   
   /*
    * Find the IRQ entry number of a certain pin.
@@@ -453,10 -466,10 +466,10 @@@ static int find_irq_entry(int apic, in
         int i;
   
         for (i = 0; i < mp_irq_entries; i++)
-               if (mp_irqs[i].mpc_irqtype == type &&
-                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
-                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
-                   mp_irqs[i].mpc_dstirq == pin)
+               if (mp_irqs[i].mp_irqtype == type &&
+                   (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+                    mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+                   mp_irqs[i].mp_dstirq == pin)
                         return i;
   
         return -1;
@@@ -470,13 -483,13 +483,13 @@@ static int __init find_isa_irq_pin(int 
         int i;
   
         for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mpc_srcbus;
+               int lbus = mp_irqs[i].mp_srcbus;
   
                 if (test_bit(lbus, mp_bus_not_pci) &&
-                   (mp_irqs[i].mpc_irqtype == type) &&
-                   (mp_irqs[i].mpc_srcbusirq == irq))
+                   (mp_irqs[i].mp_irqtype == type) &&
+                   (mp_irqs[i].mp_srcbusirq == irq))
   
-                       return mp_irqs[i].mpc_dstirq;
+                       return mp_irqs[i].mp_dstirq;
         }
         return -1;
   }
@@@ -486,17 -499,17 +499,17 @@@ static int __init find_isa_irq_apic(in
         int i;
   
         for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mpc_srcbus;
+               int lbus = mp_irqs[i].mp_srcbus;
   
                 if (test_bit(lbus, mp_bus_not_pci) &&
-                   (mp_irqs[i].mpc_irqtype == type) &&
-                   (mp_irqs[i].mpc_srcbusirq == irq))
+                   (mp_irqs[i].mp_irqtype == type) &&
+                   (mp_irqs[i].mp_srcbusirq == irq))
                         break;
         }
         if (i < mp_irq_entries) {
                 int apic;
                 for(apic = 0; apic < nr_ioapics; apic++) {
-                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+                       if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
                                 return apic;
                 }
         }
@@@ -516,28 -529,28 +529,28 @@@ int IO_APIC_get_PCI_irq_vector(int bus
   
         apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
                 bus, slot, pin);
-       if (mp_bus_id_to_pci_bus[bus] == -1) {
+       if (test_bit(bus, mp_bus_not_pci)) {
                 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
                 return -1;
         }
         for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mpc_srcbus;
+               int lbus = mp_irqs[i].mp_srcbus;
   
                 for (apic = 0; apic < nr_ioapics; apic++)
-                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
-                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+                       if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+                           mp_irqs[i].mp_dstapic == MP_APIC_ALL)
                                 break;
   
                 if (!test_bit(lbus, mp_bus_not_pci) &&
-                   !mp_irqs[i].mpc_irqtype &&
+                   !mp_irqs[i].mp_irqtype &&
                     (bus == lbus) &&
-                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
-                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+                   (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+                       int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
   
                         if (!(apic || IO_APIC_IRQ(irq)))
                                 continue;
   
-                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+                       if (pin == (mp_irqs[i].mp_srcbusirq & 3))
                                 return irq;
                         /*
                          * Use the first all-but-pin matching entry as a
@@@ -565,13 -578,13 +578,13 @@@
   
   static int MPBIOS_polarity(int idx)
   {
-       int bus = mp_irqs[idx].mpc_srcbus;
+       int bus = mp_irqs[idx].mp_srcbus;
         int polarity;
   
         /*
          * Determine IRQ line polarity (high active or low active):
          */
-       switch (mp_irqs[idx].mpc_irqflag & 3)
+       switch (mp_irqs[idx].mp_irqflag & 3)
         {
                 case 0: /* conforms, ie. bus-type dependent polarity */
                         if (test_bit(bus, mp_bus_not_pci))
@@@ -607,13 -620,13 +620,13 @@@
   
   static int MPBIOS_trigger(int idx)
   {
-       int bus = mp_irqs[idx].mpc_srcbus;
+       int bus = mp_irqs[idx].mp_srcbus;
         int trigger;
   
         /*
          * Determine IRQ trigger mode (edge or level sensitive):
          */
-       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+       switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
         {
                 case 0: /* conforms, ie. bus-type dependent */
                         if (test_bit(bus, mp_bus_not_pci))
@@@ -660,16 -673,16 +673,16 @@@ static inline int irq_trigger(int idx
   static int pin_2_irq(int idx, int apic, int pin)
   {
         int irq, i;
-       int bus = mp_irqs[idx].mpc_srcbus;
+       int bus = mp_irqs[idx].mp_srcbus;
   
         /*
          * Debugging check, we are in big trouble if this message pops up!
          */
-       if (mp_irqs[idx].mpc_dstirq != pin)
+       if (mp_irqs[idx].mp_dstirq != pin)
                 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
   
         if (test_bit(bus, mp_bus_not_pci)) {
-               irq = mp_irqs[idx].mpc_srcbusirq;
+               irq = mp_irqs[idx].mp_srcbusirq;
         } else {
                 /*
                  * PCI IRQs are mapped in order
@@@ -718,7 -731,7 +731,7 @@@ static int __assign_irq_vector(int irq
                         return 0;
         }
   
- -      for_each_cpu_mask(cpu, mask) {
+ +      for_each_cpu_mask_nr(cpu, mask) {
                 cpumask_t domain, new_mask;
                 int new_cpu;
                 int vector, offset;
@@@ -730,7 -743,7 +743,7 @@@
                 offset = current_offset;
   next:
                 vector += 8;
-               if (vector >= FIRST_SYSTEM_VECTOR) {
+               if (vector >= first_system_vector) {
                         /* If we run out of vectors on large boxen, must share them. */
                         offset = (offset + 1) % 8;
                         vector = FIRST_DEVICE_VECTOR + offset;
@@@ -739,7 -752,7 +752,7 @@@
                         continue;
                 if (vector == IA32_SYSCALL_VECTOR)
                         goto next;
- -              for_each_cpu_mask(new_cpu, new_mask)
+ +              for_each_cpu_mask_nr(new_cpu, new_mask)
                         if (per_cpu(vector_irq, new_cpu)[vector] != -1)
                                 goto next;
                 /* Found one! */
@@@ -749,7 -762,7 +762,7 @@@
                         cfg->move_in_progress = 1;
                         cfg->old_domain = cfg->domain;
                 }
- -              for_each_cpu_mask(new_cpu, new_mask)
+ +              for_each_cpu_mask_nr(new_cpu, new_mask)
                         per_cpu(vector_irq, new_cpu)[vector] = irq;
                 cfg->vector = vector;
                 cfg->domain = domain;
@@@ -781,14 -794,14 +794,14 @@@ static void __clear_irq_vector(int irq
   
         vector = cfg->vector;
         cpus_and(mask, cfg->domain, cpu_online_map);
- -      for_each_cpu_mask(cpu, mask)
+ +      for_each_cpu_mask_nr(cpu, mask)
                 per_cpu(vector_irq, cpu)[vector] = -1;
   
         cfg->vector = 0;
         cpus_clear(cfg->domain);
   }
   
- void __setup_vector_irq(int cpu)
+ static void __setup_vector_irq(int cpu)
   {
         /* Initialize vector_irq on a new cpu */
         /* This function must be called with vector_lock held */
@@@ -811,6 -824,13 +824,13 @@@
         }
   }
   
+ void setup_vector_irq(int cpu)
+ {
+       spin_lock(&vector_lock);
+       __setup_vector_irq(smp_processor_id());
+       spin_unlock(&vector_lock);
+ }
+ 
   
   static struct irq_chip ioapic_chip;
   
@@@ -846,7 -866,7 +866,7 @@@ static void setup_IO_APIC_irq(int apic
         apic_printk(APIC_VERBOSE,KERN_DEBUG
                     "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
                     "IRQ %d Mode:%i Active:%i)\n",
-                   apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
+                   apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
                     irq, trigger, polarity);
   
         /*
@@@ -887,10 -907,10 +907,10 @@@ static void __init setup_IO_APIC_irqs(v
                 idx = find_irq_entry(apic,pin,mp_INT);
                 if (idx == -1) {
                         if (first_notcon) {
-                               apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+                               apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
                                 first_notcon = 0;
                         } else
-                               apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+                               apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
                         continue;
                 }
                 if (!first_notcon) {
@@@ -911,26 -931,21 +931,21 @@@
   }
   
   /*
-  * Set up the 8259A-master output pin as broadcast to all
-  * CPUs.
+  * Set up the timer pin, possibly with the 8259A-master behind.
    */
- static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+                                       int vector)
   {
         struct IO_APIC_route_entry entry;
   
         memset(&entry, 0, sizeof(entry));
   
-       disable_8259A_irq(0);
- 
-       /* mask LVT0 */
-       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
- 
         /*
          * We use logical delivery to get the timer IRQ
          * to the first CPU.
          */
         entry.dest_mode = INT_DEST_MODE;
-       entry.mask = 0;                                 /* unmask IRQ now */
+       entry.mask = 1;                                 /* mask IRQ now */
         entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
         entry.delivery_mode = INT_DELIVERY_MODE;
         entry.polarity = 0;
@@@ -939,7 -954,7 +954,7 @@@
   
         /*
          * The timer IRQ doesn't have to know that behind the
-        * scene we have a 8259A-master in AEOI mode ...
+        * scene we may have a 8259A-master in AEOI mode ...
          */
         set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
   
@@@ -947,8 -962,6 +962,6 @@@
          * Add it to the IO-APIC irq-routing table:
          */
         ioapic_write_entry(apic, pin, entry);
- 
-       enable_8259A_irq(0);
   }
   
   void __apicdebuginit print_IO_APIC(void)
@@@ -965,7 -978,7 +978,7 @@@
         printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
         for (i = 0; i < nr_ioapics; i++)
                 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+                      mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
   
         /*
          * We are a bit conservative about what we expect.  We have to
@@@ -983,7 -996,7 +996,7 @@@
         spin_unlock_irqrestore(&ioapic_lock, flags);
   
         printk("\n");
-       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
         printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
         printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
   
@@@ -1077,6 -1090,7 +1090,7 @@@ void __apicdebuginit print_local_APIC(v
   
         printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
                 smp_processor_id(), hard_smp_processor_id());
+       v = apic_read(APIC_ID);
         printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
         v = apic_read(APIC_LVR);
         printk(KERN_INFO "... APIC VERSION: %08x\n", v);
@@@ -1146,7 -1160,7 +1160,7 @@@
   
   void print_all_local_APICs (void)
   {
-       on_each_cpu(print_local_APIC, NULL, 1, 1);
+       on_each_cpu(print_local_APIC, NULL, 1);
   }
   
   void __apicdebuginit print_PIC(void)
@@@ -1540,7 -1554,7 +1554,7 @@@ static inline void init_IO_APIC_traps(v
         }
   }
   
- static void enable_lapic_irq (unsigned int irq)
+ static void unmask_lapic_irq(unsigned int irq)
   {
         unsigned long v;
   
@@@ -1548,7 -1562,7 +1562,7 @@@
         apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
   }
   
- static void disable_lapic_irq (unsigned int irq)
+ static void mask_lapic_irq(unsigned int irq)
   {
         unsigned long v;
   
@@@ -1561,19 -1575,20 +1575,20 @@@ static void ack_lapic_irq (unsigned in
         ack_APIC_irq();
   }
   
- static void end_lapic_irq (unsigned int i) { /* nothing */ }
- 
- static struct hw_interrupt_type lapic_irq_type __read_mostly = {
-       .name = "local-APIC",
-       .typename = "local-APIC-edge",
-       .startup = NULL, /* startup_irq() not used for IRQ0 */
-       .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
-       .enable = enable_lapic_irq,
-       .disable = disable_lapic_irq,
-       .ack = ack_lapic_irq,
-       .end = end_lapic_irq,
+ static struct irq_chip lapic_chip __read_mostly = {
+       .name           = "local-APIC",
+       .mask           = mask_lapic_irq,
+       .unmask         = unmask_lapic_irq,
+       .ack            = ack_lapic_irq,
   };
   
+ static void lapic_register_intr(int irq)
+ {
+       irq_desc[irq].status &= ~IRQ_LEVEL;
+       set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+                                     "edge");
+ }
+ 
   static void __init setup_nmi(void)
   {
         /*
@@@ -1659,6 -1674,7 +1674,7 @@@ static inline void __init check_timer(v
         struct irq_cfg *cfg = irq_cfg + 0;
         int apic1, pin1, apic2, pin2;
         unsigned long flags;
+       int no_pin1 = 0;
   
         local_irq_save(flags);
   
@@@ -1669,16 -1685,11 +1685,11 @@@
         assign_irq_vector(0, TARGET_CPUS);
   
         /*
-        * Subtle, code in do_timer_interrupt() expects an AEOI
-        * mode for the 8259A whenever interrupts are routed
-        * through I/O APICs.  Also IRQ0 has to be enabled in
-        * the 8259A which implies the virtual wire has to be
-        * disabled in the local APIC.
+        * As IRQ0 is to be enabled in the 8259A, the virtual
+        * wire has to be disabled in the local APIC.
          */
         apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
         init_8259A(1);
-       if (timer_over_8254 > 0)
-               enable_8259A_irq(0);
   
         pin1  = find_isa_irq_pin(0, mp_INT);
         apic1 = find_isa_irq_apic(0, mp_INT);
@@@ -1688,15 -1699,33 +1699,33 @@@
         apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
                 cfg->vector, apic1, pin1, apic2, pin2);
   
+       /*
+        * Some BIOS writers are clueless and report the ExtINTA
+        * I/O APIC input from the cascaded 8259A as the timer
+        * interrupt input.  So just in case, if only one pin
+        * was found above, try it both directly and through the
+        * 8259A.
+        */
+       if (pin1 == -1) {
+               pin1 = pin2;
+               apic1 = apic2;
+               no_pin1 = 1;
+       } else if (pin2 == -1) {
+               pin2 = pin1;
+               apic2 = apic1;
+       }
+ 
         if (pin1 != -1) {
                 /*
                  * Ok, does IRQ0 through the IOAPIC work?
                  */
+               if (no_pin1) {
+                       add_pin_to_irq(0, apic1, pin1);
+                       setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+               }
                 unmask_IO_APIC_irq(0);
                 if (!no_timer_check && timer_irq_works()) {
-                       nmi_watchdog_default();
                         if (nmi_watchdog == NMI_IO_APIC) {
-                               disable_8259A_irq(0);
                                 setup_nmi();
                                 enable_8259A_irq(0);
                         }
@@@ -1705,43 -1734,48 +1734,48 @@@
                         goto out;
                 }
                 clear_IO_APIC_pin(apic1, pin1);
-               apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
-                               "connected to IO-APIC\n");
-       }
+               if (!no_pin1)
+                       apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: "
+                                   "8254 timer not connected to IO-APIC\n");
   
-       apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
-                               "through the 8259A ... ");
-       if (pin2 != -1) {
+               apic_printk(APIC_VERBOSE,KERN_INFO
+                       "...trying to set up timer (IRQ0) "
+                       "through the 8259A ... ");
                 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
                         apic2, pin2);
                 /*
                  * legacy devices should be connected to IO APIC #0
                  */
-               setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
+               replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+               setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+               unmask_IO_APIC_irq(0);
+               enable_8259A_irq(0);
                 if (timer_irq_works()) {
                         apic_printk(APIC_VERBOSE," works.\n");
-                       nmi_watchdog_default();
+                       timer_through_8259 = 1;
                         if (nmi_watchdog == NMI_IO_APIC) {
+                               disable_8259A_irq(0);
                                 setup_nmi();
+                               enable_8259A_irq(0);
                         }
                         goto out;
                 }
                 /*
                  * Cleanup, just in case ...
                  */
+               disable_8259A_irq(0);
                 clear_IO_APIC_pin(apic2, pin2);
+               apic_printk(APIC_VERBOSE," failed.\n");
         }
-       apic_printk(APIC_VERBOSE," failed.\n");
   
         if (nmi_watchdog == NMI_IO_APIC) {
                 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-               nmi_watchdog = 0;
+               nmi_watchdog = NMI_NONE;
         }
   
         apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
   
-       disable_8259A_irq(0);
-       irq_desc[0].chip = &lapic_irq_type;
+       lapic_register_intr(0);
         apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
         enable_8259A_irq(0);
   
@@@ -1749,6 -1783,7 +1783,7 @@@
                 apic_printk(APIC_VERBOSE," works.\n");
                 goto out;
         }
+       disable_8259A_irq(0);
         apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
         apic_printk(APIC_VERBOSE," failed.\n");
   
@@@ -1778,11 -1813,21 +1813,21 @@@ static int __init notimercheck(char *s
   __setup("no_timer_check", notimercheck);
   
   /*
-  *
-  * IRQs that are handled by the PIC in the MPS IOAPIC case.
-  * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
-  *   Linux doesn't really care, as it's not actually used
-  *   for any interrupt handling anyway.
+  * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+  * to devices.  However there may be an I/O APIC pin available for
+  * this interrupt regardless.  The pin may be left unconnected, but
+  * typically it will be reused as an ExtINT cascade interrupt for
+  * the master 8259A.  In the MPS case such a pin will normally be
+  * reported as an ExtINT interrupt in the MP table.  With ACPI
+  * there is no provision for ExtINT interrupts, and in the absence
+  * of an override it would be treated as an ordinary ISA I/O APIC
+  * interrupt, that is edge-triggered and unmasked by default.  We
+  * used to do this, but it caused problems on some systems because
+  * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+  * the same ExtINT cascade interrupt to drive the local APIC of the
+  * bootstrap processor.  Therefore we refrain from routing IRQ2 to
+  * the I/O APIC in all cases now.  No actual device should request
+  * it anyway.  --macro
    */
   #define PIC_IRQS      (1<<2)
   
@@@ -1793,10 -1838,7 +1838,7 @@@ void __init setup_IO_APIC(void
          * calling enable_IO_APIC() is moved to setup_local_APIC for BP
          */
   
-       if (acpi_ioapic)
-               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
-       else
-               io_apic_irqs = ~PIC_IRQS;
+       io_apic_irqs = ~PIC_IRQS;
   
         apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
   
@@@ -1841,8 -1883,8 +1883,8 @@@ static int ioapic_resume(struct sys_dev
   
         spin_lock_irqsave(&ioapic_lock, flags);
         reg_00.raw = io_apic_read(dev->id, 0);
-       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
-               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+       if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+               reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
                 io_apic_write(dev->id, 0, reg_00.raw);
         }
         spin_unlock_irqrestore(&ioapic_lock, flags);
@@@ -2242,8 -2284,8 +2284,8 @@@ int acpi_get_override_irq(int bus_irq, 
                 return -1;
   
         for (i = 0; i < mp_irq_entries; i++)
-               if (mp_irqs[i].mpc_irqtype == mp_INT &&
-                   mp_irqs[i].mpc_srcbusirq == bus_irq)
+               if (mp_irqs[i].mp_irqtype == mp_INT &&
+                   mp_irqs[i].mp_srcbusirq == bus_irq)
                         break;
         if (i >= mp_irq_entries)
                 return -1;
@@@ -2336,7 -2378,7 +2378,7 @@@ void __init ioapic_init_mappings(void
         ioapic_res = ioapic_setup_resources();
         for (i = 0; i < nr_ioapics; i++) {
                 if (smp_found_config) {
-                       ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+                       ioapic_phys = mp_ioapics[i].mp_apicaddr;
                 } else {
                         ioapic_phys = (unsigned long)
                                 alloc_bootmem_pages(PAGE_SIZE);
diff --combined arch/x86/kernel/smpboot.c

index 197300b,687376a..09b98cd
--- 1/arch/x86/kernel/smpboot.c
--- 2/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@@ -59,7 -59,6 +59,6 @@@
   #include <asm/pgtable.h>
   #include <asm/tlbflush.h>
   #include <asm/mtrr.h>
- #include <asm/nmi.h>
   #include <asm/vmi.h>
   #include <asm/genapic.h>
   #include <linux/mc146818rtc.h>
@@@ -68,22 -67,6 +67,6 @@@
   #include <mach_wakecpu.h>
   #include <smpboot_hooks.h>
   
- /*
-  * FIXME: For x86_64, those are defined in other files. But moving them here,
-  * would make the setup areas dependent on smp, which is a loss. When we
-  * integrate apic between arches, we can probably do a better job, but
-  * right now, they'll stay here -- glommer
-  */
- 
- /* which logical CPU number maps to which CPU (physical APIC ID) */
- u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
-                       { [0 ... NR_CPUS-1] = BAD_APICID };
- void *x86_cpu_to_apicid_early_ptr;
- 
- u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
-                               = { [0 ... NR_CPUS-1] = BAD_APICID };
- void *x86_bios_cpu_apicid_early_ptr;
- 
   #ifdef CONFIG_X86_32
   u8 apicid_2_node[MAX_APICID];
   static int low_mappings;
@@@ -198,13 -181,12 +181,12 @@@ static void map_cpu_to_logical_apicid(v
         map_cpu_to_node(cpu, node);
   }
   
- static void unmap_cpu_to_logical_apicid(int cpu)
+ void numa_remove_cpu(int cpu)
   {
         cpu_2_logical_apicid[cpu] = BAD_APICID;
         unmap_cpu_to_node(cpu);
   }
   #else
- #define unmap_cpu_to_logical_apicid(cpu) do {} while (0)
   #define map_cpu_to_logical_apicid()  do {} while (0)
   #endif
   
@@@ -345,19 -327,12 +327,12 @@@ static void __cpuinit start_secondary(v
          * lock helps us to not include this cpu in a currently in progress
          * smp_call_function().
          */
-       lock_ipi_call_lock();
- #ifdef CONFIG_X86_64
-       spin_lock(&vector_lock);
- 
-       /* Setup the per cpu irq handling data structures */
-       __setup_vector_irq(smp_processor_id());
-       /*
-        * Allow the master to continue.
-        */
-       spin_unlock(&vector_lock);
+       ipi_call_lock_irq();
+ #ifdef CONFIG_X86_IO_APIC
+       setup_vector_irq(smp_processor_id());
   #endif
         cpu_set(smp_processor_id(), cpu_online_map);
-       unlock_ipi_call_lock();
+       ipi_call_unlock_irq();
         per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
   
         setup_secondary_clock();
@@@ -366,31 -341,8 +341,8 @@@
         cpu_idle();
   }
   
- #ifdef CONFIG_X86_32
- /*
-  * Everything has been set up for the secondary
-  * CPUs - they just need to reload everything
-  * from the task structure
-  * This function must not return.
-  */
- void __devinit initialize_secondary(void)
- {
-       /*
-        * We don't actually need to load the full TSS,
-        * basically just the stack pointer and the ip.
-        */
- 
-       asm volatile(
-               "movl %0,%%esp\n\t"
-               "jmp *%1"
-               :
-               :"m" (current->thread.sp), "m" (current->thread.ip));
- }
- #endif
- 
   static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
   {
- #ifdef CONFIG_X86_32
         /*
          * Mask B, Pentium, but not Pentium MMX
          */
@@@ -440,7 -392,6 +392,6 @@@
   
   valid_k7:
         ;
- #endif
   }
   
   static void __cpuinit smp_checks(void)
@@@ -487,7 -438,7 +438,7 @@@ void __cpuinit set_cpu_sibling_map(int 
         cpu_set(cpu, cpu_sibling_setup_map);
   
         if (smp_num_siblings > 1) {
- -              for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ +              for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
                         if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
                             c->cpu_core_id == cpu_data(i).cpu_core_id) {
                                 cpu_set(i, per_cpu(cpu_sibling_map, cpu));
@@@ -510,7 -461,7 +461,7 @@@
                 return;
         }
   
- -      for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ +      for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
                 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
                     per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
                         cpu_set(i, c->llc_shared_map);
@@@ -555,23 -506,6 +506,6 @@@ cpumask_t cpu_coregroup_map(int cpu
                 return c->llc_shared_map;
   }
   
- #ifdef CONFIG_X86_32
- /*
-  * We are called very early to get the low memory for the
-  * SMP bootup trampoline page.
-  */
- void __init smp_alloc_memory(void)
- {
-       trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
-       /*
-        * Has to be in very low memory so we can execute
-        * real-mode AP code.
-        */
-       if (__pa(trampoline_base) >= 0x9F000)
-               BUG();
- }
- #endif
- 
   static void impress_friends(void)
   {
         int cpu;
@@@ -748,11 -682,7 +682,7 @@@ wakeup_secondary_cpu(int phys_apicid, u
          * target processor state.
          */
         startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
- #ifdef CONFIG_X86_64
-                        (unsigned long)init_rsp);
- #else
                          (unsigned long)stack_start.sp);
- #endif
   
         /*
          * Run STARTUP IPI loop.
@@@ -832,6 -762,45 +762,45 @@@ static void __cpuinit do_fork_idle(stru
         complete(&c_idle->done);
   }
   
+ #ifdef CONFIG_X86_64
+ /*
+  * Allocate node local memory for the AP pda.
+  *
+  * Must be called after the _cpu_pda pointer table is initialized.
+  */
+ static int __cpuinit get_local_pda(int cpu)
+ {
+       struct x8664_pda *oldpda, *newpda;
+       unsigned long size = sizeof(struct x8664_pda);
+       int node = cpu_to_node(cpu);
+ 
+       if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
+               return 0;
+ 
+       oldpda = cpu_pda(cpu);
+       newpda = kmalloc_node(size, GFP_ATOMIC, node);
+       if (!newpda) {
+               printk(KERN_ERR "Could not allocate node local PDA "
+                       "for CPU %d on node %d\n", cpu, node);
+ 
+               if (oldpda)
+                       return 0;       /* have a usable pda */
+               else
+                       return -1;
+       }
+ 
+       if (oldpda) {
+               memcpy(newpda, oldpda, size);
+               if (!after_bootmem)
+                       free_bootmem((unsigned long)oldpda, size);
+       }
+ 
+       newpda->in_bootmem = 0;
+       cpu_pda(cpu) = newpda;
+       return 0;
+ }
+ #endif /* CONFIG_X86_64 */
+ 
   static int __cpuinit do_boot_cpu(int apicid, int cpu)
   /*
    * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@@ -848,28 -817,14 +817,14 @@@
                 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
         };
         INIT_WORK(&c_idle.work, do_fork_idle);
- #ifdef CONFIG_X86_64
-       /* allocate memory for gdts of secondary cpus. Hotplug is considered */
-       if (!cpu_gdt_descr[cpu].address &&
-               !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
-               printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
-               return -1;
-       }
   
+ #ifdef CONFIG_X86_64
         /* Allocate node local memory for AP pdas */
-       if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
-               struct x8664_pda *newpda, *pda;
-               int node = cpu_to_node(cpu);
-               pda = cpu_pda(cpu);
-               newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC,
-                                     node);
-               if (newpda) {
-                       memcpy(newpda, pda, sizeof(struct x8664_pda));
-                       cpu_pda(cpu) = newpda;
-               } else
-                       printk(KERN_ERR
-               "Could not allocate node local PDA for CPU %d on node %d\n",
-                               cpu, node);
+       if (cpu > 0) {
+               boot_error = get_local_pda(cpu);
+               if (boot_error)
+                       goto restore_state;
+                       /* if can't get pda memory, can't start cpu */
         }
   #endif
   
@@@ -905,18 -860,15 +860,15 @@@ do_rest
   #ifdef CONFIG_X86_32
         per_cpu(current_task, cpu) = c_idle.idle;
         init_gdt(cpu);
-       early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
-       c_idle.idle->thread.ip = (unsigned long) start_secondary;
         /* Stack for startup_32 can be just as for start_secondary onwards */
-       stack_start.sp = (void *) c_idle.idle->thread.sp;
         irq_ctx_init(cpu);
   #else
         cpu_pda(cpu)->pcurrent = c_idle.idle;
-       init_rsp = c_idle.idle->thread.sp;
-       load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
-       initial_code = (unsigned long)start_secondary;
         clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
   #endif
+       early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+       initial_code = (unsigned long)start_secondary;
+       stack_start.sp = (void *) c_idle.idle->thread.sp;
   
         /* start_ip had better be page-aligned! */
         start_ip = setup_trampoline();
@@@ -987,13 -939,12 +939,12 @@@
                                 inquire_remote_apic(apicid);
                 }
         }
- 
-       if (boot_error) {
-               /* Try to put things back the way they were before ... */
-               unmap_cpu_to_logical_apicid(cpu);
   #ifdef CONFIG_X86_64
-               clear_node_cpumask(cpu); /* was set by numa_add_cpu */
+ restore_state:
   #endif
+       if (boot_error) {
+               /* Try to put things back the way they were before ... */
+               numa_remove_cpu(cpu); /* was set by numa_add_cpu */
                 cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
                 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
                 cpu_clear(cpu, cpu_present_map);
@@@ -1087,14 -1038,12 +1038,12 @@@ static __init void disable_smp(void
   {
         cpu_present_map = cpumask_of_cpu(0);
         cpu_possible_map = cpumask_of_cpu(0);
- #ifdef CONFIG_X86_32
         smpboot_clear_io_apic_irqs();
- #endif
+ 
         if (smp_found_config)
-               phys_cpu_present_map =
-                               physid_mask_of_physid(boot_cpu_physical_apicid);
+               physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
         else
-               phys_cpu_present_map = physid_mask_of_physid(0);
+               physid_set_mask_of_physid(0, &phys_cpu_present_map);
         map_cpu_to_logical_apicid();
         cpu_set(0, per_cpu(cpu_sibling_map, 0));
         cpu_set(0, per_cpu(cpu_core_map, 0));
@@@ -1157,12 -1106,12 +1106,12 @@@ static int __init smp_sanity_check(unsi
          * If SMP should be disabled, then really disable it!
          */
         if (!max_cpus) {
-               printk(KERN_INFO "SMP mode deactivated,"
-                                "forcing use of dummy APIC emulation.\n");
+               printk(KERN_INFO "SMP mode deactivated.\n");
                 smpboot_clear_io_apic();
- #ifdef CONFIG_X86_32
+ 
+               localise_nmi_watchdog();
+ 
                 connect_bsp_APIC();
- #endif
                 setup_local_APIC();
                 end_local_APIC_setup();
                 return -1;
@@@ -1190,7 -1139,6 +1139,6 @@@ static void __init smp_cpu_index_defaul
   void __init native_smp_prepare_cpus(unsigned int max_cpus)
   {
         preempt_disable();
-       nmi_watchdog_default();
         smp_cpu_index_default();
         current_cpu_data = boot_cpu_data;
         cpu_callin_map = cpumask_of_cpu(0);
@@@ -1217,9 -1165,8 +1165,8 @@@
         }
         preempt_enable();
   
- #ifdef CONFIG_X86_32
         connect_bsp_APIC();
- #endif
+ 
         /*
          * Switch from PIC to APIC mode.
          */
@@@ -1257,8 -1204,8 +1204,8 @@@ void __init native_smp_prepare_boot_cpu
         int me = smp_processor_id();
   #ifdef CONFIG_X86_32
         init_gdt(me);
-       switch_to_new_gdt();
   #endif
+       switch_to_new_gdt();
         /* already set me in cpu_online_map in boot_cpu_init() */
         cpu_set(me, cpu_callout_map);
         per_cpu(cpu_state, me) = CPU_ONLINE;
@@@ -1278,29 -1225,12 +1225,12 @@@ void __init native_smp_cpus_done(unsign
   
   #ifdef CONFIG_HOTPLUG_CPU
   
- #  ifdef CONFIG_X86_32
- void cpu_exit_clear(void)
- {
-       int cpu = raw_smp_processor_id();
- 
-       idle_task_exit();
- 
-       cpu_uninit();
-       irq_ctx_exit(cpu);
- 
-       cpu_clear(cpu, cpu_callout_map);
-       cpu_clear(cpu, cpu_callin_map);
- 
-       unmap_cpu_to_logical_apicid(cpu);
- }
- #  endif /* CONFIG_X86_32 */
- 
   static void remove_siblinginfo(int cpu)
   {
         int sibling;
         struct cpuinfo_x86 *c = &cpu_data(cpu);
   
- -      for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) {
+ +      for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
                 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
                 /*/
                  * last thread sibling in this cpu core going down
@@@ -1309,7 -1239,7 +1239,7 @@@
                         cpu_data(sibling).booted_cores--;
         }
   
- -      for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu))
+ +      for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
                 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
         cpus_clear(per_cpu(cpu_sibling_map, cpu));
         cpus_clear(per_cpu(cpu_core_map, cpu));
@@@ -1348,12 -1278,20 +1278,20 @@@ __init void prefill_possible_map(void
         int i;
         int possible;
   
+       /* no processor from mptable or madt */
+       if (!num_processors)
+               num_processors = 1;
+ 
+ #ifdef CONFIG_HOTPLUG_CPU
         if (additional_cpus == -1) {
                 if (disabled_cpus > 0)
                         additional_cpus = disabled_cpus;
                 else
                         additional_cpus = 0;
         }
+ #else
+       additional_cpus = 0;
+ #endif
         possible = num_processors + additional_cpus;
         if (possible > NR_CPUS)
                 possible = NR_CPUS;
@@@ -1363,18 -1301,18 +1301,18 @@@
   
         for (i = 0; i < possible; i++)
                 cpu_set(i, cpu_possible_map);
+ 
+       nr_cpu_ids = possible;
   }
   
   static void __ref remove_cpu_from_maps(int cpu)
   {
         cpu_clear(cpu, cpu_online_map);
- #ifdef CONFIG_X86_64
         cpu_clear(cpu, cpu_callout_map);
         cpu_clear(cpu, cpu_callin_map);
         /* was set by cpu_init() */
         clear_bit(cpu, (unsigned long *)&cpu_initialized);
-       clear_node_cpumask(cpu);
- #endif
+       numa_remove_cpu(cpu);
   }
   
   int __cpu_disable(void)
diff --combined arch/x86/xen/smp.c

index 7a70638,233156f..463adec
--- 1/arch/x86/xen/smp.c
--- 2/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@@ -35,28 -35,15 +35,15 @@@
   #include "xen-ops.h"
   #include "mmu.h"
   
- static cpumask_t xen_cpu_initialized_map;
- static DEFINE_PER_CPU(int, resched_irq) = -1;
- static DEFINE_PER_CPU(int, callfunc_irq) = -1;
- static DEFINE_PER_CPU(int, debug_irq) = -1;
- 
- /*
-  * Structure and data for smp_call_function(). This is designed to minimise
-  * static memory requirements. It also looks cleaner.
-  */
- static DEFINE_SPINLOCK(call_lock);
+ cpumask_t xen_cpu_initialized_map;
   
- struct call_data_struct {
-       void (*func) (void *info);
-       void *info;
-       atomic_t started;
-       atomic_t finished;
-       int wait;
- };
+ static DEFINE_PER_CPU(int, resched_irq);
+ static DEFINE_PER_CPU(int, callfunc_irq);
+ static DEFINE_PER_CPU(int, callfuncsingle_irq);
+ static DEFINE_PER_CPU(int, debug_irq) = -1;
   
   static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
- 
- static struct call_data_struct *call_data;
+ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
   
   /*
    * Reschedule call back. Nothing to do,
@@@ -65,6 -52,12 +52,12 @@@
    */
   static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
   {
+ #ifdef CONFIG_X86_32
+       __get_cpu_var(irq_stat).irq_resched_count++;
+ #else
+       add_pda(irq_resched_count, 1);
+ #endif
+ 
         return IRQ_HANDLED;
   }
   
@@@ -122,6 -115,17 +115,17 @@@ static int xen_smp_intr_init(unsigned i
                 goto fail;
         per_cpu(debug_irq, cpu) = rc;
   
+       callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
+       rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
+                                   cpu,
+                                   xen_call_function_single_interrupt,
+                                   IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                   callfunc_name,
+                                   NULL);
+       if (rc < 0)
+               goto fail;
+       per_cpu(callfuncsingle_irq, cpu) = rc;
+ 
         return 0;
   
    fail:
@@@ -131,6 -135,9 +135,9 @@@
                 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
         if (per_cpu(debug_irq, cpu) >= 0)
                 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
+       if (per_cpu(callfuncsingle_irq, cpu) >= 0)
+               unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
+ 
         return rc;
   }
   
@@@ -330,7 -337,7 +337,7 @@@ static void stop_self(void *v
   
   void xen_smp_send_stop(void)
   {
-       smp_call_function(stop_self, NULL, 0, 0);
+       smp_call_function(stop_self, NULL, 0);
   }
   
   void xen_smp_send_reschedule(int cpu)
@@@ -338,94 -345,52 +345,52 @@@
         xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
   }
   
- 
   static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
   {
         unsigned cpu;
   
         cpus_and(mask, mask, cpu_online_map);
   
- -      for_each_cpu_mask(cpu, mask)
+ +      for_each_cpu_mask_nr(cpu, mask)
                 xen_send_IPI_one(cpu, vector);
   }
   
- -      for_each_cpu_mask(cpu, mask) {
+ void xen_smp_send_call_function_ipi(cpumask_t mask)
+ {
+       int cpu;
+ 
+       xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+ 
+       /* Make sure other vcpus get a chance to run if they need to. */
++      for_each_cpu_mask_nr(cpu, mask) {
+               if (xen_vcpu_stolen(cpu)) {
+                       HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+                       break;
+               }
+       }
+ }
+ 
+ void xen_smp_send_call_function_single_ipi(int cpu)
+ {
+       xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
+ }
+ 
   static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
   {
-       void (*func) (void *info) = call_data->func;
-       void *info = call_data->info;
-       int wait = call_data->wait;
- 
-       /*
-        * Notify initiating CPU that I've grabbed the data and am
-        * about to execute the function
-        */
-       mb();
-       atomic_inc(&call_data->started);
-       /*
-        * At this point the info structure may be out of scope unless wait==1
-        */
         irq_enter();
-       (*func)(info);
+       generic_smp_call_function_interrupt();
         __get_cpu_var(irq_stat).irq_call_count++;
         irq_exit();
   
-       if (wait) {
-               mb();           /* commit everything before setting finished */
-               atomic_inc(&call_data->finished);
-       }
- 
         return IRQ_HANDLED;
   }
   
- int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
-                              void *info, int wait)
+ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
   {
-       struct call_data_struct data;
-       int cpus, cpu;
-       bool yield;
- 
-       /* Holding any lock stops cpus from going down. */
-       spin_lock(&call_lock);
- 
-       cpu_clear(smp_processor_id(), mask);
- 
-       cpus = cpus_weight(mask);
-       if (!cpus) {
-               spin_unlock(&call_lock);
-               return 0;
-       }
- 
-       /* Can deadlock when called with interrupts disabled */
-       WARN_ON(irqs_disabled());
- 
-       data.func = func;
-       data.info = info;
-       atomic_set(&data.started, 0);
-       data.wait = wait;
-       if (wait)
-               atomic_set(&data.finished, 0);
- 
-       call_data = &data;
-       mb();                   /* write everything before IPI */
- 
-       /* Send a message to other CPUs and wait for them to respond */
-       xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
- 
-       /* Make sure other vcpus get a chance to run if they need to. */
-       yield = false;
-       for_each_cpu_mask_nr(cpu, mask)
-               if (xen_vcpu_stolen(cpu))
-                       yield = true;
- 
-       if (yield)
-               HYPERVISOR_sched_op(SCHEDOP_yield, 0);
- 
-       /* Wait for response */
-       while (atomic_read(&data.started) != cpus ||
-              (wait && atomic_read(&data.finished) != cpus))
-               cpu_relax();
- 
-       spin_unlock(&call_lock);
+       irq_enter();
+       generic_smp_call_function_single_interrupt();
+       __get_cpu_var(irq_stat).irq_call_count++;
+       irq_exit();
   
-       return 0;
+       return IRQ_HANDLED;
   }
diff --combined drivers/infiniband/hw/ehca/ehca_irq.c

index 43180b9,0792d93..7a64aa9
--- 1/drivers/infiniband/hw/ehca/ehca_irq.c
--- 2/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@@ -531,7 -531,7 +531,7 @@@ void ehca_process_eq(struct ehca_shca *
   {
         struct ehca_eq *eq = &shca->eq;
         struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
-       u64 eqe_value;
+       u64 eqe_value, ret;
         unsigned long flags;
         int eqe_cnt, i;
         int eq_empty = 0;
@@@ -583,8 -583,13 +583,13 @@@
                         ehca_dbg(&shca->ib_device,
                                  "No eqe found for irq event");
                 goto unlock_irq_spinlock;
-       } else if (!is_irq)
+       } else if (!is_irq) {
+               ret = hipz_h_eoi(eq->ist);
+               if (ret != H_SUCCESS)
+                       ehca_err(&shca->ib_device,
+                                "bad return code EOI -rc = %ld\n", ret);
                 ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt);
+       }
         if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE))
                 ehca_dbg(&shca->ib_device, "too many eqes for one irq event");
         /* enable irq for new packets */
@@@ -641,8 -646,8 +646,8 @@@ static inline int find_next_online_cpu(
                 ehca_dmp(&cpu_online_map, sizeof(cpumask_t), "");
   
         spin_lock_irqsave(&pool->last_cpu_lock, flags);
- -      cpu = next_cpu(pool->last_cpu, cpu_online_map);
- -      if (cpu == NR_CPUS)
+ +      cpu = next_cpu_nr(pool->last_cpu, cpu_online_map);
+ +      if (cpu >= nr_cpu_ids)
                 cpu = first_cpu(cpu_online_map);
         pool->last_cpu = cpu;
         spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
diff --combined include/asm-x86/ipi.h

index 5f7310a,196d63c..bb1c09f
--- 1/include/asm-x86/ipi.h
--- 2/include/asm-x86/ipi.h
+++ b/include/asm-x86/ipi.h
@@@ -20,6 -20,7 +20,7 @@@
   
   #include <asm/hw_irq.h>
   #include <asm/apic.h>
+ #include <asm/smp.h>
   
   /*
    * the following functions deal with sending IPIs between CPUs.
@@@ -121,7 -122,7 +122,7 @@@ static inline void send_IPI_mask_sequen
          * - mbligh
          */
         local_irq_save(flags);
- -      for_each_cpu_mask(query_cpu, mask) {
+ +      for_each_cpu_mask_nr(query_cpu, mask) {
                 __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu),
                                       vector, APIC_DEST_PHYSICAL);
         }
diff --combined kernel/cpu.c

index 50ae922,b11f06d..b21ba7e
--- 1/kernel/cpu.c
--- 2/kernel/cpu.c
+++ b/kernel/cpu.c
@@@ -15,6 -15,28 +15,28 @@@
   #include <linux/stop_machine.h>
   #include <linux/mutex.h>
   
+ /*
+  * Represents all cpu's present in the system
+  * In systems capable of hotplug, this map could dynamically grow
+  * as new cpu's are detected in the system via any platform specific
+  * method, such as ACPI for e.g.
+  */
+ cpumask_t cpu_present_map __read_mostly;
+ EXPORT_SYMBOL(cpu_present_map);
+ 
+ #ifndef CONFIG_SMP
+ 
+ /*
+  * Represents all cpu's that are currently online.
+  */
+ cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+ EXPORT_SYMBOL(cpu_online_map);
+ 
+ cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+ EXPORT_SYMBOL(cpu_possible_map);
+ 
+ #else /* CONFIG_SMP */
+ 
   /* Serializes the updates to cpu_online_map, cpu_present_map */
   static DEFINE_MUTEX(cpu_add_remove_lock);
   
@@@ -390,7 -412,7 +412,7 @@@ void __ref enable_nonboot_cpus(void
                 goto out;
   
         printk("Enabling non-boot CPUs ...\n");
- -      for_each_cpu_mask(cpu, frozen_cpus) {
+ +      for_each_cpu_mask_nr(cpu, frozen_cpus) {
                 error = _cpu_up(cpu, 1);
                 if (!error) {
                         printk("CPU%d is up\n", cpu);
@@@ -403,3 -425,5 +425,5 @@@ out
         cpu_maps_update_done();
   }
   #endif /* CONFIG_PM_SLEEP_SMP */
+ 
+ #endif /* CONFIG_SMP */
diff --combined kernel/rcuclassic.c

index adde103,16eeeaa..6f8696c
--- 1/kernel/rcuclassic.c
--- 2/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@@ -106,7 -106,7 +106,7 @@@ static void force_quiescent_state(struc
                  */
                 cpus_and(cpumask, rcp->cpumask, cpu_online_map);
                 cpu_clear(rdp->cpu, cpumask);
- -              for_each_cpu_mask(cpu, cpumask)
+ +              for_each_cpu_mask_nr(cpu, cpumask)
                         smp_send_reschedule(cpu);
         }
   }
@@@ -387,6 -387,10 +387,10 @@@ static void __rcu_offline_cpu(struct rc
         rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
         rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
         rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
+ 
+       local_irq_disable();
+       this_rdp->qlen += rdp->qlen;
+       local_irq_enable();
   }
   
   static void rcu_offline_cpu(int cpu)
@@@ -516,10 -520,38 +520,38 @@@ void rcu_check_callbacks(int cpu, int u
         if (user ||
             (idle_cpu(cpu) && !in_softirq() &&
                                 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+ 
+               /*
+                * Get here if this CPU took its interrupt from user
+                * mode or from the idle loop, and if this is not a
+                * nested interrupt.  In this case, the CPU is in
+                * a quiescent state, so count it.
+                *
+                * Also do a memory barrier.  This is needed to handle
+                * the case where writes from a preempt-disable section
+                * of code get reordered into schedule() by this CPU's
+                * write buffer.  The memory barrier makes sure that
+                * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
+                * by other CPUs to happen after any such write.
+                */
+ 
+               smp_mb();  /* See above block comment. */
                 rcu_qsctr_inc(cpu);
                 rcu_bh_qsctr_inc(cpu);
-       } else if (!in_softirq())
+ 
+       } else if (!in_softirq()) {
+ 
+               /*
+                * Get here if this CPU did not take its interrupt from
+                * softirq, in other words, if it is not interrupting
+                * a rcu_bh read-side critical section.  This is an _bh
+                * critical section, so count it.  The memory barrier
+                * is needed for the same reason as is the above one.
+                */
+ 
+               smp_mb();  /* See above block comment. */
                 rcu_bh_qsctr_inc(cpu);
+       }
         raise_rcu_softirq();
   }
   
@@@ -543,7 -575,7 +575,7 @@@ static void __cpuinit rcu_online_cpu(in
   
         rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
         rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
-       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
   }
   
   static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
diff --combined kernel/rcupreempt.c

index 5cbd69e,6f62b77..2782793
--- 1/kernel/rcupreempt.c
--- 2/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@@ -46,11 -46,11 +46,11 @@@
   #include <asm/atomic.h>
   #include <linux/bitops.h>
   #include <linux/module.h>
+ #include <linux/kthread.h>
   #include <linux/completion.h>
   #include <linux/moduleparam.h>
   #include <linux/percpu.h>
   #include <linux/notifier.h>
- #include <linux/rcupdate.h>
   #include <linux/cpu.h>
   #include <linux/random.h>
   #include <linux/delay.h>
@@@ -82,14 -82,18 +82,18 @@@ struct rcu_data 
         spinlock_t      lock;           /* Protect rcu_data fields. */
         long            completed;      /* Number of last completed batch. */
         int             waitlistcount;
-       struct tasklet_struct rcu_tasklet;
         struct rcu_head *nextlist;
         struct rcu_head **nexttail;
         struct rcu_head *waitlist[GP_STAGES];
         struct rcu_head **waittail[GP_STAGES];
-       struct rcu_head *donelist;
+       struct rcu_head *donelist;      /* from waitlist & waitschedlist */
         struct rcu_head **donetail;
         long rcu_flipctr[2];
+       struct rcu_head *nextschedlist;
+       struct rcu_head **nextschedtail;
+       struct rcu_head *waitschedlist;
+       struct rcu_head **waitschedtail;
+       int rcu_sched_sleeping;
   #ifdef CONFIG_RCU_TRACE
         struct rcupreempt_trace trace;
   #endif /* #ifdef CONFIG_RCU_TRACE */
@@@ -131,11 -135,24 +135,24 @@@ enum rcu_try_flip_states 
         rcu_try_flip_waitmb_state,
   };
   
+ /*
+  * States for rcu_ctrlblk.rcu_sched_sleep.
+  */
+ 
+ enum rcu_sched_sleep_states {
+       rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP.  */
+       rcu_sched_sleep_prep,   /* Thinking of sleeping, rechecking. */
+       rcu_sched_sleeping,     /* Sleeping, awaken if GP needed. */
+ };
+ 
   struct rcu_ctrlblk {
         spinlock_t      fliplock;       /* Protect state-machine transitions. */
         long            completed;      /* Number of last completed batch. */
         enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
                                                         the rcu state machine */
+       spinlock_t      schedlock;      /* Protect rcu_sched sleep state. */
+       enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
+       wait_queue_head_t sched_wq;     /* Place for rcu_sched to sleep. */
   };
   
   static DEFINE_PER_CPU(struct rcu_data, rcu_data);
@@@ -143,8 -160,12 +160,12 @@@ static struct rcu_ctrlblk rcu_ctrlblk 
         .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
         .completed = 0,
         .rcu_try_flip_state = rcu_try_flip_idle_state,
+       .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
+       .sched_sleep = rcu_sched_not_sleeping,
+       .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
   };
   
+ static struct task_struct *rcu_sched_grace_period_task;
   
   #ifdef CONFIG_RCU_TRACE
   static char *rcu_try_flip_state_names[] =
@@@ -207,6 -228,8 +228,8 @@@ static DEFINE_PER_CPU_SHARED_ALIGNED(en
    */
   #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
   
+ #define RCU_SCHED_BATCH_TIME (HZ / 50)
+ 
   /*
    * Return the number of RCU batches processed thus far.  Useful
    * for debug and statistics.
@@@ -411,32 -434,34 +434,34 @@@ static void __rcu_advance_callbacks(str
         }
   }
   
- #ifdef CONFIG_NO_HZ
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
+       .dynticks = 1,
+ };
   
- DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
- static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
+ #ifdef CONFIG_NO_HZ
   static DEFINE_PER_CPU(int, rcu_update_flag);
   
   /**
    * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
    *
    * If the CPU was idle with dynamic ticks active, this updates the
-  * dynticks_progress_counter to let the RCU handling know that the
+  * rcu_dyntick_sched.dynticks to let the RCU handling know that the
    * CPU is active.
    */
   void rcu_irq_enter(void)
   {
         int cpu = smp_processor_id();
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
   
         if (per_cpu(rcu_update_flag, cpu))
                 per_cpu(rcu_update_flag, cpu)++;
   
         /*
          * Only update if we are coming from a stopped ticks mode
-        * (dynticks_progress_counter is even).
+        * (rcu_dyntick_sched.dynticks is even).
          */
         if (!in_interrupt() &&
-           (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
+           (rdssp->dynticks & 0x1) == 0) {
                 /*
                  * The following might seem like we could have a race
                  * with NMI/SMIs. But this really isn't a problem.
@@@ -459,12 -484,12 +484,12 @@@
                  * RCU read-side critical sections on this CPU would
                  * have already completed.
                  */
-               per_cpu(dynticks_progress_counter, cpu)++;
+               rdssp->dynticks++;
                 /*
                  * The following memory barrier ensures that any
                  * rcu_read_lock() primitives in the irq handler
                  * are seen by other CPUs to follow the above
-                * increment to dynticks_progress_counter. This is
+                * increment to rcu_dyntick_sched.dynticks. This is
                  * required in order for other CPUs to correctly
                  * determine when it is safe to advance the RCU
                  * grace-period state machine.
@@@ -472,7 -497,7 +497,7 @@@
                 smp_mb(); /* see above block comment. */
                 /*
                  * Since we can't determine the dynamic tick mode from
-                * the dynticks_progress_counter after this routine,
+                * the rcu_dyntick_sched.dynticks after this routine,
                  * we use a second flag to acknowledge that we came
                  * from an idle state with ticks stopped.
                  */
@@@ -480,7 -505,7 +505,7 @@@
                 /*
                  * If we take an NMI/SMI now, they will also increment
                  * the rcu_update_flag, and will not update the
-                * dynticks_progress_counter on exit. That is for
+                * rcu_dyntick_sched.dynticks on exit. That is for
                  * this IRQ to do.
                  */
         }
@@@ -490,12 -515,13 +515,13 @@@
    * rcu_irq_exit - Called from exiting Hard irq context.
    *
    * If the CPU was idle with dynamic ticks active, update the
-  * dynticks_progress_counter to put let the RCU handling be
+  * rcu_dyntick_sched.dynticks to put let the RCU handling be
    * aware that the CPU is going back to idle with no ticks.
    */
   void rcu_irq_exit(void)
   {
         int cpu = smp_processor_id();
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
   
         /*
          * rcu_update_flag is set if we interrupted the CPU
@@@ -503,7 -529,7 +529,7 @@@
          * Once this occurs, we keep track of interrupt nesting
          * because a NMI/SMI could also come in, and we still
          * only want the IRQ that started the increment of the
-        * dynticks_progress_counter to be the one that modifies
+        * rcu_dyntick_sched.dynticks to be the one that modifies
          * it on exit.
          */
         if (per_cpu(rcu_update_flag, cpu)) {
@@@ -515,28 -541,29 +541,29 @@@
   
                 /*
                  * If an NMI/SMI happens now we are still
-                * protected by the dynticks_progress_counter being odd.
+                * protected by the rcu_dyntick_sched.dynticks being odd.
                  */
   
                 /*
                  * The following memory barrier ensures that any
                  * rcu_read_unlock() primitives in the irq handler
                  * are seen by other CPUs to preceed the following
-                * increment to dynticks_progress_counter. This
+                * increment to rcu_dyntick_sched.dynticks. This
                  * is required in order for other CPUs to determine
                  * when it is safe to advance the RCU grace-period
                  * state machine.
                  */
                 smp_mb(); /* see above block comment. */
-               per_cpu(dynticks_progress_counter, cpu)++;
-               WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
+               rdssp->dynticks++;
+               WARN_ON(rdssp->dynticks & 0x1);
         }
   }
   
   static void dyntick_save_progress_counter(int cpu)
   {
-       per_cpu(rcu_dyntick_snapshot, cpu) =
-               per_cpu(dynticks_progress_counter, cpu);
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+ 
+       rdssp->dynticks_snap = rdssp->dynticks;
   }
   
   static inline int
@@@ -544,9 -571,10 +571,10 @@@ rcu_try_flip_waitack_needed(int cpu
   {
         long curr;
         long snap;
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
   
-       curr = per_cpu(dynticks_progress_counter, cpu);
-       snap = per_cpu(rcu_dyntick_snapshot, cpu);
+       curr = rdssp->dynticks;
+       snap = rdssp->dynticks_snap;
         smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
   
         /*
@@@ -567,7 -595,7 +595,7 @@@
          * that this CPU already acknowledged the counter.
          */
   
-       if ((curr - snap) > 2 || (snap & 0x1) == 0)
+       if ((curr - snap) > 2 || (curr & 0x1) == 0)
                 return 0;
   
         /* We need this CPU to explicitly acknowledge the counter flip. */
@@@ -580,9 -608,10 +608,10 @@@ rcu_try_flip_waitmb_needed(int cpu
   {
         long curr;
         long snap;
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
   
-       curr = per_cpu(dynticks_progress_counter, cpu);
-       snap = per_cpu(rcu_dyntick_snapshot, cpu);
+       curr = rdssp->dynticks;
+       snap = rdssp->dynticks_snap;
         smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
   
         /*
@@@ -609,14 -638,86 +638,86 @@@
         return 1;
   }
   
+ static void dyntick_save_progress_counter_sched(int cpu)
+ {
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+ 
+       rdssp->sched_dynticks_snap = rdssp->dynticks;
+ }
+ 
+ static int rcu_qsctr_inc_needed_dyntick(int cpu)
+ {
+       long curr;
+       long snap;
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+ 
+       curr = rdssp->dynticks;
+       snap = rdssp->sched_dynticks_snap;
+       smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+ 
+       /*
+        * If the CPU remained in dynticks mode for the entire time
+        * and didn't take any interrupts, NMIs, SMIs, or whatever,
+        * then it cannot be in the middle of an rcu_read_lock(), so
+        * the next rcu_read_lock() it executes must use the new value
+        * of the counter.  Therefore, this CPU has been in a quiescent
+        * state the entire time, and we don't need to wait for it.
+        */
+ 
+       if ((curr == snap) && ((curr & 0x1) == 0))
+               return 0;
+ 
+       /*
+        * If the CPU passed through or entered a dynticks idle phase with
+        * no active irq handlers, then, as above, this CPU has already
+        * passed through a quiescent state.
+        */
+ 
+       if ((curr - snap) > 2 || (snap & 0x1) == 0)
+               return 0;
+ 
+       /* We need this CPU to go through a quiescent state. */
+ 
+       return 1;
+ }
+ 
   #else /* !CONFIG_NO_HZ */
   
- # define dyntick_save_progress_counter(cpu)   do { } while (0)
- # define rcu_try_flip_waitack_needed(cpu)     (1)
- # define rcu_try_flip_waitmb_needed(cpu)      (1)
+ # define dyntick_save_progress_counter(cpu)           do { } while (0)
+ # define rcu_try_flip_waitack_needed(cpu)             (1)
+ # define rcu_try_flip_waitmb_needed(cpu)              (1)
+ 
+ # define dyntick_save_progress_counter_sched(cpu)     do { } while (0)
+ # define rcu_qsctr_inc_needed_dyntick(cpu)            (1)
   
   #endif /* CONFIG_NO_HZ */
   
+ static void save_qsctr_sched(int cpu)
+ {
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+ 
+       rdssp->sched_qs_snap = rdssp->sched_qs;
+ }
+ 
+ static inline int rcu_qsctr_inc_needed(int cpu)
+ {
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+ 
+       /*
+        * If there has been a quiescent state, no more need to wait
+        * on this CPU.
+        */
+ 
+       if (rdssp->sched_qs != rdssp->sched_qs_snap) {
+               smp_mb(); /* force ordering with cpu entering schedule(). */
+               return 0;
+       }
+ 
+       /* We need this CPU to go through a quiescent state. */
+ 
+       return 1;
+ }
+ 
   /*
    * Get here when RCU is idle.  Decide whether we need to
    * move out of idle state, and return non-zero if so.
@@@ -655,7 -756,7 +756,7 @@@ rcu_try_flip_idle(void
   
         /* Now ask each CPU for acknowledgement of the flip. */
   
- -      for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+ +      for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
                 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
                 dyntick_save_progress_counter(cpu);
         }
@@@ -673,7 -774,7 +774,7 @@@ rcu_try_flip_waitack(void
         int cpu;
   
         RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
- -      for_each_cpu_mask(cpu, rcu_cpu_online_map)
+ +      for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
                 if (rcu_try_flip_waitack_needed(cpu) &&
                     per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
                         RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@@ -705,7 -806,7 +806,7 @@@ rcu_try_flip_waitzero(void
         /* Check to see if the sum of the "last" counters is zero. */
   
         RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
- -      for_each_cpu_mask(cpu, rcu_cpu_online_map)
+ +      for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
                 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
         if (sum != 0) {
                 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@@ -720,7 -821,7 +821,7 @@@
         smp_mb();  /*  ^^^^^^^^^^^^ */
   
         /* Call for a memory barrier from each CPU. */
- -      for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+ +      for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
                 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
                 dyntick_save_progress_counter(cpu);
         }
@@@ -740,7 -841,7 +841,7 @@@ rcu_try_flip_waitmb(void
         int cpu;
   
         RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
- -      for_each_cpu_mask(cpu, rcu_cpu_online_map)
+ +      for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
                 if (rcu_try_flip_waitmb_needed(cpu) &&
                     per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
                         RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
@@@ -819,6 -920,26 +920,26 @@@ void rcu_check_callbacks(int cpu, int u
         unsigned long flags;
         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
   
+       /*
+        * If this CPU took its interrupt from user mode or from the
+        * idle loop, and this is not a nested interrupt, then
+        * this CPU has to have exited all prior preept-disable
+        * sections of code.  So increment the counter to note this.
+        *
+        * The memory barrier is needed to handle the case where
+        * writes from a preempt-disable section of code get reordered
+        * into schedule() by this CPU's write buffer.  So the memory
+        * barrier makes sure that the rcu_qsctr_inc() is seen by other
+        * CPUs to happen after any such write.
+        */
+ 
+       if (user ||
+           (idle_cpu(cpu) && !in_softirq() &&
+            hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+               smp_mb();       /* Guard against aggressive schedule(). */
+               rcu_qsctr_inc(cpu);
+       }
+ 
         rcu_check_mb(cpu);
         if (rcu_ctrlblk.completed == rdp->completed)
                 rcu_try_flip();
@@@ -869,6 -990,8 +990,8 @@@ void rcu_offline_cpu(int cpu
         struct rcu_head *list = NULL;
         unsigned long flags;
         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+       struct rcu_head *schedlist = NULL;
+       struct rcu_head **schedtail = &schedlist;
         struct rcu_head **tail = &list;
   
         /*
@@@ -882,6 -1005,11 +1005,11 @@@
                 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
                                                 list, tail);
         rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
+       rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
+                               schedlist, schedtail);
+       rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
+                               schedlist, schedtail);
+       rdp->rcu_sched_sleeping = 0;
         spin_unlock_irqrestore(&rdp->lock, flags);
         rdp->waitlistcount = 0;
   
@@@ -916,36 -1044,50 +1044,50 @@@
          * fix.
          */
   
-       local_irq_save(flags);
+       local_irq_save(flags);  /* disable preempt till we know what lock. */
         rdp = RCU_DATA_ME();
         spin_lock(&rdp->lock);
         *rdp->nexttail = list;
         if (list)
                 rdp->nexttail = tail;
+       *rdp->nextschedtail = schedlist;
+       if (schedlist)
+               rdp->nextschedtail = schedtail;
         spin_unlock_irqrestore(&rdp->lock, flags);
   }
   
- void __devinit rcu_online_cpu(int cpu)
+ #else /* #ifdef CONFIG_HOTPLUG_CPU */
+ 
+ void rcu_offline_cpu(int cpu)
+ {
+ }
+ 
+ #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+ 
+ void __cpuinit rcu_online_cpu(int cpu)
   {
         unsigned long flags;
+       struct rcu_data *rdp;
   
         spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
         cpu_set(cpu, rcu_cpu_online_map);
         spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
- }
   
- #else /* #ifdef CONFIG_HOTPLUG_CPU */
- 
- void rcu_offline_cpu(int cpu)
- {
- }
+       /*
+        * The rcu_sched grace-period processing might have bypassed
+        * this CPU, given that it was not in the rcu_cpu_online_map
+        * when the grace-period scan started.  This means that the
+        * grace-period task might sleep.  So make sure that if this
+        * should happen, the first callback posted to this CPU will
+        * wake up the grace-period task if need be.
+        */
   
- void __devinit rcu_online_cpu(int cpu)
- {
+       rdp = RCU_DATA_CPU(cpu);
+       spin_lock_irqsave(&rdp->lock, flags);
+       rdp->rcu_sched_sleeping = 1;
+       spin_unlock_irqrestore(&rdp->lock, flags);
   }
   
- #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
- 
   static void rcu_process_callbacks(struct softirq_action *unused)
   {
         unsigned long flags;
@@@ -986,31 -1128,196 +1128,196 @@@ void call_rcu(struct rcu_head *head, vo
         *rdp->nexttail = head;
         rdp->nexttail = &head->next;
         RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
-       spin_unlock(&rdp->lock);
-       local_irq_restore(flags);
+       spin_unlock_irqrestore(&rdp->lock, flags);
   }
   EXPORT_SYMBOL_GPL(call_rcu);
   
+ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+ {
+       unsigned long flags;
+       struct rcu_data *rdp;
+       int wake_gp = 0;
+ 
+       head->func = func;
+       head->next = NULL;
+       local_irq_save(flags);
+       rdp = RCU_DATA_ME();
+       spin_lock(&rdp->lock);
+       *rdp->nextschedtail = head;
+       rdp->nextschedtail = &head->next;
+       if (rdp->rcu_sched_sleeping) {
+ 
+               /* Grace-period processing might be sleeping... */
+ 
+               rdp->rcu_sched_sleeping = 0;
+               wake_gp = 1;
+       }
+       spin_unlock_irqrestore(&rdp->lock, flags);
+       if (wake_gp) {
+ 
+               /* Wake up grace-period processing, unless someone beat us. */
+ 
+               spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+               if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
+                       wake_gp = 0;
+               rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
+               spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+               if (wake_gp)
+                       wake_up_interruptible(&rcu_ctrlblk.sched_wq);
+       }
+ }
+ EXPORT_SYMBOL_GPL(call_rcu_sched);
+ 
   /*
    * Wait until all currently running preempt_disable() code segments
    * (including hardware-irq-disable segments) complete.  Note that
    * in -rt this does -not- necessarily result in all currently executing
    * interrupt -handlers- having completed.
    */
- void __synchronize_sched(void)
+ synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
+ EXPORT_SYMBOL_GPL(__synchronize_sched);
+ 
+ /*
+  * kthread function that manages call_rcu_sched grace periods.
+  */
+ static int rcu_sched_grace_period(void *arg)
   {
-       cpumask_t oldmask;
+       int couldsleep;         /* might sleep after current pass. */
+       int couldsleepnext = 0; /* might sleep after next pass. */
         int cpu;
+       unsigned long flags;
+       struct rcu_data *rdp;
+       int ret;
   
-       if (sched_getaffinity(0, &oldmask) < 0)
-               oldmask = cpu_possible_map;
-       for_each_online_cpu(cpu) {
-               sched_setaffinity(0, &cpumask_of_cpu(cpu));
-               schedule();
-       }
-       sched_setaffinity(0, &oldmask);
+       /*
+        * Each pass through the following loop handles one
+        * rcu_sched grace period cycle.
+        */
+       do {
+               /* Save each CPU's current state. */
+ 
+               for_each_online_cpu(cpu) {
+                       dyntick_save_progress_counter_sched(cpu);
+                       save_qsctr_sched(cpu);
+               }
+ 
+               /*
+                * Sleep for about an RCU grace-period's worth to
+                * allow better batching and to consume less CPU.
+                */
+               schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
+ 
+               /*
+                * If there was nothing to do last time, prepare to
+                * sleep at the end of the current grace period cycle.
+                */
+               couldsleep = couldsleepnext;
+               couldsleepnext = 1;
+               if (couldsleep) {
+                       spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+                       rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
+                       spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+               }
+ 
+               /*
+                * Wait on each CPU in turn to have either visited
+                * a quiescent state or been in dynticks-idle mode.
+                */
+               for_each_online_cpu(cpu) {
+                       while (rcu_qsctr_inc_needed(cpu) &&
+                              rcu_qsctr_inc_needed_dyntick(cpu)) {
+                               /* resched_cpu(cpu); @@@ */
+                               schedule_timeout_interruptible(1);
+                       }
+               }
+ 
+               /* Advance callbacks for each CPU.  */
+ 
+               for_each_online_cpu(cpu) {
+ 
+                       rdp = RCU_DATA_CPU(cpu);
+                       spin_lock_irqsave(&rdp->lock, flags);
+ 
+                       /*
+                        * We are running on this CPU irq-disabled, so no
+                        * CPU can go offline until we re-enable irqs.
+                        * The current CPU might have already gone
+                        * offline (between the for_each_offline_cpu and
+                        * the spin_lock_irqsave), but in that case all its
+                        * callback lists will be empty, so no harm done.
+                        *
+                        * Advance the callbacks!  We share normal RCU's
+                        * donelist, since callbacks are invoked the
+                        * same way in either case.
+                        */
+                       if (rdp->waitschedlist != NULL) {
+                               *rdp->donetail = rdp->waitschedlist;
+                               rdp->donetail = rdp->waitschedtail;
+ 
+                               /*
+                                * Next rcu_check_callbacks() will
+                                * do the required raise_softirq().
+                                */
+                       }
+                       if (rdp->nextschedlist != NULL) {
+                               rdp->waitschedlist = rdp->nextschedlist;
+                               rdp->waitschedtail = rdp->nextschedtail;
+                               couldsleep = 0;
+                               couldsleepnext = 0;
+                       } else {
+                               rdp->waitschedlist = NULL;
+                               rdp->waitschedtail = &rdp->waitschedlist;
+                       }
+                       rdp->nextschedlist = NULL;
+                       rdp->nextschedtail = &rdp->nextschedlist;
+ 
+                       /* Mark sleep intention. */
+ 
+                       rdp->rcu_sched_sleeping = couldsleep;
+ 
+                       spin_unlock_irqrestore(&rdp->lock, flags);
+               }
+ 
+               /* If we saw callbacks on the last scan, go deal with them. */
+ 
+               if (!couldsleep)
+                       continue;
+ 
+               /* Attempt to block... */
+ 
+               spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+               if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
+ 
+                       /*
+                        * Someone posted a callback after we scanned.
+                        * Go take care of it.
+                        */
+                       spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+                       couldsleepnext = 0;
+                       continue;
+               }
+ 
+               /* Block until the next person posts a callback. */
+ 
+               rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
+               spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+               ret = 0;
+               __wait_event_interruptible(rcu_ctrlblk.sched_wq,
+                       rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
+                       ret);
+ 
+               /*
+                * Signals would prevent us from sleeping, and we cannot
+                * do much with them in any case.  So flush them.
+                */
+               if (ret)
+                       flush_signals(current);
+               couldsleepnext = 0;
+ 
+       } while (!kthread_should_stop());
+ 
+       return (0);
   }
- EXPORT_SYMBOL_GPL(__synchronize_sched);
   
   /*
    * Check to see if any future RCU-related work will need to be done
@@@ -1027,7 -1334,9 +1334,9 @@@ int rcu_needs_cpu(int cpu
   
         return (rdp->donelist != NULL ||
                 !!rdp->waitlistcount ||
-               rdp->nextlist != NULL);
+               rdp->nextlist != NULL ||
+               rdp->nextschedlist != NULL ||
+               rdp->waitschedlist != NULL);
   }
   
   int rcu_pending(int cpu)
@@@ -1038,7 -1347,9 +1347,9 @@@
   
         if (rdp->donelist != NULL ||
             !!rdp->waitlistcount ||
-           rdp->nextlist != NULL)
+           rdp->nextlist != NULL ||
+           rdp->nextschedlist != NULL ||
+           rdp->waitschedlist != NULL)
                 return 1;
   
         /* The RCU core needs an acknowledgement from this CPU. */
@@@ -1105,6 -1416,11 +1416,11 @@@ void __init __rcu_init(void
                 rdp->donetail = &rdp->donelist;
                 rdp->rcu_flipctr[0] = 0;
                 rdp->rcu_flipctr[1] = 0;
+               rdp->nextschedlist = NULL;
+               rdp->nextschedtail = &rdp->nextschedlist;
+               rdp->waitschedlist = NULL;
+               rdp->waitschedtail = &rdp->waitschedlist;
+               rdp->rcu_sched_sleeping = 0;
         }
         register_cpu_notifier(&rcu_nb);
   
@@@ -1123,15 -1439,19 +1439,19 @@@
         for_each_online_cpu(cpu)
                 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
   
-       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
   }
   
   /*
-  * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+  * Late-boot-time RCU initialization that must wait until after scheduler
+  * has been initialized.
    */
- void synchronize_kernel(void)
+ void __init rcu_init_sched(void)
   {
-       synchronize_rcu();
+       rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
+                                                 NULL,
+                                                 "rcu_sched_grace_period");
+       WARN_ON(IS_ERR(rcu_sched_grace_period_task));
   }
   
   #ifdef CONFIG_RCU_TRACE
diff --combined kernel/sched.c

index e6795e3,99e6d85..a6d0a7f
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -70,10 -70,13 +70,13 @@@
   #include <linux/bootmem.h>
   #include <linux/debugfs.h>
   #include <linux/ctype.h>
+ #include <linux/ftrace.h>
   
   #include <asm/tlb.h>
   #include <asm/irq_regs.h>
   
+ #include "sched_cpupri.h"
+ 
   /*
    * Convert user-nice values [ -20 ... 0 ... 19 ]
    * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@@ -289,15 -292,15 +292,15 @@@ struct task_group root_task_group
   static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
   /* Default task group's cfs_rq on each cpu */
   static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
- #endif
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
   
   #ifdef CONFIG_RT_GROUP_SCHED
   static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
   static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
- #endif
- #else
+ #endif /* CONFIG_RT_GROUP_SCHED */
+ #else /* !CONFIG_FAIR_GROUP_SCHED */
   #define root_task_group init_task_group
- #endif
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
   
   /* task_group_lock serializes add/remove of task groups and also changes to
    * a task group's cpu shares.
@@@ -307,9 -310,9 +310,9 @@@ static DEFINE_SPINLOCK(task_group_lock)
   #ifdef CONFIG_FAIR_GROUP_SCHED
   #ifdef CONFIG_USER_SCHED
   # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
- #else
+ #else /* !CONFIG_USER_SCHED */
   # define INIT_TASK_GROUP_LOAD NICE_0_LOAD
- #endif
+ #endif /* CONFIG_USER_SCHED */
   
   /*
    * A weight of 0 or 1 can cause arithmetics problems.
@@@ -363,6 -366,10 +366,10 @@@ static inline void set_task_rq(struct t
   #else
   
   static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+ static inline struct task_group *task_group(struct task_struct *p)
+ {
+       return NULL;
+ }
   
   #endif        /* CONFIG_GROUP_SCHED */
   
@@@ -373,6 -380,7 +380,7 @@@ struct cfs_rq 
   
         u64 exec_clock;
         u64 min_vruntime;
+       u64 pair_start;
   
         struct rb_root tasks_timeline;
         struct rb_node *rb_leftmost;
@@@ -401,6 -409,31 +409,31 @@@
          */
         struct list_head leaf_cfs_rq_list;
         struct task_group *tg;  /* group that "owns" this runqueue */
+ 
+ #ifdef CONFIG_SMP
+       /*
+        * the part of load.weight contributed by tasks
+        */
+       unsigned long task_weight;
+ 
+       /*
+        *   h_load = weight * f(tg)
+        *
+        * Where f(tg) is the recursive weight fraction assigned to
+        * this group.
+        */
+       unsigned long h_load;
+ 
+       /*
+        * this cpu's part of tg->shares
+        */
+       unsigned long shares;
+ 
+       /*
+        * load.weight at the time we set shares
+        */
+       unsigned long rq_weight;
+ #endif
   #endif
   };
   
@@@ -452,6 -485,9 +485,9 @@@ struct root_domain 
          */
         cpumask_t rto_mask;
         atomic_t rto_count;
+ #ifdef CONFIG_SMP
+       struct cpupri cpupri;
+ #endif
   };
   
   /*
@@@ -526,6 -562,9 +562,9 @@@ struct rq 
         int push_cpu;
         /* cpu of this runqueue: */
         int cpu;
+       int online;
+ 
+       unsigned long avg_load_per_task;
   
         struct task_struct *migration_thread;
         struct list_head migration_queue;
@@@ -607,6 -646,24 +646,24 @@@ static inline void update_rq_clock(stru
   # define const_debug static const
   #endif
   
+ /**
+  * runqueue_is_locked
+  *
+  * Returns true if the current cpu runqueue is locked.
+  * This interface allows printk to be called with the runqueue lock
+  * held and know whether or not it is OK to wake up the klogd.
+  */
+ int runqueue_is_locked(void)
+ {
+       int cpu = get_cpu();
+       struct rq *rq = cpu_rq(cpu);
+       int ret;
+ 
+       ret = spin_is_locked(&rq->lock);
+       put_cpu();
+       return ret;
+ }
+ 
   /*
    * Debugging: various feature bits
    */
@@@ -748,6 -805,12 +805,12 @@@ late_initcall(sched_init_debug)
    */
   const_debug unsigned int sysctl_sched_nr_migrate = 32;
   
+ /*
+  * ratelimit for updating the group shares.
+  * default: 0.5ms
+  */
+ const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
+ 
   /*
    * period over which we measure -rt task cpu usage in us.
    * default: 1s
@@@ -775,82 -838,6 +838,6 @@@ static inline u64 global_rt_runtime(voi
         return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
   }
   
- unsigned long long time_sync_thresh = 100000;
- 
- static DEFINE_PER_CPU(unsigned long long, time_offset);
- static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
- 
- /*
-  * Global lock which we take every now and then to synchronize
-  * the CPUs time. This method is not warp-safe, but it's good
-  * enough to synchronize slowly diverging time sources and thus
-  * it's good enough for tracing:
-  */
- static DEFINE_SPINLOCK(time_sync_lock);
- static unsigned long long prev_global_time;
- 
- static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
- {
-       /*
-        * We want this inlined, to not get tracer function calls
-        * in this critical section:
-        */
-       spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
-       __raw_spin_lock(&time_sync_lock.raw_lock);
- 
-       if (time < prev_global_time) {
-               per_cpu(time_offset, cpu) += prev_global_time - time;
-               time = prev_global_time;
-       } else {
-               prev_global_time = time;
-       }
- 
-       __raw_spin_unlock(&time_sync_lock.raw_lock);
-       spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
- 
-       return time;
- }
- 
- static unsigned long long __cpu_clock(int cpu)
- {
-       unsigned long long now;
- 
-       /*
-        * Only call sched_clock() if the scheduler has already been
-        * initialized (some code might call cpu_clock() very early):
-        */
-       if (unlikely(!scheduler_running))
-               return 0;
- 
-       now = sched_clock_cpu(cpu);
- 
-       return now;
- }
- 
- /*
-  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
-  * clock constructed from sched_clock():
-  */
- unsigned long long cpu_clock(int cpu)
- {
-       unsigned long long prev_cpu_time, time, delta_time;
-       unsigned long flags;
- 
-       local_irq_save(flags);
-       prev_cpu_time = per_cpu(prev_cpu_time, cpu);
-       time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
-       delta_time = time-prev_cpu_time;
- 
-       if (unlikely(delta_time > time_sync_thresh)) {
-               time = __sync_cpu_clock(time, cpu);
-               per_cpu(prev_cpu_time, cpu) = time;
-       }
-       local_irq_restore(flags);
- 
-       return time;
- }
- EXPORT_SYMBOL_GPL(cpu_clock);
- 
   #ifndef prepare_arch_switch
   # define prepare_arch_switch(next)    do { } while (0)
   #endif
@@@ -1313,15 -1300,15 +1300,15 @@@ void wake_up_idle_cpu(int cpu
         if (!tsk_is_polling(rq->idle))
                 smp_send_reschedule(cpu);
   }
- #endif
+ #endif /* CONFIG_NO_HZ */
   
- #else
+ #else /* !CONFIG_SMP */
   static void __resched_task(struct task_struct *p, int tif_bit)
   {
         assert_spin_locked(&task_rq(p)->lock);
         set_tsk_thread_flag(p, tif_bit);
   }
- #endif
+ #endif /* CONFIG_SMP */
   
   #if BITS_PER_LONG == 32
   # define WMULT_CONST  (~0UL)
@@@ -1336,6 -1323,9 +1323,9 @@@
    */
   #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
   
+ /*
+  * delta *= weight / lw
+  */
   static unsigned long
   calc_delta_mine(unsigned long delta_exec, unsigned long weight,
                 struct load_weight *lw)
@@@ -1363,12 -1353,6 +1353,6 @@@
         return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
   }
   
- static inline unsigned long
- calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
- {
-       return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
- }
- 
   static inline void update_load_add(struct load_weight *lw, unsigned long inc)
   {
         lw->weight += inc;
@@@ -1479,17 -1463,211 +1463,211 @@@ static inline void dec_cpu_load(struct 
   #ifdef CONFIG_SMP
   static unsigned long source_load(int cpu, int type);
   static unsigned long target_load(int cpu, int type);
- static unsigned long cpu_avg_load_per_task(int cpu);
   static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
- #else /* CONFIG_SMP */
+ 
+ static unsigned long cpu_avg_load_per_task(int cpu)
+ {
+       struct rq *rq = cpu_rq(cpu);
+ 
+       if (rq->nr_running)
+               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+ 
+       return rq->avg_load_per_task;
+ }
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+ 
+ typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+ 
+ /*
+  * Iterate the full tree, calling @down when first entering a node and @up when
+  * leaving it for the final time.
+  */
+ static void
+ walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
   {
+       struct task_group *parent, *child;
+ 
+       rcu_read_lock();
+       parent = &root_task_group;
+ down:
+       (*down)(parent, cpu, sd);
+       list_for_each_entry_rcu(child, &parent->children, siblings) {
+               parent = child;
+               goto down;
+ 
+ up:
+               continue;
+       }
+       (*up)(parent, cpu, sd);
+ 
+       child = parent;
+       parent = parent->parent;
+       if (parent)
+               goto up;
+       rcu_read_unlock();
   }
+ 
+ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+ 
+ /*
+  * Calculate and set the cpu's group shares.
+  */
+ static void
+ __update_group_shares_cpu(struct task_group *tg, int cpu,
+                         unsigned long sd_shares, unsigned long sd_rq_weight)
+ {
+       int boost = 0;
+       unsigned long shares;
+       unsigned long rq_weight;
+ 
+       if (!tg->se[cpu])
+               return;
+ 
+       rq_weight = tg->cfs_rq[cpu]->load.weight;
+ 
+       /*
+        * If there are currently no tasks on the cpu pretend there is one of
+        * average load so that when a new task gets to run here it will not
+        * get delayed by group starvation.
+        */
+       if (!rq_weight) {
+               boost = 1;
+               rq_weight = NICE_0_LOAD;
+       }
+ 
+       if (unlikely(rq_weight > sd_rq_weight))
+               rq_weight = sd_rq_weight;
+ 
+       /*
+        *           \Sum shares * rq_weight
+        * shares =  -----------------------
+        *               \Sum rq_weight
+        *
+        */
+       shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+ 
+       /*
+        * record the actual number of shares, not the boosted amount.
+        */
+       tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+       tg->cfs_rq[cpu]->rq_weight = rq_weight;
+ 
+       if (shares < MIN_SHARES)
+               shares = MIN_SHARES;
+       else if (shares > MAX_SHARES)
+               shares = MAX_SHARES;
+ 
+       __set_se_shares(tg->se[cpu], shares);
+ }
+ 
+ /*
+  * Re-compute the task group their per cpu shares over the given domain.
+  * This needs to be done in a bottom-up fashion because the rq weight of a
+  * parent group depends on the shares of its child groups.
+  */
+ static void
+ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+ {
+       unsigned long rq_weight = 0;
+       unsigned long shares = 0;
+       int i;
+ 
+       for_each_cpu_mask(i, sd->span) {
+               rq_weight += tg->cfs_rq[i]->load.weight;
+               shares += tg->cfs_rq[i]->shares;
+       }
+ 
+       if ((!shares && rq_weight) || shares > tg->shares)
+               shares = tg->shares;
+ 
+       if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+               shares = tg->shares;
+ 
+       if (!rq_weight)
+               rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
+ 
+       for_each_cpu_mask(i, sd->span) {
+               struct rq *rq = cpu_rq(i);
+               unsigned long flags;
+ 
+               spin_lock_irqsave(&rq->lock, flags);
+               __update_group_shares_cpu(tg, i, shares, rq_weight);
+               spin_unlock_irqrestore(&rq->lock, flags);
+       }
+ }
+ 
+ /*
+  * Compute the cpu's hierarchical load factor for each task group.
+  * This needs to be done in a top-down fashion because the load of a child
+  * group is a fraction of its parents load.
+  */
+ static void
+ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+ {
+       unsigned long load;
+ 
+       if (!tg->parent) {
+               load = cpu_rq(cpu)->load.weight;
+       } else {
+               load = tg->parent->cfs_rq[cpu]->h_load;
+               load *= tg->cfs_rq[cpu]->shares;
+               load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+       }
+ 
+       tg->cfs_rq[cpu]->h_load = load;
+ }
+ 
+ static void
+ tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
+ {
+ }
+ 
+ static void update_shares(struct sched_domain *sd)
+ {
+       u64 now = cpu_clock(raw_smp_processor_id());
+       s64 elapsed = now - sd->last_update;
+ 
+       if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
+               sd->last_update = now;
+               walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+       }
+ }
+ 
+ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+ {
+       spin_unlock(&rq->lock);
+       update_shares(sd);
+       spin_lock(&rq->lock);
+ }
+ 
+ static void update_h_load(int cpu)
+ {
+       walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+ }
+ 
+ #else
+ 
+ static inline void update_shares(struct sched_domain *sd)
+ {
+ }
+ 
+ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+ {
+ }
+ 
   #endif
   
- #endif /* CONFIG_SMP */
+ #endif
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+ {
+ #ifdef CONFIG_SMP
+       cfs_rq->shares = shares;
+ #endif
+ }
+ #endif
   
   #include "sched_stats.h"
   #include "sched_idletask.c"
@@@ -1500,27 -1678,17 +1678,17 @@@
   #endif
   
   #define sched_class_highest (&rt_sched_class)
+ #define for_each_class(class) \
+    for (class = sched_class_highest; class; class = class->next)
   
- static inline void inc_load(struct rq *rq, const struct task_struct *p)
- {
-       update_load_add(&rq->load, p->se.load.weight);
- }
- 
- static inline void dec_load(struct rq *rq, const struct task_struct *p)
- {
-       update_load_sub(&rq->load, p->se.load.weight);
- }
- 
- static void inc_nr_running(struct task_struct *p, struct rq *rq)
+ static void inc_nr_running(struct rq *rq)
   {
         rq->nr_running++;
-       inc_load(rq, p);
   }
   
- static void dec_nr_running(struct task_struct *p, struct rq *rq)
+ static void dec_nr_running(struct rq *rq)
   {
         rq->nr_running--;
-       dec_load(rq, p);
   }
   
   static void set_load_weight(struct task_struct *p)
@@@ -1544,6 -1712,12 +1712,12 @@@
         p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
   }
   
+ static void update_avg(u64 *avg, u64 sample)
+ {
+       s64 diff = sample - *avg;
+       *avg += diff >> 3;
+ }
+ 
   static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
   {
         sched_info_queued(p);
@@@ -1553,6 -1727,13 +1727,13 @@@
   
   static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
   {
+       if (sleep && p->se.last_wakeup) {
+               update_avg(&p->se.avg_overlap,
+                          p->se.sum_exec_runtime - p->se.last_wakeup);
+               p->se.last_wakeup = 0;
+       }
+ 
+       sched_info_dequeued(p);
         p->sched_class->dequeue_task(rq, p, sleep);
         p->se.on_rq = 0;
   }
@@@ -1612,7 -1793,7 +1793,7 @@@ static void activate_task(struct rq *rq
                 rq->nr_uninterruptible--;
   
         enqueue_task(rq, p, wakeup);
-       inc_nr_running(p, rq);
+       inc_nr_running(rq);
   }
   
   /*
@@@ -1624,7 -1805,7 +1805,7 @@@ static void deactivate_task(struct rq *
                 rq->nr_uninterruptible++;
   
         dequeue_task(rq, p, sleep);
-       dec_nr_running(p, rq);
+       dec_nr_running(rq);
   }
   
   /**
@@@ -1636,12 -1817,6 +1817,6 @@@ inline int task_curr(const struct task_
         return cpu_curr(task_cpu(p)) == p;
   }
   
- /* Used instead of source_load when we know the type == 0 */
- unsigned long weighted_cpuload(const int cpu)
- {
-       return cpu_rq(cpu)->load.weight;
- }
- 
   static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
   {
         set_task_rq(p, cpu);
@@@ -1670,6 -1845,12 +1845,12 @@@ static inline void check_class_changed(
   
   #ifdef CONFIG_SMP
   
+ /* Used instead of source_load when we know the type == 0 */
+ static unsigned long weighted_cpuload(const int cpu)
+ {
+       return cpu_rq(cpu)->load.weight;
+ }
+ 
   /*
    * Is this task likely cache-hot:
    */
@@@ -1880,7 -2061,7 +2061,7 @@@ static unsigned long source_load(int cp
         struct rq *rq = cpu_rq(cpu);
         unsigned long total = weighted_cpuload(cpu);
   
-       if (type == 0)
+       if (type == 0 || !sched_feat(LB_BIAS))
                 return total;
   
         return min(rq->cpu_load[type-1], total);
@@@ -1895,24 -2076,12 +2076,12 @@@ static unsigned long target_load(int cp
         struct rq *rq = cpu_rq(cpu);
         unsigned long total = weighted_cpuload(cpu);
   
-       if (type == 0)
+       if (type == 0 || !sched_feat(LB_BIAS))
                 return total;
   
         return max(rq->cpu_load[type-1], total);
   }
   
- /*
-  * Return the average load per task on the cpu's run queue
-  */
- static unsigned long cpu_avg_load_per_task(int cpu)
- {
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long total = weighted_cpuload(cpu);
-       unsigned long n = rq->nr_running;
- 
-       return n ? total / n : SCHED_LOAD_SCALE;
- }
- 
   /*
    * find_idlest_group finds and returns the least busy CPU group within the
    * domain.
@@@ -1939,7 -2108,7 +2108,7 @@@ find_idlest_group(struct sched_domain *
                 /* Tally up the load of all CPUs in the group */
                 avg_load = 0;
   
- -              for_each_cpu_mask(i, group->cpumask) {
+ +              for_each_cpu_mask_nr(i, group->cpumask) {
                         /* Bias balancing toward cpus of our domain */
                         if (local_group)
                                 load = source_load(i, load_idx);
@@@ -1981,7 -2150,7 +2150,7 @@@ find_idlest_cpu(struct sched_group *gro
         /* Traverse only the allowed CPUs */
         cpus_and(*tmp, group->cpumask, p->cpus_allowed);
   
- -      for_each_cpu_mask(i, *tmp) {
+ +      for_each_cpu_mask_nr(i, *tmp) {
                 load = weighted_cpuload(i);
   
                 if (load < min_load || (load == min_load && i == this_cpu)) {
@@@ -2019,6 -2188,9 +2188,9 @@@ static int sched_balance_self(int cpu, 
                         sd = tmp;
         }
   
+       if (sd)
+               update_shares(sd);
+ 
         while (sd) {
                 cpumask_t span, tmpmask;
                 struct sched_group *group;
@@@ -2085,6 -2257,22 +2257,22 @@@ static int try_to_wake_up(struct task_s
         if (!sched_feat(SYNC_WAKEUPS))
                 sync = 0;
   
+ #ifdef CONFIG_SMP
+       if (sched_feat(LB_WAKEUP_UPDATE)) {
+               struct sched_domain *sd;
+ 
+               this_cpu = raw_smp_processor_id();
+               cpu = task_cpu(p);
+ 
+               for_each_domain(this_cpu, sd) {
+                       if (cpu_isset(cpu, sd->span)) {
+                               update_shares(sd);
+                               break;
+                       }
+               }
+       }
+ #endif
+ 
         smp_wmb();
         rq = task_rq_lock(p, &flags);
         old_state = p->state;
@@@ -2131,7 -2319,7 +2319,7 @@@
                         }
                 }
         }
- #endif
+ #endif /* CONFIG_SCHEDSTATS */
   
   out_activate:
   #endif /* CONFIG_SMP */
@@@ -2149,6 -2337,9 +2337,9 @@@
         success = 1;
   
   out_running:
+       trace_mark(kernel_sched_wakeup,
+               "pid %d state %ld ## rq %p task %p rq->curr %p",
+               p->pid, p->state, rq, p, rq->curr);
         check_preempt_curr(rq, p);
   
         p->state = TASK_RUNNING;
@@@ -2157,6 -2348,8 +2348,8 @@@
                 p->sched_class->task_wake_up(rq, p);
   #endif
   out:
+       current->se.last_wakeup = current->se.sum_exec_runtime;
+ 
         task_rq_unlock(rq, &flags);
   
         return success;
@@@ -2277,8 -2470,11 +2470,11 @@@ void wake_up_new_task(struct task_struc
                  * management (if any):
                  */
                 p->sched_class->task_new(rq, p);
-               inc_nr_running(p, rq);
+               inc_nr_running(rq);
         }
+       trace_mark(kernel_sched_wakeup_new,
+               "pid %d state %ld ## rq %p task %p rq->curr %p",
+               p->pid, p->state, rq, p, rq->curr);
         check_preempt_curr(rq, p);
   #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
@@@ -2331,7 -2527,7 +2527,7 @@@ fire_sched_out_preempt_notifiers(struc
                 notifier->ops->sched_out(notifier, next);
   }
   
- #else
+ #else /* !CONFIG_PREEMPT_NOTIFIERS */
   
   static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
   {
@@@ -2343,7 -2539,7 +2539,7 @@@ fire_sched_out_preempt_notifiers(struc
   {
   }
   
- #endif
+ #endif /* CONFIG_PREEMPT_NOTIFIERS */
   
   /**
    * prepare_task_switch - prepare to switch tasks
@@@ -2451,6 -2647,11 +2647,11 @@@ context_switch(struct rq *rq, struct ta
         struct mm_struct *mm, *oldmm;
   
         prepare_task_switch(rq, prev, next);
+       trace_mark(kernel_sched_schedule,
+               "prev_pid %d next_pid %d prev_state %ld "
+               "## rq %p prev %p next %p",
+               prev->pid, next->pid, prev->state,
+               rq, prev, next);
         mm = next->mm;
         oldmm = prev->active_mm;
         /*
@@@ -2785,7 -2986,7 +2986,7 @@@ balance_tasks(struct rq *this_rq, int t
               enum cpu_idle_type idle, int *all_pinned,
               int *this_best_prio, struct rq_iterator *iterator)
   {
-       int loops = 0, pulled = 0, pinned = 0, skip_for_load;
+       int loops = 0, pulled = 0, pinned = 0;
         struct task_struct *p;
         long rem_load_move = max_load_move;
   
@@@ -2801,14 -3002,8 +3002,8 @@@
   next:
         if (!p || loops++ > sysctl_sched_nr_migrate)
                 goto out;
-       /*
-        * To help distribute high priority tasks across CPUs we don't
-        * skip a task if it will be the highest priority task (i.e. smallest
-        * prio value) on its new queue regardless of its load weight
-        */
-       skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
-                                                        SCHED_LOAD_SCALE_FUZZ;
-       if ((skip_for_load && p->prio >= *this_best_prio) ||
+ 
+       if ((p->se.load.weight >> 1) > rem_load_move ||
             !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
                 p = iterator->next(iterator->arg);
                 goto next;
@@@ -2863,6 -3058,10 +3058,10 @@@ static int move_tasks(struct rq *this_r
                                 max_load_move - total_load_moved,
                                 sd, idle, all_pinned, &this_best_prio);
                 class = class->next;
+ 
+               if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
+                       break;
+ 
         } while (class && max_load_move > total_load_moved);
   
         return total_load_moved > 0;
@@@ -2939,6 -3138,7 +3138,7 @@@ find_busiest_group(struct sched_domain 
         max_load = this_load = total_load = total_pwr = 0;
         busiest_load_per_task = busiest_nr_running = 0;
         this_load_per_task = this_nr_running = 0;
+ 
         if (idle == CPU_NOT_IDLE)
                 load_idx = sd->busy_idx;
         else if (idle == CPU_NEWLY_IDLE)
@@@ -2953,6 -3153,8 +3153,8 @@@
                 int __group_imb = 0;
                 unsigned int balance_cpu = -1, first_idle_cpu = 0;
                 unsigned long sum_nr_running, sum_weighted_load;
+               unsigned long sum_avg_load_per_task;
+               unsigned long avg_load_per_task;
   
                 local_group = cpu_isset(this_cpu, group->cpumask);
   
@@@ -2961,10 -3163,12 +3163,12 @@@
   
                 /* Tally up the load of all CPUs in the group */
                 sum_weighted_load = sum_nr_running = avg_load = 0;
+               sum_avg_load_per_task = avg_load_per_task = 0;
+ 
                 max_cpu_load = 0;
                 min_cpu_load = ~0UL;
   
- -              for_each_cpu_mask(i, group->cpumask) {
+ +              for_each_cpu_mask_nr(i, group->cpumask) {
                         struct rq *rq;
   
                         if (!cpu_isset(i, *cpus))
@@@ -2994,6 -3198,8 +3198,8 @@@
                         avg_load += load;
                         sum_nr_running += rq->nr_running;
                         sum_weighted_load += weighted_cpuload(i);
+ 
+                       sum_avg_load_per_task += cpu_avg_load_per_task(i);
                 }
   
                 /*
@@@ -3015,7 -3221,20 +3221,20 @@@
                 avg_load = sg_div_cpu_power(group,
                                 avg_load * SCHED_LOAD_SCALE);
   
-               if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
+ 
+               /*
+                * Consider the group unbalanced when the imbalance is larger
+                * than the average weight of two tasks.
+                *
+                * APZ: with cgroup the avg task weight can vary wildly and
+                *      might not be a suitable number - should we keep a
+                *      normalized nr_running number somewhere that negates
+                *      the hierarchy?
+                */
+               avg_load_per_task = sg_div_cpu_power(group,
+                               sum_avg_load_per_task * SCHED_LOAD_SCALE);
+ 
+               if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
                         __group_imb = 1;
   
                 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@@ -3156,9 -3375,9 +3375,9 @@@ small_imbalance
                         if (busiest_load_per_task > this_load_per_task)
                                 imbn = 1;
                 } else
-                       this_load_per_task = SCHED_LOAD_SCALE;
+                       this_load_per_task = cpu_avg_load_per_task(this_cpu);
   
-               if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
+               if (max_load - this_load + 2*busiest_load_per_task >=
                                         busiest_load_per_task * imbn) {
                         *imbalance = busiest_load_per_task;
                         return busiest;
@@@ -3228,7 -3447,7 +3447,7 @@@ find_busiest_queue(struct sched_group *
         unsigned long max_load = 0;
         int i;
   
- -      for_each_cpu_mask(i, group->cpumask) {
+ +      for_each_cpu_mask_nr(i, group->cpumask) {
                 unsigned long wl;
   
                 if (!cpu_isset(i, *cpus))
@@@ -3284,6 -3503,7 +3503,7 @@@ static int load_balance(int this_cpu, s
         schedstat_inc(sd, lb_count[idle]);
   
   redo:
+       update_shares(sd);
         group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                    cpus, balance);
   
@@@ -3386,8 -3606,9 +3606,9 @@@
   
         if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-               return -1;
-       return ld_moved;
+               ld_moved = -1;
+ 
+       goto out;
   
   out_balanced:
         schedstat_inc(sd, lb_balanced[idle]);
@@@ -3402,8 -3623,13 +3623,13 @@@ out_one_pinned
   
         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-               return -1;
-       return 0;
+               ld_moved = -1;
+       else
+               ld_moved = 0;
+ out:
+       if (ld_moved)
+               update_shares(sd);
+       return ld_moved;
   }
   
   /*
@@@ -3438,6 -3664,7 +3664,7 @@@ load_balance_newidle(int this_cpu, stru
   
         schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
   redo:
+       update_shares_locked(this_rq, sd);
         group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
                                    &sd_idle, cpus, NULL);
         if (!group) {
@@@ -3481,6 -3708,7 +3708,7 @@@
         } else
                 sd->nr_balance_failed = 0;
   
+       update_shares_locked(this_rq, sd);
         return ld_moved;
   
   out_balanced:
@@@ -3672,6 -3900,7 +3900,7 @@@ static void rebalance_domains(int cpu, 
         /* Earliest time when we have to do rebalance again */
         unsigned long next_balance = jiffies + 60*HZ;
         int update_next_balance = 0;
+       int need_serialize;
         cpumask_t tmp;
   
         for_each_domain(cpu, sd) {
@@@ -3689,8 -3918,9 +3918,9 @@@
                 if (interval > HZ*NR_CPUS/10)
                         interval = HZ*NR_CPUS/10;
   
+               need_serialize = sd->flags & SD_SERIALIZE;
   
-               if (sd->flags & SD_SERIALIZE) {
+               if (need_serialize) {
                         if (!spin_trylock(&balancing))
                                 goto out;
                 }
@@@ -3706,7 -3936,7 +3936,7 @@@
                         }
                         sd->last_balance = jiffies;
                 }
-               if (sd->flags & SD_SERIALIZE)
+               if (need_serialize)
                         spin_unlock(&balancing);
   out:
                 if (time_after(next_balance, sd->last_balance + interval)) {
@@@ -3759,7 -3989,7 +3989,7 @@@ static void run_rebalance_domains(struc
                 int balance_cpu;
   
                 cpu_clear(this_cpu, cpus);
- -              for_each_cpu_mask(balance_cpu, cpus) {
+ +              for_each_cpu_mask_nr(balance_cpu, cpus) {
                         /*
                          * If this cpu gets work to do, stop the load balancing
                          * work being done for other cpus. Next load
@@@ -4021,26 -4251,44 +4251,44 @@@ void scheduler_tick(void
   #endif
   }
   
- #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+ #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+                               defined(CONFIG_PREEMPT_TRACER))
+ 
+ static inline unsigned long get_parent_ip(unsigned long addr)
+ {
+       if (in_lock_functions(addr)) {
+               addr = CALLER_ADDR2;
+               if (in_lock_functions(addr))
+                       addr = CALLER_ADDR3;
+       }
+       return addr;
+ }
   
   void __kprobes add_preempt_count(int val)
   {
+ #ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Underflow?
          */
         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                 return;
+ #endif
         preempt_count() += val;
+ #ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Spinlock count overflowing soon?
          */
         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                 PREEMPT_MASK - 10);
+ #endif
+       if (preempt_count() == val)
+               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
   }
   EXPORT_SYMBOL(add_preempt_count);
   
   void __kprobes sub_preempt_count(int val)
   {
+ #ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Underflow?
          */
@@@ -4052,7 -4300,10 +4300,10 @@@
         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
                         !(preempt_count() & PREEMPT_MASK)))
                 return;
+ #endif
   
+       if (preempt_count() == val)
+               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
         preempt_count() -= val;
   }
   EXPORT_SYMBOL(sub_preempt_count);
@@@ -4070,6 -4321,7 +4321,7 @@@ static noinline void __schedule_bug(str
                 prev->comm, prev->pid, preempt_count());
   
         debug_show_held_locks(prev);
+       print_modules();
         if (irqs_disabled())
                 print_irqtrace_events(prev);
   
@@@ -4143,7 -4395,7 +4395,7 @@@ asmlinkage void __sched schedule(void
         struct task_struct *prev, *next;
         unsigned long *switch_count;
         struct rq *rq;
-       int cpu;
+       int cpu, hrtick = sched_feat(HRTICK);
   
   need_resched:
         preempt_disable();
@@@ -4158,7 -4410,8 +4410,8 @@@ need_resched_nonpreemptible
   
         schedule_debug(prev);
   
-       hrtick_clear(rq);
+       if (hrtick)
+               hrtick_clear(rq);
   
         /*
          * Do the rq-clock update outside the rq lock:
@@@ -4204,7 -4457,8 +4457,8 @@@
         } else
                 spin_unlock_irq(&rq->lock);
   
-       hrtick_set(rq);
+       if (hrtick)
+               hrtick_set(rq);
   
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
@@@ -4586,10 -4840,8 +4840,8 @@@ void set_user_nice(struct task_struct *
                 goto out_unlock;
         }
         on_rq = p->se.on_rq;
-       if (on_rq) {
+       if (on_rq)
                 dequeue_task(rq, p, 0);
-               dec_load(rq, p);
-       }
   
         p->static_prio = NICE_TO_PRIO(nice);
         set_load_weight(p);
@@@ -4599,7 -4851,6 +4851,6 @@@
   
         if (on_rq) {
                 enqueue_task(rq, p, 0);
-               inc_load(rq, p);
                 /*
                  * If the task increased its priority or is running and
                  * lowered its priority, then reschedule its CPU:
@@@ -4744,16 -4995,8 +4995,8 @@@ __setscheduler(struct rq *rq, struct ta
         set_load_weight(p);
   }
   
- /**
-  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
-  * @p: the task in question.
-  * @policy: new policy.
-  * @param: structure containing the new RT priority.
-  *
-  * NOTE that the task may be already dead.
-  */
- int sched_setscheduler(struct task_struct *p, int policy,
-                      struct sched_param *param)
+ static int __sched_setscheduler(struct task_struct *p, int policy,
+                               struct sched_param *param, bool user)
   {
         int retval, oldprio, oldpolicy = -1, on_rq, running;
         unsigned long flags;
@@@ -4785,7 -5028,7 +5028,7 @@@ recheck
         /*
          * Allow unprivileged RT tasks to decrease priority:
          */
-       if (!capable(CAP_SYS_NICE)) {
+       if (user && !capable(CAP_SYS_NICE)) {
                 if (rt_policy(policy)) {
                         unsigned long rlim_rtprio;
   
@@@ -4821,7 -5064,8 +5064,8 @@@
          * Do not allow realtime tasks into groups that have no runtime
          * assigned.
          */
-       if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+       if (user
+           && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
                 return -EPERM;
   #endif
   
@@@ -4870,8 -5114,39 +5114,39 @@@
   
         return 0;
   }
+ 
+ /**
+  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+  * @p: the task in question.
+  * @policy: new policy.
+  * @param: structure containing the new RT priority.
+  *
+  * NOTE that the task may be already dead.
+  */
+ int sched_setscheduler(struct task_struct *p, int policy,
+                      struct sched_param *param)
+ {
+       return __sched_setscheduler(p, policy, param, true);
+ }
   EXPORT_SYMBOL_GPL(sched_setscheduler);
   
+ /**
+  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+  * @p: the task in question.
+  * @policy: new policy.
+  * @param: structure containing the new RT priority.
+  *
+  * Just like sched_setscheduler, only don't bother checking if the
+  * current context has permission.  For example, this is needed in
+  * stop_machine(): we create temporary high priority worker threads,
+  * but our caller might not have that capability.
+  */
+ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+                              struct sched_param *param)
+ {
+       return __sched_setscheduler(p, policy, param, false);
+ }
+ 
   static int
   do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
   {
@@@ -5070,24 -5345,6 +5345,6 @@@ asmlinkage long sys_sched_setaffinity(p
         return sched_setaffinity(pid, &new_mask);
   }
   
- /*
-  * Represents all cpu's present in the system
-  * In systems capable of hotplug, this map could dynamically grow
-  * as new cpu's are detected in the system via any platform specific
-  * method, such as ACPI for e.g.
-  */
- 
- cpumask_t cpu_present_map __read_mostly;
- EXPORT_SYMBOL(cpu_present_map);
- 
- #ifndef CONFIG_SMP
- cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
- EXPORT_SYMBOL(cpu_online_map);
- 
- cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
- EXPORT_SYMBOL(cpu_possible_map);
- #endif
- 
   long sched_getaffinity(pid_t pid, cpumask_t *mask)
   {
         struct task_struct *p;
@@@ -5384,7 -5641,7 +5641,7 @@@ out_unlock
         return retval;
   }
   
- static const char stat_nam[] = "RSDTtZX";
+ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
   
   void sched_show_task(struct task_struct *p)
   {
@@@ -5571,6 -5828,12 +5828,12 @@@ int set_cpus_allowed_ptr(struct task_st
                 goto out;
         }
   
+       if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+                    !cpus_equal(p->cpus_allowed, *new_mask))) {
+               ret = -EINVAL;
+               goto out;
+       }
+ 
         if (p->sched_class->set_cpus_allowed)
                 p->sched_class->set_cpus_allowed(p, new_mask);
         else {
@@@ -5622,10 -5885,10 +5885,10 @@@ static int __migrate_task(struct task_s
         double_rq_lock(rq_src, rq_dest);
         /* Already moved. */
         if (task_cpu(p) != src_cpu)
-               goto out;
+               goto done;
         /* Affinity changed (again). */
         if (!cpu_isset(dest_cpu, p->cpus_allowed))
-               goto out;
+               goto fail;
   
         on_rq = p->se.on_rq;
         if (on_rq)
@@@ -5636,8 -5899,9 +5899,9 @@@
                 activate_task(rq_dest, p, 0);
                 check_preempt_curr(rq_dest, p);
         }
+ done:
         ret = 1;
- out:
+ fail:
         double_rq_unlock(rq_src, rq_dest);
         return ret;
   }
@@@ -6059,6 -6323,36 +6323,36 @@@ static void unregister_sched_domain_sys
   }
   #endif
   
+ static void set_rq_online(struct rq *rq)
+ {
+       if (!rq->online) {
+               const struct sched_class *class;
+ 
+               cpu_set(rq->cpu, rq->rd->online);
+               rq->online = 1;
+ 
+               for_each_class(class) {
+                       if (class->rq_online)
+                               class->rq_online(rq);
+               }
+       }
+ }
+ 
+ static void set_rq_offline(struct rq *rq)
+ {
+       if (rq->online) {
+               const struct sched_class *class;
+ 
+               for_each_class(class) {
+                       if (class->rq_offline)
+                               class->rq_offline(rq);
+               }
+ 
+               cpu_clear(rq->cpu, rq->rd->online);
+               rq->online = 0;
+       }
+ }
+ 
   /*
    * migration_call - callback that gets triggered when a CPU is added.
    * Here we can start up the necessary migration thread for the new CPU.
@@@ -6096,7 -6390,8 +6390,8 @@@ migration_call(struct notifier_block *n
                 spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
                         BUG_ON(!cpu_isset(cpu, rq->rd->span));
-                       cpu_set(cpu, rq->rd->online);
+ 
+                       set_rq_online(rq);
                 }
                 spin_unlock_irqrestore(&rq->lock, flags);
                 break;
@@@ -6157,7 -6452,7 +6452,7 @@@
                 spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
                         BUG_ON(!cpu_isset(cpu, rq->rd->span));
-                       cpu_clear(cpu, rq->rd->online);
+                       set_rq_offline(rq);
                 }
                 spin_unlock_irqrestore(&rq->lock, flags);
                 break;
@@@ -6191,6 -6486,28 +6486,28 @@@ void __init migration_init(void
   
   #ifdef CONFIG_SCHED_DEBUG
   
+ static inline const char *sd_level_to_string(enum sched_domain_level lvl)
+ {
+       switch (lvl) {
+       case SD_LV_NONE:
+                       return "NONE";
+       case SD_LV_SIBLING:
+                       return "SIBLING";
+       case SD_LV_MC:
+                       return "MC";
+       case SD_LV_CPU:
+                       return "CPU";
+       case SD_LV_NODE:
+                       return "NODE";
+       case SD_LV_ALLNODES:
+                       return "ALLNODES";
+       case SD_LV_MAX:
+                       return "MAX";
+ 
+       }
+       return "MAX";
+ }
+ 
   static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                   cpumask_t *groupmask)
   {
@@@ -6210,7 -6527,8 +6527,8 @@@
                 return -1;
         }
   
-       printk(KERN_CONT "span %s\n", str);
+       printk(KERN_CONT "span %s level %s\n",
+               str, sd_level_to_string(sd->level));
   
         if (!cpu_isset(cpu, sd->span)) {
                 printk(KERN_ERR "ERROR: domain->span does not contain "
@@@ -6294,9 -6612,9 +6612,9 @@@ static void sched_domain_debug(struct s
         }
         kfree(groupmask);
   }
- #else
+ #else /* !CONFIG_SCHED_DEBUG */
   # define sched_domain_debug(sd, cpu) do { } while (0)
- #endif
+ #endif /* CONFIG_SCHED_DEBUG */
   
   static int sd_degenerate(struct sched_domain *sd)
   {
@@@ -6356,20 -6674,16 +6674,16 @@@ sd_parent_degenerate(struct sched_domai
   static void rq_attach_root(struct rq *rq, struct root_domain *rd)
   {
         unsigned long flags;
-       const struct sched_class *class;
   
         spin_lock_irqsave(&rq->lock, flags);
   
         if (rq->rd) {
                 struct root_domain *old_rd = rq->rd;
   
-               for (class = sched_class_highest; class; class = class->next) {
-                       if (class->leave_domain)
-                               class->leave_domain(rq);
-               }
+               if (cpu_isset(rq->cpu, old_rd->online))
+                       set_rq_offline(rq);
   
                 cpu_clear(rq->cpu, old_rd->span);
-               cpu_clear(rq->cpu, old_rd->online);
   
                 if (atomic_dec_and_test(&old_rd->refcount))
                         kfree(old_rd);
@@@ -6380,12 -6694,7 +6694,7 @@@
   
         cpu_set(rq->cpu, rd->span);
         if (cpu_isset(rq->cpu, cpu_online_map))
-               cpu_set(rq->cpu, rd->online);
- 
-       for (class = sched_class_highest; class; class = class->next) {
-               if (class->join_domain)
-                       class->join_domain(rq);
-       }
+               set_rq_online(rq);
   
         spin_unlock_irqrestore(&rq->lock, flags);
   }
@@@ -6396,6 -6705,8 +6705,8 @@@ static void init_rootdomain(struct root
   
         cpus_clear(rd->span);
         cpus_clear(rd->online);
+ 
+       cpupri_init(&rd->cpupri);
   }
   
   static void init_defrootdomain(void)
@@@ -6491,7 -6802,7 +6802,7 @@@ init_sched_build_groups(const cpumask_
   
         cpus_clear(*covered);
   
- -      for_each_cpu_mask(i, *span) {
+ +      for_each_cpu_mask_nr(i, *span) {
                 struct sched_group *sg;
                 int group = group_fn(i, cpu_map, &sg, tmpmask);
                 int j;
@@@ -6502,7 -6813,7 +6813,7 @@@
                 cpus_clear(sg->cpumask);
                 sg->__cpu_power = 0;
   
- -              for_each_cpu_mask(j, *span) {
+ +              for_each_cpu_mask_nr(j, *span) {
                         if (group_fn(j, cpu_map, NULL, tmpmask) != group)
                                 continue;
   
@@@ -6590,7 -6901,7 +6901,7 @@@ static void sched_domain_node_span(int 
                 cpus_or(*span, *span, *nodemask);
         }
   }
- #endif
+ #endif /* CONFIG_NUMA */
   
   int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
   
@@@ -6609,7 -6920,7 +6920,7 @@@ cpu_to_cpu_group(int cpu, const cpumask
                 *sg = &per_cpu(sched_group_cpus, cpu);
         return cpu;
   }
- #endif
+ #endif /* CONFIG_SCHED_SMT */
   
   /*
    * multi-core sched-domains:
@@@ -6617,7 -6928,7 +6928,7 @@@
   #ifdef CONFIG_SCHED_MC
   static DEFINE_PER_CPU(struct sched_domain, core_domains);
   static DEFINE_PER_CPU(struct sched_group, sched_group_core);
- #endif
+ #endif /* CONFIG_SCHED_MC */
   
   #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
   static int
@@@ -6702,7 -7013,7 +7013,7 @@@ static void init_numa_sched_groups_powe
         if (!sg)
                 return;
         do {
- -              for_each_cpu_mask(j, sg->cpumask) {
+ +              for_each_cpu_mask_nr(j, sg->cpumask) {
                         struct sched_domain *sd;
   
                         sd = &per_cpu(phys_domains, j);
@@@ -6719,7 -7030,7 +7030,7 @@@
                 sg = sg->next;
         } while (sg != group_head);
   }
- #endif
+ #endif /* CONFIG_NUMA */
   
   #ifdef CONFIG_NUMA
   /* Free memory allocated for various sched_group structures */
@@@ -6727,7 -7038,7 +7038,7 @@@ static void free_sched_groups(const cpu
   {
         int cpu, i;
   
- -      for_each_cpu_mask(cpu, *cpu_map) {
+ +      for_each_cpu_mask_nr(cpu, *cpu_map) {
                 struct sched_group **sched_group_nodes
                         = sched_group_nodes_bycpu[cpu];
   
@@@ -6756,11 -7067,11 +7067,11 @@@ next_sg
                 sched_group_nodes_bycpu[cpu] = NULL;
         }
   }
- #else
+ #else /* !CONFIG_NUMA */
   static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
   {
   }
- #endif
+ #endif /* CONFIG_NUMA */
   
   /*
    * Initialize sched groups cpu_power.
@@@ -6966,7 -7277,7 +7277,7 @@@ static int __build_sched_domains(const 
         /*
          * Set up domains for cpus specified by the cpu_map.
          */
- -      for_each_cpu_mask(i, *cpu_map) {
+ +      for_each_cpu_mask_nr(i, *cpu_map) {
                 struct sched_domain *sd = NULL, *p;
                 SCHED_CPUMASK_VAR(nodemask, allmasks);
   
@@@ -7033,7 -7344,7 +7344,7 @@@
   
   #ifdef CONFIG_SCHED_SMT
         /* Set up CPU (sibling) groups */
- -      for_each_cpu_mask(i, *cpu_map) {
+ +      for_each_cpu_mask_nr(i, *cpu_map) {
                 SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
                 SCHED_CPUMASK_VAR(send_covered, allmasks);
   
@@@ -7050,7 -7361,7 +7361,7 @@@
   
   #ifdef CONFIG_SCHED_MC
         /* Set up multi-core groups */
- -      for_each_cpu_mask(i, *cpu_map) {
+ +      for_each_cpu_mask_nr(i, *cpu_map) {
                 SCHED_CPUMASK_VAR(this_core_map, allmasks);
                 SCHED_CPUMASK_VAR(send_covered, allmasks);
   
@@@ -7117,7 -7428,7 +7428,7 @@@
                         goto error;
                 }
                 sched_group_nodes[i] = sg;
- -              for_each_cpu_mask(j, *nodemask) {
+ +              for_each_cpu_mask_nr(j, *nodemask) {
                         struct sched_domain *sd;
   
                         sd = &per_cpu(node_domains, j);
@@@ -7163,21 -7474,21 +7474,21 @@@
   
         /* Calculate CPU power for physical packages and nodes */
   #ifdef CONFIG_SCHED_SMT
- -      for_each_cpu_mask(i, *cpu_map) {
+ +      for_each_cpu_mask_nr(i, *cpu_map) {
                 struct sched_domain *sd = &per_cpu(cpu_domains, i);
   
                 init_sched_groups_power(i, sd);
         }
   #endif
   #ifdef CONFIG_SCHED_MC
- -      for_each_cpu_mask(i, *cpu_map) {
+ +      for_each_cpu_mask_nr(i, *cpu_map) {
                 struct sched_domain *sd = &per_cpu(core_domains, i);
   
                 init_sched_groups_power(i, sd);
         }
   #endif
   
- -      for_each_cpu_mask(i, *cpu_map) {
+ +      for_each_cpu_mask_nr(i, *cpu_map) {
                 struct sched_domain *sd = &per_cpu(phys_domains, i);
   
                 init_sched_groups_power(i, sd);
@@@ -7197,7 -7508,7 +7508,7 @@@
   #endif
   
         /* Attach the domains */
- -      for_each_cpu_mask(i, *cpu_map) {
+ +      for_each_cpu_mask_nr(i, *cpu_map) {
                 struct sched_domain *sd;
   #ifdef CONFIG_SCHED_SMT
                 sd = &per_cpu(cpu_domains, i);
@@@ -7292,7 -7603,7 +7603,7 @@@ static void detach_destroy_domains(cons
   
         unregister_sched_domain_sysctl();
   
- -      for_each_cpu_mask(i, *cpu_map)
+ +      for_each_cpu_mask_nr(i, *cpu_map)
                 cpu_attach_domain(NULL, &def_root_domain, i);
         synchronize_sched();
         arch_destroy_sched_domains(cpu_map, &tmpmask);
@@@ -7469,7 -7780,7 +7780,7 @@@ int sched_create_sysfs_power_savings_en
   #endif
         return err;
   }
- #endif
+ #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
   
   /*
    * Force a reinitialization of the sched domains hierarchy. The domains
@@@ -7480,21 -7791,28 +7791,28 @@@
   static int update_sched_domains(struct notifier_block *nfb,
                                 unsigned long action, void *hcpu)
   {
+       int cpu = (int)(long)hcpu;
+ 
         switch (action) {
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
         case CPU_DOWN_PREPARE:
         case CPU_DOWN_PREPARE_FROZEN:
+               disable_runtime(cpu_rq(cpu));
+               /* fall-through */
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
                 detach_destroy_domains(&cpu_online_map);
                 free_sched_domains();
                 return NOTIFY_OK;
   
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
+ 
         case CPU_DOWN_FAILED:
         case CPU_DOWN_FAILED_FROZEN:
         case CPU_ONLINE:
         case CPU_ONLINE_FROZEN:
+               enable_runtime(cpu_rq(cpu));
+               /* fall-through */
+       case CPU_UP_CANCELED:
+       case CPU_UP_CANCELED_FROZEN:
         case CPU_DEAD:
         case CPU_DEAD_FROZEN:
                 /*
@@@ -7694,8 -8012,8 +8012,8 @@@ void __init sched_init(void
   
                 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
- #endif
- #endif
+ #endif /* CONFIG_USER_SCHED */
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
   #ifdef CONFIG_RT_GROUP_SCHED
                 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
@@@ -7709,8 -8027,8 +8027,8 @@@
   
                 root_task_group.rt_rq = (struct rt_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
- #endif
- #endif
+ #endif /* CONFIG_USER_SCHED */
+ #endif /* CONFIG_RT_GROUP_SCHED */
         }
   
   #ifdef CONFIG_SMP
@@@ -7726,8 -8044,8 +8044,8 @@@
   #ifdef CONFIG_USER_SCHED
         init_rt_bandwidth(&root_task_group.rt_bandwidth,
                         global_rt_period(), RUNTIME_INF);
- #endif
- #endif
+ #endif /* CONFIG_USER_SCHED */
+ #endif /* CONFIG_RT_GROUP_SCHED */
   
   #ifdef CONFIG_GROUP_SCHED
         list_add(&init_task_group.list, &task_groups);
@@@ -7737,8 -8055,8 +8055,8 @@@
         INIT_LIST_HEAD(&root_task_group.children);
         init_task_group.parent = &root_task_group;
         list_add(&init_task_group.siblings, &root_task_group.children);
- #endif
- #endif
+ #endif /* CONFIG_USER_SCHED */
+ #endif /* CONFIG_GROUP_SCHED */
   
         for_each_possible_cpu(i) {
                 struct rq *rq;
@@@ -7818,6 -8136,7 +8136,7 @@@
                 rq->next_balance = jiffies;
                 rq->push_cpu = 0;
                 rq->cpu = i;
+               rq->online = 0;
                 rq->migration_thread = NULL;
                 INIT_LIST_HEAD(&rq->migration_queue);
                 rq_attach_root(rq, &def_root_domain);
@@@ -7833,7 -8152,7 +8152,7 @@@
   #endif
   
   #ifdef CONFIG_SMP
-       open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
+       open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
   #endif
   
   #ifdef CONFIG_RT_MUTEXES
@@@ -8057,7 -8376,7 +8376,7 @@@ static inline void unregister_fair_sche
   {
         list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
   }
- #else
+ #else /* !CONFG_FAIR_GROUP_SCHED */
   static inline void free_fair_sched_group(struct task_group *tg)
   {
   }
@@@ -8075,7 -8394,7 +8394,7 @@@ static inline void register_fair_sched_
   static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
   {
   }
- #endif
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
   
   #ifdef CONFIG_RT_GROUP_SCHED
   static void free_rt_sched_group(struct task_group *tg)
@@@ -8146,7 -8465,7 +8465,7 @@@ static inline void unregister_rt_sched_
   {
         list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
   }
- #else
+ #else /* !CONFIG_RT_GROUP_SCHED */
   static inline void free_rt_sched_group(struct task_group *tg)
   {
   }
@@@ -8164,7 -8483,7 +8483,7 @@@ static inline void register_rt_sched_gr
   static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
   {
   }
- #endif
+ #endif /* CONFIG_RT_GROUP_SCHED */
   
   #ifdef CONFIG_GROUP_SCHED
   static void free_sched_group(struct task_group *tg)
@@@ -8275,17 -8594,14 +8594,14 @@@ void sched_move_task(struct task_struc
   
         task_rq_unlock(rq, &flags);
   }
- #endif
+ #endif /* CONFIG_GROUP_SCHED */
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- static void set_se_shares(struct sched_entity *se, unsigned long shares)
+ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
   {
         struct cfs_rq *cfs_rq = se->cfs_rq;
-       struct rq *rq = cfs_rq->rq;
         int on_rq;
   
-       spin_lock_irq(&rq->lock);
- 
         on_rq = se->on_rq;
         if (on_rq)
                 dequeue_entity(cfs_rq, se, 0);
@@@ -8295,8 -8611,17 +8611,17 @@@
   
         if (on_rq)
                 enqueue_entity(cfs_rq, se, 0);
+ }
   
-       spin_unlock_irq(&rq->lock);
+ static void set_se_shares(struct sched_entity *se, unsigned long shares)
+ {
+       struct cfs_rq *cfs_rq = se->cfs_rq;
+       struct rq *rq = cfs_rq->rq;
+       unsigned long flags;
+ 
+       spin_lock_irqsave(&rq->lock, flags);
+       __set_se_shares(se, shares);
+       spin_unlock_irqrestore(&rq->lock, flags);
   }
   
   static DEFINE_MUTEX(shares_mutex);
@@@ -8335,8 -8660,13 +8660,13 @@@ int sched_group_set_shares(struct task_
          * w/o tripping rebalance_share or load_balance_fair.
          */
         tg->shares = shares;
-       for_each_possible_cpu(i)
+       for_each_possible_cpu(i) {
+               /*
+                * force a rebalance
+                */
+               cfs_rq_set_shares(tg->cfs_rq[i], 0);
                 set_se_shares(tg->se[i], shares);
+       }
   
         /*
          * Enable load balance activity on this group, by inserting it back on
@@@ -8375,7 -8705,7 +8705,7 @@@ static unsigned long to_ratio(u64 perio
   #ifdef CONFIG_CGROUP_SCHED
   static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
   {
-       struct task_group *tgi, *parent = tg ? tg->parent : NULL;
+       struct task_group *tgi, *parent = tg->parent;
         unsigned long total = 0;
   
         if (!parent) {
@@@ -8399,7 -8729,7 +8729,7 @@@
         }
         rcu_read_unlock();
   
-       return total + to_ratio(period, runtime) <
+       return total + to_ratio(period, runtime) <=
                 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
                                 parent->rt_bandwidth.rt_runtime);
   }
@@@ -8519,16 -8849,21 +8849,21 @@@ long sched_group_rt_period(struct task_
   
   static int sched_rt_global_constraints(void)
   {
+       struct task_group *tg = &root_task_group;
+       u64 rt_runtime, rt_period;
         int ret = 0;
   
+       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+       rt_runtime = tg->rt_bandwidth.rt_runtime;
+ 
         mutex_lock(&rt_constraints_mutex);
-       if (!__rt_schedulable(NULL, 1, 0))
+       if (!__rt_schedulable(tg, rt_period, rt_runtime))
                 ret = -EINVAL;
         mutex_unlock(&rt_constraints_mutex);
   
         return ret;
   }
- #else
+ #else /* !CONFIG_RT_GROUP_SCHED */
   static int sched_rt_global_constraints(void)
   {
         unsigned long flags;
@@@ -8546,7 -8881,7 +8881,7 @@@
   
         return 0;
   }
- #endif
+ #endif /* CONFIG_RT_GROUP_SCHED */
   
   int sched_rt_handler(struct ctl_table *table, int write,
                 struct file *filp, void __user *buffer, size_t *lenp,
@@@ -8654,7 -8989,7 +8989,7 @@@ static u64 cpu_shares_read_u64(struct c
   
         return (u64) tg->shares;
   }
- #endif
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
   
   #ifdef CONFIG_RT_GROUP_SCHED
   static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@@ -8678,7 -9013,7 +9013,7 @@@ static u64 cpu_rt_period_read_uint(stru
   {
         return sched_group_rt_period(cgroup_tg(cgrp));
   }
- #endif
+ #endif /* CONFIG_RT_GROUP_SCHED */
   
   static struct cftype cpu_files[] = {
   #ifdef CONFIG_FAIR_GROUP_SCHED
diff --combined kernel/sched_fair.c

index 74774bd,f2aa987..bb61fe2
--- 1/kernel/sched_fair.c
--- 2/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@@ -63,13 -63,13 +63,13 @@@ unsigned int __read_mostly sysctl_sched
   
   /*
    * SCHED_OTHER wake-up granularity.
-  * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
+  * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
    *
    * This option delays the preemption effects of decoupled workloads
    * and reduces their over-scheduling. Synchronous workloads will still
    * have immediate wakeup/sleep latencies.
    */
- unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+ unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
   
   const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
   
@@@ -333,6 -333,34 +333,34 @@@ int sched_nr_latency_handler(struct ctl
   }
   #endif
   
+ /*
+  * delta *= w / rw
+  */
+ static inline unsigned long
+ calc_delta_weight(unsigned long delta, struct sched_entity *se)
+ {
+       for_each_sched_entity(se) {
+               delta = calc_delta_mine(delta,
+                               se->load.weight, &cfs_rq_of(se)->load);
+       }
+ 
+       return delta;
+ }
+ 
+ /*
+  * delta *= rw / w
+  */
+ static inline unsigned long
+ calc_delta_fair(unsigned long delta, struct sched_entity *se)
+ {
+       for_each_sched_entity(se) {
+               delta = calc_delta_mine(delta,
+                               cfs_rq_of(se)->load.weight, &se->load);
+       }
+ 
+       return delta;
+ }
+ 
   /*
    * The idea is to set a period in which each task runs once.
    *
@@@ -362,47 -390,80 +390,80 @@@ static u64 __sched_period(unsigned lon
    */
   static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
-       u64 slice = __sched_period(cfs_rq->nr_running);
- 
-       for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
- 
-               slice *= se->load.weight;
-               do_div(slice, cfs_rq->load.weight);
-       }
- 
- 
-       return slice;
+       return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
   }
   
   /*
    * We calculate the vruntime slice of a to be inserted task
    *
-  * vs = s/w = p/rw
+  * vs = s*rw/w = p
    */
   static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
         unsigned long nr_running = cfs_rq->nr_running;
-       unsigned long weight;
-       u64 vslice;
   
         if (!se->on_rq)
                 nr_running++;
   
-       vslice = __sched_period(nr_running);
+       return __sched_period(nr_running);
+ }
+ 
+ /*
+  * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
+  * that it favours >=0 over <0.
+  *
+  *   -20         |
+  *               |
+  *     0 --------+-------
+  *             .'
+  *    19     .'
+  *
+  */
+ static unsigned long
+ calc_delta_asym(unsigned long delta, struct sched_entity *se)
+ {
+       struct load_weight lw = {
+               .weight = NICE_0_LOAD,
+               .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
+       };
   
         for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
+               struct load_weight *se_lw = &se->load;
+               unsigned long rw = cfs_rq_of(se)->load.weight;
+ 
+ #ifdef CONFIG_FAIR_SCHED_GROUP
+               struct cfs_rq *cfs_rq = se->my_q;
+               struct task_group *tg = NULL
+ 
+               if (cfs_rq)
+                       tg = cfs_rq->tg;
+ 
+               if (tg && tg->shares < NICE_0_LOAD) {
+                       /*
+                        * scale shares to what it would have been had
+                        * tg->weight been NICE_0_LOAD:
+                        *
+                        *   weight = 1024 * shares / tg->weight
+                        */
+                       lw.weight *= se->load.weight;
+                       lw.weight /= tg->shares;
+ 
+                       lw.inv_weight = 0;
+ 
+                       se_lw = &lw;
+                       rw += lw.weight - se->load.weight;
+               } else
+ #endif
   
-               weight = cfs_rq->load.weight;
-               if (!se->on_rq)
-                       weight += se->load.weight;
+               if (se->load.weight < NICE_0_LOAD) {
+                       se_lw = &lw;
+                       rw += NICE_0_LOAD - se->load.weight;
+               }
   
-               vslice *= NICE_0_LOAD;
-               do_div(vslice, weight);
+               delta = calc_delta_mine(delta, rw, se_lw);
         }
   
-       return vslice;
+       return delta;
   }
   
   /*
@@@ -419,11 -480,7 +480,7 @@@ __update_curr(struct cfs_rq *cfs_rq, st
   
         curr->sum_exec_runtime += delta_exec;
         schedstat_add(cfs_rq, exec_clock, delta_exec);
-       delta_exec_weighted = delta_exec;
-       if (unlikely(curr->load.weight != NICE_0_LOAD)) {
-               delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
-                                                       &curr->load);
-       }
+       delta_exec_weighted = calc_delta_fair(delta_exec, curr);
         curr->vruntime += delta_exec_weighted;
   }
   
@@@ -510,10 -567,27 +567,27 @@@ update_stats_curr_start(struct cfs_rq *
    * Scheduling class queueing methods:
    */
   
+ #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+ static void
+ add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+ {
+       cfs_rq->task_weight += weight;
+ }
+ #else
+ static inline void
+ add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+ {
+ }
+ #endif
+ 
   static void
   account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
         update_load_add(&cfs_rq->load, se->load.weight);
+       if (!parent_entity(se))
+               inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+       if (entity_is_task(se))
+               add_cfs_task_weight(cfs_rq, se->load.weight);
         cfs_rq->nr_running++;
         se->on_rq = 1;
         list_add(&se->group_node, &cfs_rq->tasks);
@@@ -523,6 -597,10 +597,10 @@@ static voi
   account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
         update_load_sub(&cfs_rq->load, se->load.weight);
+       if (!parent_entity(se))
+               dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+       if (entity_is_task(se))
+               add_cfs_task_weight(cfs_rq, -se->load.weight);
         cfs_rq->nr_running--;
         se->on_rq = 0;
         list_del_init(&se->group_node);
@@@ -609,8 -687,17 +687,17 @@@ place_entity(struct cfs_rq *cfs_rq, str
   
         if (!initial) {
                 /* sleeps upto a single latency don't count. */
-               if (sched_feat(NEW_FAIR_SLEEPERS))
-                       vruntime -= sysctl_sched_latency;
+               if (sched_feat(NEW_FAIR_SLEEPERS)) {
+                       unsigned long thresh = sysctl_sched_latency;
+ 
+                       /*
+                        * convert the sleeper threshold into virtual time
+                        */
+                       if (sched_feat(NORMALIZED_SLEEPER))
+                               thresh = calc_delta_fair(thresh, se);
+ 
+                       vruntime -= thresh;
+               }
   
                 /* ensure we never gain time by being placed backwards. */
                 vruntime = max_vruntime(se->vruntime, vruntime);
@@@ -639,21 -726,6 +726,6 @@@ enqueue_entity(struct cfs_rq *cfs_rq, s
                 __enqueue_entity(cfs_rq, se);
   }
   
- static void update_avg(u64 *avg, u64 sample)
- {
-       s64 diff = sample - *avg;
-       *avg += diff >> 3;
- }
- 
- static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
-       if (!se->last_wakeup)
-               return;
- 
-       update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
-       se->last_wakeup = 0;
- }
- 
   static void
   dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
   {
@@@ -664,7 -736,6 +736,6 @@@
   
         update_stats_dequeue(cfs_rq, se);
         if (sleep) {
-               update_avg_stats(cfs_rq, se);
   #ifdef CONFIG_SCHEDSTATS
                 if (entity_is_task(se)) {
                         struct task_struct *tsk = task_of(se);
@@@ -726,17 -797,16 +797,16 @@@ set_next_entity(struct cfs_rq *cfs_rq, 
         se->prev_sum_exec_runtime = se->sum_exec_runtime;
   }
   
- static int
- wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
- 
   static struct sched_entity *
   pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
-       if (!cfs_rq->next)
-               return se;
+       struct rq *rq = rq_of(cfs_rq);
+       u64 pair_slice = rq->clock - cfs_rq->pair_start;
   
-       if (wakeup_preempt_entity(cfs_rq->next, se) != 0)
+       if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
+               cfs_rq->pair_start = rq->clock;
                 return se;
+       }
   
         return cfs_rq->next;
   }
@@@ -835,7 -905,7 +905,7 @@@ static void hrtick_start_fair(struct r
                 hrtick_start(rq, delta, requeue);
         }
   }
- #else
+ #else /* !CONFIG_SCHED_HRTICK */
   static inline void
   hrtick_start_fair(struct rq *rq, struct task_struct *p)
   {
@@@ -961,7 -1031,7 +1031,7 @@@ static int wake_idle(int cpu, struct ta
                     || ((sd->flags & SD_WAKE_IDLE_FAR)
                         && !task_hot(p, task_rq(p)->clock, sd))) {
                         cpus_and(tmp, sd->span, p->cpus_allowed);
- -                      for_each_cpu_mask(i, tmp) {
+ +                      for_each_cpu_mask_nr(i, tmp) {
                                 if (idle_cpu(i)) {
                                         if (i != task_cpu(p)) {
                                                 schedstat_inc(p,
@@@ -976,7 -1046,7 +1046,7 @@@
         }
         return cpu;
   }
- #else
+ #else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
   static inline int wake_idle(int cpu, struct task_struct *p)
   {
         return cpu;
@@@ -987,6 -1057,89 +1057,89 @@@
   
   static const struct sched_class fair_sched_class;
   
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+  * effective_load() calculates the load change as seen from the root_task_group
+  *
+  * Adding load to a group doesn't make a group heavier, but can cause movement
+  * of group shares between cpus. Assuming the shares were perfectly aligned one
+  * can calculate the shift in shares.
+  *
+  * The problem is that perfectly aligning the shares is rather expensive, hence
+  * we try to avoid doing that too often - see update_shares(), which ratelimits
+  * this change.
+  *
+  * We compensate this by not only taking the current delta into account, but
+  * also considering the delta between when the shares were last adjusted and
+  * now.
+  *
+  * We still saw a performance dip, some tracing learned us that between
+  * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
+  * significantly. Therefore try to bias the error in direction of failing
+  * the affine wakeup.
+  *
+  */
+ static long effective_load(struct task_group *tg, int cpu,
+               long wl, long wg)
+ {
+       struct sched_entity *se = tg->se[cpu];
+       long more_w;
+ 
+       if (!tg->parent)
+               return wl;
+ 
+       /*
+        * By not taking the decrease of shares on the other cpu into
+        * account our error leans towards reducing the affine wakeups.
+        */
+       if (!wl && sched_feat(ASYM_EFF_LOAD))
+               return wl;
+ 
+       /*
+        * Instead of using this increment, also add the difference
+        * between when the shares were last updated and now.
+        */
+       more_w = se->my_q->load.weight - se->my_q->rq_weight;
+       wl += more_w;
+       wg += more_w;
+ 
+       for_each_sched_entity(se) {
+ #define D(n) (likely(n) ? (n) : 1)
+ 
+               long S, rw, s, a, b;
+ 
+               S = se->my_q->tg->shares;
+               s = se->my_q->shares;
+               rw = se->my_q->rq_weight;
+ 
+               a = S*(rw + wl);
+               b = S*rw + s*wg;
+ 
+               wl = s*(a-b)/D(b);
+               /*
+                * Assume the group is already running and will
+                * thus already be accounted for in the weight.
+                *
+                * That is, moving shares between CPUs, does not
+                * alter the group weight.
+                */
+               wg = 0;
+ #undef D
+       }
+ 
+       return wl;
+ }
+ 
+ #else
+ 
+ static inline unsigned long effective_load(struct task_group *tg, int cpu,
+               unsigned long wl, unsigned long wg)
+ {
+       return wl;
+ }
+ 
+ #endif
+ 
   static int
   wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
             struct task_struct *p, int prev_cpu, int this_cpu, int sync,
@@@ -994,8 -1147,10 +1147,10 @@@
             unsigned int imbalance)
   {
         struct task_struct *curr = this_rq->curr;
+       struct task_group *tg;
         unsigned long tl = this_load;
         unsigned long tl_per_task;
+       unsigned long weight;
         int balanced;
   
         if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
@@@ -1006,19 -1161,28 +1161,28 @@@
          * effect of the currently running task from the load
          * of the current CPU:
          */
-       if (sync)
-               tl -= current->se.load.weight;
+       if (sync) {
+               tg = task_group(current);
+               weight = current->se.load.weight;
+ 
+               tl += effective_load(tg, this_cpu, -weight, -weight);
+               load += effective_load(tg, prev_cpu, 0, -weight);
+       }
   
-       balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
+       tg = task_group(p);
+       weight = p->se.load.weight;
+ 
+       balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+               imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
   
         /*
          * If the currently running task will sleep within
          * a reasonable amount of time then attract this newly
          * woken task:
          */
-       if (sync && balanced && curr->sched_class == &fair_sched_class) {
+       if (sync && balanced) {
                 if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-                               p->se.avg_overlap < sysctl_sched_migration_cost)
+                   p->se.avg_overlap < sysctl_sched_migration_cost)
                         return 1;
         }
   
@@@ -1111,11 -1275,13 +1275,13 @@@ static unsigned long wakeup_gran(struc
         unsigned long gran = sysctl_sched_wakeup_granularity;
   
         /*
-        * More easily preempt - nice tasks, while not making
-        * it harder for + nice tasks.
+        * More easily preempt - nice tasks, while not making it harder for
+        * + nice tasks.
          */
-       if (unlikely(se->load.weight > NICE_0_LOAD))
-               gran = calc_delta_fair(gran, &se->load);
+       if (sched_feat(ASYM_GRAN))
+               gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+       else
+               gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
   
         return gran;
   }
@@@ -1177,7 -1343,6 +1343,6 @@@ static void check_preempt_wakeup(struc
                 return;
         }
   
-       se->last_wakeup = se->sum_exec_runtime;
         if (unlikely(se == pse))
                 return;
   
@@@ -1275,23 -1440,18 +1440,18 @@@ __load_balance_iterator(struct cfs_rq *
         struct task_struct *p = NULL;
         struct sched_entity *se;
   
-       if (next == &cfs_rq->tasks)
-               return NULL;
- 
-       /* Skip over entities that are not tasks */
-       do {
+       while (next != &cfs_rq->tasks) {
                 se = list_entry(next, struct sched_entity, group_node);
                 next = next->next;
-       } while (next != &cfs_rq->tasks && !entity_is_task(se));
   
-       if (next == &cfs_rq->tasks)
-               return NULL;
+               /* Skip over entities that are not tasks */
+               if (entity_is_task(se)) {
+                       p = task_of(se);
+                       break;
+               }
+       }
   
         cfs_rq->balance_iterator = next;
- 
-       if (entity_is_task(se))
-               p = task_of(se);
- 
         return p;
   }
   
@@@ -1309,75 -1469,82 +1469,82 @@@ static struct task_struct *load_balance
         return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
   }
   
- #ifdef CONFIG_FAIR_GROUP_SCHED
- static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+ static unsigned long
+ __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+               unsigned long max_load_move, struct sched_domain *sd,
+               enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+               struct cfs_rq *cfs_rq)
   {
-       struct sched_entity *curr;
-       struct task_struct *p;
- 
-       if (!cfs_rq->nr_running || !first_fair(cfs_rq))
-               return MAX_PRIO;
- 
-       curr = cfs_rq->curr;
-       if (!curr)
-               curr = __pick_next_entity(cfs_rq);
+       struct rq_iterator cfs_rq_iterator;
   
-       p = task_of(curr);
+       cfs_rq_iterator.start = load_balance_start_fair;
+       cfs_rq_iterator.next = load_balance_next_fair;
+       cfs_rq_iterator.arg = cfs_rq;
   
-       return p->prio;
+       return balance_tasks(this_rq, this_cpu, busiest,
+                       max_load_move, sd, idle, all_pinned,
+                       this_best_prio, &cfs_rq_iterator);
   }
- #endif
   
+ #ifdef CONFIG_FAIR_GROUP_SCHED
   static unsigned long
   load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                   unsigned long max_load_move,
                   struct sched_domain *sd, enum cpu_idle_type idle,
                   int *all_pinned, int *this_best_prio)
   {
-       struct cfs_rq *busy_cfs_rq;
         long rem_load_move = max_load_move;
-       struct rq_iterator cfs_rq_iterator;
+       int busiest_cpu = cpu_of(busiest);
+       struct task_group *tg;
   
-       cfs_rq_iterator.start = load_balance_start_fair;
-       cfs_rq_iterator.next = load_balance_next_fair;
- 
-       for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
- #ifdef CONFIG_FAIR_GROUP_SCHED
-               struct cfs_rq *this_cfs_rq;
-               long imbalance;
-               unsigned long maxload;
+       rcu_read_lock();
+       update_h_load(busiest_cpu);
   
-               this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+       list_for_each_entry(tg, &task_groups, list) {
+               struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
+               unsigned long busiest_h_load = busiest_cfs_rq->h_load;
+               unsigned long busiest_weight = busiest_cfs_rq->load.weight;
+               u64 rem_load, moved_load;
   
-               imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
-               /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-               if (imbalance <= 0)
+               /*
+                * empty group
+                */
+               if (!busiest_cfs_rq->task_weight)
                         continue;
   
-               /* Don't pull more than imbalance/2 */
-               imbalance /= 2;
-               maxload = min(rem_load_move, imbalance);
+               rem_load = (u64)rem_load_move * busiest_weight;
+               rem_load = div_u64(rem_load, busiest_h_load + 1);
   
-               *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
- #else
- # define maxload rem_load_move
- #endif
-               /*
-                * pass busy_cfs_rq argument into
-                * load_balance_[start|next]_fair iterators
-                */
-               cfs_rq_iterator.arg = busy_cfs_rq;
-               rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
-                                              maxload, sd, idle, all_pinned,
-                                              this_best_prio,
-                                              &cfs_rq_iterator);
+               moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+                               rem_load, sd, idle, all_pinned, this_best_prio,
+                               tg->cfs_rq[busiest_cpu]);
   
-               if (rem_load_move <= 0)
+               if (!moved_load)
+                       continue;
+ 
+               moved_load *= busiest_h_load;
+               moved_load = div_u64(moved_load, busiest_weight + 1);
+ 
+               rem_load_move -= moved_load;
+               if (rem_load_move < 0)
                         break;
         }
+       rcu_read_unlock();
   
         return max_load_move - rem_load_move;
   }
+ #else
+ static unsigned long
+ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                 unsigned long max_load_move,
+                 struct sched_domain *sd, enum cpu_idle_type idle,
+                 int *all_pinned, int *this_best_prio)
+ {
+       return __load_balance_fair(this_rq, this_cpu, busiest,
+                       max_load_move, sd, idle, all_pinned,
+                       this_best_prio, &busiest->cfs);
+ }
+ #endif
   
   static int
   move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
@@@ -1402,7 -1569,7 +1569,7 @@@
   
         return 0;
   }
- #endif
+ #endif /* CONFIG_SMP */
   
   /*
    * scheduler tick hitting a task of our scheduling class:
diff --combined kernel/sched_rt.c

index e757f37,47ceac9..7c96147
--- 1/kernel/sched_rt.c
--- 2/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@@ -12,6 -12,9 +12,9 @@@ static inline int rt_overloaded(struct 
   
   static inline void rt_set_overload(struct rq *rq)
   {
+       if (!rq->online)
+               return;
+ 
         cpu_set(rq->cpu, rq->rd->rto_mask);
         /*
          * Make sure the mask is visible before we set
@@@ -26,6 -29,9 +29,9 @@@
   
   static inline void rt_clear_overload(struct rq *rq)
   {
+       if (!rq->online)
+               return;
+ 
         /* the order here really doesn't matter */
         atomic_dec(&rq->rd->rto_count);
         cpu_clear(rq->cpu, rq->rd->rto_mask);
@@@ -155,7 -161,7 +161,7 @@@ static inline struct rt_bandwidth *sche
         return &rt_rq->tg->rt_bandwidth;
   }
   
- #else
+ #else /* !CONFIG_RT_GROUP_SCHED */
   
   static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
   {
@@@ -220,49 -226,10 +226,10 @@@ static inline struct rt_bandwidth *sche
         return &def_rt_bandwidth;
   }
   
- #endif
- 
- static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
- {
-       int i, idle = 1;
-       cpumask_t span;
- 
-       if (rt_b->rt_runtime == RUNTIME_INF)
-               return 1;
- 
-       span = sched_rt_period_mask();
-       for_each_cpu_mask_nr(i, span) {
-               int enqueue = 0;
-               struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
-               struct rq *rq = rq_of_rt_rq(rt_rq);
- 
-               spin_lock(&rq->lock);
-               if (rt_rq->rt_time) {
-                       u64 runtime;
- 
-                       spin_lock(&rt_rq->rt_runtime_lock);
-                       runtime = rt_rq->rt_runtime;
-                       rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
-                       if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
-                               rt_rq->rt_throttled = 0;
-                               enqueue = 1;
-                       }
-                       if (rt_rq->rt_time || rt_rq->rt_nr_running)
-                               idle = 0;
-                       spin_unlock(&rt_rq->rt_runtime_lock);
-               } else if (rt_rq->rt_nr_running)
-                       idle = 0;
- 
-               if (enqueue)
-                       sched_rt_rq_enqueue(rt_rq);
-               spin_unlock(&rq->lock);
-       }
- 
-       return idle;
- }
+ #endif /* CONFIG_RT_GROUP_SCHED */
   
   #ifdef CONFIG_SMP
- static int balance_runtime(struct rt_rq *rt_rq)
+ static int do_balance_runtime(struct rt_rq *rt_rq)
   {
         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
         struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
@@@ -273,7 -240,7 +240,7 @@@
   
         spin_lock(&rt_b->rt_runtime_lock);
         rt_period = ktime_to_ns(rt_b->rt_period);
- -      for_each_cpu_mask(i, rd->span) {
+ +      for_each_cpu_mask_nr(i, rd->span) {
                 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
                 s64 diff;
   
@@@ -281,6 -248,9 +248,9 @@@
                         continue;
   
                 spin_lock(&iter->rt_runtime_lock);
+               if (iter->rt_runtime == RUNTIME_INF)
+                       goto next;
+ 
                 diff = iter->rt_runtime - iter->rt_time;
                 if (diff > 0) {
                         do_div(diff, weight);
@@@ -294,13 -264,163 +264,163 @@@
                                 break;
                         }
                 }
+ next:
                 spin_unlock(&iter->rt_runtime_lock);
         }
         spin_unlock(&rt_b->rt_runtime_lock);
   
         return more;
   }
- #endif
+ 
+ static void __disable_runtime(struct rq *rq)
+ {
+       struct root_domain *rd = rq->rd;
+       struct rt_rq *rt_rq;
+ 
+       if (unlikely(!scheduler_running))
+               return;
+ 
+       for_each_leaf_rt_rq(rt_rq, rq) {
+               struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+               s64 want;
+               int i;
+ 
+               spin_lock(&rt_b->rt_runtime_lock);
+               spin_lock(&rt_rq->rt_runtime_lock);
+               if (rt_rq->rt_runtime == RUNTIME_INF ||
+                               rt_rq->rt_runtime == rt_b->rt_runtime)
+                       goto balanced;
+               spin_unlock(&rt_rq->rt_runtime_lock);
+ 
+               want = rt_b->rt_runtime - rt_rq->rt_runtime;
+ 
+               for_each_cpu_mask(i, rd->span) {
+                       struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+                       s64 diff;
+ 
+                       if (iter == rt_rq)
+                               continue;
+ 
+                       spin_lock(&iter->rt_runtime_lock);
+                       if (want > 0) {
+                               diff = min_t(s64, iter->rt_runtime, want);
+                               iter->rt_runtime -= diff;
+                               want -= diff;
+                       } else {
+                               iter->rt_runtime -= want;
+                               want -= want;
+                       }
+                       spin_unlock(&iter->rt_runtime_lock);
+ 
+                       if (!want)
+                               break;
+               }
+ 
+               spin_lock(&rt_rq->rt_runtime_lock);
+               BUG_ON(want);
+ balanced:
+               rt_rq->rt_runtime = RUNTIME_INF;
+               spin_unlock(&rt_rq->rt_runtime_lock);
+               spin_unlock(&rt_b->rt_runtime_lock);
+       }
+ }
+ 
+ static void disable_runtime(struct rq *rq)
+ {
+       unsigned long flags;
+ 
+       spin_lock_irqsave(&rq->lock, flags);
+       __disable_runtime(rq);
+       spin_unlock_irqrestore(&rq->lock, flags);
+ }
+ 
+ static void __enable_runtime(struct rq *rq)
+ {
+       struct rt_rq *rt_rq;
+ 
+       if (unlikely(!scheduler_running))
+               return;
+ 
+       for_each_leaf_rt_rq(rt_rq, rq) {
+               struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+ 
+               spin_lock(&rt_b->rt_runtime_lock);
+               spin_lock(&rt_rq->rt_runtime_lock);
+               rt_rq->rt_runtime = rt_b->rt_runtime;
+               rt_rq->rt_time = 0;
+               spin_unlock(&rt_rq->rt_runtime_lock);
+               spin_unlock(&rt_b->rt_runtime_lock);
+       }
+ }
+ 
+ static void enable_runtime(struct rq *rq)
+ {
+       unsigned long flags;
+ 
+       spin_lock_irqsave(&rq->lock, flags);
+       __enable_runtime(rq);
+       spin_unlock_irqrestore(&rq->lock, flags);
+ }
+ 
+ static int balance_runtime(struct rt_rq *rt_rq)
+ {
+       int more = 0;
+ 
+       if (rt_rq->rt_time > rt_rq->rt_runtime) {
+               spin_unlock(&rt_rq->rt_runtime_lock);
+               more = do_balance_runtime(rt_rq);
+               spin_lock(&rt_rq->rt_runtime_lock);
+       }
+ 
+       return more;
+ }
+ #else /* !CONFIG_SMP */
+ static inline int balance_runtime(struct rt_rq *rt_rq)
+ {
+       return 0;
+ }
+ #endif /* CONFIG_SMP */
+ 
+ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+ {
+       int i, idle = 1;
+       cpumask_t span;
+ 
+       if (rt_b->rt_runtime == RUNTIME_INF)
+               return 1;
+ 
+       span = sched_rt_period_mask();
+       for_each_cpu_mask(i, span) {
+               int enqueue = 0;
+               struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
+               struct rq *rq = rq_of_rt_rq(rt_rq);
+ 
+               spin_lock(&rq->lock);
+               if (rt_rq->rt_time) {
+                       u64 runtime;
+ 
+                       spin_lock(&rt_rq->rt_runtime_lock);
+                       if (rt_rq->rt_throttled)
+                               balance_runtime(rt_rq);
+                       runtime = rt_rq->rt_runtime;
+                       rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
+                       if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
+                               rt_rq->rt_throttled = 0;
+                               enqueue = 1;
+                       }
+                       if (rt_rq->rt_time || rt_rq->rt_nr_running)
+                               idle = 0;
+                       spin_unlock(&rt_rq->rt_runtime_lock);
+               } else if (rt_rq->rt_nr_running)
+                       idle = 0;
+ 
+               if (enqueue)
+                       sched_rt_rq_enqueue(rt_rq);
+               spin_unlock(&rq->lock);
+       }
+ 
+       return idle;
+ }
   
   static inline int rt_se_prio(struct sched_rt_entity *rt_se)
   {
@@@ -327,18 -447,10 +447,10 @@@ static int sched_rt_runtime_exceeded(st
         if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
                 return 0;
   
- #ifdef CONFIG_SMP
-       if (rt_rq->rt_time > runtime) {
-               int more;
- 
-               spin_unlock(&rt_rq->rt_runtime_lock);
-               more = balance_runtime(rt_rq);
-               spin_lock(&rt_rq->rt_runtime_lock);
- 
-               if (more)
-                       runtime = sched_rt_runtime(rt_rq);
-       }
- #endif
+       balance_runtime(rt_rq);
+       runtime = sched_rt_runtime(rt_rq);
+       if (runtime == RUNTIME_INF)
+               return 0;
   
         if (rt_rq->rt_time > runtime) {
                 rt_rq->rt_throttled = 1;
@@@ -392,12 -504,21 +504,21 @@@ void inc_rt_tasks(struct sched_rt_entit
         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
         rt_rq->rt_nr_running++;
   #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       if (rt_se_prio(rt_se) < rt_rq->highest_prio)
+       if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+               struct rq *rq = rq_of_rt_rq(rt_rq);
+ 
                 rt_rq->highest_prio = rt_se_prio(rt_se);
+ #ifdef CONFIG_SMP
+               if (rq->online)
+                       cpupri_set(&rq->rd->cpupri, rq->cpu,
+                                  rt_se_prio(rt_se));
+ #endif
+       }
   #endif
   #ifdef CONFIG_SMP
         if (rt_se->nr_cpus_allowed > 1) {
                 struct rq *rq = rq_of_rt_rq(rt_rq);
+ 
                 rq->rt.rt_nr_migratory++;
         }
   
@@@ -417,6 -538,10 +538,10 @@@
   static inline
   void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
   {
+ #ifdef CONFIG_SMP
+       int highest_prio = rt_rq->highest_prio;
+ #endif
+ 
         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
         WARN_ON(!rt_rq->rt_nr_running);
         rt_rq->rt_nr_running--;
@@@ -440,6 -565,14 +565,14 @@@
                 rq->rt.rt_nr_migratory--;
         }
   
+       if (rt_rq->highest_prio != highest_prio) {
+               struct rq *rq = rq_of_rt_rq(rt_rq);
+ 
+               if (rq->online)
+                       cpupri_set(&rq->rd->cpupri, rq->cpu,
+                                  rt_rq->highest_prio);
+       }
+ 
         update_rt_migration(rq_of_rt_rq(rt_rq));
   #endif /* CONFIG_SMP */
   #ifdef CONFIG_RT_GROUP_SCHED
@@@ -455,6 -588,7 +588,7 @@@ static void __enqueue_rt_entity(struct 
         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
         struct rt_prio_array *array = &rt_rq->active;
         struct rt_rq *group_rq = group_rt_rq(rt_se);
+       struct list_head *queue = array->queue + rt_se_prio(rt_se);
   
         /*
          * Don't enqueue the group if its throttled, or when empty.
@@@ -465,7 -599,11 +599,11 @@@
         if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                 return;
   
-       list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+       if (rt_se->nr_cpus_allowed == 1)
+               list_add(&rt_se->run_list, queue);
+       else
+               list_add_tail(&rt_se->run_list, queue);
+ 
         __set_bit(rt_se_prio(rt_se), array->bitmap);
   
         inc_rt_tasks(rt_se, rt_rq);
@@@ -532,6 -670,8 +670,8 @@@ static void enqueue_task_rt(struct rq *
                 rt_se->timeout = 0;
   
         enqueue_rt_entity(rt_se);
+ 
+       inc_cpu_load(rq, p->se.load.weight);
   }
   
   static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@@ -540,6 -680,8 +680,8 @@@
   
         update_curr_rt(rq);
         dequeue_rt_entity(rt_se);
+ 
+       dec_cpu_load(rq, p->se.load.weight);
   }
   
   /*
@@@ -550,10 -692,12 +692,12 @@@ stati
   void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
   {
         struct rt_prio_array *array = &rt_rq->active;
-       struct list_head *queue = array->queue + rt_se_prio(rt_se);
   
-       if (on_rt_rq(rt_se))
-               list_move_tail(&rt_se->run_list, queue);
+       if (on_rt_rq(rt_se)) {
+               list_del_init(&rt_se->run_list);
+               list_add_tail(&rt_se->run_list,
+                             array->queue + rt_se_prio(rt_se));
+       }
   }
   
   static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@@ -616,8 -760,37 +760,37 @@@ static int select_task_rq_rt(struct tas
    */
   static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
   {
-       if (p->prio < rq->curr->prio)
+       if (p->prio < rq->curr->prio) {
                 resched_task(rq->curr);
+               return;
+       }
+ 
+ #ifdef CONFIG_SMP
+       /*
+        * If:
+        *
+        * - the newly woken task is of equal priority to the current task
+        * - the newly woken task is non-migratable while current is migratable
+        * - current will be preempted on the next reschedule
+        *
+        * we should check to see if current can readily move to a different
+        * cpu.  If so, we will reschedule to allow the push logic to try
+        * to move current somewhere else, making room for our non-migratable
+        * task.
+        */
+       if((p->prio == rq->curr->prio)
+          && p->rt.nr_cpus_allowed == 1
+          && rq->curr->rt.nr_cpus_allowed != 1) {
+               cpumask_t mask;
+ 
+               if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+                       /*
+                        * There appears to be other cpus that can accept
+                        * current, so lets reschedule to try and push it away
+                        */
+                       resched_task(rq->curr);
+       }
+ #endif
   }
   
   static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@@ -720,73 -893,6 +893,6 @@@ static struct task_struct *pick_next_hi
   
   static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
   
- static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
- {
-       int       lowest_prio = -1;
-       int       lowest_cpu  = -1;
-       int       count       = 0;
-       int       cpu;
- 
-       cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
- 
-       /*
-        * Scan each rq for the lowest prio.
-        */
-       for_each_cpu_mask(cpu, *lowest_mask) {
-               struct rq *rq = cpu_rq(cpu);
- 
-               /* We look for lowest RT prio or non-rt CPU */
-               if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-                       /*
-                        * if we already found a low RT queue
-                        * and now we found this non-rt queue
-                        * clear the mask and set our bit.
-                        * Otherwise just return the queue as is
-                        * and the count==1 will cause the algorithm
-                        * to use the first bit found.
-                        */
-                       if (lowest_cpu != -1) {
-                               cpus_clear(*lowest_mask);
-                               cpu_set(rq->cpu, *lowest_mask);
-                       }
-                       return 1;
-               }
- 
-               /* no locking for now */
-               if ((rq->rt.highest_prio > task->prio)
-                   && (rq->rt.highest_prio >= lowest_prio)) {
-                       if (rq->rt.highest_prio > lowest_prio) {
-                               /* new low - clear old data */
-                               lowest_prio = rq->rt.highest_prio;
-                               lowest_cpu = cpu;
-                               count = 0;
-                       }
-                       count++;
-               } else
-                       cpu_clear(cpu, *lowest_mask);
-       }
- 
-       /*
-        * Clear out all the set bits that represent
-        * runqueues that were of higher prio than
-        * the lowest_prio.
-        */
-       if (lowest_cpu > 0) {
-               /*
-                * Perhaps we could add another cpumask op to
-                * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
-                * Then that could be optimized to use memset and such.
-                */
-               for_each_cpu_mask(cpu, *lowest_mask) {
-                       if (cpu >= lowest_cpu)
-                               break;
-                       cpu_clear(cpu, *lowest_mask);
-               }
-       }
- 
-       return count;
- }
- 
   static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
   {
         int first;
@@@ -808,17 -914,12 +914,12 @@@ static int find_lowest_rq(struct task_s
         cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
         int this_cpu = smp_processor_id();
         int cpu      = task_cpu(task);
-       int count    = find_lowest_cpus(task, lowest_mask);
   
-       if (!count)
-               return -1; /* No targets found */
+       if (task->rt.nr_cpus_allowed == 1)
+               return -1; /* No other targets possible */
   
-       /*
-        * There is no sense in performing an optimal search if only one
-        * target is found.
-        */
-       if (count == 1)
-               return first_cpu(*lowest_mask);
+       if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+               return -1; /* No targets found */
   
         /*
          * At this point we have built a mask of cpus representing the
@@@ -1006,7 -1107,7 +1107,7 @@@ static int pull_rt_task(struct rq *this
   
         next = pick_next_task_rt(this_rq);
   
- -      for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
+ +      for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
                 if (this_cpu == cpu)
                         continue;
   
@@@ -1163,17 -1264,25 +1264,25 @@@ static void set_cpus_allowed_rt(struct 
   }
   
   /* Assumes rq->lock is held */
- static void join_domain_rt(struct rq *rq)
+ static void rq_online_rt(struct rq *rq)
   {
         if (rq->rt.overloaded)
                 rt_set_overload(rq);
+ 
+       __enable_runtime(rq);
+ 
+       cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
   }
   
   /* Assumes rq->lock is held */
- static void leave_domain_rt(struct rq *rq)
+ static void rq_offline_rt(struct rq *rq)
   {
         if (rq->rt.overloaded)
                 rt_clear_overload(rq);
+ 
+       __disable_runtime(rq);
+ 
+       cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
   }
   
   /*
@@@ -1336,8 -1445,8 +1445,8 @@@ static const struct sched_class rt_sche
         .load_balance           = load_balance_rt,
         .move_one_task          = move_one_task_rt,
         .set_cpus_allowed       = set_cpus_allowed_rt,
-       .join_domain            = join_domain_rt,
-       .leave_domain           = leave_domain_rt,
+       .rq_online              = rq_online_rt,
+       .rq_offline             = rq_offline_rt,
         .pre_schedule           = pre_schedule_rt,
         .post_schedule          = post_schedule_rt,
         .task_wake_up           = task_wake_up_rt,
@@@ -1350,3 -1459,17 +1459,17 @@@
         .prio_changed           = prio_changed_rt,
         .switched_to            = switched_to_rt,
   };
+ 
+ #ifdef CONFIG_SCHED_DEBUG
+ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+ 
+ static void print_rt_stats(struct seq_file *m, int cpu)
+ {
+       struct rt_rq *rt_rq;
+ 
+       rcu_read_lock();
+       for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
+               print_rt_rq(m, cpu, rt_rq);
+       rcu_read_unlock();
+ }
+ #endif /* CONFIG_SCHED_DEBUG */
diff --combined kernel/time/tick-broadcast.c

index 2d0a963,f48d0f0..31463d3
--- 1/kernel/time/tick-broadcast.c
--- 2/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@@ -30,6 -30,7 +30,7 @@@
   struct tick_device tick_broadcast_device;
   static cpumask_t tick_broadcast_mask;
   static DEFINE_SPINLOCK(tick_broadcast_lock);
+ static int tick_broadcast_force;
   
   #ifdef CONFIG_TICK_ONESHOT
   static void tick_broadcast_clear_oneshot(int cpu);
@@@ -232,10 -233,11 +233,11 @@@ static void tick_do_broadcast_on_off(vo
                                                      CLOCK_EVT_MODE_SHUTDOWN);
                 }
                 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
-                       dev->features |= CLOCK_EVT_FEAT_DUMMY;
+                       tick_broadcast_force = 1;
                 break;
         case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
-               if (cpu_isset(cpu, tick_broadcast_mask)) {
+               if (!tick_broadcast_force &&
+                   cpu_isset(cpu, tick_broadcast_mask)) {
                         cpu_clear(cpu, tick_broadcast_mask);
                         if (td->mode == TICKDEV_MODE_PERIODIC)
                                 tick_setup_periodic(dev, 0);
@@@ -266,7 -268,7 +268,7 @@@ void tick_broadcast_on_off(unsigned lon
                        "offline CPU #%d\n", *oncpu);
         else
                 smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
-                                        &reason, 1, 1);
+                                        &reason, 1);
   }
   
   /*
@@@ -397,7 -399,8 +399,7 @@@ again
         mask = CPU_MASK_NONE;
         now = ktime_get();
         /* Find all expired events */
- -      for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
- -           cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
+ +      for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) {
                 td = &per_cpu(tick_cpu_device, cpu);
                 if (td->evtdev->next_event.tv64 <= now.tv64)
                         cpu_set(cpu, mask);
diff --combined net/core/dev.c

index 94d9d6f,821cb16..df5520a
--- 1/net/core/dev.c
--- 2/net/core/dev.c
+++ b/net/core/dev.c
@@@ -2261,7 -2261,7 +2261,7 @@@ out
          */
         if (!cpus_empty(net_dma.channel_mask)) {
                 int chan_idx;
- -              for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
+ +              for_each_cpu_mask_nr(chan_idx, net_dma.channel_mask) {
                         struct dma_chan *chan = net_dma.channels[chan_idx];
                         if (chan)
                                 dma_async_memcpy_issue_pending(chan);
@@@ -4322,7 -4322,7 +4322,7 @@@ static void net_dma_rebalance(struct ne
         i = 0;
         cpu = first_cpu(cpu_online_map);
   
- -      for_each_cpu_mask(chan_idx, net_dma->channel_mask) {
+ +      for_each_cpu_mask_nr(chan_idx, net_dma->channel_mask) {
                 chan = net_dma->channels[chan_idx];
   
                 n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
@@@ -4585,8 -4585,8 +4585,8 @@@ static int __init net_dev_init(void
   
         dev_boot_phase = 0;
   
-       open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
-       open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
+       open_softirq(NET_TX_SOFTIRQ, net_tx_action);
+       open_softirq(NET_RX_SOFTIRQ, net_rx_action);
   
         hotcpu_notifier(dev_cpu_callback, 0);
         dst_init();
diff --combined net/iucv/iucv.c

index 8de5110,cc34ac7..411b339
--- 1/net/iucv/iucv.c
--- 2/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@@ -480,7 -480,7 +480,7 @@@ static void iucv_setmask_mp(void
                 if (cpu_isset(cpu, iucv_buffer_cpumask) &&
                     !cpu_isset(cpu, iucv_irq_cpumask))
                         smp_call_function_single(cpu, iucv_allow_cpu,
-                                                NULL, 0, 1);
+                                                NULL, 1);
         preempt_enable();
   }
   
@@@ -497,8 -497,8 +497,8 @@@ static void iucv_setmask_up(void
         /* Disable all cpu but the first in cpu_irq_cpumask. */
         cpumask = iucv_irq_cpumask;
         cpu_clear(first_cpu(iucv_irq_cpumask), cpumask);
- -      for_each_cpu_mask(cpu, cpumask)
+ +      for_each_cpu_mask_nr(cpu, cpumask)
-               smp_call_function_single(cpu, iucv_block_cpu, NULL, 0, 1);
+               smp_call_function_single(cpu, iucv_block_cpu, NULL, 1);
   }
   
   /**
@@@ -523,7 -523,7 +523,7 @@@ static int iucv_enable(void
         rc = -EIO;
         preempt_disable();
         for_each_online_cpu(cpu)
-               smp_call_function_single(cpu, iucv_declare_cpu, NULL, 0, 1);
+               smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
         preempt_enable();
         if (cpus_empty(iucv_buffer_cpumask))
                 /* No cpu could declare an iucv buffer. */
@@@ -545,7 -545,7 +545,7 @@@ out
    */
   static void iucv_disable(void)
   {
-       on_each_cpu(iucv_retrieve_cpu, NULL, 0, 1);
+       on_each_cpu(iucv_retrieve_cpu, NULL, 1);
         kfree(iucv_path_table);
   }
   
@@@ -580,7 -580,7 +580,7 @@@ static int __cpuinit iucv_cpu_notify(st
         case CPU_ONLINE_FROZEN:
         case CPU_DOWN_FAILED:
         case CPU_DOWN_FAILED_FROZEN:
-               smp_call_function_single(cpu, iucv_declare_cpu, NULL, 0, 1);
+               smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
                 break;
         case CPU_DOWN_PREPARE:
         case CPU_DOWN_PREPARE_FROZEN:
@@@ -589,10 -589,10 +589,10 @@@
                 if (cpus_empty(cpumask))
                         /* Can't offline last IUCV enabled cpu. */
                         return NOTIFY_BAD;
-               smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 0, 1);
+               smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 1);
                 if (cpus_empty(iucv_irq_cpumask))
                         smp_call_function_single(first_cpu(iucv_buffer_cpumask),
-                                                iucv_allow_cpu, NULL, 0, 1);
+                                                iucv_allow_cpu, NULL, 1);
                 break;
         }
         return NOTIFY_OK;
@@@ -652,7 -652,7 +652,7 @@@ static void iucv_cleanup_queue(void
          * pending interrupts force them to the work queue by calling
          * an empty function on all cpus.
          */
-       smp_call_function(__iucv_cleanup_queue, NULL, 0, 1);
+       smp_call_function(__iucv_cleanup_queue, NULL, 1);
         spin_lock_irq(&iucv_queue_lock);
         list_for_each_entry_safe(p, n, &iucv_task_queue, list) {
                 /* Remove stale work items from the task queue. */
@@@ -1559,16 -1559,11 +1559,11 @@@ static void iucv_external_interrupt(u1
   
         p = iucv_irq_data[smp_processor_id()];
         if (p->ippathid >= iucv_max_pathid) {
-               printk(KERN_WARNING "iucv_do_int: Got interrupt with "
-                      "pathid %d > max_connections (%ld)\n",
-                      p->ippathid, iucv_max_pathid - 1);
+               WARN_ON(p->ippathid >= iucv_max_pathid);
                 iucv_sever_pathid(p->ippathid, iucv_error_no_listener);
                 return;
         }
-       if (p->iptype  < 0x01 || p->iptype > 0x09) {
-               printk(KERN_ERR "iucv_do_int: unknown iucv interrupt\n");
-               return;
-       }
+       BUG_ON(p->iptype  < 0x01 || p->iptype > 0x09);
         work = kmalloc(sizeof(struct iucv_irq_list), GFP_ATOMIC);
         if (!work) {
                 printk(KERN_WARNING "iucv_external_interrupt: out of memory\n");
author	Ingo Molnar <mingo@elte.hu>
	Tue, 15 Jul 2008 22:29:07 +0000 (00:29 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Tue, 15 Jul 2008 22:29:07 +0000 (00:29 +0200)
		1	2
arch/x86/kernel/cpu/intel_cacheinfo.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/io_apic_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/smpboot.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/xen/smp.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/infiniband/hw/ehca/ehca_irq.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/asm-x86/ipi.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cpu.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcuclassic.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcupreempt.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_rt.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/tick-broadcast.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/iucv/iucv.c	patch \|	diff1 \|	diff2 \|	blob \| history