Merge branch 'linus' into cpus4096
authorIngo Molnar <mingo@elte.hu>
Tue, 15 Jul 2008 22:29:07 +0000 (00:29 +0200)
committerIngo Molnar <mingo@elte.hu>
Tue, 15 Jul 2008 22:29:07 +0000 (00:29 +0200)
Conflicts:

arch/x86/xen/smp.c
kernel/sched_rt.c
net/iucv/iucv.c

Signed-off-by: Ingo Molnar <mingo@elte.hu>
15 files changed:
1  2 
arch/x86/kernel/cpu/intel_cacheinfo.c
arch/x86/kernel/io_apic_64.c
arch/x86/kernel/smpboot.c
arch/x86/xen/smp.c
drivers/infiniband/hw/ehca/ehca_irq.c
include/asm-x86/ipi.h
kernel/cpu.c
kernel/rcuclassic.c
kernel/rcupreempt.c
kernel/sched.c
kernel/sched_fair.c
kernel/sched_rt.c
kernel/time/tick-broadcast.c
net/core/dev.c
net/iucv/iucv.c

@@@ -62,6 -62,7 +62,7 @@@ static struct _cache_table cache_table[
        { 0x4b, LVL_3,      8192 },     /* 16-way set assoc, 64 byte line size */
        { 0x4c, LVL_3,     12288 },     /* 12-way set assoc, 64 byte line size */
        { 0x4d, LVL_3,     16384 },     /* 16-way set assoc, 64 byte line size */
+       { 0x4e, LVL_2,      6144 },     /* 24-way set assoc, 64 byte line size */
        { 0x60, LVL_1_DATA, 16 },       /* 8-way set assoc, sectored cache, 64 byte line size */
        { 0x66, LVL_1_DATA, 8 },        /* 4-way set assoc, sectored cache, 64 byte line size */
        { 0x67, LVL_1_DATA, 16 },       /* 4-way set assoc, sectored cache, 64 byte line size */
@@@ -488,7 -489,7 +489,7 @@@ static void __cpuinit cache_remove_shar
        int sibling;
  
        this_leaf = CPUID4_INFO_IDX(cpu, index);
 -      for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) {
 +      for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
                sibling_leaf = CPUID4_INFO_IDX(sibling, index); 
                cpu_clear(cpu, sibling_leaf->shared_cpu_map);
        }
@@@ -61,7 -61,7 +61,7 @@@ struct irq_cfg 
  };
  
  /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
- struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
+ static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
        [0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
        [1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
        [2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
  
  static int assign_irq_vector(int irq, cpumask_t mask);
  
+ int first_system_vector = 0xfe;
+ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
  #define __apicdebuginit  __init
  
  int sis_apic_bug; /* not actually supported, dummy for compile */
@@@ -90,7 -94,7 +94,7 @@@ static int no_timer_check
  
  static int disable_timer_pin_1 __initdata;
  
- int timer_over_8254 __initdata = 1;
+ int timer_through_8259 __initdata;
  
  /* Where if anywhere is the i8259 connect in external int mode */
  static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
@@@ -104,15 -108,17 +108,17 @@@ DEFINE_SPINLOCK(vector_lock)
  int nr_ioapic_registers[MAX_IO_APICS];
  
  /* I/O APIC entries */
- struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+ struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
  int nr_ioapics;
  
  /* MP IRQ source entries */
- struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
  
  /* # of MP IRQ source entries */
  int mp_irq_entries;
  
+ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
  /*
   * Rough estimation of how many shared IRQs there are, can
   * be changed anytime.
@@@ -140,7 -146,7 +146,7 @@@ struct io_apic 
  static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
  {
        return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-               + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+               + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
  }
  
  static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@@ -183,7 -189,7 +189,7 @@@ static bool io_apic_level_ack_pending(u
                        break;
                reg = io_apic_read(entry->apic, 0x10 + pin*2);
                /* Is the remote IRR bit set? */
-               if ((reg >> 14) & 1) {
+               if (reg & IO_APIC_REDIR_REMOTE_IRR) {
                        spin_unlock_irqrestore(&ioapic_lock, flags);
                        return true;
                }
@@@ -298,7 -304,7 +304,7 @@@ static void __target_IO_APIC_irq(unsign
                        break;
                io_apic_write(apic, 0x11 + pin*2, dest);
                reg = io_apic_read(apic, 0x10 + pin*2);
-               reg &= ~0x000000ff;
+               reg &= ~IO_APIC_REDIR_VECTOR_MASK;
                reg |= vector;
                io_apic_modify(apic, reg);
                if (!entry->next)
@@@ -360,16 -366,37 +366,37 @@@ static void add_pin_to_irq(unsigned in
        entry->pin = pin;
  }
  
+ /*
+  * Reroute an IRQ to a different pin.
+  */
+ static void __init replace_pin_at_irq(unsigned int irq,
+                                     int oldapic, int oldpin,
+                                     int newapic, int newpin)
+ {
+       struct irq_pin_list *entry = irq_2_pin + irq;
+       while (1) {
+               if (entry->apic == oldapic && entry->pin == oldpin) {
+                       entry->apic = newapic;
+                       entry->pin = newpin;
+               }
+               if (!entry->next)
+                       break;
+               entry = irq_2_pin + entry->next;
+       }
+ }
  
  #define DO_ACTION(name,R,ACTION, FINAL)                                       \
                                                                        \
        static void name##_IO_APIC_irq (unsigned int irq)               \
        __DO_ACTION(R, ACTION, FINAL)
  
- DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
-                                               /* mask = 1 */
- DO_ACTION( __unmask,           0, &= 0xfffeffff, )
-                                               /* mask = 0 */
+ /* mask = 1 */
+ DO_ACTION(__mask,     0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
+ /* mask = 0 */
+ DO_ACTION(__unmask,   0, &= ~IO_APIC_REDIR_MASKED, )
  
  static void mask_IO_APIC_irq (unsigned int irq)
  {
@@@ -430,20 -457,6 +457,6 @@@ static int __init disable_timer_pin_set
  }
  __setup("disable_timer_pin_1", disable_timer_pin_setup);
  
- static int __init setup_disable_8254_timer(char *s)
- {
-       timer_over_8254 = -1;
-       return 1;
- }
- static int __init setup_enable_8254_timer(char *s)
- {
-       timer_over_8254 = 2;
-       return 1;
- }
- __setup("disable_8254_timer", setup_disable_8254_timer);
- __setup("enable_8254_timer", setup_enable_8254_timer);
  
  /*
   * Find the IRQ entry number of a certain pin.
@@@ -453,10 -466,10 +466,10 @@@ static int find_irq_entry(int apic, in
        int i;
  
        for (i = 0; i < mp_irq_entries; i++)
-               if (mp_irqs[i].mpc_irqtype == type &&
-                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
-                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
-                   mp_irqs[i].mpc_dstirq == pin)
+               if (mp_irqs[i].mp_irqtype == type &&
+                   (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+                    mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+                   mp_irqs[i].mp_dstirq == pin)
                        return i;
  
        return -1;
@@@ -470,13 -483,13 +483,13 @@@ static int __init find_isa_irq_pin(int 
        int i;
  
        for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mpc_srcbus;
+               int lbus = mp_irqs[i].mp_srcbus;
  
                if (test_bit(lbus, mp_bus_not_pci) &&
-                   (mp_irqs[i].mpc_irqtype == type) &&
-                   (mp_irqs[i].mpc_srcbusirq == irq))
+                   (mp_irqs[i].mp_irqtype == type) &&
+                   (mp_irqs[i].mp_srcbusirq == irq))
  
-                       return mp_irqs[i].mpc_dstirq;
+                       return mp_irqs[i].mp_dstirq;
        }
        return -1;
  }
@@@ -486,17 -499,17 +499,17 @@@ static int __init find_isa_irq_apic(in
        int i;
  
        for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mpc_srcbus;
+               int lbus = mp_irqs[i].mp_srcbus;
  
                if (test_bit(lbus, mp_bus_not_pci) &&
-                   (mp_irqs[i].mpc_irqtype == type) &&
-                   (mp_irqs[i].mpc_srcbusirq == irq))
+                   (mp_irqs[i].mp_irqtype == type) &&
+                   (mp_irqs[i].mp_srcbusirq == irq))
                        break;
        }
        if (i < mp_irq_entries) {
                int apic;
                for(apic = 0; apic < nr_ioapics; apic++) {
-                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+                       if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
                                return apic;
                }
        }
@@@ -516,28 -529,28 +529,28 @@@ int IO_APIC_get_PCI_irq_vector(int bus
  
        apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
                bus, slot, pin);
-       if (mp_bus_id_to_pci_bus[bus] == -1) {
+       if (test_bit(bus, mp_bus_not_pci)) {
                apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
                return -1;
        }
        for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mpc_srcbus;
+               int lbus = mp_irqs[i].mp_srcbus;
  
                for (apic = 0; apic < nr_ioapics; apic++)
-                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
-                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+                       if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+                           mp_irqs[i].mp_dstapic == MP_APIC_ALL)
                                break;
  
                if (!test_bit(lbus, mp_bus_not_pci) &&
-                   !mp_irqs[i].mpc_irqtype &&
+                   !mp_irqs[i].mp_irqtype &&
                    (bus == lbus) &&
-                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
-                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+                   (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+                       int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
  
                        if (!(apic || IO_APIC_IRQ(irq)))
                                continue;
  
-                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+                       if (pin == (mp_irqs[i].mp_srcbusirq & 3))
                                return irq;
                        /*
                         * Use the first all-but-pin matching entry as a
  
  static int MPBIOS_polarity(int idx)
  {
-       int bus = mp_irqs[idx].mpc_srcbus;
+       int bus = mp_irqs[idx].mp_srcbus;
        int polarity;
  
        /*
         * Determine IRQ line polarity (high active or low active):
         */
-       switch (mp_irqs[idx].mpc_irqflag & 3)
+       switch (mp_irqs[idx].mp_irqflag & 3)
        {
                case 0: /* conforms, ie. bus-type dependent polarity */
                        if (test_bit(bus, mp_bus_not_pci))
  
  static int MPBIOS_trigger(int idx)
  {
-       int bus = mp_irqs[idx].mpc_srcbus;
+       int bus = mp_irqs[idx].mp_srcbus;
        int trigger;
  
        /*
         * Determine IRQ trigger mode (edge or level sensitive):
         */
-       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+       switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
        {
                case 0: /* conforms, ie. bus-type dependent */
                        if (test_bit(bus, mp_bus_not_pci))
@@@ -660,16 -673,16 +673,16 @@@ static inline int irq_trigger(int idx
  static int pin_2_irq(int idx, int apic, int pin)
  {
        int irq, i;
-       int bus = mp_irqs[idx].mpc_srcbus;
+       int bus = mp_irqs[idx].mp_srcbus;
  
        /*
         * Debugging check, we are in big trouble if this message pops up!
         */
-       if (mp_irqs[idx].mpc_dstirq != pin)
+       if (mp_irqs[idx].mp_dstirq != pin)
                printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
  
        if (test_bit(bus, mp_bus_not_pci)) {
-               irq = mp_irqs[idx].mpc_srcbusirq;
+               irq = mp_irqs[idx].mp_srcbusirq;
        } else {
                /*
                 * PCI IRQs are mapped in order
@@@ -718,7 -731,7 +731,7 @@@ static int __assign_irq_vector(int irq
                        return 0;
        }
  
 -      for_each_cpu_mask(cpu, mask) {
 +      for_each_cpu_mask_nr(cpu, mask) {
                cpumask_t domain, new_mask;
                int new_cpu;
                int vector, offset;
                offset = current_offset;
  next:
                vector += 8;
-               if (vector >= FIRST_SYSTEM_VECTOR) {
+               if (vector >= first_system_vector) {
                        /* If we run out of vectors on large boxen, must share them. */
                        offset = (offset + 1) % 8;
                        vector = FIRST_DEVICE_VECTOR + offset;
                        continue;
                if (vector == IA32_SYSCALL_VECTOR)
                        goto next;
 -              for_each_cpu_mask(new_cpu, new_mask)
 +              for_each_cpu_mask_nr(new_cpu, new_mask)
                        if (per_cpu(vector_irq, new_cpu)[vector] != -1)
                                goto next;
                /* Found one! */
                        cfg->move_in_progress = 1;
                        cfg->old_domain = cfg->domain;
                }
 -              for_each_cpu_mask(new_cpu, new_mask)
 +              for_each_cpu_mask_nr(new_cpu, new_mask)
                        per_cpu(vector_irq, new_cpu)[vector] = irq;
                cfg->vector = vector;
                cfg->domain = domain;
@@@ -781,14 -794,14 +794,14 @@@ static void __clear_irq_vector(int irq
  
        vector = cfg->vector;
        cpus_and(mask, cfg->domain, cpu_online_map);
 -      for_each_cpu_mask(cpu, mask)
 +      for_each_cpu_mask_nr(cpu, mask)
                per_cpu(vector_irq, cpu)[vector] = -1;
  
        cfg->vector = 0;
        cpus_clear(cfg->domain);
  }
  
- void __setup_vector_irq(int cpu)
static void __setup_vector_irq(int cpu)
  {
        /* Initialize vector_irq on a new cpu */
        /* This function must be called with vector_lock held */
        }
  }
  
+ void setup_vector_irq(int cpu)
+ {
+       spin_lock(&vector_lock);
+       __setup_vector_irq(smp_processor_id());
+       spin_unlock(&vector_lock);
+ }
  
  static struct irq_chip ioapic_chip;
  
@@@ -846,7 -866,7 +866,7 @@@ static void setup_IO_APIC_irq(int apic
        apic_printk(APIC_VERBOSE,KERN_DEBUG
                    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
                    "IRQ %d Mode:%i Active:%i)\n",
-                   apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
+                   apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
                    irq, trigger, polarity);
  
        /*
@@@ -887,10 -907,10 +907,10 @@@ static void __init setup_IO_APIC_irqs(v
                idx = find_irq_entry(apic,pin,mp_INT);
                if (idx == -1) {
                        if (first_notcon) {
-                               apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+                               apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
                                first_notcon = 0;
                        } else
-                               apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+                               apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
                        continue;
                }
                if (!first_notcon) {
  }
  
  /*
-  * Set up the 8259A-master output pin as broadcast to all
-  * CPUs.
+  * Set up the timer pin, possibly with the 8259A-master behind.
   */
- static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+                                       int vector)
  {
        struct IO_APIC_route_entry entry;
  
        memset(&entry, 0, sizeof(entry));
  
-       disable_8259A_irq(0);
-       /* mask LVT0 */
-       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
        /*
         * We use logical delivery to get the timer IRQ
         * to the first CPU.
         */
        entry.dest_mode = INT_DEST_MODE;
-       entry.mask = 0;                                 /* unmask IRQ now */
+       entry.mask = 1;                                 /* mask IRQ now */
        entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
        entry.delivery_mode = INT_DELIVERY_MODE;
        entry.polarity = 0;
  
        /*
         * The timer IRQ doesn't have to know that behind the
-        * scene we have a 8259A-master in AEOI mode ...
+        * scene we may have a 8259A-master in AEOI mode ...
         */
        set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
  
         * Add it to the IO-APIC irq-routing table:
         */
        ioapic_write_entry(apic, pin, entry);
-       enable_8259A_irq(0);
  }
  
  void __apicdebuginit print_IO_APIC(void)
        printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
        for (i = 0; i < nr_ioapics; i++)
                printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+                      mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
  
        /*
         * We are a bit conservative about what we expect.  We have to
        spin_unlock_irqrestore(&ioapic_lock, flags);
  
        printk("\n");
-       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
        printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
        printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
  
@@@ -1077,6 -1090,7 +1090,7 @@@ void __apicdebuginit print_local_APIC(v
  
        printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
                smp_processor_id(), hard_smp_processor_id());
+       v = apic_read(APIC_ID);
        printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
        v = apic_read(APIC_LVR);
        printk(KERN_INFO "... APIC VERSION: %08x\n", v);
  
  void print_all_local_APICs (void)
  {
-       on_each_cpu(print_local_APIC, NULL, 1, 1);
+       on_each_cpu(print_local_APIC, NULL, 1);
  }
  
  void __apicdebuginit print_PIC(void)
@@@ -1540,7 -1554,7 +1554,7 @@@ static inline void init_IO_APIC_traps(v
        }
  }
  
- static void enable_lapic_irq (unsigned int irq)
+ static void unmask_lapic_irq(unsigned int irq)
  {
        unsigned long v;
  
        apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
  }
  
- static void disable_lapic_irq (unsigned int irq)
+ static void mask_lapic_irq(unsigned int irq)
  {
        unsigned long v;
  
@@@ -1561,19 -1575,20 +1575,20 @@@ static void ack_lapic_irq (unsigned in
        ack_APIC_irq();
  }
  
- static void end_lapic_irq (unsigned int i) { /* nothing */ }
- static struct hw_interrupt_type lapic_irq_type __read_mostly = {
-       .name = "local-APIC",
-       .typename = "local-APIC-edge",
-       .startup = NULL, /* startup_irq() not used for IRQ0 */
-       .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
-       .enable = enable_lapic_irq,
-       .disable = disable_lapic_irq,
-       .ack = ack_lapic_irq,
-       .end = end_lapic_irq,
+ static struct irq_chip lapic_chip __read_mostly = {
+       .name           = "local-APIC",
+       .mask           = mask_lapic_irq,
+       .unmask         = unmask_lapic_irq,
+       .ack            = ack_lapic_irq,
  };
  
+ static void lapic_register_intr(int irq)
+ {
+       irq_desc[irq].status &= ~IRQ_LEVEL;
+       set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+                                     "edge");
+ }
  static void __init setup_nmi(void)
  {
        /*
@@@ -1659,6 -1674,7 +1674,7 @@@ static inline void __init check_timer(v
        struct irq_cfg *cfg = irq_cfg + 0;
        int apic1, pin1, apic2, pin2;
        unsigned long flags;
+       int no_pin1 = 0;
  
        local_irq_save(flags);
  
        assign_irq_vector(0, TARGET_CPUS);
  
        /*
-        * Subtle, code in do_timer_interrupt() expects an AEOI
-        * mode for the 8259A whenever interrupts are routed
-        * through I/O APICs.  Also IRQ0 has to be enabled in
-        * the 8259A which implies the virtual wire has to be
-        * disabled in the local APIC.
+        * As IRQ0 is to be enabled in the 8259A, the virtual
+        * wire has to be disabled in the local APIC.
         */
        apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
        init_8259A(1);
-       if (timer_over_8254 > 0)
-               enable_8259A_irq(0);
  
        pin1  = find_isa_irq_pin(0, mp_INT);
        apic1 = find_isa_irq_apic(0, mp_INT);
        apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
                cfg->vector, apic1, pin1, apic2, pin2);
  
+       /*
+        * Some BIOS writers are clueless and report the ExtINTA
+        * I/O APIC input from the cascaded 8259A as the timer
+        * interrupt input.  So just in case, if only one pin
+        * was found above, try it both directly and through the
+        * 8259A.
+        */
+       if (pin1 == -1) {
+               pin1 = pin2;
+               apic1 = apic2;
+               no_pin1 = 1;
+       } else if (pin2 == -1) {
+               pin2 = pin1;
+               apic2 = apic1;
+       }
        if (pin1 != -1) {
                /*
                 * Ok, does IRQ0 through the IOAPIC work?
                 */
+               if (no_pin1) {
+                       add_pin_to_irq(0, apic1, pin1);
+                       setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+               }
                unmask_IO_APIC_irq(0);
                if (!no_timer_check && timer_irq_works()) {
-                       nmi_watchdog_default();
                        if (nmi_watchdog == NMI_IO_APIC) {
-                               disable_8259A_irq(0);
                                setup_nmi();
                                enable_8259A_irq(0);
                        }
                        goto out;
                }
                clear_IO_APIC_pin(apic1, pin1);
-               apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
-                               "connected to IO-APIC\n");
-       }
+               if (!no_pin1)
+                       apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: "
+                                   "8254 timer not connected to IO-APIC\n");
  
-       apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
-                               "through the 8259A ... ");
-       if (pin2 != -1) {
+               apic_printk(APIC_VERBOSE,KERN_INFO
+                       "...trying to set up timer (IRQ0) "
+                       "through the 8259A ... ");
                apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
                        apic2, pin2);
                /*
                 * legacy devices should be connected to IO APIC #0
                 */
-               setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
+               replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+               setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+               unmask_IO_APIC_irq(0);
+               enable_8259A_irq(0);
                if (timer_irq_works()) {
                        apic_printk(APIC_VERBOSE," works.\n");
-                       nmi_watchdog_default();
+                       timer_through_8259 = 1;
                        if (nmi_watchdog == NMI_IO_APIC) {
+                               disable_8259A_irq(0);
                                setup_nmi();
+                               enable_8259A_irq(0);
                        }
                        goto out;
                }
                /*
                 * Cleanup, just in case ...
                 */
+               disable_8259A_irq(0);
                clear_IO_APIC_pin(apic2, pin2);
+               apic_printk(APIC_VERBOSE," failed.\n");
        }
-       apic_printk(APIC_VERBOSE," failed.\n");
  
        if (nmi_watchdog == NMI_IO_APIC) {
                printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-               nmi_watchdog = 0;
+               nmi_watchdog = NMI_NONE;
        }
  
        apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
  
-       disable_8259A_irq(0);
-       irq_desc[0].chip = &lapic_irq_type;
+       lapic_register_intr(0);
        apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
        enable_8259A_irq(0);
  
                apic_printk(APIC_VERBOSE," works.\n");
                goto out;
        }
+       disable_8259A_irq(0);
        apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
        apic_printk(APIC_VERBOSE," failed.\n");
  
@@@ -1778,11 -1813,21 +1813,21 @@@ static int __init notimercheck(char *s
  __setup("no_timer_check", notimercheck);
  
  /*
-  *
-  * IRQs that are handled by the PIC in the MPS IOAPIC case.
-  * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
-  *   Linux doesn't really care, as it's not actually used
-  *   for any interrupt handling anyway.
+  * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+  * to devices.  However there may be an I/O APIC pin available for
+  * this interrupt regardless.  The pin may be left unconnected, but
+  * typically it will be reused as an ExtINT cascade interrupt for
+  * the master 8259A.  In the MPS case such a pin will normally be
+  * reported as an ExtINT interrupt in the MP table.  With ACPI
+  * there is no provision for ExtINT interrupts, and in the absence
+  * of an override it would be treated as an ordinary ISA I/O APIC
+  * interrupt, that is edge-triggered and unmasked by default.  We
+  * used to do this, but it caused problems on some systems because
+  * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+  * the same ExtINT cascade interrupt to drive the local APIC of the
+  * bootstrap processor.  Therefore we refrain from routing IRQ2 to
+  * the I/O APIC in all cases now.  No actual device should request
+  * it anyway.  --macro
   */
  #define PIC_IRQS      (1<<2)
  
@@@ -1793,10 -1838,7 +1838,7 @@@ void __init setup_IO_APIC(void
         * calling enable_IO_APIC() is moved to setup_local_APIC for BP
         */
  
-       if (acpi_ioapic)
-               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
-       else
-               io_apic_irqs = ~PIC_IRQS;
+       io_apic_irqs = ~PIC_IRQS;
  
        apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
  
@@@ -1841,8 -1883,8 +1883,8 @@@ static int ioapic_resume(struct sys_dev
  
        spin_lock_irqsave(&ioapic_lock, flags);
        reg_00.raw = io_apic_read(dev->id, 0);
-       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
-               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+       if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+               reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
                io_apic_write(dev->id, 0, reg_00.raw);
        }
        spin_unlock_irqrestore(&ioapic_lock, flags);
@@@ -2242,8 -2284,8 +2284,8 @@@ int acpi_get_override_irq(int bus_irq, 
                return -1;
  
        for (i = 0; i < mp_irq_entries; i++)
-               if (mp_irqs[i].mpc_irqtype == mp_INT &&
-                   mp_irqs[i].mpc_srcbusirq == bus_irq)
+               if (mp_irqs[i].mp_irqtype == mp_INT &&
+                   mp_irqs[i].mp_srcbusirq == bus_irq)
                        break;
        if (i >= mp_irq_entries)
                return -1;
@@@ -2336,7 -2378,7 +2378,7 @@@ void __init ioapic_init_mappings(void
        ioapic_res = ioapic_setup_resources();
        for (i = 0; i < nr_ioapics; i++) {
                if (smp_found_config) {
-                       ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+                       ioapic_phys = mp_ioapics[i].mp_apicaddr;
                } else {
                        ioapic_phys = (unsigned long)
                                alloc_bootmem_pages(PAGE_SIZE);
@@@ -59,7 -59,6 +59,6 @@@
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
  #include <asm/mtrr.h>
- #include <asm/nmi.h>
  #include <asm/vmi.h>
  #include <asm/genapic.h>
  #include <linux/mc146818rtc.h>
  #include <mach_wakecpu.h>
  #include <smpboot_hooks.h>
  
- /*
-  * FIXME: For x86_64, those are defined in other files. But moving them here,
-  * would make the setup areas dependent on smp, which is a loss. When we
-  * integrate apic between arches, we can probably do a better job, but
-  * right now, they'll stay here -- glommer
-  */
- /* which logical CPU number maps to which CPU (physical APIC ID) */
- u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
-                       { [0 ... NR_CPUS-1] = BAD_APICID };
- void *x86_cpu_to_apicid_early_ptr;
- u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
-                               = { [0 ... NR_CPUS-1] = BAD_APICID };
- void *x86_bios_cpu_apicid_early_ptr;
  #ifdef CONFIG_X86_32
  u8 apicid_2_node[MAX_APICID];
  static int low_mappings;
@@@ -198,13 -181,12 +181,12 @@@ static void map_cpu_to_logical_apicid(v
        map_cpu_to_node(cpu, node);
  }
  
static void unmap_cpu_to_logical_apicid(int cpu)
void numa_remove_cpu(int cpu)
  {
        cpu_2_logical_apicid[cpu] = BAD_APICID;
        unmap_cpu_to_node(cpu);
  }
  #else
- #define unmap_cpu_to_logical_apicid(cpu) do {} while (0)
  #define map_cpu_to_logical_apicid()  do {} while (0)
  #endif
  
@@@ -345,19 -327,12 +327,12 @@@ static void __cpuinit start_secondary(v
         * lock helps us to not include this cpu in a currently in progress
         * smp_call_function().
         */
-       lock_ipi_call_lock();
- #ifdef CONFIG_X86_64
-       spin_lock(&vector_lock);
-       /* Setup the per cpu irq handling data structures */
-       __setup_vector_irq(smp_processor_id());
-       /*
-        * Allow the master to continue.
-        */
-       spin_unlock(&vector_lock);
+       ipi_call_lock_irq();
+ #ifdef CONFIG_X86_IO_APIC
+       setup_vector_irq(smp_processor_id());
  #endif
        cpu_set(smp_processor_id(), cpu_online_map);
-       unlock_ipi_call_lock();
+       ipi_call_unlock_irq();
        per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
  
        setup_secondary_clock();
        cpu_idle();
  }
  
- #ifdef CONFIG_X86_32
- /*
-  * Everything has been set up for the secondary
-  * CPUs - they just need to reload everything
-  * from the task structure
-  * This function must not return.
-  */
- void __devinit initialize_secondary(void)
- {
-       /*
-        * We don't actually need to load the full TSS,
-        * basically just the stack pointer and the ip.
-        */
-       asm volatile(
-               "movl %0,%%esp\n\t"
-               "jmp *%1"
-               :
-               :"m" (current->thread.sp), "m" (current->thread.ip));
- }
- #endif
  static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
  {
- #ifdef CONFIG_X86_32
        /*
         * Mask B, Pentium, but not Pentium MMX
         */
  
  valid_k7:
        ;
- #endif
  }
  
  static void __cpuinit smp_checks(void)
@@@ -487,7 -438,7 +438,7 @@@ void __cpuinit set_cpu_sibling_map(int 
        cpu_set(cpu, cpu_sibling_setup_map);
  
        if (smp_num_siblings > 1) {
 -              for_each_cpu_mask(i, cpu_sibling_setup_map) {
 +              for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
                        if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
                            c->cpu_core_id == cpu_data(i).cpu_core_id) {
                                cpu_set(i, per_cpu(cpu_sibling_map, cpu));
                return;
        }
  
 -      for_each_cpu_mask(i, cpu_sibling_setup_map) {
 +      for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
                if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
                    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
                        cpu_set(i, c->llc_shared_map);
@@@ -555,23 -506,6 +506,6 @@@ cpumask_t cpu_coregroup_map(int cpu
                return c->llc_shared_map;
  }
  
- #ifdef CONFIG_X86_32
- /*
-  * We are called very early to get the low memory for the
-  * SMP bootup trampoline page.
-  */
- void __init smp_alloc_memory(void)
- {
-       trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
-       /*
-        * Has to be in very low memory so we can execute
-        * real-mode AP code.
-        */
-       if (__pa(trampoline_base) >= 0x9F000)
-               BUG();
- }
- #endif
  static void impress_friends(void)
  {
        int cpu;
@@@ -748,11 -682,7 +682,7 @@@ wakeup_secondary_cpu(int phys_apicid, u
         * target processor state.
         */
        startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
- #ifdef CONFIG_X86_64
-                        (unsigned long)init_rsp);
- #else
                         (unsigned long)stack_start.sp);
- #endif
  
        /*
         * Run STARTUP IPI loop.
@@@ -832,6 -762,45 +762,45 @@@ static void __cpuinit do_fork_idle(stru
        complete(&c_idle->done);
  }
  
+ #ifdef CONFIG_X86_64
+ /*
+  * Allocate node local memory for the AP pda.
+  *
+  * Must be called after the _cpu_pda pointer table is initialized.
+  */
+ static int __cpuinit get_local_pda(int cpu)
+ {
+       struct x8664_pda *oldpda, *newpda;
+       unsigned long size = sizeof(struct x8664_pda);
+       int node = cpu_to_node(cpu);
+       if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
+               return 0;
+       oldpda = cpu_pda(cpu);
+       newpda = kmalloc_node(size, GFP_ATOMIC, node);
+       if (!newpda) {
+               printk(KERN_ERR "Could not allocate node local PDA "
+                       "for CPU %d on node %d\n", cpu, node);
+               if (oldpda)
+                       return 0;       /* have a usable pda */
+               else
+                       return -1;
+       }
+       if (oldpda) {
+               memcpy(newpda, oldpda, size);
+               if (!after_bootmem)
+                       free_bootmem((unsigned long)oldpda, size);
+       }
+       newpda->in_bootmem = 0;
+       cpu_pda(cpu) = newpda;
+       return 0;
+ }
+ #endif /* CONFIG_X86_64 */
  static int __cpuinit do_boot_cpu(int apicid, int cpu)
  /*
   * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
                .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
        };
        INIT_WORK(&c_idle.work, do_fork_idle);
- #ifdef CONFIG_X86_64
-       /* allocate memory for gdts of secondary cpus. Hotplug is considered */
-       if (!cpu_gdt_descr[cpu].address &&
-               !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
-               printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
-               return -1;
-       }
  
+ #ifdef CONFIG_X86_64
        /* Allocate node local memory for AP pdas */
-       if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
-               struct x8664_pda *newpda, *pda;
-               int node = cpu_to_node(cpu);
-               pda = cpu_pda(cpu);
-               newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC,
-                                     node);
-               if (newpda) {
-                       memcpy(newpda, pda, sizeof(struct x8664_pda));
-                       cpu_pda(cpu) = newpda;
-               } else
-                       printk(KERN_ERR
-               "Could not allocate node local PDA for CPU %d on node %d\n",
-                               cpu, node);
+       if (cpu > 0) {
+               boot_error = get_local_pda(cpu);
+               if (boot_error)
+                       goto restore_state;
+                       /* if can't get pda memory, can't start cpu */
        }
  #endif
  
@@@ -905,18 -860,15 +860,15 @@@ do_rest
  #ifdef CONFIG_X86_32
        per_cpu(current_task, cpu) = c_idle.idle;
        init_gdt(cpu);
-       early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
-       c_idle.idle->thread.ip = (unsigned long) start_secondary;
        /* Stack for startup_32 can be just as for start_secondary onwards */
-       stack_start.sp = (void *) c_idle.idle->thread.sp;
        irq_ctx_init(cpu);
  #else
        cpu_pda(cpu)->pcurrent = c_idle.idle;
-       init_rsp = c_idle.idle->thread.sp;
-       load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
-       initial_code = (unsigned long)start_secondary;
        clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
  #endif
+       early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+       initial_code = (unsigned long)start_secondary;
+       stack_start.sp = (void *) c_idle.idle->thread.sp;
  
        /* start_ip had better be page-aligned! */
        start_ip = setup_trampoline();
                                inquire_remote_apic(apicid);
                }
        }
-       if (boot_error) {
-               /* Try to put things back the way they were before ... */
-               unmap_cpu_to_logical_apicid(cpu);
  #ifdef CONFIG_X86_64
-               clear_node_cpumask(cpu); /* was set by numa_add_cpu */
+ restore_state:
  #endif
+       if (boot_error) {
+               /* Try to put things back the way they were before ... */
+               numa_remove_cpu(cpu); /* was set by numa_add_cpu */
                cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
                cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
                cpu_clear(cpu, cpu_present_map);
@@@ -1087,14 -1038,12 +1038,12 @@@ static __init void disable_smp(void
  {
        cpu_present_map = cpumask_of_cpu(0);
        cpu_possible_map = cpumask_of_cpu(0);
- #ifdef CONFIG_X86_32
        smpboot_clear_io_apic_irqs();
- #endif
        if (smp_found_config)
-               phys_cpu_present_map =
-                               physid_mask_of_physid(boot_cpu_physical_apicid);
+               physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
        else
-               phys_cpu_present_map = physid_mask_of_physid(0);
+               physid_set_mask_of_physid(0, &phys_cpu_present_map);
        map_cpu_to_logical_apicid();
        cpu_set(0, per_cpu(cpu_sibling_map, 0));
        cpu_set(0, per_cpu(cpu_core_map, 0));
@@@ -1157,12 -1106,12 +1106,12 @@@ static int __init smp_sanity_check(unsi
         * If SMP should be disabled, then really disable it!
         */
        if (!max_cpus) {
-               printk(KERN_INFO "SMP mode deactivated,"
-                                "forcing use of dummy APIC emulation.\n");
+               printk(KERN_INFO "SMP mode deactivated.\n");
                smpboot_clear_io_apic();
- #ifdef CONFIG_X86_32
+               localise_nmi_watchdog();
                connect_bsp_APIC();
- #endif
                setup_local_APIC();
                end_local_APIC_setup();
                return -1;
@@@ -1190,7 -1139,6 +1139,6 @@@ static void __init smp_cpu_index_defaul
  void __init native_smp_prepare_cpus(unsigned int max_cpus)
  {
        preempt_disable();
-       nmi_watchdog_default();
        smp_cpu_index_default();
        current_cpu_data = boot_cpu_data;
        cpu_callin_map = cpumask_of_cpu(0);
        }
        preempt_enable();
  
- #ifdef CONFIG_X86_32
        connect_bsp_APIC();
- #endif
        /*
         * Switch from PIC to APIC mode.
         */
@@@ -1257,8 -1204,8 +1204,8 @@@ void __init native_smp_prepare_boot_cpu
        int me = smp_processor_id();
  #ifdef CONFIG_X86_32
        init_gdt(me);
-       switch_to_new_gdt();
  #endif
+       switch_to_new_gdt();
        /* already set me in cpu_online_map in boot_cpu_init() */
        cpu_set(me, cpu_callout_map);
        per_cpu(cpu_state, me) = CPU_ONLINE;
@@@ -1278,29 -1225,12 +1225,12 @@@ void __init native_smp_cpus_done(unsign
  
  #ifdef CONFIG_HOTPLUG_CPU
  
- #  ifdef CONFIG_X86_32
- void cpu_exit_clear(void)
- {
-       int cpu = raw_smp_processor_id();
-       idle_task_exit();
-       cpu_uninit();
-       irq_ctx_exit(cpu);
-       cpu_clear(cpu, cpu_callout_map);
-       cpu_clear(cpu, cpu_callin_map);
-       unmap_cpu_to_logical_apicid(cpu);
- }
- #  endif /* CONFIG_X86_32 */
  static void remove_siblinginfo(int cpu)
  {
        int sibling;
        struct cpuinfo_x86 *c = &cpu_data(cpu);
  
 -      for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) {
 +      for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
                cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
                /*/
                 * last thread sibling in this cpu core going down
                        cpu_data(sibling).booted_cores--;
        }
  
 -      for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu))
 +      for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
                cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
        cpus_clear(per_cpu(cpu_sibling_map, cpu));
        cpus_clear(per_cpu(cpu_core_map, cpu));
@@@ -1348,12 -1278,20 +1278,20 @@@ __init void prefill_possible_map(void
        int i;
        int possible;
  
+       /* no processor from mptable or madt */
+       if (!num_processors)
+               num_processors = 1;
+ #ifdef CONFIG_HOTPLUG_CPU
        if (additional_cpus == -1) {
                if (disabled_cpus > 0)
                        additional_cpus = disabled_cpus;
                else
                        additional_cpus = 0;
        }
+ #else
+       additional_cpus = 0;
+ #endif
        possible = num_processors + additional_cpus;
        if (possible > NR_CPUS)
                possible = NR_CPUS;
  
        for (i = 0; i < possible; i++)
                cpu_set(i, cpu_possible_map);
+       nr_cpu_ids = possible;
  }
  
  static void __ref remove_cpu_from_maps(int cpu)
  {
        cpu_clear(cpu, cpu_online_map);
- #ifdef CONFIG_X86_64
        cpu_clear(cpu, cpu_callout_map);
        cpu_clear(cpu, cpu_callin_map);
        /* was set by cpu_init() */
        clear_bit(cpu, (unsigned long *)&cpu_initialized);
-       clear_node_cpumask(cpu);
- #endif
+       numa_remove_cpu(cpu);
  }
  
  int __cpu_disable(void)
diff --combined arch/x86/xen/smp.c
  #include "xen-ops.h"
  #include "mmu.h"
  
- static cpumask_t xen_cpu_initialized_map;
- static DEFINE_PER_CPU(int, resched_irq) = -1;
- static DEFINE_PER_CPU(int, callfunc_irq) = -1;
- static DEFINE_PER_CPU(int, debug_irq) = -1;
- /*
-  * Structure and data for smp_call_function(). This is designed to minimise
-  * static memory requirements. It also looks cleaner.
-  */
- static DEFINE_SPINLOCK(call_lock);
+ cpumask_t xen_cpu_initialized_map;
  
- struct call_data_struct {
-       void (*func) (void *info);
-       void *info;
-       atomic_t started;
-       atomic_t finished;
-       int wait;
- };
+ static DEFINE_PER_CPU(int, resched_irq);
+ static DEFINE_PER_CPU(int, callfunc_irq);
+ static DEFINE_PER_CPU(int, callfuncsingle_irq);
+ static DEFINE_PER_CPU(int, debug_irq) = -1;
  
  static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
- static struct call_data_struct *call_data;
+ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
  
  /*
   * Reschedule call back. Nothing to do,
   */
  static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
  {
+ #ifdef CONFIG_X86_32
+       __get_cpu_var(irq_stat).irq_resched_count++;
+ #else
+       add_pda(irq_resched_count, 1);
+ #endif
        return IRQ_HANDLED;
  }
  
@@@ -122,6 -115,17 +115,17 @@@ static int xen_smp_intr_init(unsigned i
                goto fail;
        per_cpu(debug_irq, cpu) = rc;
  
+       callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
+       rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
+                                   cpu,
+                                   xen_call_function_single_interrupt,
+                                   IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                   callfunc_name,
+                                   NULL);
+       if (rc < 0)
+               goto fail;
+       per_cpu(callfuncsingle_irq, cpu) = rc;
        return 0;
  
   fail:
                unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
        if (per_cpu(debug_irq, cpu) >= 0)
                unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
+       if (per_cpu(callfuncsingle_irq, cpu) >= 0)
+               unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
        return rc;
  }
  
@@@ -330,7 -337,7 +337,7 @@@ static void stop_self(void *v
  
  void xen_smp_send_stop(void)
  {
-       smp_call_function(stop_self, NULL, 0, 0);
+       smp_call_function(stop_self, NULL, 0);
  }
  
  void xen_smp_send_reschedule(int cpu)
        xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
  }
  
  static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
  {
        unsigned cpu;
  
        cpus_and(mask, mask, cpu_online_map);
  
 -      for_each_cpu_mask(cpu, mask)
 +      for_each_cpu_mask_nr(cpu, mask)
                xen_send_IPI_one(cpu, vector);
  }
  
 -      for_each_cpu_mask(cpu, mask) {
+ void xen_smp_send_call_function_ipi(cpumask_t mask)
+ {
+       int cpu;
+       xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+       /* Make sure other vcpus get a chance to run if they need to. */
++      for_each_cpu_mask_nr(cpu, mask) {
+               if (xen_vcpu_stolen(cpu)) {
+                       HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+                       break;
+               }
+       }
+ }
+ void xen_smp_send_call_function_single_ipi(int cpu)
+ {
+       xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
+ }
  static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
  {
-       void (*func) (void *info) = call_data->func;
-       void *info = call_data->info;
-       int wait = call_data->wait;
-       /*
-        * Notify initiating CPU that I've grabbed the data and am
-        * about to execute the function
-        */
-       mb();
-       atomic_inc(&call_data->started);
-       /*
-        * At this point the info structure may be out of scope unless wait==1
-        */
        irq_enter();
-       (*func)(info);
+       generic_smp_call_function_interrupt();
        __get_cpu_var(irq_stat).irq_call_count++;
        irq_exit();
  
-       if (wait) {
-               mb();           /* commit everything before setting finished */
-               atomic_inc(&call_data->finished);
-       }
        return IRQ_HANDLED;
  }
  
- int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
-                              void *info, int wait)
+ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
  {
-       struct call_data_struct data;
-       int cpus, cpu;
-       bool yield;
-       /* Holding any lock stops cpus from going down. */
-       spin_lock(&call_lock);
-       cpu_clear(smp_processor_id(), mask);
-       cpus = cpus_weight(mask);
-       if (!cpus) {
-               spin_unlock(&call_lock);
-               return 0;
-       }
-       /* Can deadlock when called with interrupts disabled */
-       WARN_ON(irqs_disabled());
-       data.func = func;
-       data.info = info;
-       atomic_set(&data.started, 0);
-       data.wait = wait;
-       if (wait)
-               atomic_set(&data.finished, 0);
-       call_data = &data;
-       mb();                   /* write everything before IPI */
-       /* Send a message to other CPUs and wait for them to respond */
-       xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
-       /* Make sure other vcpus get a chance to run if they need to. */
-       yield = false;
-       for_each_cpu_mask_nr(cpu, mask)
-               if (xen_vcpu_stolen(cpu))
-                       yield = true;
-       if (yield)
-               HYPERVISOR_sched_op(SCHEDOP_yield, 0);
-       /* Wait for response */
-       while (atomic_read(&data.started) != cpus ||
-              (wait && atomic_read(&data.finished) != cpus))
-               cpu_relax();
-       spin_unlock(&call_lock);
+       irq_enter();
+       generic_smp_call_function_single_interrupt();
+       __get_cpu_var(irq_stat).irq_call_count++;
+       irq_exit();
  
-       return 0;
+       return IRQ_HANDLED;
  }
@@@ -531,7 -531,7 +531,7 @@@ void ehca_process_eq(struct ehca_shca *
  {
        struct ehca_eq *eq = &shca->eq;
        struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
-       u64 eqe_value;
+       u64 eqe_value, ret;
        unsigned long flags;
        int eqe_cnt, i;
        int eq_empty = 0;
                        ehca_dbg(&shca->ib_device,
                                 "No eqe found for irq event");
                goto unlock_irq_spinlock;
-       } else if (!is_irq)
+       } else if (!is_irq) {
+               ret = hipz_h_eoi(eq->ist);
+               if (ret != H_SUCCESS)
+                       ehca_err(&shca->ib_device,
+                                "bad return code EOI -rc = %ld\n", ret);
                ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt);
+       }
        if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE))
                ehca_dbg(&shca->ib_device, "too many eqes for one irq event");
        /* enable irq for new packets */
@@@ -641,8 -646,8 +646,8 @@@ static inline int find_next_online_cpu(
                ehca_dmp(&cpu_online_map, sizeof(cpumask_t), "");
  
        spin_lock_irqsave(&pool->last_cpu_lock, flags);
 -      cpu = next_cpu(pool->last_cpu, cpu_online_map);
 -      if (cpu == NR_CPUS)
 +      cpu = next_cpu_nr(pool->last_cpu, cpu_online_map);
 +      if (cpu >= nr_cpu_ids)
                cpu = first_cpu(cpu_online_map);
        pool->last_cpu = cpu;
        spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
diff --combined include/asm-x86/ipi.h
@@@ -20,6 -20,7 +20,7 @@@
  
  #include <asm/hw_irq.h>
  #include <asm/apic.h>
+ #include <asm/smp.h>
  
  /*
   * the following functions deal with sending IPIs between CPUs.
@@@ -121,7 -122,7 +122,7 @@@ static inline void send_IPI_mask_sequen
         * - mbligh
         */
        local_irq_save(flags);
 -      for_each_cpu_mask(query_cpu, mask) {
 +      for_each_cpu_mask_nr(query_cpu, mask) {
                __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu),
                                      vector, APIC_DEST_PHYSICAL);
        }
diff --combined kernel/cpu.c
  #include <linux/stop_machine.h>
  #include <linux/mutex.h>
  
+ /*
+  * Represents all cpu's present in the system
+  * In systems capable of hotplug, this map could dynamically grow
+  * as new cpu's are detected in the system via any platform specific
+  * method, such as ACPI for e.g.
+  */
+ cpumask_t cpu_present_map __read_mostly;
+ EXPORT_SYMBOL(cpu_present_map);
+ #ifndef CONFIG_SMP
+ /*
+  * Represents all cpu's that are currently online.
+  */
+ cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+ EXPORT_SYMBOL(cpu_online_map);
+ cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+ EXPORT_SYMBOL(cpu_possible_map);
+ #else /* CONFIG_SMP */
  /* Serializes the updates to cpu_online_map, cpu_present_map */
  static DEFINE_MUTEX(cpu_add_remove_lock);
  
@@@ -390,7 -412,7 +412,7 @@@ void __ref enable_nonboot_cpus(void
                goto out;
  
        printk("Enabling non-boot CPUs ...\n");
 -      for_each_cpu_mask(cpu, frozen_cpus) {
 +      for_each_cpu_mask_nr(cpu, frozen_cpus) {
                error = _cpu_up(cpu, 1);
                if (!error) {
                        printk("CPU%d is up\n", cpu);
@@@ -403,3 -425,5 +425,5 @@@ out
        cpu_maps_update_done();
  }
  #endif /* CONFIG_PM_SLEEP_SMP */
+ #endif /* CONFIG_SMP */
diff --combined kernel/rcuclassic.c
@@@ -106,7 -106,7 +106,7 @@@ static void force_quiescent_state(struc
                 */
                cpus_and(cpumask, rcp->cpumask, cpu_online_map);
                cpu_clear(rdp->cpu, cpumask);
 -              for_each_cpu_mask(cpu, cpumask)
 +              for_each_cpu_mask_nr(cpu, cpumask)
                        smp_send_reschedule(cpu);
        }
  }
@@@ -387,6 -387,10 +387,10 @@@ static void __rcu_offline_cpu(struct rc
        rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
        rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
        rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
+       local_irq_disable();
+       this_rdp->qlen += rdp->qlen;
+       local_irq_enable();
  }
  
  static void rcu_offline_cpu(int cpu)
@@@ -516,10 -520,38 +520,38 @@@ void rcu_check_callbacks(int cpu, int u
        if (user ||
            (idle_cpu(cpu) && !in_softirq() &&
                                hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+               /*
+                * Get here if this CPU took its interrupt from user
+                * mode or from the idle loop, and if this is not a
+                * nested interrupt.  In this case, the CPU is in
+                * a quiescent state, so count it.
+                *
+                * Also do a memory barrier.  This is needed to handle
+                * the case where writes from a preempt-disable section
+                * of code get reordered into schedule() by this CPU's
+                * write buffer.  The memory barrier makes sure that
+                * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
+                * by other CPUs to happen after any such write.
+                */
+               smp_mb();  /* See above block comment. */
                rcu_qsctr_inc(cpu);
                rcu_bh_qsctr_inc(cpu);
-       } else if (!in_softirq())
+       } else if (!in_softirq()) {
+               /*
+                * Get here if this CPU did not take its interrupt from
+                * softirq, in other words, if it is not interrupting
+                * a rcu_bh read-side critical section.  This is an _bh
+                * critical section, so count it.  The memory barrier
+                * is needed for the same reason as is the above one.
+                */
+               smp_mb();  /* See above block comment. */
                rcu_bh_qsctr_inc(cpu);
+       }
        raise_rcu_softirq();
  }
  
@@@ -543,7 -575,7 +575,7 @@@ static void __cpuinit rcu_online_cpu(in
  
        rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
        rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
-       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  }
  
  static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
diff --combined kernel/rcupreempt.c
  #include <asm/atomic.h>
  #include <linux/bitops.h>
  #include <linux/module.h>
+ #include <linux/kthread.h>
  #include <linux/completion.h>
  #include <linux/moduleparam.h>
  #include <linux/percpu.h>
  #include <linux/notifier.h>
- #include <linux/rcupdate.h>
  #include <linux/cpu.h>
  #include <linux/random.h>
  #include <linux/delay.h>
@@@ -82,14 -82,18 +82,18 @@@ struct rcu_data 
        spinlock_t      lock;           /* Protect rcu_data fields. */
        long            completed;      /* Number of last completed batch. */
        int             waitlistcount;
-       struct tasklet_struct rcu_tasklet;
        struct rcu_head *nextlist;
        struct rcu_head **nexttail;
        struct rcu_head *waitlist[GP_STAGES];
        struct rcu_head **waittail[GP_STAGES];
-       struct rcu_head *donelist;
+       struct rcu_head *donelist;      /* from waitlist & waitschedlist */
        struct rcu_head **donetail;
        long rcu_flipctr[2];
+       struct rcu_head *nextschedlist;
+       struct rcu_head **nextschedtail;
+       struct rcu_head *waitschedlist;
+       struct rcu_head **waitschedtail;
+       int rcu_sched_sleeping;
  #ifdef CONFIG_RCU_TRACE
        struct rcupreempt_trace trace;
  #endif /* #ifdef CONFIG_RCU_TRACE */
@@@ -131,11 -135,24 +135,24 @@@ enum rcu_try_flip_states 
        rcu_try_flip_waitmb_state,
  };
  
+ /*
+  * States for rcu_ctrlblk.rcu_sched_sleep.
+  */
+ enum rcu_sched_sleep_states {
+       rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP.  */
+       rcu_sched_sleep_prep,   /* Thinking of sleeping, rechecking. */
+       rcu_sched_sleeping,     /* Sleeping, awaken if GP needed. */
+ };
  struct rcu_ctrlblk {
        spinlock_t      fliplock;       /* Protect state-machine transitions. */
        long            completed;      /* Number of last completed batch. */
        enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
                                                        the rcu state machine */
+       spinlock_t      schedlock;      /* Protect rcu_sched sleep state. */
+       enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
+       wait_queue_head_t sched_wq;     /* Place for rcu_sched to sleep. */
  };
  
  static DEFINE_PER_CPU(struct rcu_data, rcu_data);
@@@ -143,8 -160,12 +160,12 @@@ static struct rcu_ctrlblk rcu_ctrlblk 
        .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
        .completed = 0,
        .rcu_try_flip_state = rcu_try_flip_idle_state,
+       .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
+       .sched_sleep = rcu_sched_not_sleeping,
+       .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
  };
  
+ static struct task_struct *rcu_sched_grace_period_task;
  
  #ifdef CONFIG_RCU_TRACE
  static char *rcu_try_flip_state_names[] =
@@@ -207,6 -228,8 +228,8 @@@ static DEFINE_PER_CPU_SHARED_ALIGNED(en
   */
  #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
  
+ #define RCU_SCHED_BATCH_TIME (HZ / 50)
  /*
   * Return the number of RCU batches processed thus far.  Useful
   * for debug and statistics.
@@@ -411,32 -434,34 +434,34 @@@ static void __rcu_advance_callbacks(str
        }
  }
  
- #ifdef CONFIG_NO_HZ
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
+       .dynticks = 1,
+ };
  
- DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
- static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
+ #ifdef CONFIG_NO_HZ
  static DEFINE_PER_CPU(int, rcu_update_flag);
  
  /**
   * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
   *
   * If the CPU was idle with dynamic ticks active, this updates the
-  * dynticks_progress_counter to let the RCU handling know that the
+  * rcu_dyntick_sched.dynticks to let the RCU handling know that the
   * CPU is active.
   */
  void rcu_irq_enter(void)
  {
        int cpu = smp_processor_id();
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  
        if (per_cpu(rcu_update_flag, cpu))
                per_cpu(rcu_update_flag, cpu)++;
  
        /*
         * Only update if we are coming from a stopped ticks mode
-        * (dynticks_progress_counter is even).
+        * (rcu_dyntick_sched.dynticks is even).
         */
        if (!in_interrupt() &&
-           (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
+           (rdssp->dynticks & 0x1) == 0) {
                /*
                 * The following might seem like we could have a race
                 * with NMI/SMIs. But this really isn't a problem.
                 * RCU read-side critical sections on this CPU would
                 * have already completed.
                 */
-               per_cpu(dynticks_progress_counter, cpu)++;
+               rdssp->dynticks++;
                /*
                 * The following memory barrier ensures that any
                 * rcu_read_lock() primitives in the irq handler
                 * are seen by other CPUs to follow the above
-                * increment to dynticks_progress_counter. This is
+                * increment to rcu_dyntick_sched.dynticks. This is
                 * required in order for other CPUs to correctly
                 * determine when it is safe to advance the RCU
                 * grace-period state machine.
                smp_mb(); /* see above block comment. */
                /*
                 * Since we can't determine the dynamic tick mode from
-                * the dynticks_progress_counter after this routine,
+                * the rcu_dyntick_sched.dynticks after this routine,
                 * we use a second flag to acknowledge that we came
                 * from an idle state with ticks stopped.
                 */
                /*
                 * If we take an NMI/SMI now, they will also increment
                 * the rcu_update_flag, and will not update the
-                * dynticks_progress_counter on exit. That is for
+                * rcu_dyntick_sched.dynticks on exit. That is for
                 * this IRQ to do.
                 */
        }
   * rcu_irq_exit - Called from exiting Hard irq context.
   *
   * If the CPU was idle with dynamic ticks active, update the
-  * dynticks_progress_counter to put let the RCU handling be
+  * rcu_dyntick_sched.dynticks to put let the RCU handling be
   * aware that the CPU is going back to idle with no ticks.
   */
  void rcu_irq_exit(void)
  {
        int cpu = smp_processor_id();
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  
        /*
         * rcu_update_flag is set if we interrupted the CPU
         * Once this occurs, we keep track of interrupt nesting
         * because a NMI/SMI could also come in, and we still
         * only want the IRQ that started the increment of the
-        * dynticks_progress_counter to be the one that modifies
+        * rcu_dyntick_sched.dynticks to be the one that modifies
         * it on exit.
         */
        if (per_cpu(rcu_update_flag, cpu)) {
  
                /*
                 * If an NMI/SMI happens now we are still
-                * protected by the dynticks_progress_counter being odd.
+                * protected by the rcu_dyntick_sched.dynticks being odd.
                 */
  
                /*
                 * The following memory barrier ensures that any
                 * rcu_read_unlock() primitives in the irq handler
                 * are seen by other CPUs to preceed the following
-                * increment to dynticks_progress_counter. This
+                * increment to rcu_dyntick_sched.dynticks. This
                 * is required in order for other CPUs to determine
                 * when it is safe to advance the RCU grace-period
                 * state machine.
                 */
                smp_mb(); /* see above block comment. */
-               per_cpu(dynticks_progress_counter, cpu)++;
-               WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
+               rdssp->dynticks++;
+               WARN_ON(rdssp->dynticks & 0x1);
        }
  }
  
  static void dyntick_save_progress_counter(int cpu)
  {
-       per_cpu(rcu_dyntick_snapshot, cpu) =
-               per_cpu(dynticks_progress_counter, cpu);
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+       rdssp->dynticks_snap = rdssp->dynticks;
  }
  
  static inline int
@@@ -544,9 -571,10 +571,10 @@@ rcu_try_flip_waitack_needed(int cpu
  {
        long curr;
        long snap;
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  
-       curr = per_cpu(dynticks_progress_counter, cpu);
-       snap = per_cpu(rcu_dyntick_snapshot, cpu);
+       curr = rdssp->dynticks;
+       snap = rdssp->dynticks_snap;
        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
  
        /*
         * that this CPU already acknowledged the counter.
         */
  
-       if ((curr - snap) > 2 || (snap & 0x1) == 0)
+       if ((curr - snap) > 2 || (curr & 0x1) == 0)
                return 0;
  
        /* We need this CPU to explicitly acknowledge the counter flip. */
@@@ -580,9 -608,10 +608,10 @@@ rcu_try_flip_waitmb_needed(int cpu
  {
        long curr;
        long snap;
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
  
-       curr = per_cpu(dynticks_progress_counter, cpu);
-       snap = per_cpu(rcu_dyntick_snapshot, cpu);
+       curr = rdssp->dynticks;
+       snap = rdssp->dynticks_snap;
        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
  
        /*
        return 1;
  }
  
+ static void dyntick_save_progress_counter_sched(int cpu)
+ {
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+       rdssp->sched_dynticks_snap = rdssp->dynticks;
+ }
+ static int rcu_qsctr_inc_needed_dyntick(int cpu)
+ {
+       long curr;
+       long snap;
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+       curr = rdssp->dynticks;
+       snap = rdssp->sched_dynticks_snap;
+       smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+       /*
+        * If the CPU remained in dynticks mode for the entire time
+        * and didn't take any interrupts, NMIs, SMIs, or whatever,
+        * then it cannot be in the middle of an rcu_read_lock(), so
+        * the next rcu_read_lock() it executes must use the new value
+        * of the counter.  Therefore, this CPU has been in a quiescent
+        * state the entire time, and we don't need to wait for it.
+        */
+       if ((curr == snap) && ((curr & 0x1) == 0))
+               return 0;
+       /*
+        * If the CPU passed through or entered a dynticks idle phase with
+        * no active irq handlers, then, as above, this CPU has already
+        * passed through a quiescent state.
+        */
+       if ((curr - snap) > 2 || (snap & 0x1) == 0)
+               return 0;
+       /* We need this CPU to go through a quiescent state. */
+       return 1;
+ }
  #else /* !CONFIG_NO_HZ */
  
- # define dyntick_save_progress_counter(cpu)   do { } while (0)
- # define rcu_try_flip_waitack_needed(cpu)     (1)
- # define rcu_try_flip_waitmb_needed(cpu)      (1)
+ # define dyntick_save_progress_counter(cpu)           do { } while (0)
+ # define rcu_try_flip_waitack_needed(cpu)             (1)
+ # define rcu_try_flip_waitmb_needed(cpu)              (1)
+ # define dyntick_save_progress_counter_sched(cpu)     do { } while (0)
+ # define rcu_qsctr_inc_needed_dyntick(cpu)            (1)
  
  #endif /* CONFIG_NO_HZ */
  
+ static void save_qsctr_sched(int cpu)
+ {
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+       rdssp->sched_qs_snap = rdssp->sched_qs;
+ }
+ static inline int rcu_qsctr_inc_needed(int cpu)
+ {
+       struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+       /*
+        * If there has been a quiescent state, no more need to wait
+        * on this CPU.
+        */
+       if (rdssp->sched_qs != rdssp->sched_qs_snap) {
+               smp_mb(); /* force ordering with cpu entering schedule(). */
+               return 0;
+       }
+       /* We need this CPU to go through a quiescent state. */
+       return 1;
+ }
  /*
   * Get here when RCU is idle.  Decide whether we need to
   * move out of idle state, and return non-zero if so.
@@@ -655,7 -756,7 +756,7 @@@ rcu_try_flip_idle(void
  
        /* Now ask each CPU for acknowledgement of the flip. */
  
 -      for_each_cpu_mask(cpu, rcu_cpu_online_map) {
 +      for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
                per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
                dyntick_save_progress_counter(cpu);
        }
@@@ -673,7 -774,7 +774,7 @@@ rcu_try_flip_waitack(void
        int cpu;
  
        RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
 -      for_each_cpu_mask(cpu, rcu_cpu_online_map)
 +      for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
                if (rcu_try_flip_waitack_needed(cpu) &&
                    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
                        RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@@ -705,7 -806,7 +806,7 @@@ rcu_try_flip_waitzero(void
        /* Check to see if the sum of the "last" counters is zero. */
  
        RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
 -      for_each_cpu_mask(cpu, rcu_cpu_online_map)
 +      for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
                sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
        if (sum != 0) {
                RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
        smp_mb();  /*  ^^^^^^^^^^^^ */
  
        /* Call for a memory barrier from each CPU. */
 -      for_each_cpu_mask(cpu, rcu_cpu_online_map) {
 +      for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
                per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
                dyntick_save_progress_counter(cpu);
        }
@@@ -740,7 -841,7 +841,7 @@@ rcu_try_flip_waitmb(void
        int cpu;
  
        RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
 -      for_each_cpu_mask(cpu, rcu_cpu_online_map)
 +      for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
                if (rcu_try_flip_waitmb_needed(cpu) &&
                    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
                        RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
@@@ -819,6 -920,26 +920,26 @@@ void rcu_check_callbacks(int cpu, int u
        unsigned long flags;
        struct rcu_data *rdp = RCU_DATA_CPU(cpu);
  
+       /*
+        * If this CPU took its interrupt from user mode or from the
+        * idle loop, and this is not a nested interrupt, then
+        * this CPU has to have exited all prior preept-disable
+        * sections of code.  So increment the counter to note this.
+        *
+        * The memory barrier is needed to handle the case where
+        * writes from a preempt-disable section of code get reordered
+        * into schedule() by this CPU's write buffer.  So the memory
+        * barrier makes sure that the rcu_qsctr_inc() is seen by other
+        * CPUs to happen after any such write.
+        */
+       if (user ||
+           (idle_cpu(cpu) && !in_softirq() &&
+            hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+               smp_mb();       /* Guard against aggressive schedule(). */
+               rcu_qsctr_inc(cpu);
+       }
        rcu_check_mb(cpu);
        if (rcu_ctrlblk.completed == rdp->completed)
                rcu_try_flip();
@@@ -869,6 -990,8 +990,8 @@@ void rcu_offline_cpu(int cpu
        struct rcu_head *list = NULL;
        unsigned long flags;
        struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+       struct rcu_head *schedlist = NULL;
+       struct rcu_head **schedtail = &schedlist;
        struct rcu_head **tail = &list;
  
        /*
                rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
                                                list, tail);
        rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
+       rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
+                               schedlist, schedtail);
+       rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
+                               schedlist, schedtail);
+       rdp->rcu_sched_sleeping = 0;
        spin_unlock_irqrestore(&rdp->lock, flags);
        rdp->waitlistcount = 0;
  
         * fix.
         */
  
-       local_irq_save(flags);
+       local_irq_save(flags);  /* disable preempt till we know what lock. */
        rdp = RCU_DATA_ME();
        spin_lock(&rdp->lock);
        *rdp->nexttail = list;
        if (list)
                rdp->nexttail = tail;
+       *rdp->nextschedtail = schedlist;
+       if (schedlist)
+               rdp->nextschedtail = schedtail;
        spin_unlock_irqrestore(&rdp->lock, flags);
  }
  
- void __devinit rcu_online_cpu(int cpu)
+ #else /* #ifdef CONFIG_HOTPLUG_CPU */
+ void rcu_offline_cpu(int cpu)
+ {
+ }
+ #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+ void __cpuinit rcu_online_cpu(int cpu)
  {
        unsigned long flags;
+       struct rcu_data *rdp;
  
        spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
        cpu_set(cpu, rcu_cpu_online_map);
        spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
- }
  
- #else /* #ifdef CONFIG_HOTPLUG_CPU */
- void rcu_offline_cpu(int cpu)
- {
- }
+       /*
+        * The rcu_sched grace-period processing might have bypassed
+        * this CPU, given that it was not in the rcu_cpu_online_map
+        * when the grace-period scan started.  This means that the
+        * grace-period task might sleep.  So make sure that if this
+        * should happen, the first callback posted to this CPU will
+        * wake up the grace-period task if need be.
+        */
  
- void __devinit rcu_online_cpu(int cpu)
- {
+       rdp = RCU_DATA_CPU(cpu);
+       spin_lock_irqsave(&rdp->lock, flags);
+       rdp->rcu_sched_sleeping = 1;
+       spin_unlock_irqrestore(&rdp->lock, flags);
  }
  
- #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
  static void rcu_process_callbacks(struct softirq_action *unused)
  {
        unsigned long flags;
@@@ -986,31 -1128,196 +1128,196 @@@ void call_rcu(struct rcu_head *head, vo
        *rdp->nexttail = head;
        rdp->nexttail = &head->next;
        RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
-       spin_unlock(&rdp->lock);
-       local_irq_restore(flags);
+       spin_unlock_irqrestore(&rdp->lock, flags);
  }
  EXPORT_SYMBOL_GPL(call_rcu);
  
+ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+ {
+       unsigned long flags;
+       struct rcu_data *rdp;
+       int wake_gp = 0;
+       head->func = func;
+       head->next = NULL;
+       local_irq_save(flags);
+       rdp = RCU_DATA_ME();
+       spin_lock(&rdp->lock);
+       *rdp->nextschedtail = head;
+       rdp->nextschedtail = &head->next;
+       if (rdp->rcu_sched_sleeping) {
+               /* Grace-period processing might be sleeping... */
+               rdp->rcu_sched_sleeping = 0;
+               wake_gp = 1;
+       }
+       spin_unlock_irqrestore(&rdp->lock, flags);
+       if (wake_gp) {
+               /* Wake up grace-period processing, unless someone beat us. */
+               spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+               if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
+                       wake_gp = 0;
+               rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
+               spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+               if (wake_gp)
+                       wake_up_interruptible(&rcu_ctrlblk.sched_wq);
+       }
+ }
+ EXPORT_SYMBOL_GPL(call_rcu_sched);
  /*
   * Wait until all currently running preempt_disable() code segments
   * (including hardware-irq-disable segments) complete.  Note that
   * in -rt this does -not- necessarily result in all currently executing
   * interrupt -handlers- having completed.
   */
- void __synchronize_sched(void)
+ synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
+ EXPORT_SYMBOL_GPL(__synchronize_sched);
+ /*
+  * kthread function that manages call_rcu_sched grace periods.
+  */
+ static int rcu_sched_grace_period(void *arg)
  {
-       cpumask_t oldmask;
+       int couldsleep;         /* might sleep after current pass. */
+       int couldsleepnext = 0; /* might sleep after next pass. */
        int cpu;
+       unsigned long flags;
+       struct rcu_data *rdp;
+       int ret;
  
-       if (sched_getaffinity(0, &oldmask) < 0)
-               oldmask = cpu_possible_map;
-       for_each_online_cpu(cpu) {
-               sched_setaffinity(0, &cpumask_of_cpu(cpu));
-               schedule();
-       }
-       sched_setaffinity(0, &oldmask);
+       /*
+        * Each pass through the following loop handles one
+        * rcu_sched grace period cycle.
+        */
+       do {
+               /* Save each CPU's current state. */
+               for_each_online_cpu(cpu) {
+                       dyntick_save_progress_counter_sched(cpu);
+                       save_qsctr_sched(cpu);
+               }
+               /*
+                * Sleep for about an RCU grace-period's worth to
+                * allow better batching and to consume less CPU.
+                */
+               schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
+               /*
+                * If there was nothing to do last time, prepare to
+                * sleep at the end of the current grace period cycle.
+                */
+               couldsleep = couldsleepnext;
+               couldsleepnext = 1;
+               if (couldsleep) {
+                       spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+                       rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
+                       spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+               }
+               /*
+                * Wait on each CPU in turn to have either visited
+                * a quiescent state or been in dynticks-idle mode.
+                */
+               for_each_online_cpu(cpu) {
+                       while (rcu_qsctr_inc_needed(cpu) &&
+                              rcu_qsctr_inc_needed_dyntick(cpu)) {
+                               /* resched_cpu(cpu); @@@ */
+                               schedule_timeout_interruptible(1);
+                       }
+               }
+               /* Advance callbacks for each CPU.  */
+               for_each_online_cpu(cpu) {
+                       rdp = RCU_DATA_CPU(cpu);
+                       spin_lock_irqsave(&rdp->lock, flags);
+                       /*
+                        * We are running on this CPU irq-disabled, so no
+                        * CPU can go offline until we re-enable irqs.
+                        * The current CPU might have already gone
+                        * offline (between the for_each_offline_cpu and
+                        * the spin_lock_irqsave), but in that case all its
+                        * callback lists will be empty, so no harm done.
+                        *
+                        * Advance the callbacks!  We share normal RCU's
+                        * donelist, since callbacks are invoked the
+                        * same way in either case.
+                        */
+                       if (rdp->waitschedlist != NULL) {
+                               *rdp->donetail = rdp->waitschedlist;
+                               rdp->donetail = rdp->waitschedtail;
+                               /*
+                                * Next rcu_check_callbacks() will
+                                * do the required raise_softirq().
+                                */
+                       }
+                       if (rdp->nextschedlist != NULL) {
+                               rdp->waitschedlist = rdp->nextschedlist;
+                               rdp->waitschedtail = rdp->nextschedtail;
+                               couldsleep = 0;
+                               couldsleepnext = 0;
+                       } else {
+                               rdp->waitschedlist = NULL;
+                               rdp->waitschedtail = &rdp->waitschedlist;
+                       }
+                       rdp->nextschedlist = NULL;
+                       rdp->nextschedtail = &rdp->nextschedlist;
+                       /* Mark sleep intention. */
+                       rdp->rcu_sched_sleeping = couldsleep;
+                       spin_unlock_irqrestore(&rdp->lock, flags);
+               }
+               /* If we saw callbacks on the last scan, go deal with them. */
+               if (!couldsleep)
+                       continue;
+               /* Attempt to block... */
+               spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+               if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
+                       /*
+                        * Someone posted a callback after we scanned.
+                        * Go take care of it.
+                        */
+                       spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+                       couldsleepnext = 0;
+                       continue;
+               }
+               /* Block until the next person posts a callback. */
+               rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
+               spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+               ret = 0;
+               __wait_event_interruptible(rcu_ctrlblk.sched_wq,
+                       rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
+                       ret);
+               /*
+                * Signals would prevent us from sleeping, and we cannot
+                * do much with them in any case.  So flush them.
+                */
+               if (ret)
+                       flush_signals(current);
+               couldsleepnext = 0;
+       } while (!kthread_should_stop());
+       return (0);
  }
- EXPORT_SYMBOL_GPL(__synchronize_sched);
  
  /*
   * Check to see if any future RCU-related work will need to be done
@@@ -1027,7 -1334,9 +1334,9 @@@ int rcu_needs_cpu(int cpu
  
        return (rdp->donelist != NULL ||
                !!rdp->waitlistcount ||
-               rdp->nextlist != NULL);
+               rdp->nextlist != NULL ||
+               rdp->nextschedlist != NULL ||
+               rdp->waitschedlist != NULL);
  }
  
  int rcu_pending(int cpu)
  
        if (rdp->donelist != NULL ||
            !!rdp->waitlistcount ||
-           rdp->nextlist != NULL)
+           rdp->nextlist != NULL ||
+           rdp->nextschedlist != NULL ||
+           rdp->waitschedlist != NULL)
                return 1;
  
        /* The RCU core needs an acknowledgement from this CPU. */
@@@ -1105,6 -1416,11 +1416,11 @@@ void __init __rcu_init(void
                rdp->donetail = &rdp->donelist;
                rdp->rcu_flipctr[0] = 0;
                rdp->rcu_flipctr[1] = 0;
+               rdp->nextschedlist = NULL;
+               rdp->nextschedtail = &rdp->nextschedlist;
+               rdp->waitschedlist = NULL;
+               rdp->waitschedtail = &rdp->waitschedlist;
+               rdp->rcu_sched_sleeping = 0;
        }
        register_cpu_notifier(&rcu_nb);
  
        for_each_online_cpu(cpu)
                rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
  
-       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  }
  
  /*
-  * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+  * Late-boot-time RCU initialization that must wait until after scheduler
+  * has been initialized.
   */
- void synchronize_kernel(void)
+ void __init rcu_init_sched(void)
  {
-       synchronize_rcu();
+       rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
+                                                 NULL,
+                                                 "rcu_sched_grace_period");
+       WARN_ON(IS_ERR(rcu_sched_grace_period_task));
  }
  
  #ifdef CONFIG_RCU_TRACE
diff --combined kernel/sched.c
  #include <linux/bootmem.h>
  #include <linux/debugfs.h>
  #include <linux/ctype.h>
+ #include <linux/ftrace.h>
  
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
  
+ #include "sched_cpupri.h"
  /*
   * Convert user-nice values [ -20 ... 0 ... 19 ]
   * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@@ -289,15 -292,15 +292,15 @@@ struct task_group root_task_group
  static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
  /* Default task group's cfs_rq on each cpu */
  static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
- #endif
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  #ifdef CONFIG_RT_GROUP_SCHED
  static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
  static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
- #endif
- #else
+ #endif /* CONFIG_RT_GROUP_SCHED */
+ #else /* !CONFIG_FAIR_GROUP_SCHED */
  #define root_task_group init_task_group
- #endif
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  /* task_group_lock serializes add/remove of task groups and also changes to
   * a task group's cpu shares.
@@@ -307,9 -310,9 +310,9 @@@ static DEFINE_SPINLOCK(task_group_lock)
  #ifdef CONFIG_FAIR_GROUP_SCHED
  #ifdef CONFIG_USER_SCHED
  # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
- #else
+ #else /* !CONFIG_USER_SCHED */
  # define INIT_TASK_GROUP_LOAD NICE_0_LOAD
- #endif
+ #endif /* CONFIG_USER_SCHED */
  
  /*
   * A weight of 0 or 1 can cause arithmetics problems.
@@@ -363,6 -366,10 +366,10 @@@ static inline void set_task_rq(struct t
  #else
  
  static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+ static inline struct task_group *task_group(struct task_struct *p)
+ {
+       return NULL;
+ }
  
  #endif        /* CONFIG_GROUP_SCHED */
  
@@@ -373,6 -380,7 +380,7 @@@ struct cfs_rq 
  
        u64 exec_clock;
        u64 min_vruntime;
+       u64 pair_start;
  
        struct rb_root tasks_timeline;
        struct rb_node *rb_leftmost;
         */
        struct list_head leaf_cfs_rq_list;
        struct task_group *tg;  /* group that "owns" this runqueue */
+ #ifdef CONFIG_SMP
+       /*
+        * the part of load.weight contributed by tasks
+        */
+       unsigned long task_weight;
+       /*
+        *   h_load = weight * f(tg)
+        *
+        * Where f(tg) is the recursive weight fraction assigned to
+        * this group.
+        */
+       unsigned long h_load;
+       /*
+        * this cpu's part of tg->shares
+        */
+       unsigned long shares;
+       /*
+        * load.weight at the time we set shares
+        */
+       unsigned long rq_weight;
+ #endif
  #endif
  };
  
@@@ -452,6 -485,9 +485,9 @@@ struct root_domain 
         */
        cpumask_t rto_mask;
        atomic_t rto_count;
+ #ifdef CONFIG_SMP
+       struct cpupri cpupri;
+ #endif
  };
  
  /*
@@@ -526,6 -562,9 +562,9 @@@ struct rq 
        int push_cpu;
        /* cpu of this runqueue: */
        int cpu;
+       int online;
+       unsigned long avg_load_per_task;
  
        struct task_struct *migration_thread;
        struct list_head migration_queue;
@@@ -607,6 -646,24 +646,24 @@@ static inline void update_rq_clock(stru
  # define const_debug static const
  #endif
  
+ /**
+  * runqueue_is_locked
+  *
+  * Returns true if the current cpu runqueue is locked.
+  * This interface allows printk to be called with the runqueue lock
+  * held and know whether or not it is OK to wake up the klogd.
+  */
+ int runqueue_is_locked(void)
+ {
+       int cpu = get_cpu();
+       struct rq *rq = cpu_rq(cpu);
+       int ret;
+       ret = spin_is_locked(&rq->lock);
+       put_cpu();
+       return ret;
+ }
  /*
   * Debugging: various feature bits
   */
@@@ -748,6 -805,12 +805,12 @@@ late_initcall(sched_init_debug)
   */
  const_debug unsigned int sysctl_sched_nr_migrate = 32;
  
+ /*
+  * ratelimit for updating the group shares.
+  * default: 0.5ms
+  */
+ const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
  /*
   * period over which we measure -rt task cpu usage in us.
   * default: 1s
@@@ -775,82 -838,6 +838,6 @@@ static inline u64 global_rt_runtime(voi
        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
  }
  
- unsigned long long time_sync_thresh = 100000;
- static DEFINE_PER_CPU(unsigned long long, time_offset);
- static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
- /*
-  * Global lock which we take every now and then to synchronize
-  * the CPUs time. This method is not warp-safe, but it's good
-  * enough to synchronize slowly diverging time sources and thus
-  * it's good enough for tracing:
-  */
- static DEFINE_SPINLOCK(time_sync_lock);
- static unsigned long long prev_global_time;
- static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
- {
-       /*
-        * We want this inlined, to not get tracer function calls
-        * in this critical section:
-        */
-       spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
-       __raw_spin_lock(&time_sync_lock.raw_lock);
-       if (time < prev_global_time) {
-               per_cpu(time_offset, cpu) += prev_global_time - time;
-               time = prev_global_time;
-       } else {
-               prev_global_time = time;
-       }
-       __raw_spin_unlock(&time_sync_lock.raw_lock);
-       spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
-       return time;
- }
- static unsigned long long __cpu_clock(int cpu)
- {
-       unsigned long long now;
-       /*
-        * Only call sched_clock() if the scheduler has already been
-        * initialized (some code might call cpu_clock() very early):
-        */
-       if (unlikely(!scheduler_running))
-               return 0;
-       now = sched_clock_cpu(cpu);
-       return now;
- }
- /*
-  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
-  * clock constructed from sched_clock():
-  */
- unsigned long long cpu_clock(int cpu)
- {
-       unsigned long long prev_cpu_time, time, delta_time;
-       unsigned long flags;
-       local_irq_save(flags);
-       prev_cpu_time = per_cpu(prev_cpu_time, cpu);
-       time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
-       delta_time = time-prev_cpu_time;
-       if (unlikely(delta_time > time_sync_thresh)) {
-               time = __sync_cpu_clock(time, cpu);
-               per_cpu(prev_cpu_time, cpu) = time;
-       }
-       local_irq_restore(flags);
-       return time;
- }
- EXPORT_SYMBOL_GPL(cpu_clock);
  #ifndef prepare_arch_switch
  # define prepare_arch_switch(next)    do { } while (0)
  #endif
@@@ -1313,15 -1300,15 +1300,15 @@@ void wake_up_idle_cpu(int cpu
        if (!tsk_is_polling(rq->idle))
                smp_send_reschedule(cpu);
  }
- #endif
+ #endif /* CONFIG_NO_HZ */
  
- #else
+ #else /* !CONFIG_SMP */
  static void __resched_task(struct task_struct *p, int tif_bit)
  {
        assert_spin_locked(&task_rq(p)->lock);
        set_tsk_thread_flag(p, tif_bit);
  }
- #endif
+ #endif /* CONFIG_SMP */
  
  #if BITS_PER_LONG == 32
  # define WMULT_CONST  (~0UL)
   */
  #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
  
+ /*
+  * delta *= weight / lw
+  */
  static unsigned long
  calc_delta_mine(unsigned long delta_exec, unsigned long weight,
                struct load_weight *lw)
        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
  }
  
- static inline unsigned long
- calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
- {
-       return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
- }
  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
  {
        lw->weight += inc;
@@@ -1479,17 -1463,211 +1463,211 @@@ static inline void dec_cpu_load(struct 
  #ifdef CONFIG_SMP
  static unsigned long source_load(int cpu, int type);
  static unsigned long target_load(int cpu, int type);
- static unsigned long cpu_avg_load_per_task(int cpu);
  static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
- #else /* CONFIG_SMP */
+ static unsigned long cpu_avg_load_per_task(int cpu)
+ {
+       struct rq *rq = cpu_rq(cpu);
+       if (rq->nr_running)
+               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+       return rq->avg_load_per_task;
+ }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
- static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+ typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+ /*
+  * Iterate the full tree, calling @down when first entering a node and @up when
+  * leaving it for the final time.
+  */
+ static void
+ walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
  {
+       struct task_group *parent, *child;
+       rcu_read_lock();
+       parent = &root_task_group;
+ down:
+       (*down)(parent, cpu, sd);
+       list_for_each_entry_rcu(child, &parent->children, siblings) {
+               parent = child;
+               goto down;
+ up:
+               continue;
+       }
+       (*up)(parent, cpu, sd);
+       child = parent;
+       parent = parent->parent;
+       if (parent)
+               goto up;
+       rcu_read_unlock();
  }
+ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+ /*
+  * Calculate and set the cpu's group shares.
+  */
+ static void
+ __update_group_shares_cpu(struct task_group *tg, int cpu,
+                         unsigned long sd_shares, unsigned long sd_rq_weight)
+ {
+       int boost = 0;
+       unsigned long shares;
+       unsigned long rq_weight;
+       if (!tg->se[cpu])
+               return;
+       rq_weight = tg->cfs_rq[cpu]->load.weight;
+       /*
+        * If there are currently no tasks on the cpu pretend there is one of
+        * average load so that when a new task gets to run here it will not
+        * get delayed by group starvation.
+        */
+       if (!rq_weight) {
+               boost = 1;
+               rq_weight = NICE_0_LOAD;
+       }
+       if (unlikely(rq_weight > sd_rq_weight))
+               rq_weight = sd_rq_weight;
+       /*
+        *           \Sum shares * rq_weight
+        * shares =  -----------------------
+        *               \Sum rq_weight
+        *
+        */
+       shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+       /*
+        * record the actual number of shares, not the boosted amount.
+        */
+       tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+       tg->cfs_rq[cpu]->rq_weight = rq_weight;
+       if (shares < MIN_SHARES)
+               shares = MIN_SHARES;
+       else if (shares > MAX_SHARES)
+               shares = MAX_SHARES;
+       __set_se_shares(tg->se[cpu], shares);
+ }
+ /*
+  * Re-compute the task group their per cpu shares over the given domain.
+  * This needs to be done in a bottom-up fashion because the rq weight of a
+  * parent group depends on the shares of its child groups.
+  */
+ static void
+ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+ {
+       unsigned long rq_weight = 0;
+       unsigned long shares = 0;
+       int i;
+       for_each_cpu_mask(i, sd->span) {
+               rq_weight += tg->cfs_rq[i]->load.weight;
+               shares += tg->cfs_rq[i]->shares;
+       }
+       if ((!shares && rq_weight) || shares > tg->shares)
+               shares = tg->shares;
+       if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+               shares = tg->shares;
+       if (!rq_weight)
+               rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
+       for_each_cpu_mask(i, sd->span) {
+               struct rq *rq = cpu_rq(i);
+               unsigned long flags;
+               spin_lock_irqsave(&rq->lock, flags);
+               __update_group_shares_cpu(tg, i, shares, rq_weight);
+               spin_unlock_irqrestore(&rq->lock, flags);
+       }
+ }
+ /*
+  * Compute the cpu's hierarchical load factor for each task group.
+  * This needs to be done in a top-down fashion because the load of a child
+  * group is a fraction of its parents load.
+  */
+ static void
+ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+ {
+       unsigned long load;
+       if (!tg->parent) {
+               load = cpu_rq(cpu)->load.weight;
+       } else {
+               load = tg->parent->cfs_rq[cpu]->h_load;
+               load *= tg->cfs_rq[cpu]->shares;
+               load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+       }
+       tg->cfs_rq[cpu]->h_load = load;
+ }
+ static void
+ tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
+ {
+ }
+ static void update_shares(struct sched_domain *sd)
+ {
+       u64 now = cpu_clock(raw_smp_processor_id());
+       s64 elapsed = now - sd->last_update;
+       if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
+               sd->last_update = now;
+               walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+       }
+ }
+ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+ {
+       spin_unlock(&rq->lock);
+       update_shares(sd);
+       spin_lock(&rq->lock);
+ }
+ static void update_h_load(int cpu)
+ {
+       walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+ }
+ #else
+ static inline void update_shares(struct sched_domain *sd)
+ {
+ }
+ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+ {
+ }
  #endif
  
- #endif /* CONFIG_SMP */
+ #endif
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+ {
+ #ifdef CONFIG_SMP
+       cfs_rq->shares = shares;
+ #endif
+ }
+ #endif
  
  #include "sched_stats.h"
  #include "sched_idletask.c"
  #endif
  
  #define sched_class_highest (&rt_sched_class)
+ #define for_each_class(class) \
+    for (class = sched_class_highest; class; class = class->next)
  
- static inline void inc_load(struct rq *rq, const struct task_struct *p)
- {
-       update_load_add(&rq->load, p->se.load.weight);
- }
- static inline void dec_load(struct rq *rq, const struct task_struct *p)
- {
-       update_load_sub(&rq->load, p->se.load.weight);
- }
- static void inc_nr_running(struct task_struct *p, struct rq *rq)
+ static void inc_nr_running(struct rq *rq)
  {
        rq->nr_running++;
-       inc_load(rq, p);
  }
  
- static void dec_nr_running(struct task_struct *p, struct rq *rq)
+ static void dec_nr_running(struct rq *rq)
  {
        rq->nr_running--;
-       dec_load(rq, p);
  }
  
  static void set_load_weight(struct task_struct *p)
        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
  }
  
+ static void update_avg(u64 *avg, u64 sample)
+ {
+       s64 diff = sample - *avg;
+       *avg += diff >> 3;
+ }
  static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
  {
        sched_info_queued(p);
  
  static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
  {
+       if (sleep && p->se.last_wakeup) {
+               update_avg(&p->se.avg_overlap,
+                          p->se.sum_exec_runtime - p->se.last_wakeup);
+               p->se.last_wakeup = 0;
+       }
+       sched_info_dequeued(p);
        p->sched_class->dequeue_task(rq, p, sleep);
        p->se.on_rq = 0;
  }
@@@ -1612,7 -1793,7 +1793,7 @@@ static void activate_task(struct rq *rq
                rq->nr_uninterruptible--;
  
        enqueue_task(rq, p, wakeup);
-       inc_nr_running(p, rq);
+       inc_nr_running(rq);
  }
  
  /*
@@@ -1624,7 -1805,7 +1805,7 @@@ static void deactivate_task(struct rq *
                rq->nr_uninterruptible++;
  
        dequeue_task(rq, p, sleep);
-       dec_nr_running(p, rq);
+       dec_nr_running(rq);
  }
  
  /**
@@@ -1636,12 -1817,6 +1817,6 @@@ inline int task_curr(const struct task_
        return cpu_curr(task_cpu(p)) == p;
  }
  
- /* Used instead of source_load when we know the type == 0 */
- unsigned long weighted_cpuload(const int cpu)
- {
-       return cpu_rq(cpu)->load.weight;
- }
  static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
  {
        set_task_rq(p, cpu);
@@@ -1670,6 -1845,12 +1845,12 @@@ static inline void check_class_changed(
  
  #ifdef CONFIG_SMP
  
+ /* Used instead of source_load when we know the type == 0 */
+ static unsigned long weighted_cpuload(const int cpu)
+ {
+       return cpu_rq(cpu)->load.weight;
+ }
  /*
   * Is this task likely cache-hot:
   */
@@@ -1880,7 -2061,7 +2061,7 @@@ static unsigned long source_load(int cp
        struct rq *rq = cpu_rq(cpu);
        unsigned long total = weighted_cpuload(cpu);
  
-       if (type == 0)
+       if (type == 0 || !sched_feat(LB_BIAS))
                return total;
  
        return min(rq->cpu_load[type-1], total);
@@@ -1895,24 -2076,12 +2076,12 @@@ static unsigned long target_load(int cp
        struct rq *rq = cpu_rq(cpu);
        unsigned long total = weighted_cpuload(cpu);
  
-       if (type == 0)
+       if (type == 0 || !sched_feat(LB_BIAS))
                return total;
  
        return max(rq->cpu_load[type-1], total);
  }
  
- /*
-  * Return the average load per task on the cpu's run queue
-  */
- static unsigned long cpu_avg_load_per_task(int cpu)
- {
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long total = weighted_cpuload(cpu);
-       unsigned long n = rq->nr_running;
-       return n ? total / n : SCHED_LOAD_SCALE;
- }
  /*
   * find_idlest_group finds and returns the least busy CPU group within the
   * domain.
@@@ -1939,7 -2108,7 +2108,7 @@@ find_idlest_group(struct sched_domain *
                /* Tally up the load of all CPUs in the group */
                avg_load = 0;
  
 -              for_each_cpu_mask(i, group->cpumask) {
 +              for_each_cpu_mask_nr(i, group->cpumask) {
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
                                load = source_load(i, load_idx);
@@@ -1981,7 -2150,7 +2150,7 @@@ find_idlest_cpu(struct sched_group *gro
        /* Traverse only the allowed CPUs */
        cpus_and(*tmp, group->cpumask, p->cpus_allowed);
  
 -      for_each_cpu_mask(i, *tmp) {
 +      for_each_cpu_mask_nr(i, *tmp) {
                load = weighted_cpuload(i);
  
                if (load < min_load || (load == min_load && i == this_cpu)) {
@@@ -2019,6 -2188,9 +2188,9 @@@ static int sched_balance_self(int cpu, 
                        sd = tmp;
        }
  
+       if (sd)
+               update_shares(sd);
        while (sd) {
                cpumask_t span, tmpmask;
                struct sched_group *group;
@@@ -2085,6 -2257,22 +2257,22 @@@ static int try_to_wake_up(struct task_s
        if (!sched_feat(SYNC_WAKEUPS))
                sync = 0;
  
+ #ifdef CONFIG_SMP
+       if (sched_feat(LB_WAKEUP_UPDATE)) {
+               struct sched_domain *sd;
+               this_cpu = raw_smp_processor_id();
+               cpu = task_cpu(p);
+               for_each_domain(this_cpu, sd) {
+                       if (cpu_isset(cpu, sd->span)) {
+                               update_shares(sd);
+                               break;
+                       }
+               }
+       }
+ #endif
        smp_wmb();
        rq = task_rq_lock(p, &flags);
        old_state = p->state;
                        }
                }
        }
- #endif
+ #endif /* CONFIG_SCHEDSTATS */
  
  out_activate:
  #endif /* CONFIG_SMP */
        success = 1;
  
  out_running:
+       trace_mark(kernel_sched_wakeup,
+               "pid %d state %ld ## rq %p task %p rq->curr %p",
+               p->pid, p->state, rq, p, rq->curr);
        check_preempt_curr(rq, p);
  
        p->state = TASK_RUNNING;
                p->sched_class->task_wake_up(rq, p);
  #endif
  out:
+       current->se.last_wakeup = current->se.sum_exec_runtime;
        task_rq_unlock(rq, &flags);
  
        return success;
@@@ -2277,8 -2470,11 +2470,11 @@@ void wake_up_new_task(struct task_struc
                 * management (if any):
                 */
                p->sched_class->task_new(rq, p);
-               inc_nr_running(p, rq);
+               inc_nr_running(rq);
        }
+       trace_mark(kernel_sched_wakeup_new,
+               "pid %d state %ld ## rq %p task %p rq->curr %p",
+               p->pid, p->state, rq, p, rq->curr);
        check_preempt_curr(rq, p);
  #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
@@@ -2331,7 -2527,7 +2527,7 @@@ fire_sched_out_preempt_notifiers(struc
                notifier->ops->sched_out(notifier, next);
  }
  
- #else
+ #else /* !CONFIG_PREEMPT_NOTIFIERS */
  
  static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
  {
@@@ -2343,7 -2539,7 +2539,7 @@@ fire_sched_out_preempt_notifiers(struc
  {
  }
  
- #endif
+ #endif /* CONFIG_PREEMPT_NOTIFIERS */
  
  /**
   * prepare_task_switch - prepare to switch tasks
@@@ -2451,6 -2647,11 +2647,11 @@@ context_switch(struct rq *rq, struct ta
        struct mm_struct *mm, *oldmm;
  
        prepare_task_switch(rq, prev, next);
+       trace_mark(kernel_sched_schedule,
+               "prev_pid %d next_pid %d prev_state %ld "
+               "## rq %p prev %p next %p",
+               prev->pid, next->pid, prev->state,
+               rq, prev, next);
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
@@@ -2785,7 -2986,7 +2986,7 @@@ balance_tasks(struct rq *this_rq, int t
              enum cpu_idle_type idle, int *all_pinned,
              int *this_best_prio, struct rq_iterator *iterator)
  {
-       int loops = 0, pulled = 0, pinned = 0, skip_for_load;
+       int loops = 0, pulled = 0, pinned = 0;
        struct task_struct *p;
        long rem_load_move = max_load_move;
  
  next:
        if (!p || loops++ > sysctl_sched_nr_migrate)
                goto out;
-       /*
-        * To help distribute high priority tasks across CPUs we don't
-        * skip a task if it will be the highest priority task (i.e. smallest
-        * prio value) on its new queue regardless of its load weight
-        */
-       skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
-                                                        SCHED_LOAD_SCALE_FUZZ;
-       if ((skip_for_load && p->prio >= *this_best_prio) ||
+       if ((p->se.load.weight >> 1) > rem_load_move ||
            !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
                p = iterator->next(iterator->arg);
                goto next;
@@@ -2863,6 -3058,10 +3058,10 @@@ static int move_tasks(struct rq *this_r
                                max_load_move - total_load_moved,
                                sd, idle, all_pinned, &this_best_prio);
                class = class->next;
+               if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
+                       break;
        } while (class && max_load_move > total_load_moved);
  
        return total_load_moved > 0;
@@@ -2939,6 -3138,7 +3138,7 @@@ find_busiest_group(struct sched_domain 
        max_load = this_load = total_load = total_pwr = 0;
        busiest_load_per_task = busiest_nr_running = 0;
        this_load_per_task = this_nr_running = 0;
        if (idle == CPU_NOT_IDLE)
                load_idx = sd->busy_idx;
        else if (idle == CPU_NEWLY_IDLE)
                int __group_imb = 0;
                unsigned int balance_cpu = -1, first_idle_cpu = 0;
                unsigned long sum_nr_running, sum_weighted_load;
+               unsigned long sum_avg_load_per_task;
+               unsigned long avg_load_per_task;
  
                local_group = cpu_isset(this_cpu, group->cpumask);
  
  
                /* Tally up the load of all CPUs in the group */
                sum_weighted_load = sum_nr_running = avg_load = 0;
+               sum_avg_load_per_task = avg_load_per_task = 0;
                max_cpu_load = 0;
                min_cpu_load = ~0UL;
  
 -              for_each_cpu_mask(i, group->cpumask) {
 +              for_each_cpu_mask_nr(i, group->cpumask) {
                        struct rq *rq;
  
                        if (!cpu_isset(i, *cpus))
                        avg_load += load;
                        sum_nr_running += rq->nr_running;
                        sum_weighted_load += weighted_cpuload(i);
+                       sum_avg_load_per_task += cpu_avg_load_per_task(i);
                }
  
                /*
                avg_load = sg_div_cpu_power(group,
                                avg_load * SCHED_LOAD_SCALE);
  
-               if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
+               /*
+                * Consider the group unbalanced when the imbalance is larger
+                * than the average weight of two tasks.
+                *
+                * APZ: with cgroup the avg task weight can vary wildly and
+                *      might not be a suitable number - should we keep a
+                *      normalized nr_running number somewhere that negates
+                *      the hierarchy?
+                */
+               avg_load_per_task = sg_div_cpu_power(group,
+                               sum_avg_load_per_task * SCHED_LOAD_SCALE);
+               if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
                        __group_imb = 1;
  
                group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@@ -3156,9 -3375,9 +3375,9 @@@ small_imbalance
                        if (busiest_load_per_task > this_load_per_task)
                                imbn = 1;
                } else
-                       this_load_per_task = SCHED_LOAD_SCALE;
+                       this_load_per_task = cpu_avg_load_per_task(this_cpu);
  
-               if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
+               if (max_load - this_load + 2*busiest_load_per_task >=
                                        busiest_load_per_task * imbn) {
                        *imbalance = busiest_load_per_task;
                        return busiest;
@@@ -3228,7 -3447,7 +3447,7 @@@ find_busiest_queue(struct sched_group *
        unsigned long max_load = 0;
        int i;
  
 -      for_each_cpu_mask(i, group->cpumask) {
 +      for_each_cpu_mask_nr(i, group->cpumask) {
                unsigned long wl;
  
                if (!cpu_isset(i, *cpus))
@@@ -3284,6 -3503,7 +3503,7 @@@ static int load_balance(int this_cpu, s
        schedstat_inc(sd, lb_count[idle]);
  
  redo:
+       update_shares(sd);
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                   cpus, balance);
  
  
        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-               return -1;
-       return ld_moved;
+               ld_moved = -1;
+       goto out;
  
  out_balanced:
        schedstat_inc(sd, lb_balanced[idle]);
@@@ -3402,8 -3623,13 +3623,13 @@@ out_one_pinned
  
        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-               return -1;
-       return 0;
+               ld_moved = -1;
+       else
+               ld_moved = 0;
+ out:
+       if (ld_moved)
+               update_shares(sd);
+       return ld_moved;
  }
  
  /*
@@@ -3438,6 -3664,7 +3664,7 @@@ load_balance_newidle(int this_cpu, stru
  
        schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
  redo:
+       update_shares_locked(this_rq, sd);
        group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
                                   &sd_idle, cpus, NULL);
        if (!group) {
        } else
                sd->nr_balance_failed = 0;
  
+       update_shares_locked(this_rq, sd);
        return ld_moved;
  
  out_balanced:
@@@ -3672,6 -3900,7 +3900,7 @@@ static void rebalance_domains(int cpu, 
        /* Earliest time when we have to do rebalance again */
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
+       int need_serialize;
        cpumask_t tmp;
  
        for_each_domain(cpu, sd) {
                if (interval > HZ*NR_CPUS/10)
                        interval = HZ*NR_CPUS/10;
  
+               need_serialize = sd->flags & SD_SERIALIZE;
  
-               if (sd->flags & SD_SERIALIZE) {
+               if (need_serialize) {
                        if (!spin_trylock(&balancing))
                                goto out;
                }
                        }
                        sd->last_balance = jiffies;
                }
-               if (sd->flags & SD_SERIALIZE)
+               if (need_serialize)
                        spin_unlock(&balancing);
  out:
                if (time_after(next_balance, sd->last_balance + interval)) {
@@@ -3759,7 -3989,7 +3989,7 @@@ static void run_rebalance_domains(struc
                int balance_cpu;
  
                cpu_clear(this_cpu, cpus);
 -              for_each_cpu_mask(balance_cpu, cpus) {
 +              for_each_cpu_mask_nr(balance_cpu, cpus) {
                        /*
                         * If this cpu gets work to do, stop the load balancing
                         * work being done for other cpus. Next load
@@@ -4021,26 -4251,44 +4251,44 @@@ void scheduler_tick(void
  #endif
  }
  
- #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+ #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+                               defined(CONFIG_PREEMPT_TRACER))
+ static inline unsigned long get_parent_ip(unsigned long addr)
+ {
+       if (in_lock_functions(addr)) {
+               addr = CALLER_ADDR2;
+               if (in_lock_functions(addr))
+                       addr = CALLER_ADDR3;
+       }
+       return addr;
+ }
  
  void __kprobes add_preempt_count(int val)
  {
+ #ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Underflow?
         */
        if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                return;
+ #endif
        preempt_count() += val;
+ #ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Spinlock count overflowing soon?
         */
        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                PREEMPT_MASK - 10);
+ #endif
+       if (preempt_count() == val)
+               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
  }
  EXPORT_SYMBOL(add_preempt_count);
  
  void __kprobes sub_preempt_count(int val)
  {
+ #ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Underflow?
         */
        if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
                        !(preempt_count() & PREEMPT_MASK)))
                return;
+ #endif
  
+       if (preempt_count() == val)
+               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
        preempt_count() -= val;
  }
  EXPORT_SYMBOL(sub_preempt_count);
@@@ -4070,6 -4321,7 +4321,7 @@@ static noinline void __schedule_bug(str
                prev->comm, prev->pid, preempt_count());
  
        debug_show_held_locks(prev);
+       print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
  
@@@ -4143,7 -4395,7 +4395,7 @@@ asmlinkage void __sched schedule(void
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
-       int cpu;
+       int cpu, hrtick = sched_feat(HRTICK);
  
  need_resched:
        preempt_disable();
@@@ -4158,7 -4410,8 +4410,8 @@@ need_resched_nonpreemptible
  
        schedule_debug(prev);
  
-       hrtick_clear(rq);
+       if (hrtick)
+               hrtick_clear(rq);
  
        /*
         * Do the rq-clock update outside the rq lock:
        } else
                spin_unlock_irq(&rq->lock);
  
-       hrtick_set(rq);
+       if (hrtick)
+               hrtick_set(rq);
  
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
@@@ -4586,10 -4840,8 +4840,8 @@@ void set_user_nice(struct task_struct *
                goto out_unlock;
        }
        on_rq = p->se.on_rq;
-       if (on_rq) {
+       if (on_rq)
                dequeue_task(rq, p, 0);
-               dec_load(rq, p);
-       }
  
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
  
        if (on_rq) {
                enqueue_task(rq, p, 0);
-               inc_load(rq, p);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@@ -4744,16 -4995,8 +4995,8 @@@ __setscheduler(struct rq *rq, struct ta
        set_load_weight(p);
  }
  
- /**
-  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
-  * @p: the task in question.
-  * @policy: new policy.
-  * @param: structure containing the new RT priority.
-  *
-  * NOTE that the task may be already dead.
-  */
- int sched_setscheduler(struct task_struct *p, int policy,
-                      struct sched_param *param)
+ static int __sched_setscheduler(struct task_struct *p, int policy,
+                               struct sched_param *param, bool user)
  {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
        unsigned long flags;
@@@ -4785,7 -5028,7 +5028,7 @@@ recheck
        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
-       if (!capable(CAP_SYS_NICE)) {
+       if (user && !capable(CAP_SYS_NICE)) {
                if (rt_policy(policy)) {
                        unsigned long rlim_rtprio;
  
         * Do not allow realtime tasks into groups that have no runtime
         * assigned.
         */
-       if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+       if (user
+           && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
                return -EPERM;
  #endif
  
  
        return 0;
  }
+ /**
+  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+  * @p: the task in question.
+  * @policy: new policy.
+  * @param: structure containing the new RT priority.
+  *
+  * NOTE that the task may be already dead.
+  */
+ int sched_setscheduler(struct task_struct *p, int policy,
+                      struct sched_param *param)
+ {
+       return __sched_setscheduler(p, policy, param, true);
+ }
  EXPORT_SYMBOL_GPL(sched_setscheduler);
  
+ /**
+  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+  * @p: the task in question.
+  * @policy: new policy.
+  * @param: structure containing the new RT priority.
+  *
+  * Just like sched_setscheduler, only don't bother checking if the
+  * current context has permission.  For example, this is needed in
+  * stop_machine(): we create temporary high priority worker threads,
+  * but our caller might not have that capability.
+  */
+ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+                              struct sched_param *param)
+ {
+       return __sched_setscheduler(p, policy, param, false);
+ }
  static int
  do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  {
@@@ -5070,24 -5345,6 +5345,6 @@@ asmlinkage long sys_sched_setaffinity(p
        return sched_setaffinity(pid, &new_mask);
  }
  
- /*
-  * Represents all cpu's present in the system
-  * In systems capable of hotplug, this map could dynamically grow
-  * as new cpu's are detected in the system via any platform specific
-  * method, such as ACPI for e.g.
-  */
- cpumask_t cpu_present_map __read_mostly;
- EXPORT_SYMBOL(cpu_present_map);
- #ifndef CONFIG_SMP
- cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
- EXPORT_SYMBOL(cpu_online_map);
- cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
- EXPORT_SYMBOL(cpu_possible_map);
- #endif
  long sched_getaffinity(pid_t pid, cpumask_t *mask)
  {
        struct task_struct *p;
@@@ -5384,7 -5641,7 +5641,7 @@@ out_unlock
        return retval;
  }
  
- static const char stat_nam[] = "RSDTtZX";
+ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
  
  void sched_show_task(struct task_struct *p)
  {
@@@ -5571,6 -5828,12 +5828,12 @@@ int set_cpus_allowed_ptr(struct task_st
                goto out;
        }
  
+       if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+                    !cpus_equal(p->cpus_allowed, *new_mask))) {
+               ret = -EINVAL;
+               goto out;
+       }
        if (p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
        else {
@@@ -5622,10 -5885,10 +5885,10 @@@ static int __migrate_task(struct task_s
        double_rq_lock(rq_src, rq_dest);
        /* Already moved. */
        if (task_cpu(p) != src_cpu)
-               goto out;
+               goto done;
        /* Affinity changed (again). */
        if (!cpu_isset(dest_cpu, p->cpus_allowed))
-               goto out;
+               goto fail;
  
        on_rq = p->se.on_rq;
        if (on_rq)
                activate_task(rq_dest, p, 0);
                check_preempt_curr(rq_dest, p);
        }
+ done:
        ret = 1;
out:
fail:
        double_rq_unlock(rq_src, rq_dest);
        return ret;
  }
@@@ -6059,6 -6323,36 +6323,36 @@@ static void unregister_sched_domain_sys
  }
  #endif
  
+ static void set_rq_online(struct rq *rq)
+ {
+       if (!rq->online) {
+               const struct sched_class *class;
+               cpu_set(rq->cpu, rq->rd->online);
+               rq->online = 1;
+               for_each_class(class) {
+                       if (class->rq_online)
+                               class->rq_online(rq);
+               }
+       }
+ }
+ static void set_rq_offline(struct rq *rq)
+ {
+       if (rq->online) {
+               const struct sched_class *class;
+               for_each_class(class) {
+                       if (class->rq_offline)
+                               class->rq_offline(rq);
+               }
+               cpu_clear(rq->cpu, rq->rd->online);
+               rq->online = 0;
+       }
+ }
  /*
   * migration_call - callback that gets triggered when a CPU is added.
   * Here we can start up the necessary migration thread for the new CPU.
@@@ -6096,7 -6390,8 +6390,8 @@@ migration_call(struct notifier_block *n
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpu_isset(cpu, rq->rd->span));
-                       cpu_set(cpu, rq->rd->online);
+                       set_rq_online(rq);
                }
                spin_unlock_irqrestore(&rq->lock, flags);
                break;
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpu_isset(cpu, rq->rd->span));
-                       cpu_clear(cpu, rq->rd->online);
+                       set_rq_offline(rq);
                }
                spin_unlock_irqrestore(&rq->lock, flags);
                break;
@@@ -6191,6 -6486,28 +6486,28 @@@ void __init migration_init(void
  
  #ifdef CONFIG_SCHED_DEBUG
  
+ static inline const char *sd_level_to_string(enum sched_domain_level lvl)
+ {
+       switch (lvl) {
+       case SD_LV_NONE:
+                       return "NONE";
+       case SD_LV_SIBLING:
+                       return "SIBLING";
+       case SD_LV_MC:
+                       return "MC";
+       case SD_LV_CPU:
+                       return "CPU";
+       case SD_LV_NODE:
+                       return "NODE";
+       case SD_LV_ALLNODES:
+                       return "ALLNODES";
+       case SD_LV_MAX:
+                       return "MAX";
+       }
+       return "MAX";
+ }
  static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  cpumask_t *groupmask)
  {
                return -1;
        }
  
-       printk(KERN_CONT "span %s\n", str);
+       printk(KERN_CONT "span %s level %s\n",
+               str, sd_level_to_string(sd->level));
  
        if (!cpu_isset(cpu, sd->span)) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
@@@ -6294,9 -6612,9 +6612,9 @@@ static void sched_domain_debug(struct s
        }
        kfree(groupmask);
  }
- #else
+ #else /* !CONFIG_SCHED_DEBUG */
  # define sched_domain_debug(sd, cpu) do { } while (0)
- #endif
+ #endif /* CONFIG_SCHED_DEBUG */
  
  static int sd_degenerate(struct sched_domain *sd)
  {
@@@ -6356,20 -6674,16 +6674,16 @@@ sd_parent_degenerate(struct sched_domai
  static void rq_attach_root(struct rq *rq, struct root_domain *rd)
  {
        unsigned long flags;
-       const struct sched_class *class;
  
        spin_lock_irqsave(&rq->lock, flags);
  
        if (rq->rd) {
                struct root_domain *old_rd = rq->rd;
  
-               for (class = sched_class_highest; class; class = class->next) {
-                       if (class->leave_domain)
-                               class->leave_domain(rq);
-               }
+               if (cpu_isset(rq->cpu, old_rd->online))
+                       set_rq_offline(rq);
  
                cpu_clear(rq->cpu, old_rd->span);
-               cpu_clear(rq->cpu, old_rd->online);
  
                if (atomic_dec_and_test(&old_rd->refcount))
                        kfree(old_rd);
  
        cpu_set(rq->cpu, rd->span);
        if (cpu_isset(rq->cpu, cpu_online_map))
-               cpu_set(rq->cpu, rd->online);
-       for (class = sched_class_highest; class; class = class->next) {
-               if (class->join_domain)
-                       class->join_domain(rq);
-       }
+               set_rq_online(rq);
  
        spin_unlock_irqrestore(&rq->lock, flags);
  }
@@@ -6396,6 -6705,8 +6705,8 @@@ static void init_rootdomain(struct root
  
        cpus_clear(rd->span);
        cpus_clear(rd->online);
+       cpupri_init(&rd->cpupri);
  }
  
  static void init_defrootdomain(void)
@@@ -6491,7 -6802,7 +6802,7 @@@ init_sched_build_groups(const cpumask_
  
        cpus_clear(*covered);
  
 -      for_each_cpu_mask(i, *span) {
 +      for_each_cpu_mask_nr(i, *span) {
                struct sched_group *sg;
                int group = group_fn(i, cpu_map, &sg, tmpmask);
                int j;
                cpus_clear(sg->cpumask);
                sg->__cpu_power = 0;
  
 -              for_each_cpu_mask(j, *span) {
 +              for_each_cpu_mask_nr(j, *span) {
                        if (group_fn(j, cpu_map, NULL, tmpmask) != group)
                                continue;
  
@@@ -6590,7 -6901,7 +6901,7 @@@ static void sched_domain_node_span(int 
                cpus_or(*span, *span, *nodemask);
        }
  }
- #endif
+ #endif /* CONFIG_NUMA */
  
  int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
  
@@@ -6609,7 -6920,7 +6920,7 @@@ cpu_to_cpu_group(int cpu, const cpumask
                *sg = &per_cpu(sched_group_cpus, cpu);
        return cpu;
  }
- #endif
+ #endif /* CONFIG_SCHED_SMT */
  
  /*
   * multi-core sched-domains:
  #ifdef CONFIG_SCHED_MC
  static DEFINE_PER_CPU(struct sched_domain, core_domains);
  static DEFINE_PER_CPU(struct sched_group, sched_group_core);
- #endif
+ #endif /* CONFIG_SCHED_MC */
  
  #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
  static int
@@@ -6702,7 -7013,7 +7013,7 @@@ static void init_numa_sched_groups_powe
        if (!sg)
                return;
        do {
 -              for_each_cpu_mask(j, sg->cpumask) {
 +              for_each_cpu_mask_nr(j, sg->cpumask) {
                        struct sched_domain *sd;
  
                        sd = &per_cpu(phys_domains, j);
                sg = sg->next;
        } while (sg != group_head);
  }
- #endif
+ #endif /* CONFIG_NUMA */
  
  #ifdef CONFIG_NUMA
  /* Free memory allocated for various sched_group structures */
@@@ -6727,7 -7038,7 +7038,7 @@@ static void free_sched_groups(const cpu
  {
        int cpu, i;
  
 -      for_each_cpu_mask(cpu, *cpu_map) {
 +      for_each_cpu_mask_nr(cpu, *cpu_map) {
                struct sched_group **sched_group_nodes
                        = sched_group_nodes_bycpu[cpu];
  
@@@ -6756,11 -7067,11 +7067,11 @@@ next_sg
                sched_group_nodes_bycpu[cpu] = NULL;
        }
  }
- #else
+ #else /* !CONFIG_NUMA */
  static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
  {
  }
- #endif
+ #endif /* CONFIG_NUMA */
  
  /*
   * Initialize sched groups cpu_power.
@@@ -6966,7 -7277,7 +7277,7 @@@ static int __build_sched_domains(const 
        /*
         * Set up domains for cpus specified by the cpu_map.
         */
 -      for_each_cpu_mask(i, *cpu_map) {
 +      for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd = NULL, *p;
                SCHED_CPUMASK_VAR(nodemask, allmasks);
  
  
  #ifdef CONFIG_SCHED_SMT
        /* Set up CPU (sibling) groups */
 -      for_each_cpu_mask(i, *cpu_map) {
 +      for_each_cpu_mask_nr(i, *cpu_map) {
                SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
                SCHED_CPUMASK_VAR(send_covered, allmasks);
  
  
  #ifdef CONFIG_SCHED_MC
        /* Set up multi-core groups */
 -      for_each_cpu_mask(i, *cpu_map) {
 +      for_each_cpu_mask_nr(i, *cpu_map) {
                SCHED_CPUMASK_VAR(this_core_map, allmasks);
                SCHED_CPUMASK_VAR(send_covered, allmasks);
  
                        goto error;
                }
                sched_group_nodes[i] = sg;
 -              for_each_cpu_mask(j, *nodemask) {
 +              for_each_cpu_mask_nr(j, *nodemask) {
                        struct sched_domain *sd;
  
                        sd = &per_cpu(node_domains, j);
  
        /* Calculate CPU power for physical packages and nodes */
  #ifdef CONFIG_SCHED_SMT
 -      for_each_cpu_mask(i, *cpu_map) {
 +      for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd = &per_cpu(cpu_domains, i);
  
                init_sched_groups_power(i, sd);
        }
  #endif
  #ifdef CONFIG_SCHED_MC
 -      for_each_cpu_mask(i, *cpu_map) {
 +      for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd = &per_cpu(core_domains, i);
  
                init_sched_groups_power(i, sd);
        }
  #endif
  
 -      for_each_cpu_mask(i, *cpu_map) {
 +      for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd = &per_cpu(phys_domains, i);
  
                init_sched_groups_power(i, sd);
  #endif
  
        /* Attach the domains */
 -      for_each_cpu_mask(i, *cpu_map) {
 +      for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd;
  #ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i);
@@@ -7292,7 -7603,7 +7603,7 @@@ static void detach_destroy_domains(cons
  
        unregister_sched_domain_sysctl();
  
 -      for_each_cpu_mask(i, *cpu_map)
 +      for_each_cpu_mask_nr(i, *cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
        synchronize_sched();
        arch_destroy_sched_domains(cpu_map, &tmpmask);
@@@ -7469,7 -7780,7 +7780,7 @@@ int sched_create_sysfs_power_savings_en
  #endif
        return err;
  }
- #endif
+ #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
  
  /*
   * Force a reinitialization of the sched domains hierarchy. The domains
  static int update_sched_domains(struct notifier_block *nfb,
                                unsigned long action, void *hcpu)
  {
+       int cpu = (int)(long)hcpu;
        switch (action) {
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
+               disable_runtime(cpu_rq(cpu));
+               /* fall-through */
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
                detach_destroy_domains(&cpu_online_map);
                free_sched_domains();
                return NOTIFY_OK;
  
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
        case CPU_DOWN_FAILED:
        case CPU_DOWN_FAILED_FROZEN:
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
+               enable_runtime(cpu_rq(cpu));
+               /* fall-through */
+       case CPU_UP_CANCELED:
+       case CPU_UP_CANCELED_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                /*
@@@ -7694,8 -8012,8 +8012,8 @@@ void __init sched_init(void
  
                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
- #endif
- #endif
+ #endif /* CONFIG_USER_SCHED */
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
  #ifdef CONFIG_RT_GROUP_SCHED
                init_task_group.rt_se = (struct sched_rt_entity **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
  
                root_task_group.rt_rq = (struct rt_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
- #endif
- #endif
+ #endif /* CONFIG_USER_SCHED */
+ #endif /* CONFIG_RT_GROUP_SCHED */
        }
  
  #ifdef CONFIG_SMP
  #ifdef CONFIG_USER_SCHED
        init_rt_bandwidth(&root_task_group.rt_bandwidth,
                        global_rt_period(), RUNTIME_INF);
- #endif
- #endif
+ #endif /* CONFIG_USER_SCHED */
+ #endif /* CONFIG_RT_GROUP_SCHED */
  
  #ifdef CONFIG_GROUP_SCHED
        list_add(&init_task_group.list, &task_groups);
        INIT_LIST_HEAD(&root_task_group.children);
        init_task_group.parent = &root_task_group;
        list_add(&init_task_group.siblings, &root_task_group.children);
- #endif
- #endif
+ #endif /* CONFIG_USER_SCHED */
+ #endif /* CONFIG_GROUP_SCHED */
  
        for_each_possible_cpu(i) {
                struct rq *rq;
                rq->next_balance = jiffies;
                rq->push_cpu = 0;
                rq->cpu = i;
+               rq->online = 0;
                rq->migration_thread = NULL;
                INIT_LIST_HEAD(&rq->migration_queue);
                rq_attach_root(rq, &def_root_domain);
  #endif
  
  #ifdef CONFIG_SMP
-       open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
+       open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
  #endif
  
  #ifdef CONFIG_RT_MUTEXES
@@@ -8057,7 -8376,7 +8376,7 @@@ static inline void unregister_fair_sche
  {
        list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
  }
- #else
+ #else /* !CONFG_FAIR_GROUP_SCHED */
  static inline void free_fair_sched_group(struct task_group *tg)
  {
  }
@@@ -8075,7 -8394,7 +8394,7 @@@ static inline void register_fair_sched_
  static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
  {
  }
- #endif
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  #ifdef CONFIG_RT_GROUP_SCHED
  static void free_rt_sched_group(struct task_group *tg)
@@@ -8146,7 -8465,7 +8465,7 @@@ static inline void unregister_rt_sched_
  {
        list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
  }
- #else
+ #else /* !CONFIG_RT_GROUP_SCHED */
  static inline void free_rt_sched_group(struct task_group *tg)
  {
  }
@@@ -8164,7 -8483,7 +8483,7 @@@ static inline void register_rt_sched_gr
  static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
  {
  }
- #endif
+ #endif /* CONFIG_RT_GROUP_SCHED */
  
  #ifdef CONFIG_GROUP_SCHED
  static void free_sched_group(struct task_group *tg)
@@@ -8275,17 -8594,14 +8594,14 @@@ void sched_move_task(struct task_struc
  
        task_rq_unlock(rq, &flags);
  }
- #endif
+ #endif /* CONFIG_GROUP_SCHED */
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
- static void set_se_shares(struct sched_entity *se, unsigned long shares)
+ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
  {
        struct cfs_rq *cfs_rq = se->cfs_rq;
-       struct rq *rq = cfs_rq->rq;
        int on_rq;
  
-       spin_lock_irq(&rq->lock);
        on_rq = se->on_rq;
        if (on_rq)
                dequeue_entity(cfs_rq, se, 0);
  
        if (on_rq)
                enqueue_entity(cfs_rq, se, 0);
+ }
  
-       spin_unlock_irq(&rq->lock);
+ static void set_se_shares(struct sched_entity *se, unsigned long shares)
+ {
+       struct cfs_rq *cfs_rq = se->cfs_rq;
+       struct rq *rq = cfs_rq->rq;
+       unsigned long flags;
+       spin_lock_irqsave(&rq->lock, flags);
+       __set_se_shares(se, shares);
+       spin_unlock_irqrestore(&rq->lock, flags);
  }
  
  static DEFINE_MUTEX(shares_mutex);
@@@ -8335,8 -8660,13 +8660,13 @@@ int sched_group_set_shares(struct task_
         * w/o tripping rebalance_share or load_balance_fair.
         */
        tg->shares = shares;
-       for_each_possible_cpu(i)
+       for_each_possible_cpu(i) {
+               /*
+                * force a rebalance
+                */
+               cfs_rq_set_shares(tg->cfs_rq[i], 0);
                set_se_shares(tg->se[i], shares);
+       }
  
        /*
         * Enable load balance activity on this group, by inserting it back on
@@@ -8375,7 -8705,7 +8705,7 @@@ static unsigned long to_ratio(u64 perio
  #ifdef CONFIG_CGROUP_SCHED
  static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
  {
-       struct task_group *tgi, *parent = tg ? tg->parent : NULL;
+       struct task_group *tgi, *parent = tg->parent;
        unsigned long total = 0;
  
        if (!parent) {
        }
        rcu_read_unlock();
  
-       return total + to_ratio(period, runtime) <
+       return total + to_ratio(period, runtime) <=
                to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
                                parent->rt_bandwidth.rt_runtime);
  }
@@@ -8519,16 -8849,21 +8849,21 @@@ long sched_group_rt_period(struct task_
  
  static int sched_rt_global_constraints(void)
  {
+       struct task_group *tg = &root_task_group;
+       u64 rt_runtime, rt_period;
        int ret = 0;
  
+       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+       rt_runtime = tg->rt_bandwidth.rt_runtime;
        mutex_lock(&rt_constraints_mutex);
-       if (!__rt_schedulable(NULL, 1, 0))
+       if (!__rt_schedulable(tg, rt_period, rt_runtime))
                ret = -EINVAL;
        mutex_unlock(&rt_constraints_mutex);
  
        return ret;
  }
- #else
+ #else /* !CONFIG_RT_GROUP_SCHED */
  static int sched_rt_global_constraints(void)
  {
        unsigned long flags;
  
        return 0;
  }
- #endif
+ #endif /* CONFIG_RT_GROUP_SCHED */
  
  int sched_rt_handler(struct ctl_table *table, int write,
                struct file *filp, void __user *buffer, size_t *lenp,
@@@ -8654,7 -8989,7 +8989,7 @@@ static u64 cpu_shares_read_u64(struct c
  
        return (u64) tg->shares;
  }
- #endif
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  #ifdef CONFIG_RT_GROUP_SCHED
  static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@@ -8678,7 -9013,7 +9013,7 @@@ static u64 cpu_rt_period_read_uint(stru
  {
        return sched_group_rt_period(cgroup_tg(cgrp));
  }
- #endif
+ #endif /* CONFIG_RT_GROUP_SCHED */
  
  static struct cftype cpu_files[] = {
  #ifdef CONFIG_FAIR_GROUP_SCHED
diff --combined kernel/sched_fair.c
@@@ -63,13 -63,13 +63,13 @@@ unsigned int __read_mostly sysctl_sched
  
  /*
   * SCHED_OTHER wake-up granularity.
-  * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
+  * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
   *
   * This option delays the preemption effects of decoupled workloads
   * and reduces their over-scheduling. Synchronous workloads will still
   * have immediate wakeup/sleep latencies.
   */
- unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+ unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
  
  const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  
@@@ -333,6 -333,34 +333,34 @@@ int sched_nr_latency_handler(struct ctl
  }
  #endif
  
+ /*
+  * delta *= w / rw
+  */
+ static inline unsigned long
+ calc_delta_weight(unsigned long delta, struct sched_entity *se)
+ {
+       for_each_sched_entity(se) {
+               delta = calc_delta_mine(delta,
+                               se->load.weight, &cfs_rq_of(se)->load);
+       }
+       return delta;
+ }
+ /*
+  * delta *= rw / w
+  */
+ static inline unsigned long
+ calc_delta_fair(unsigned long delta, struct sched_entity *se)
+ {
+       for_each_sched_entity(se) {
+               delta = calc_delta_mine(delta,
+                               cfs_rq_of(se)->load.weight, &se->load);
+       }
+       return delta;
+ }
  /*
   * The idea is to set a period in which each task runs once.
   *
@@@ -362,47 -390,80 +390,80 @@@ static u64 __sched_period(unsigned lon
   */
  static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       u64 slice = __sched_period(cfs_rq->nr_running);
-       for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
-               slice *= se->load.weight;
-               do_div(slice, cfs_rq->load.weight);
-       }
-       return slice;
+       return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
  }
  
  /*
   * We calculate the vruntime slice of a to be inserted task
   *
-  * vs = s/w = p/rw
+  * vs = s*rw/w = p
   */
  static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
        unsigned long nr_running = cfs_rq->nr_running;
-       unsigned long weight;
-       u64 vslice;
  
        if (!se->on_rq)
                nr_running++;
  
-       vslice = __sched_period(nr_running);
+       return __sched_period(nr_running);
+ }
+ /*
+  * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
+  * that it favours >=0 over <0.
+  *
+  *   -20         |
+  *               |
+  *     0 --------+-------
+  *             .'
+  *    19     .'
+  *
+  */
+ static unsigned long
+ calc_delta_asym(unsigned long delta, struct sched_entity *se)
+ {
+       struct load_weight lw = {
+               .weight = NICE_0_LOAD,
+               .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
+       };
  
        for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
+               struct load_weight *se_lw = &se->load;
+               unsigned long rw = cfs_rq_of(se)->load.weight;
+ #ifdef CONFIG_FAIR_SCHED_GROUP
+               struct cfs_rq *cfs_rq = se->my_q;
+               struct task_group *tg = NULL
+               if (cfs_rq)
+                       tg = cfs_rq->tg;
+               if (tg && tg->shares < NICE_0_LOAD) {
+                       /*
+                        * scale shares to what it would have been had
+                        * tg->weight been NICE_0_LOAD:
+                        *
+                        *   weight = 1024 * shares / tg->weight
+                        */
+                       lw.weight *= se->load.weight;
+                       lw.weight /= tg->shares;
+                       lw.inv_weight = 0;
+                       se_lw = &lw;
+                       rw += lw.weight - se->load.weight;
+               } else
+ #endif
  
-               weight = cfs_rq->load.weight;
-               if (!se->on_rq)
-                       weight += se->load.weight;
+               if (se->load.weight < NICE_0_LOAD) {
+                       se_lw = &lw;
+                       rw += NICE_0_LOAD - se->load.weight;
+               }
  
-               vslice *= NICE_0_LOAD;
-               do_div(vslice, weight);
+               delta = calc_delta_mine(delta, rw, se_lw);
        }
  
-       return vslice;
+       return delta;
  }
  
  /*
@@@ -419,11 -480,7 +480,7 @@@ __update_curr(struct cfs_rq *cfs_rq, st
  
        curr->sum_exec_runtime += delta_exec;
        schedstat_add(cfs_rq, exec_clock, delta_exec);
-       delta_exec_weighted = delta_exec;
-       if (unlikely(curr->load.weight != NICE_0_LOAD)) {
-               delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
-                                                       &curr->load);
-       }
+       delta_exec_weighted = calc_delta_fair(delta_exec, curr);
        curr->vruntime += delta_exec_weighted;
  }
  
@@@ -510,10 -567,27 +567,27 @@@ update_stats_curr_start(struct cfs_rq *
   * Scheduling class queueing methods:
   */
  
+ #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+ static void
+ add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+ {
+       cfs_rq->task_weight += weight;
+ }
+ #else
+ static inline void
+ add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+ {
+ }
+ #endif
  static void
  account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
        update_load_add(&cfs_rq->load, se->load.weight);
+       if (!parent_entity(se))
+               inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+       if (entity_is_task(se))
+               add_cfs_task_weight(cfs_rq, se->load.weight);
        cfs_rq->nr_running++;
        se->on_rq = 1;
        list_add(&se->group_node, &cfs_rq->tasks);
@@@ -523,6 -597,10 +597,10 @@@ static voi
  account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
        update_load_sub(&cfs_rq->load, se->load.weight);
+       if (!parent_entity(se))
+               dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+       if (entity_is_task(se))
+               add_cfs_task_weight(cfs_rq, -se->load.weight);
        cfs_rq->nr_running--;
        se->on_rq = 0;
        list_del_init(&se->group_node);
@@@ -609,8 -687,17 +687,17 @@@ place_entity(struct cfs_rq *cfs_rq, str
  
        if (!initial) {
                /* sleeps upto a single latency don't count. */
-               if (sched_feat(NEW_FAIR_SLEEPERS))
-                       vruntime -= sysctl_sched_latency;
+               if (sched_feat(NEW_FAIR_SLEEPERS)) {
+                       unsigned long thresh = sysctl_sched_latency;
+                       /*
+                        * convert the sleeper threshold into virtual time
+                        */
+                       if (sched_feat(NORMALIZED_SLEEPER))
+                               thresh = calc_delta_fair(thresh, se);
+                       vruntime -= thresh;
+               }
  
                /* ensure we never gain time by being placed backwards. */
                vruntime = max_vruntime(se->vruntime, vruntime);
@@@ -639,21 -726,6 +726,6 @@@ enqueue_entity(struct cfs_rq *cfs_rq, s
                __enqueue_entity(cfs_rq, se);
  }
  
- static void update_avg(u64 *avg, u64 sample)
- {
-       s64 diff = sample - *avg;
-       *avg += diff >> 3;
- }
- static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
-       if (!se->last_wakeup)
-               return;
-       update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
-       se->last_wakeup = 0;
- }
  static void
  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
  {
  
        update_stats_dequeue(cfs_rq, se);
        if (sleep) {
-               update_avg_stats(cfs_rq, se);
  #ifdef CONFIG_SCHEDSTATS
                if (entity_is_task(se)) {
                        struct task_struct *tsk = task_of(se);
@@@ -726,17 -797,16 +797,16 @@@ set_next_entity(struct cfs_rq *cfs_rq, 
        se->prev_sum_exec_runtime = se->sum_exec_runtime;
  }
  
- static int
- wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
  static struct sched_entity *
  pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       if (!cfs_rq->next)
-               return se;
+       struct rq *rq = rq_of(cfs_rq);
+       u64 pair_slice = rq->clock - cfs_rq->pair_start;
  
-       if (wakeup_preempt_entity(cfs_rq->next, se) != 0)
+       if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
+               cfs_rq->pair_start = rq->clock;
                return se;
+       }
  
        return cfs_rq->next;
  }
@@@ -835,7 -905,7 +905,7 @@@ static void hrtick_start_fair(struct r
                hrtick_start(rq, delta, requeue);
        }
  }
- #else
+ #else /* !CONFIG_SCHED_HRTICK */
  static inline void
  hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
@@@ -961,7 -1031,7 +1031,7 @@@ static int wake_idle(int cpu, struct ta
                    || ((sd->flags & SD_WAKE_IDLE_FAR)
                        && !task_hot(p, task_rq(p)->clock, sd))) {
                        cpus_and(tmp, sd->span, p->cpus_allowed);
 -                      for_each_cpu_mask(i, tmp) {
 +                      for_each_cpu_mask_nr(i, tmp) {
                                if (idle_cpu(i)) {
                                        if (i != task_cpu(p)) {
                                                schedstat_inc(p,
        }
        return cpu;
  }
- #else
+ #else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
  static inline int wake_idle(int cpu, struct task_struct *p)
  {
        return cpu;
  
  static const struct sched_class fair_sched_class;
  
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+  * effective_load() calculates the load change as seen from the root_task_group
+  *
+  * Adding load to a group doesn't make a group heavier, but can cause movement
+  * of group shares between cpus. Assuming the shares were perfectly aligned one
+  * can calculate the shift in shares.
+  *
+  * The problem is that perfectly aligning the shares is rather expensive, hence
+  * we try to avoid doing that too often - see update_shares(), which ratelimits
+  * this change.
+  *
+  * We compensate this by not only taking the current delta into account, but
+  * also considering the delta between when the shares were last adjusted and
+  * now.
+  *
+  * We still saw a performance dip, some tracing learned us that between
+  * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
+  * significantly. Therefore try to bias the error in direction of failing
+  * the affine wakeup.
+  *
+  */
+ static long effective_load(struct task_group *tg, int cpu,
+               long wl, long wg)
+ {
+       struct sched_entity *se = tg->se[cpu];
+       long more_w;
+       if (!tg->parent)
+               return wl;
+       /*
+        * By not taking the decrease of shares on the other cpu into
+        * account our error leans towards reducing the affine wakeups.
+        */
+       if (!wl && sched_feat(ASYM_EFF_LOAD))
+               return wl;
+       /*
+        * Instead of using this increment, also add the difference
+        * between when the shares were last updated and now.
+        */
+       more_w = se->my_q->load.weight - se->my_q->rq_weight;
+       wl += more_w;
+       wg += more_w;
+       for_each_sched_entity(se) {
+ #define D(n) (likely(n) ? (n) : 1)
+               long S, rw, s, a, b;
+               S = se->my_q->tg->shares;
+               s = se->my_q->shares;
+               rw = se->my_q->rq_weight;
+               a = S*(rw + wl);
+               b = S*rw + s*wg;
+               wl = s*(a-b)/D(b);
+               /*
+                * Assume the group is already running and will
+                * thus already be accounted for in the weight.
+                *
+                * That is, moving shares between CPUs, does not
+                * alter the group weight.
+                */
+               wg = 0;
+ #undef D
+       }
+       return wl;
+ }
+ #else
+ static inline unsigned long effective_load(struct task_group *tg, int cpu,
+               unsigned long wl, unsigned long wg)
+ {
+       return wl;
+ }
+ #endif
  static int
  wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
            struct task_struct *p, int prev_cpu, int this_cpu, int sync,
            unsigned int imbalance)
  {
        struct task_struct *curr = this_rq->curr;
+       struct task_group *tg;
        unsigned long tl = this_load;
        unsigned long tl_per_task;
+       unsigned long weight;
        int balanced;
  
        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
         * effect of the currently running task from the load
         * of the current CPU:
         */
-       if (sync)
-               tl -= current->se.load.weight;
+       if (sync) {
+               tg = task_group(current);
+               weight = current->se.load.weight;
+               tl += effective_load(tg, this_cpu, -weight, -weight);
+               load += effective_load(tg, prev_cpu, 0, -weight);
+       }
  
-       balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
+       tg = task_group(p);
+       weight = p->se.load.weight;
+       balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+               imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
  
        /*
         * If the currently running task will sleep within
         * a reasonable amount of time then attract this newly
         * woken task:
         */
-       if (sync && balanced && curr->sched_class == &fair_sched_class) {
+       if (sync && balanced) {
                if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-                               p->se.avg_overlap < sysctl_sched_migration_cost)
+                   p->se.avg_overlap < sysctl_sched_migration_cost)
                        return 1;
        }
  
@@@ -1111,11 -1275,13 +1275,13 @@@ static unsigned long wakeup_gran(struc
        unsigned long gran = sysctl_sched_wakeup_granularity;
  
        /*
-        * More easily preempt - nice tasks, while not making
-        * it harder for + nice tasks.
+        * More easily preempt - nice tasks, while not making it harder for
+        * + nice tasks.
         */
-       if (unlikely(se->load.weight > NICE_0_LOAD))
-               gran = calc_delta_fair(gran, &se->load);
+       if (sched_feat(ASYM_GRAN))
+               gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+       else
+               gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
  
        return gran;
  }
@@@ -1177,7 -1343,6 +1343,6 @@@ static void check_preempt_wakeup(struc
                return;
        }
  
-       se->last_wakeup = se->sum_exec_runtime;
        if (unlikely(se == pse))
                return;
  
@@@ -1275,23 -1440,18 +1440,18 @@@ __load_balance_iterator(struct cfs_rq *
        struct task_struct *p = NULL;
        struct sched_entity *se;
  
-       if (next == &cfs_rq->tasks)
-               return NULL;
-       /* Skip over entities that are not tasks */
-       do {
+       while (next != &cfs_rq->tasks) {
                se = list_entry(next, struct sched_entity, group_node);
                next = next->next;
-       } while (next != &cfs_rq->tasks && !entity_is_task(se));
  
-       if (next == &cfs_rq->tasks)
-               return NULL;
+               /* Skip over entities that are not tasks */
+               if (entity_is_task(se)) {
+                       p = task_of(se);
+                       break;
+               }
+       }
  
        cfs_rq->balance_iterator = next;
-       if (entity_is_task(se))
-               p = task_of(se);
        return p;
  }
  
@@@ -1309,75 -1469,82 +1469,82 @@@ static struct task_struct *load_balance
        return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
  }
  
- #ifdef CONFIG_FAIR_GROUP_SCHED
- static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+ static unsigned long
+ __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+               unsigned long max_load_move, struct sched_domain *sd,
+               enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+               struct cfs_rq *cfs_rq)
  {
-       struct sched_entity *curr;
-       struct task_struct *p;
-       if (!cfs_rq->nr_running || !first_fair(cfs_rq))
-               return MAX_PRIO;
-       curr = cfs_rq->curr;
-       if (!curr)
-               curr = __pick_next_entity(cfs_rq);
+       struct rq_iterator cfs_rq_iterator;
  
-       p = task_of(curr);
+       cfs_rq_iterator.start = load_balance_start_fair;
+       cfs_rq_iterator.next = load_balance_next_fair;
+       cfs_rq_iterator.arg = cfs_rq;
  
-       return p->prio;
+       return balance_tasks(this_rq, this_cpu, busiest,
+                       max_load_move, sd, idle, all_pinned,
+                       this_best_prio, &cfs_rq_iterator);
  }
- #endif
  
+ #ifdef CONFIG_FAIR_GROUP_SCHED
  static unsigned long
  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
                  int *all_pinned, int *this_best_prio)
  {
-       struct cfs_rq *busy_cfs_rq;
        long rem_load_move = max_load_move;
-       struct rq_iterator cfs_rq_iterator;
+       int busiest_cpu = cpu_of(busiest);
+       struct task_group *tg;
  
-       cfs_rq_iterator.start = load_balance_start_fair;
-       cfs_rq_iterator.next = load_balance_next_fair;
-       for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
- #ifdef CONFIG_FAIR_GROUP_SCHED
-               struct cfs_rq *this_cfs_rq;
-               long imbalance;
-               unsigned long maxload;
+       rcu_read_lock();
+       update_h_load(busiest_cpu);
  
-               this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+       list_for_each_entry(tg, &task_groups, list) {
+               struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
+               unsigned long busiest_h_load = busiest_cfs_rq->h_load;
+               unsigned long busiest_weight = busiest_cfs_rq->load.weight;
+               u64 rem_load, moved_load;
  
-               imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
-               /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-               if (imbalance <= 0)
+               /*
+                * empty group
+                */
+               if (!busiest_cfs_rq->task_weight)
                        continue;
  
-               /* Don't pull more than imbalance/2 */
-               imbalance /= 2;
-               maxload = min(rem_load_move, imbalance);
+               rem_load = (u64)rem_load_move * busiest_weight;
+               rem_load = div_u64(rem_load, busiest_h_load + 1);
  
-               *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
- #else
- # define maxload rem_load_move
- #endif
-               /*
-                * pass busy_cfs_rq argument into
-                * load_balance_[start|next]_fair iterators
-                */
-               cfs_rq_iterator.arg = busy_cfs_rq;
-               rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
-                                              maxload, sd, idle, all_pinned,
-                                              this_best_prio,
-                                              &cfs_rq_iterator);
+               moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+                               rem_load, sd, idle, all_pinned, this_best_prio,
+                               tg->cfs_rq[busiest_cpu]);
  
-               if (rem_load_move <= 0)
+               if (!moved_load)
+                       continue;
+               moved_load *= busiest_h_load;
+               moved_load = div_u64(moved_load, busiest_weight + 1);
+               rem_load_move -= moved_load;
+               if (rem_load_move < 0)
                        break;
        }
+       rcu_read_unlock();
  
        return max_load_move - rem_load_move;
  }
+ #else
+ static unsigned long
+ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                 unsigned long max_load_move,
+                 struct sched_domain *sd, enum cpu_idle_type idle,
+                 int *all_pinned, int *this_best_prio)
+ {
+       return __load_balance_fair(this_rq, this_cpu, busiest,
+                       max_load_move, sd, idle, all_pinned,
+                       this_best_prio, &busiest->cfs);
+ }
+ #endif
  
  static int
  move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
  
        return 0;
  }
- #endif
+ #endif /* CONFIG_SMP */
  
  /*
   * scheduler tick hitting a task of our scheduling class:
diff --combined kernel/sched_rt.c
@@@ -12,6 -12,9 +12,9 @@@ static inline int rt_overloaded(struct 
  
  static inline void rt_set_overload(struct rq *rq)
  {
+       if (!rq->online)
+               return;
        cpu_set(rq->cpu, rq->rd->rto_mask);
        /*
         * Make sure the mask is visible before we set
@@@ -26,6 -29,9 +29,9 @@@
  
  static inline void rt_clear_overload(struct rq *rq)
  {
+       if (!rq->online)
+               return;
        /* the order here really doesn't matter */
        atomic_dec(&rq->rd->rto_count);
        cpu_clear(rq->cpu, rq->rd->rto_mask);
@@@ -155,7 -161,7 +161,7 @@@ static inline struct rt_bandwidth *sche
        return &rt_rq->tg->rt_bandwidth;
  }
  
- #else
+ #else /* !CONFIG_RT_GROUP_SCHED */
  
  static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
  {
@@@ -220,49 -226,10 +226,10 @@@ static inline struct rt_bandwidth *sche
        return &def_rt_bandwidth;
  }
  
- #endif
- static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
- {
-       int i, idle = 1;
-       cpumask_t span;
-       if (rt_b->rt_runtime == RUNTIME_INF)
-               return 1;
-       span = sched_rt_period_mask();
-       for_each_cpu_mask_nr(i, span) {
-               int enqueue = 0;
-               struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
-               struct rq *rq = rq_of_rt_rq(rt_rq);
-               spin_lock(&rq->lock);
-               if (rt_rq->rt_time) {
-                       u64 runtime;
-                       spin_lock(&rt_rq->rt_runtime_lock);
-                       runtime = rt_rq->rt_runtime;
-                       rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
-                       if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
-                               rt_rq->rt_throttled = 0;
-                               enqueue = 1;
-                       }
-                       if (rt_rq->rt_time || rt_rq->rt_nr_running)
-                               idle = 0;
-                       spin_unlock(&rt_rq->rt_runtime_lock);
-               } else if (rt_rq->rt_nr_running)
-                       idle = 0;
-               if (enqueue)
-                       sched_rt_rq_enqueue(rt_rq);
-               spin_unlock(&rq->lock);
-       }
-       return idle;
- }
+ #endif /* CONFIG_RT_GROUP_SCHED */
  
  #ifdef CONFIG_SMP
- static int balance_runtime(struct rt_rq *rt_rq)
+ static int do_balance_runtime(struct rt_rq *rt_rq)
  {
        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
  
        spin_lock(&rt_b->rt_runtime_lock);
        rt_period = ktime_to_ns(rt_b->rt_period);
 -      for_each_cpu_mask(i, rd->span) {
 +      for_each_cpu_mask_nr(i, rd->span) {
                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
                s64 diff;
  
                        continue;
  
                spin_lock(&iter->rt_runtime_lock);
+               if (iter->rt_runtime == RUNTIME_INF)
+                       goto next;
                diff = iter->rt_runtime - iter->rt_time;
                if (diff > 0) {
                        do_div(diff, weight);
                                break;
                        }
                }
+ next:
                spin_unlock(&iter->rt_runtime_lock);
        }
        spin_unlock(&rt_b->rt_runtime_lock);
  
        return more;
  }
- #endif
+ static void __disable_runtime(struct rq *rq)
+ {
+       struct root_domain *rd = rq->rd;
+       struct rt_rq *rt_rq;
+       if (unlikely(!scheduler_running))
+               return;
+       for_each_leaf_rt_rq(rt_rq, rq) {
+               struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+               s64 want;
+               int i;
+               spin_lock(&rt_b->rt_runtime_lock);
+               spin_lock(&rt_rq->rt_runtime_lock);
+               if (rt_rq->rt_runtime == RUNTIME_INF ||
+                               rt_rq->rt_runtime == rt_b->rt_runtime)
+                       goto balanced;
+               spin_unlock(&rt_rq->rt_runtime_lock);
+               want = rt_b->rt_runtime - rt_rq->rt_runtime;
+               for_each_cpu_mask(i, rd->span) {
+                       struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+                       s64 diff;
+                       if (iter == rt_rq)
+                               continue;
+                       spin_lock(&iter->rt_runtime_lock);
+                       if (want > 0) {
+                               diff = min_t(s64, iter->rt_runtime, want);
+                               iter->rt_runtime -= diff;
+                               want -= diff;
+                       } else {
+                               iter->rt_runtime -= want;
+                               want -= want;
+                       }
+                       spin_unlock(&iter->rt_runtime_lock);
+                       if (!want)
+                               break;
+               }
+               spin_lock(&rt_rq->rt_runtime_lock);
+               BUG_ON(want);
+ balanced:
+               rt_rq->rt_runtime = RUNTIME_INF;
+               spin_unlock(&rt_rq->rt_runtime_lock);
+               spin_unlock(&rt_b->rt_runtime_lock);
+       }
+ }
+ static void disable_runtime(struct rq *rq)
+ {
+       unsigned long flags;
+       spin_lock_irqsave(&rq->lock, flags);
+       __disable_runtime(rq);
+       spin_unlock_irqrestore(&rq->lock, flags);
+ }
+ static void __enable_runtime(struct rq *rq)
+ {
+       struct rt_rq *rt_rq;
+       if (unlikely(!scheduler_running))
+               return;
+       for_each_leaf_rt_rq(rt_rq, rq) {
+               struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+               spin_lock(&rt_b->rt_runtime_lock);
+               spin_lock(&rt_rq->rt_runtime_lock);
+               rt_rq->rt_runtime = rt_b->rt_runtime;
+               rt_rq->rt_time = 0;
+               spin_unlock(&rt_rq->rt_runtime_lock);
+               spin_unlock(&rt_b->rt_runtime_lock);
+       }
+ }
+ static void enable_runtime(struct rq *rq)
+ {
+       unsigned long flags;
+       spin_lock_irqsave(&rq->lock, flags);
+       __enable_runtime(rq);
+       spin_unlock_irqrestore(&rq->lock, flags);
+ }
+ static int balance_runtime(struct rt_rq *rt_rq)
+ {
+       int more = 0;
+       if (rt_rq->rt_time > rt_rq->rt_runtime) {
+               spin_unlock(&rt_rq->rt_runtime_lock);
+               more = do_balance_runtime(rt_rq);
+               spin_lock(&rt_rq->rt_runtime_lock);
+       }
+       return more;
+ }
+ #else /* !CONFIG_SMP */
+ static inline int balance_runtime(struct rt_rq *rt_rq)
+ {
+       return 0;
+ }
+ #endif /* CONFIG_SMP */
+ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+ {
+       int i, idle = 1;
+       cpumask_t span;
+       if (rt_b->rt_runtime == RUNTIME_INF)
+               return 1;
+       span = sched_rt_period_mask();
+       for_each_cpu_mask(i, span) {
+               int enqueue = 0;
+               struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
+               struct rq *rq = rq_of_rt_rq(rt_rq);
+               spin_lock(&rq->lock);
+               if (rt_rq->rt_time) {
+                       u64 runtime;
+                       spin_lock(&rt_rq->rt_runtime_lock);
+                       if (rt_rq->rt_throttled)
+                               balance_runtime(rt_rq);
+                       runtime = rt_rq->rt_runtime;
+                       rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
+                       if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
+                               rt_rq->rt_throttled = 0;
+                               enqueue = 1;
+                       }
+                       if (rt_rq->rt_time || rt_rq->rt_nr_running)
+                               idle = 0;
+                       spin_unlock(&rt_rq->rt_runtime_lock);
+               } else if (rt_rq->rt_nr_running)
+                       idle = 0;
+               if (enqueue)
+                       sched_rt_rq_enqueue(rt_rq);
+               spin_unlock(&rq->lock);
+       }
+       return idle;
+ }
  
  static inline int rt_se_prio(struct sched_rt_entity *rt_se)
  {
@@@ -327,18 -447,10 +447,10 @@@ static int sched_rt_runtime_exceeded(st
        if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
                return 0;
  
- #ifdef CONFIG_SMP
-       if (rt_rq->rt_time > runtime) {
-               int more;
-               spin_unlock(&rt_rq->rt_runtime_lock);
-               more = balance_runtime(rt_rq);
-               spin_lock(&rt_rq->rt_runtime_lock);
-               if (more)
-                       runtime = sched_rt_runtime(rt_rq);
-       }
- #endif
+       balance_runtime(rt_rq);
+       runtime = sched_rt_runtime(rt_rq);
+       if (runtime == RUNTIME_INF)
+               return 0;
  
        if (rt_rq->rt_time > runtime) {
                rt_rq->rt_throttled = 1;
@@@ -392,12 -504,21 +504,21 @@@ void inc_rt_tasks(struct sched_rt_entit
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        rt_rq->rt_nr_running++;
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       if (rt_se_prio(rt_se) < rt_rq->highest_prio)
+       if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+               struct rq *rq = rq_of_rt_rq(rt_rq);
                rt_rq->highest_prio = rt_se_prio(rt_se);
+ #ifdef CONFIG_SMP
+               if (rq->online)
+                       cpupri_set(&rq->rd->cpupri, rq->cpu,
+                                  rt_se_prio(rt_se));
+ #endif
+       }
  #endif
  #ifdef CONFIG_SMP
        if (rt_se->nr_cpus_allowed > 1) {
                struct rq *rq = rq_of_rt_rq(rt_rq);
                rq->rt.rt_nr_migratory++;
        }
  
  static inline
  void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  {
+ #ifdef CONFIG_SMP
+       int highest_prio = rt_rq->highest_prio;
+ #endif
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        WARN_ON(!rt_rq->rt_nr_running);
        rt_rq->rt_nr_running--;
                rq->rt.rt_nr_migratory--;
        }
  
+       if (rt_rq->highest_prio != highest_prio) {
+               struct rq *rq = rq_of_rt_rq(rt_rq);
+               if (rq->online)
+                       cpupri_set(&rq->rd->cpupri, rq->cpu,
+                                  rt_rq->highest_prio);
+       }
        update_rt_migration(rq_of_rt_rq(rt_rq));
  #endif /* CONFIG_SMP */
  #ifdef CONFIG_RT_GROUP_SCHED
@@@ -455,6 -588,7 +588,7 @@@ static void __enqueue_rt_entity(struct 
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        struct rt_prio_array *array = &rt_rq->active;
        struct rt_rq *group_rq = group_rt_rq(rt_se);
+       struct list_head *queue = array->queue + rt_se_prio(rt_se);
  
        /*
         * Don't enqueue the group if its throttled, or when empty.
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
  
-       list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+       if (rt_se->nr_cpus_allowed == 1)
+               list_add(&rt_se->run_list, queue);
+       else
+               list_add_tail(&rt_se->run_list, queue);
        __set_bit(rt_se_prio(rt_se), array->bitmap);
  
        inc_rt_tasks(rt_se, rt_rq);
@@@ -532,6 -670,8 +670,8 @@@ static void enqueue_task_rt(struct rq *
                rt_se->timeout = 0;
  
        enqueue_rt_entity(rt_se);
+       inc_cpu_load(rq, p->se.load.weight);
  }
  
  static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
  
        update_curr_rt(rq);
        dequeue_rt_entity(rt_se);
+       dec_cpu_load(rq, p->se.load.weight);
  }
  
  /*
@@@ -550,10 -692,12 +692,12 @@@ stati
  void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
  {
        struct rt_prio_array *array = &rt_rq->active;
-       struct list_head *queue = array->queue + rt_se_prio(rt_se);
  
-       if (on_rt_rq(rt_se))
-               list_move_tail(&rt_se->run_list, queue);
+       if (on_rt_rq(rt_se)) {
+               list_del_init(&rt_se->run_list);
+               list_add_tail(&rt_se->run_list,
+                             array->queue + rt_se_prio(rt_se));
+       }
  }
  
  static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@@ -616,8 -760,37 +760,37 @@@ static int select_task_rq_rt(struct tas
   */
  static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
  {
-       if (p->prio < rq->curr->prio)
+       if (p->prio < rq->curr->prio) {
                resched_task(rq->curr);
+               return;
+       }
+ #ifdef CONFIG_SMP
+       /*
+        * If:
+        *
+        * - the newly woken task is of equal priority to the current task
+        * - the newly woken task is non-migratable while current is migratable
+        * - current will be preempted on the next reschedule
+        *
+        * we should check to see if current can readily move to a different
+        * cpu.  If so, we will reschedule to allow the push logic to try
+        * to move current somewhere else, making room for our non-migratable
+        * task.
+        */
+       if((p->prio == rq->curr->prio)
+          && p->rt.nr_cpus_allowed == 1
+          && rq->curr->rt.nr_cpus_allowed != 1) {
+               cpumask_t mask;
+               if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+                       /*
+                        * There appears to be other cpus that can accept
+                        * current, so lets reschedule to try and push it away
+                        */
+                       resched_task(rq->curr);
+       }
+ #endif
  }
  
  static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@@ -720,73 -893,6 +893,6 @@@ static struct task_struct *pick_next_hi
  
  static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
  
- static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
- {
-       int       lowest_prio = -1;
-       int       lowest_cpu  = -1;
-       int       count       = 0;
-       int       cpu;
-       cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
-       /*
-        * Scan each rq for the lowest prio.
-        */
-       for_each_cpu_mask(cpu, *lowest_mask) {
-               struct rq *rq = cpu_rq(cpu);
-               /* We look for lowest RT prio or non-rt CPU */
-               if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-                       /*
-                        * if we already found a low RT queue
-                        * and now we found this non-rt queue
-                        * clear the mask and set our bit.
-                        * Otherwise just return the queue as is
-                        * and the count==1 will cause the algorithm
-                        * to use the first bit found.
-                        */
-                       if (lowest_cpu != -1) {
-                               cpus_clear(*lowest_mask);
-                               cpu_set(rq->cpu, *lowest_mask);
-                       }
-                       return 1;
-               }
-               /* no locking for now */
-               if ((rq->rt.highest_prio > task->prio)
-                   && (rq->rt.highest_prio >= lowest_prio)) {
-                       if (rq->rt.highest_prio > lowest_prio) {
-                               /* new low - clear old data */
-                               lowest_prio = rq->rt.highest_prio;
-                               lowest_cpu = cpu;
-                               count = 0;
-                       }
-                       count++;
-               } else
-                       cpu_clear(cpu, *lowest_mask);
-       }
-       /*
-        * Clear out all the set bits that represent
-        * runqueues that were of higher prio than
-        * the lowest_prio.
-        */
-       if (lowest_cpu > 0) {
-               /*
-                * Perhaps we could add another cpumask op to
-                * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
-                * Then that could be optimized to use memset and such.
-                */
-               for_each_cpu_mask(cpu, *lowest_mask) {
-                       if (cpu >= lowest_cpu)
-                               break;
-                       cpu_clear(cpu, *lowest_mask);
-               }
-       }
-       return count;
- }
  static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
  {
        int first;
@@@ -808,17 -914,12 +914,12 @@@ static int find_lowest_rq(struct task_s
        cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
-       int count    = find_lowest_cpus(task, lowest_mask);
  
-       if (!count)
-               return -1; /* No targets found */
+       if (task->rt.nr_cpus_allowed == 1)
+               return -1; /* No other targets possible */
  
-       /*
-        * There is no sense in performing an optimal search if only one
-        * target is found.
-        */
-       if (count == 1)
-               return first_cpu(*lowest_mask);
+       if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+               return -1; /* No targets found */
  
        /*
         * At this point we have built a mask of cpus representing the
@@@ -1006,7 -1107,7 +1107,7 @@@ static int pull_rt_task(struct rq *this
  
        next = pick_next_task_rt(this_rq);
  
 -      for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
 +      for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
  
@@@ -1163,17 -1264,25 +1264,25 @@@ static void set_cpus_allowed_rt(struct 
  }
  
  /* Assumes rq->lock is held */
- static void join_domain_rt(struct rq *rq)
+ static void rq_online_rt(struct rq *rq)
  {
        if (rq->rt.overloaded)
                rt_set_overload(rq);
+       __enable_runtime(rq);
+       cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
  }
  
  /* Assumes rq->lock is held */
- static void leave_domain_rt(struct rq *rq)
+ static void rq_offline_rt(struct rq *rq)
  {
        if (rq->rt.overloaded)
                rt_clear_overload(rq);
+       __disable_runtime(rq);
+       cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
  }
  
  /*
@@@ -1336,8 -1445,8 +1445,8 @@@ static const struct sched_class rt_sche
        .load_balance           = load_balance_rt,
        .move_one_task          = move_one_task_rt,
        .set_cpus_allowed       = set_cpus_allowed_rt,
-       .join_domain            = join_domain_rt,
-       .leave_domain           = leave_domain_rt,
+       .rq_online              = rq_online_rt,
+       .rq_offline             = rq_offline_rt,
        .pre_schedule           = pre_schedule_rt,
        .post_schedule          = post_schedule_rt,
        .task_wake_up           = task_wake_up_rt,
        .prio_changed           = prio_changed_rt,
        .switched_to            = switched_to_rt,
  };
+ #ifdef CONFIG_SCHED_DEBUG
+ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+ static void print_rt_stats(struct seq_file *m, int cpu)
+ {
+       struct rt_rq *rt_rq;
+       rcu_read_lock();
+       for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
+               print_rt_rq(m, cpu, rt_rq);
+       rcu_read_unlock();
+ }
+ #endif /* CONFIG_SCHED_DEBUG */
@@@ -30,6 -30,7 +30,7 @@@
  struct tick_device tick_broadcast_device;
  static cpumask_t tick_broadcast_mask;
  static DEFINE_SPINLOCK(tick_broadcast_lock);
+ static int tick_broadcast_force;
  
  #ifdef CONFIG_TICK_ONESHOT
  static void tick_broadcast_clear_oneshot(int cpu);
@@@ -232,10 -233,11 +233,11 @@@ static void tick_do_broadcast_on_off(vo
                                                     CLOCK_EVT_MODE_SHUTDOWN);
                }
                if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
-                       dev->features |= CLOCK_EVT_FEAT_DUMMY;
+                       tick_broadcast_force = 1;
                break;
        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
-               if (cpu_isset(cpu, tick_broadcast_mask)) {
+               if (!tick_broadcast_force &&
+                   cpu_isset(cpu, tick_broadcast_mask)) {
                        cpu_clear(cpu, tick_broadcast_mask);
                        if (td->mode == TICKDEV_MODE_PERIODIC)
                                tick_setup_periodic(dev, 0);
@@@ -266,7 -268,7 +268,7 @@@ void tick_broadcast_on_off(unsigned lon
                       "offline CPU #%d\n", *oncpu);
        else
                smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
-                                        &reason, 1, 1);
+                                        &reason, 1);
  }
  
  /*
@@@ -397,7 -399,8 +399,7 @@@ again
        mask = CPU_MASK_NONE;
        now = ktime_get();
        /* Find all expired events */
 -      for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
 -           cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
 +      for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) {
                td = &per_cpu(tick_cpu_device, cpu);
                if (td->evtdev->next_event.tv64 <= now.tv64)
                        cpu_set(cpu, mask);
diff --combined net/core/dev.c
@@@ -2261,7 -2261,7 +2261,7 @@@ out
         */
        if (!cpus_empty(net_dma.channel_mask)) {
                int chan_idx;
 -              for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
 +              for_each_cpu_mask_nr(chan_idx, net_dma.channel_mask) {
                        struct dma_chan *chan = net_dma.channels[chan_idx];
                        if (chan)
                                dma_async_memcpy_issue_pending(chan);
@@@ -4322,7 -4322,7 +4322,7 @@@ static void net_dma_rebalance(struct ne
        i = 0;
        cpu = first_cpu(cpu_online_map);
  
 -      for_each_cpu_mask(chan_idx, net_dma->channel_mask) {
 +      for_each_cpu_mask_nr(chan_idx, net_dma->channel_mask) {
                chan = net_dma->channels[chan_idx];
  
                n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
@@@ -4585,8 -4585,8 +4585,8 @@@ static int __init net_dev_init(void
  
        dev_boot_phase = 0;
  
-       open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
-       open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
+       open_softirq(NET_TX_SOFTIRQ, net_tx_action);
+       open_softirq(NET_RX_SOFTIRQ, net_rx_action);
  
        hotcpu_notifier(dev_cpu_callback, 0);
        dst_init();
diff --combined net/iucv/iucv.c
@@@ -480,7 -480,7 +480,7 @@@ static void iucv_setmask_mp(void
                if (cpu_isset(cpu, iucv_buffer_cpumask) &&
                    !cpu_isset(cpu, iucv_irq_cpumask))
                        smp_call_function_single(cpu, iucv_allow_cpu,
-                                                NULL, 0, 1);
+                                                NULL, 1);
        preempt_enable();
  }
  
@@@ -497,8 -497,8 +497,8 @@@ static void iucv_setmask_up(void
        /* Disable all cpu but the first in cpu_irq_cpumask. */
        cpumask = iucv_irq_cpumask;
        cpu_clear(first_cpu(iucv_irq_cpumask), cpumask);
 -      for_each_cpu_mask(cpu, cpumask)
 +      for_each_cpu_mask_nr(cpu, cpumask)
-               smp_call_function_single(cpu, iucv_block_cpu, NULL, 0, 1);
+               smp_call_function_single(cpu, iucv_block_cpu, NULL, 1);
  }
  
  /**
@@@ -523,7 -523,7 +523,7 @@@ static int iucv_enable(void
        rc = -EIO;
        preempt_disable();
        for_each_online_cpu(cpu)
-               smp_call_function_single(cpu, iucv_declare_cpu, NULL, 0, 1);
+               smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
        preempt_enable();
        if (cpus_empty(iucv_buffer_cpumask))
                /* No cpu could declare an iucv buffer. */
@@@ -545,7 -545,7 +545,7 @@@ out
   */
  static void iucv_disable(void)
  {
-       on_each_cpu(iucv_retrieve_cpu, NULL, 0, 1);
+       on_each_cpu(iucv_retrieve_cpu, NULL, 1);
        kfree(iucv_path_table);
  }
  
@@@ -580,7 -580,7 +580,7 @@@ static int __cpuinit iucv_cpu_notify(st
        case CPU_ONLINE_FROZEN:
        case CPU_DOWN_FAILED:
        case CPU_DOWN_FAILED_FROZEN:
-               smp_call_function_single(cpu, iucv_declare_cpu, NULL, 0, 1);
+               smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                if (cpus_empty(cpumask))
                        /* Can't offline last IUCV enabled cpu. */
                        return NOTIFY_BAD;
-               smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 0, 1);
+               smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 1);
                if (cpus_empty(iucv_irq_cpumask))
                        smp_call_function_single(first_cpu(iucv_buffer_cpumask),
-                                                iucv_allow_cpu, NULL, 0, 1);
+                                                iucv_allow_cpu, NULL, 1);
                break;
        }
        return NOTIFY_OK;
@@@ -652,7 -652,7 +652,7 @@@ static void iucv_cleanup_queue(void
         * pending interrupts force them to the work queue by calling
         * an empty function on all cpus.
         */
-       smp_call_function(__iucv_cleanup_queue, NULL, 0, 1);
+       smp_call_function(__iucv_cleanup_queue, NULL, 1);
        spin_lock_irq(&iucv_queue_lock);
        list_for_each_entry_safe(p, n, &iucv_task_queue, list) {
                /* Remove stale work items from the task queue. */
@@@ -1559,16 -1559,11 +1559,11 @@@ static void iucv_external_interrupt(u1
  
        p = iucv_irq_data[smp_processor_id()];
        if (p->ippathid >= iucv_max_pathid) {
-               printk(KERN_WARNING "iucv_do_int: Got interrupt with "
-                      "pathid %d > max_connections (%ld)\n",
-                      p->ippathid, iucv_max_pathid - 1);
+               WARN_ON(p->ippathid >= iucv_max_pathid);
                iucv_sever_pathid(p->ippathid, iucv_error_no_listener);
                return;
        }
-       if (p->iptype  < 0x01 || p->iptype > 0x09) {
-               printk(KERN_ERR "iucv_do_int: unknown iucv interrupt\n");
-               return;
-       }
+       BUG_ON(p->iptype  < 0x01 || p->iptype > 0x09);
        work = kmalloc(sizeof(struct iucv_irq_list), GFP_ATOMIC);
        if (!work) {
                printk(KERN_WARNING "iucv_external_interrupt: out of memory\n");