Merge branch 'x86-apic-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 5 Dec 2009 23:31:25 +0000 (15:31 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 5 Dec 2009 23:31:25 +0000 (15:31 -0800)
* 'x86-apic-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (30 commits)
  x86, apic: Enable lapic nmi watchdog on AMD Family 11h
  x86: Remove unnecessary mdelay() from cpu_disable_common()
  x86, ioapic: Document another case when level irq is seen as an edge
  x86, ioapic: Fix the EOI register detection mechanism
  x86, io-apic: Move the effort of clearing remoteIRR explicitly before migrating the irq
  x86: SGI UV: Map low MMR ranges
  x86: apic: Print out SRAT table APIC id in hex
  x86: Re-get cfg_new in case reuse/move irq_desc
  x86: apic: Remove not needed #ifdef
  x86: io-apic: IO-APIC MMIO should not fail on resource insertion
  x86: Remove asm/apicnum.h
  x86: apic: Do not use stacked physid_mask_t
  x86, apic: Get rid of apicid_to_cpu_present assign on 64-bit
  x86, ioapic: Use snrpintf while set names for IO-APIC resourses
  x86, apic: Use PAGE_SIZE instead of numbers
  x86: Remove local_irq_enable()/local_irq_disable() in fixup_irqs()
  x86: Use EOI register in io-apic on intel platforms
  x86: Force irq complete move during cpu offline
  x86: Remove move_cleanup_count from irq_cfg
  x86, intr-remap: Avoid irq_chip mask/unmask in fixup_irqs() for intr-remapping
  ...

1  2 
Documentation/kernel-parameters.txt
arch/x86/kernel/apic/x2apic_uv_x.c
arch/x86/kernel/irq.c

@@@ -85,6 -85,7 +85,6 @@@ parameter is applicable
        PPT     Parallel port support is enabled.
        PS2     Appropriate PS/2 support is enabled.
        RAM     RAM disk support is enabled.
 -      ROOTPLUG The example Root Plug LSM is enabled.
        S390    S390 architecture is enabled.
        SCSI    Appropriate SCSI support is enabled.
                        A lot of drivers has their options described inside of
@@@ -344,6 -345,15 +344,15 @@@ and is between 256 and 4096 characters
                        Change the amount of debugging information output
                        when initialising the APIC and IO-APIC components.
  
+       show_lapic=     [APIC,X86] Advanced Programmable Interrupt Controller
+                       Limit apic dumping. The parameter defines the maximal
+                       number of local apics being dumped. Also it is possible
+                       to set it to "all" by meaning -- no limit here.
+                       Format: { 1 (default) | 2 | ... | all }.
+                       The parameter valid if only apic=debug or
+                       apic=verbose is specified.
+                       Example: apic=debug show_lapic=all
        apm=            [APM] Advanced Power Management
                        See header of arch/x86/kernel/apm_32.c.
  
                        by the set_ftrace_notrace file in the debugfs
                        tracing directory.
  
 +      ftrace_graph_filter=[function-list]
 +                      [FTRACE] Limit the top level callers functions traced
 +                      by the function graph tracer at boot up.
 +                      function-list is a comma separated list of functions
 +                      that can be changed at run time by the
 +                      set_graph_function file in the debugfs tracing directory.
 +
        gamecon.map[2|3]=
                        [HW,JOY] Multisystem joystick and NES/SNES/PSX pad
                        support via parallel port (up to 5 devices per port)
  
        print-fatal-signals=
                        [KNL] debug: print fatal signals
 -                      print-fatal-signals=1: print segfault info to
 -                      the kernel console.
 +
 +                      If enabled, warn about various signal handling
 +                      related application anomalies: too many signals,
 +                      too many POSIX.1 timers, fatal signals causing a
 +                      coredump - etc.
 +
 +                      If you hit the warning due to signal overflow,
 +                      you might want to try "ulimit -i unlimited".
 +
                        default: off.
  
        printk.time=    Show timing data prefixed to each printk message line
                        Useful for devices that are detected asynchronously
                        (e.g. USB and MMC devices).
  
 -      root_plug.vendor_id=
 -                      [ROOTPLUG] Override the default vendor ID
 -
 -      root_plug.product_id=
 -                      [ROOTPLUG] Override the default product ID
 -
 -      root_plug.debug=
 -                      [ROOTPLUG] Enable debugging output
 -
        rw              [KNL] Mount root device read-write on boot
  
        S               [KNL] Run init in single mode
  
        sbni=           [NET] Granch SBNI12 leased line adapter
  
 +      sched_debug     [KNL] Enables verbose scheduler debug messages.
 +
        sc1200wdt=      [HW,WDT] SC1200 WDT (watchdog) driver
                        Format: <io>[,<timeout>[,<isapnp>]]
  
@@@ -352,14 -352,14 +352,14 @@@ static __init void get_lowmem_redirect(
  
        for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
                alias.v = uv_read_local_mmr(redir_addrs[i].alias);
 -              if (alias.s.base == 0) {
 +              if (alias.s.enable && alias.s.base == 0) {
                        *size = (1UL << alias.s.m_alias);
                        redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
                        *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
                        return;
                }
        }
 -      BUG();
 +      *base = *size = 0;
  }
  
  enum map_type {map_wb, map_uc};
@@@ -409,6 -409,12 +409,12 @@@ static __init void map_mmioh_high(int m
                map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
  }
  
+ static __init void map_low_mmrs(void)
+ {
+       init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
+       init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
+ }
  static __init void uv_rtc_init(void)
  {
        long status;
@@@ -550,6 -556,8 +556,8 @@@ void __init uv_system_init(void
        unsigned long mmr_base, present, paddr;
        unsigned short pnode_mask;
  
+       map_low_mmrs();
        m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
        m_val = m_n_config.s.m_skt;
        n_val = m_n_config.s.n_skt;
                uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
                uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
                uv_cpu_hub_info(cpu)->m_val = m_val;
 -              uv_cpu_hub_info(cpu)->n_val = m_val;
 +              uv_cpu_hub_info(cpu)->n_val = n_val;
                uv_cpu_hub_info(cpu)->numa_blade_id = blade;
                uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
                uv_cpu_hub_info(cpu)->pnode = pnode;
                uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
 -              uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
 +              uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
                uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
                uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
                uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
diff --combined arch/x86/kernel/irq.c
@@@ -63,10 -63,10 +63,10 @@@ static int show_other_interrupts(struc
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
        seq_printf(p, "  Spurious interrupts\n");
 -      seq_printf(p, "%*s: ", prec, "CNT");
 +      seq_printf(p, "%*s: ", prec, "PMI");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 -      seq_printf(p, "  Performance counter interrupts\n");
 +      seq_printf(p, "  Performance monitoring interrupts\n");
        seq_printf(p, "%*s: ", prec, "PND");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
                seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
        seq_printf(p, "  TLB shootdowns\n");
  #endif
 -#ifdef CONFIG_X86_MCE
 +#ifdef CONFIG_X86_THERMAL_VECTOR
        seq_printf(p, "%*s: ", prec, "TRM");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
        seq_printf(p, "  Thermal event interrupts\n");
 -# ifdef CONFIG_X86_MCE_THRESHOLD
 +#endif
 +#ifdef CONFIG_X86_MCE_THRESHOLD
        seq_printf(p, "%*s: ", prec, "THR");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
        seq_printf(p, "  Threshold APIC interrupts\n");
 -# endif
  #endif
  #ifdef CONFIG_X86_MCE
        seq_printf(p, "%*s: ", prec, "MCE");
@@@ -194,11 -194,11 +194,11 @@@ u64 arch_irq_stat_cpu(unsigned int cpu
        sum += irq_stats(cpu)->irq_call_count;
        sum += irq_stats(cpu)->irq_tlb_count;
  #endif
 -#ifdef CONFIG_X86_MCE
 +#ifdef CONFIG_X86_THERMAL_VECTOR
        sum += irq_stats(cpu)->irq_thermal_count;
 -# ifdef CONFIG_X86_MCE_THRESHOLD
 +#endif
 +#ifdef CONFIG_X86_MCE_THRESHOLD
        sum += irq_stats(cpu)->irq_threshold_count;
 -# endif
  #endif
  #ifdef CONFIG_X86_MCE
        sum += per_cpu(mce_exception_count, cpu);
@@@ -244,6 -244,7 +244,6 @@@ unsigned int __irq_entry do_IRQ(struct 
                                __func__, smp_processor_id(), vector, irq);
        }
  
 -      run_local_timers();
        irq_exit();
  
        set_irq_regs(old_regs);
@@@ -268,9 -269,100 +268,99 @@@ void smp_generic_interrupt(struct pt_re
        if (generic_interrupt_extension)
                generic_interrupt_extension();
  
 -      run_local_timers();
        irq_exit();
  
        set_irq_regs(old_regs);
  }
  
  EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
+ #ifdef CONFIG_HOTPLUG_CPU
+ /* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
+ void fixup_irqs(void)
+ {
+       unsigned int irq, vector;
+       static int warned;
+       struct irq_desc *desc;
+       for_each_irq_desc(irq, desc) {
+               int break_affinity = 0;
+               int set_affinity = 1;
+               const struct cpumask *affinity;
+               if (!desc)
+                       continue;
+               if (irq == 2)
+                       continue;
+               /* interrupt's are disabled at this point */
+               spin_lock(&desc->lock);
+               affinity = desc->affinity;
+               if (!irq_has_action(irq) ||
+                   cpumask_equal(affinity, cpu_online_mask)) {
+                       spin_unlock(&desc->lock);
+                       continue;
+               }
+               /*
+                * Complete the irq move. This cpu is going down and for
+                * non intr-remapping case, we can't wait till this interrupt
+                * arrives at this cpu before completing the irq move.
+                */
+               irq_force_complete_move(irq);
+               if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+                       break_affinity = 1;
+                       affinity = cpu_all_mask;
+               }
+               if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
+                       desc->chip->mask(irq);
+               if (desc->chip->set_affinity)
+                       desc->chip->set_affinity(irq, affinity);
+               else if (!(warned++))
+                       set_affinity = 0;
+               if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
+                       desc->chip->unmask(irq);
+               spin_unlock(&desc->lock);
+               if (break_affinity && set_affinity)
+                       printk("Broke affinity for irq %i\n", irq);
+               else if (!set_affinity)
+                       printk("Cannot set affinity for irq %i\n", irq);
+       }
+       /*
+        * We can remove mdelay() and then send spuriuous interrupts to
+        * new cpu targets for all the irqs that were handled previously by
+        * this cpu. While it works, I have seen spurious interrupt messages
+        * (nothing wrong but still...).
+        *
+        * So for now, retain mdelay(1) and check the IRR and then send those
+        * interrupts to new targets as this cpu is already offlined...
+        */
+       mdelay(1);
+       for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+               unsigned int irr;
+               if (__get_cpu_var(vector_irq)[vector] < 0)
+                       continue;
+               irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+               if (irr  & (1 << (vector % 32))) {
+                       irq = __get_cpu_var(vector_irq)[vector];
+                       desc = irq_to_desc(irq);
+                       spin_lock(&desc->lock);
+                       if (desc->chip->retrigger)
+                               desc->chip->retrigger(irq);
+                       spin_unlock(&desc->lock);
+               }
+       }
+ }
+ #endif