debug lockups: Improve lockup detection

author Ingo Molnar <mingo@elte.hu>

Sun, 2 Aug 2009 09:28:21 +0000 (11:28 +0200)

committer Ingo Molnar <mingo@elte.hu>

Sun, 2 Aug 2009 11:27:17 +0000 (13:27 +0200)
author Ingo Molnar <mingo@elte.hu>
Sun, 2 Aug 2009 09:28:21 +0000 (11:28 +0200)
committer Ingo Molnar <mingo@elte.hu>
Sun, 2 Aug 2009 11:27:17 +0000 (13:27 +0200)
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c

index b3025b4..1bb1ac2 100644 (file)
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,7 @@
  int unknown_nmi_panic;
  int nmi_watchdog_enabled;
  
  int unknown_nmi_panic;
  int nmi_watchdog_enabled;
  
-static cpumask_var_t backtrace_mask;
+static cpumask_t backtrace_mask __read_mostly;
  
  /* nmi_active:
   * >0: the lapic NMI watchdog is active, but can be disabled
  
  /* nmi_active:
   * >0: the lapic NMI watchdog is active, but can be disabled
@@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void)
         if (!prev_nmi_count)
                 goto error;
  
         if (!prev_nmi_count)
                 goto error;
  
-       alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO);
         printk(KERN_INFO "Testing NMI watchdog ... ");
  
  #ifdef CONFIG_SMP
         printk(KERN_INFO "Testing NMI watchdog ... ");
  
  #ifdef CONFIG_SMP
@@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
         }
  
         /* We can be called before check_nmi_watchdog, hence NULL check. */
         }
  
         /* We can be called before check_nmi_watchdog, hence NULL check. */
-       if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) {
+       if (cpumask_test_cpu(cpu, &backtrace_mask)) {
                 static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
  
                 spin_lock(&lock);
                 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
                 static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
  
                 spin_lock(&lock);
                 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+               show_regs(regs);
                 dump_stack();
                 spin_unlock(&lock);
                 dump_stack();
                 spin_unlock(&lock);
-               cpumask_clear_cpu(cpu, backtrace_mask);
+               cpumask_clear_cpu(cpu, &backtrace_mask);
+
+               rc = 1;
         }
  
         /* Could check oops_in_progress here too, but it's safer not to */
         }
  
         /* Could check oops_in_progress here too, but it's safer not to */
@@ -556,10 +558,14 @@ void __trigger_all_cpu_backtrace(void)
  {
         int i;
  
  {
         int i;
  
-       cpumask_copy(backtrace_mask, cpu_online_mask);
+       cpumask_copy(&backtrace_mask, cpu_online_mask);
+
+       printk(KERN_INFO "sending NMI to all CPUs:\n");
+       apic->send_IPI_all(NMI_VECTOR);
+
         /* Wait for up to 10 seconds for all CPUs to do the backtrace */
         for (i = 0; i < 10 * 1000; i++) {
         /* Wait for up to 10 seconds for all CPUs to do the backtrace */
         for (i = 0; i < 10 * 1000; i++) {
-               if (cpumask_empty(backtrace_mask))
+               if (cpumask_empty(&backtrace_mask))
                         break;
                 mdelay(1);
         }
                         break;
                 mdelay(1);
         }
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c

index 5d7a02f..165f307 100644 (file)
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -24,6 +24,7 @@
  #include <linux/sysrq.h>
  #include <linux/kbd_kern.h>
  #include <linux/proc_fs.h>
  #include <linux/sysrq.h>
  #include <linux/kbd_kern.h>
  #include <linux/proc_fs.h>
+#include <linux/nmi.h>
  #include <linux/quotaops.h>
  #include <linux/perf_counter.h>
  #include <linux/kernel.h>
  #include <linux/quotaops.h>
  #include <linux/perf_counter.h>
  #include <linux/kernel.h>
@@ -222,12 +223,7 @@ static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus);
  
  static void sysrq_handle_showallcpus(int key, struct tty_struct *tty)
  {
  
  static void sysrq_handle_showallcpus(int key, struct tty_struct *tty)
  {
-       struct pt_regs *regs = get_irq_regs();
-       if (regs) {
-               printk(KERN_INFO "CPU%d:\n", smp_processor_id());
-               show_regs(regs);
-       }
-       schedule_work(&sysrq_showallcpus);
+       trigger_all_cpu_backtrace();
  }
  
  static struct sysrq_key_op sysrq_showallcpus_op = {
  }
  
  static struct sysrq_key_op sysrq_showallcpus_op = {
diff --git a/kernel/rcutree.c b/kernel/rcutree.c

index 7717b95..9c5fa9f 100644 (file)
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -35,6 +35,7 @@
  #include <linux/rcupdate.h>
  #include <linux/interrupt.h>
  #include <linux/sched.h>
  #include <linux/rcupdate.h>
  #include <linux/interrupt.h>
  #include <linux/sched.h>
+#include <linux/nmi.h>
  #include <asm/atomic.h>
  #include <linux/bitops.h>
  #include <linux/module.h>
  #include <asm/atomic.h>
  #include <linux/bitops.h>
  #include <linux/module.h>
@@ -469,6 +470,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         }
         printk(" (detected by %d, t=%ld jiffies)\n",
                smp_processor_id(), (long)(jiffies - rsp->gp_start));
         }
         printk(" (detected by %d, t=%ld jiffies)\n",
                smp_processor_id(), (long)(jiffies - rsp->gp_start));
+       trigger_all_cpu_backtrace();
+
         force_quiescent_state(rsp, 0);  /* Kick them all. */
  }
  
         force_quiescent_state(rsp, 0);  /* Kick them all. */
  }
  
@@ -479,12 +482,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
  
         printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
                         smp_processor_id(), jiffies - rsp->gp_start);
  
         printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
                         smp_processor_id(), jiffies - rsp->gp_start);
-       dump_stack();
+       trigger_all_cpu_backtrace();
+
         spin_lock_irqsave(&rnp->lock, flags);
         if ((long)(jiffies - rsp->jiffies_stall) >= 0)
                 rsp->jiffies_stall =
                         jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
         spin_unlock_irqrestore(&rnp->lock, flags);
         spin_lock_irqsave(&rnp->lock, flags);
         if ((long)(jiffies - rsp->jiffies_stall) >= 0)
                 rsp->jiffies_stall =
                         jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
         spin_unlock_irqrestore(&rnp->lock, flags);
+
         set_need_resched();  /* kick ourselves to get things going. */
  }
  
         set_need_resched();  /* kick ourselves to get things going. */
  }
author	Ingo Molnar <mingo@elte.hu>
	Sun, 2 Aug 2009 09:28:21 +0000 (11:28 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Sun, 2 Aug 2009 11:27:17 +0000 (13:27 +0200)
arch/x86/kernel/apic/nmi.c		patch \| blob \| history
drivers/char/sysrq.c		patch \| blob \| history
kernel/rcutree.c		patch \| blob \| history