x86_64, traps: Rework bad_iret
[pandora-kernel.git] / arch / x86 / kernel / traps.c
index 6913369..56d013f 100644 (file)
@@ -81,15 +81,6 @@ gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
 DECLARE_BITMAP(used_vectors, NR_VECTORS);
 EXPORT_SYMBOL_GPL(used_vectors);
 
-static int ignore_nmis;
-
-int unknown_nmi_panic;
-/*
- * Prevent NMI reason port (0x61) being accessed simultaneously, can
- * only be used in NMI handler.
- */
-static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
-
 static inline void conditional_sti(struct pt_regs *regs)
 {
        if (regs->flags & X86_EFLAGS_IF)
@@ -128,7 +119,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
                 * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
                 * On nmi (interrupt 2), do_trap should not be called.
                 */
-               if (trapnr < 6)
+               if (trapnr < X86_TRAP_UD)
                        goto vm86_trap;
                goto trap_signal;
        }
@@ -212,40 +203,56 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code)       \
        do_trap(trapnr, signr, str, regs, error_code, &info);           \
 }
 
-DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-DO_ERROR(4, SIGSEGV, "overflow", overflow)
-DO_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
-DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
-#ifdef CONFIG_X86_32
-DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
-#endif
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
+               regs->ip)
+DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
+DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN,
+               regs->ip)
+DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",
+               coprocessor_segment_overrun)
+DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
+DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
+               BUS_ADRALN, 0)
 
 #ifdef CONFIG_X86_64
 /* Runs on IST stack */
-dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
-{
-       if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
-                       12, SIGBUS) == NOTIFY_STOP)
-               return;
-       preempt_conditional_sti(regs);
-       do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
-       preempt_conditional_cli(regs);
-}
-
 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 {
        static const char str[] = "double fault";
        struct task_struct *tsk = current;
 
+#ifdef CONFIG_X86_ESPFIX64
+       extern unsigned char native_irq_return_iret[];
+
+       /*
+        * If IRET takes a non-IST fault on the espfix64 stack, then we
+        * end up promoting it to a doublefault.  In that case, modify
+        * the stack to make it look like we just entered the #GP
+        * handler from user space, similar to bad_iret.
+        */
+       if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
+               regs->cs == __KERNEL_CS &&
+               regs->ip == (unsigned long)native_irq_return_iret)
+       {
+               struct pt_regs *normal_regs = task_pt_regs(current);
+
+               /* Fake a #GP(0) from userspace. */
+               memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
+               normal_regs->orig_ax = 0;  /* Missing (lost) #GP error code */
+               regs->ip = (unsigned long)general_protection;
+               regs->sp = (unsigned long)&normal_regs->orig_ax;
+               return;
+       }
+#endif
+
        /* Return not checked because double check cannot be ignored */
-       notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
+       notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 
        tsk->thread.error_code = error_code;
-       tsk->thread.trap_no = 8;
+       tsk->thread.trap_no = X86_TRAP_DF;
 
        /*
         * This is always a kernel trap and never fixable (and thus must
@@ -273,7 +280,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
                goto gp_in_kernel;
 
        tsk->thread.error_code = error_code;
-       tsk->thread.trap_no = 13;
+       tsk->thread.trap_no = X86_TRAP_GP;
 
        if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
                        printk_ratelimit()) {
@@ -300,170 +307,25 @@ gp_in_kernel:
                return;
 
        tsk->thread.error_code = error_code;
-       tsk->thread.trap_no = 13;
-       if (notify_die(DIE_GPF, "general protection fault", regs,
-                               error_code, 13, SIGSEGV) == NOTIFY_STOP)
+       tsk->thread.trap_no = X86_TRAP_GP;
+       if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
+                       X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
                return;
        die("general protection fault", regs, error_code);
 }
 
-static int __init setup_unknown_nmi_panic(char *str)
-{
-       unknown_nmi_panic = 1;
-       return 1;
-}
-__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
-
-static notrace __kprobes void
-pci_serr_error(unsigned char reason, struct pt_regs *regs)
-{
-       pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
-                reason, smp_processor_id());
-
-       /*
-        * On some machines, PCI SERR line is used to report memory
-        * errors. EDAC makes use of it.
-        */
-#if defined(CONFIG_EDAC)
-       if (edac_handler_set()) {
-               edac_atomic_assert_error();
-               return;
-       }
-#endif
-
-       if (panic_on_unrecovered_nmi)
-               panic("NMI: Not continuing");
-
-       pr_emerg("Dazed and confused, but trying to continue\n");
-
-       /* Clear and disable the PCI SERR error line. */
-       reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
-       outb(reason, NMI_REASON_PORT);
-}
-
-static notrace __kprobes void
-io_check_error(unsigned char reason, struct pt_regs *regs)
-{
-       unsigned long i;
-
-       pr_emerg(
-       "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
-                reason, smp_processor_id());
-       show_registers(regs);
-
-       if (panic_on_io_nmi)
-               panic("NMI IOCK error: Not continuing");
-
-       /* Re-enable the IOCK line, wait for a few seconds */
-       reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
-       outb(reason, NMI_REASON_PORT);
-
-       i = 20000;
-       while (--i) {
-               touch_nmi_watchdog();
-               udelay(100);
-       }
-
-       reason &= ~NMI_REASON_CLEAR_IOCHK;
-       outb(reason, NMI_REASON_PORT);
-}
-
-static notrace __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
-{
-       if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
-                       NOTIFY_STOP)
-               return;
-#ifdef CONFIG_MCA
-       /*
-        * Might actually be able to figure out what the guilty party
-        * is:
-        */
-       if (MCA_bus) {
-               mca_handle_nmi();
-               return;
-       }
-#endif
-       pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
-                reason, smp_processor_id());
-
-       pr_emerg("Do you have a strange power saving mode enabled?\n");
-       if (unknown_nmi_panic || panic_on_unrecovered_nmi)
-               panic("NMI: Not continuing");
-
-       pr_emerg("Dazed and confused, but trying to continue\n");
-}
-
-static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
-{
-       unsigned char reason = 0;
-
-       /*
-        * CPU-specific NMI must be processed before non-CPU-specific
-        * NMI, otherwise we may lose it, because the CPU-specific
-        * NMI can not be detected/processed on other CPUs.
-        */
-       if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
-               return;
-
-       /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
-       raw_spin_lock(&nmi_reason_lock);
-       reason = get_nmi_reason();
-
-       if (reason & NMI_REASON_MASK) {
-               if (reason & NMI_REASON_SERR)
-                       pci_serr_error(reason, regs);
-               else if (reason & NMI_REASON_IOCHK)
-                       io_check_error(reason, regs);
-#ifdef CONFIG_X86_32
-               /*
-                * Reassert NMI in case it became active
-                * meanwhile as it's edge-triggered:
-                */
-               reassert_nmi();
-#endif
-               raw_spin_unlock(&nmi_reason_lock);
-               return;
-       }
-       raw_spin_unlock(&nmi_reason_lock);
-
-       unknown_nmi_error(reason, regs);
-}
-
-dotraplinkage notrace __kprobes void
-do_nmi(struct pt_regs *regs, long error_code)
-{
-       nmi_enter();
-
-       inc_irq_stat(__nmi_count);
-
-       if (!ignore_nmis)
-               default_do_nmi(regs);
-
-       nmi_exit();
-}
-
-void stop_nmi(void)
-{
-       ignore_nmis++;
-}
-
-void restart_nmi(void)
-{
-       ignore_nmis--;
-}
-
 /* May run on IST stack. */
 dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
-       if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
-                       == NOTIFY_STOP)
+       if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+                               SIGTRAP) == NOTIFY_STOP)
                return;
 #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
 #ifdef CONFIG_KPROBES
-       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
-                       == NOTIFY_STOP)
+
+       if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+                       SIGTRAP) == NOTIFY_STOP)
                return;
 #else
        if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
@@ -472,7 +334,7 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 #endif
 
        preempt_conditional_sti(regs);
-       do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
+       do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
        preempt_conditional_cli(regs);
 }
 
@@ -501,6 +363,35 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
                *regs = *eregs;
        return regs;
 }
+
+struct bad_iret_stack {
+       void *error_entry_ret;
+       struct pt_regs regs;
+};
+
+asmlinkage
+struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
+{
+       /*
+        * This is called from entry_64.S early in handling a fault
+        * caused by a bad iret to user mode.  To handle the fault
+        * correctly, we want move our stack frame to task_pt_regs
+        * and we want to pretend that the exception came from the
+        * iret target.
+        */
+       struct bad_iret_stack *new_stack =
+               container_of(task_pt_regs(current),
+                            struct bad_iret_stack, regs);
+
+       /* Copy the IRET target to the new stack. */
+       memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
+
+       /* Copy the remainder of the stack from the current stack. */
+       memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
+
+       BUG_ON(!user_mode_vm(&new_stack->regs));
+       return new_stack;
+}
 #endif
 
 /*
@@ -570,8 +461,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
        preempt_conditional_sti(regs);
 
        if (regs->flags & X86_VM_MASK) {
-               handle_vm86_trap((struct kernel_vm86_regs *) regs,
-                               error_code, 1);
+               handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
+                                       X86_TRAP_DB);
                preempt_conditional_cli(regs);
                return;
        }
@@ -606,7 +497,8 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
        struct task_struct *task = current;
        siginfo_t info;
        unsigned short err;
-       char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
+       char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
+                                               "simd exception";
 
        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
                return;
@@ -631,7 +523,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
        info.si_signo = SIGFPE;
        info.si_errno = 0;
        info.si_addr = (void __user *)regs->ip;
-       if (trapnr == 16) {
+       if (trapnr == X86_TRAP_MF) {
                unsigned short cwd, swd;
                /*
                 * (~cwd & swd) will mask out exceptions that are not set to unmasked
@@ -675,10 +567,11 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
                info.si_code = FPE_FLTRES;
        } else {
                /*
-                * If we're using IRQ 13, or supposedly even some trap 16
-                * implementations, it's possible we get a spurious trap...
+                * If we're using IRQ 13, or supposedly even some trap
+                * X86_TRAP_MF implementations, it's possible
+                * we get a spurious trap, which is not an error.
                 */
-               return;         /* Spurious trap, no error */
+               return;
        }
        force_sig_info(SIGFPE, &info, task);
 }
@@ -689,13 +582,13 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
        ignore_fpu_irq = 1;
 #endif
 
-       math_error(regs, error_code, 16);
+       math_error(regs, error_code, X86_TRAP_MF);
 }
 
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-       math_error(regs, error_code, 19);
+       math_error(regs, error_code, X86_TRAP_XF);
 }
 
 dotraplinkage void
@@ -717,25 +610,35 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
 }
 
 /*
- * __math_state_restore assumes that cr0.TS is already clear and the
- * fpu state is all ready for use.  Used during context switch.
+ * This gets called with the process already owning the
+ * FPU state, and with CR0.TS cleared. It just needs to
+ * restore the FPU register state.
  */
-void __math_state_restore(void)
+void __math_state_restore(struct task_struct *tsk)
 {
-       struct thread_info *thread = current_thread_info();
-       struct task_struct *tsk = thread->task;
+       /* We need a safe address that is cheap to find and that is already
+          in L1. We've just brought in "tsk->thread.has_fpu", so use that */
+#define safe_address (tsk->thread.has_fpu)
+
+       /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+          is pending.  Clear the x87 state here by setting it to fixed
+          values. safe_address is a random variable that should be in L1 */
+       if (unlikely(static_cpu_has(X86_FEATURE_FXSAVE_LEAK))) {
+               asm volatile(
+                       "fnclex\n\t"
+                       "emms\n\t"
+                       "fildl %P[addr]"        /* set F?P to defined value */
+                       : : [addr] "m" (safe_address));
+       }
 
        /*
         * Paranoid restore. send a SIGSEGV if we fail to restore the state.
         */
        if (unlikely(restore_fpu_checking(tsk))) {
-               stts();
+               __thread_fpu_end(tsk);
                force_sig(SIGSEGV, tsk);
                return;
        }
-
-       thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
-       tsk->fpu_counter++;
 }
 
 /*
@@ -745,13 +648,12 @@ void __math_state_restore(void)
  * Careful.. There are problems with IBM-designed IRQ13 behaviour.
  * Don't touch unless you *really* know how it works.
  *
- * Must be called with kernel preemption disabled (in this case,
- * local interrupts are disabled at the call-site in entry.S).
+ * Must be called with kernel preemption disabled (eg with local
+ * local interrupts as in the case of do_device_not_available).
  */
-asmlinkage void math_state_restore(void)
+void math_state_restore(void)
 {
-       struct thread_info *thread = current_thread_info();
-       struct task_struct *tsk = thread->task;
+       struct task_struct *tsk = current;
 
        if (!tsk_used_math(tsk)) {
                local_irq_enable();
@@ -768,9 +670,10 @@ asmlinkage void math_state_restore(void)
                local_irq_disable();
        }
 
-       clts();                         /* Allow maths ops (or we recurse) */
+       __thread_fpu_begin(tsk);
+       __math_state_restore(tsk);
 
-       __math_state_restore();
+       tsk->fpu_counter++;
 }
 EXPORT_SYMBOL_GPL(math_state_restore);
 
@@ -804,20 +707,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
        info.si_errno = 0;
        info.si_code = ILL_BADSTK;
        info.si_addr = NULL;
-       if (notify_die(DIE_TRAP, "iret exception",
-                       regs, error_code, 32, SIGILL) == NOTIFY_STOP)
+       if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
+                       X86_TRAP_IRET, SIGILL) == NOTIFY_STOP)
                return;
-       do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
+       do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
+               &info);
 }
 #endif
 
 /* Set of traps needed for early debugging. */
 void __init early_trap_init(void)
 {
-       set_intr_gate_ist(1, &debug, DEBUG_STACK);
+       set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
        /* int3 can be called from all */
-       set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
-       set_intr_gate(14, &page_fault);
+       set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+       set_intr_gate(X86_TRAP_PF, &page_fault);
        load_idt(&idt_descr);
 }
 
@@ -833,30 +737,30 @@ void __init trap_init(void)
        early_iounmap(p, 4);
 #endif
 
-       set_intr_gate(0, &divide_error);
-       set_intr_gate_ist(2, &nmi, NMI_STACK);
+       set_intr_gate(X86_TRAP_DE, &divide_error);
+       set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
        /* int4 can be called from all */
-       set_system_intr_gate(4, &overflow);
-       set_intr_gate(5, &bounds);
-       set_intr_gate(6, &invalid_op);
-       set_intr_gate(7, &device_not_available);
+       set_system_intr_gate(X86_TRAP_OF, &overflow);
+       set_intr_gate(X86_TRAP_BR, &bounds);
+       set_intr_gate(X86_TRAP_UD, &invalid_op);
+       set_intr_gate(X86_TRAP_NM, &device_not_available);
 #ifdef CONFIG_X86_32
-       set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
+       set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
 #else
-       set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
+       set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
 #endif
-       set_intr_gate(9, &coprocessor_segment_overrun);
-       set_intr_gate(10, &invalid_TSS);
-       set_intr_gate(11, &segment_not_present);
-       set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
-       set_intr_gate(13, &general_protection);
-       set_intr_gate(15, &spurious_interrupt_bug);
-       set_intr_gate(16, &coprocessor_error);
-       set_intr_gate(17, &alignment_check);
+       set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);
+       set_intr_gate(X86_TRAP_TS, &invalid_TSS);
+       set_intr_gate(X86_TRAP_NP, &segment_not_present);
+       set_intr_gate(X86_TRAP_SS, stack_segment);
+       set_intr_gate(X86_TRAP_GP, &general_protection);
+       set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);
+       set_intr_gate(X86_TRAP_MF, &coprocessor_error);
+       set_intr_gate(X86_TRAP_AC, &alignment_check);
 #ifdef CONFIG_X86_MCE
-       set_intr_gate_ist(18, &machine_check, MCE_STACK);
+       set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
 #endif
-       set_intr_gate(19, &simd_coprocessor_error);
+       set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error);
 
        /* Reserve all the builtin and the syscall vector: */
        for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)