2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
22 #include <linux/kernel.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/utsname.h>
30 #include <linux/delay.h>
31 #include <linux/module.h>
32 #include <linux/ptrace.h>
33 #include <linux/notifier.h>
34 #include <linux/kprobes.h>
35 #include <linux/kdebug.h>
36 #include <linux/tick.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
40 #include <linux/ftrace.h>
41 #include <linux/dmi.h>
43 #include <asm/pgtable.h>
44 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
50 #include <asm/proto.h>
53 #include <asm/syscalls.h>
55 #include <asm/debugreg.h>
56 #include <asm/hw_breakpoint.h>
58 asmlinkage extern void ret_from_fork(void);
60 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
61 EXPORT_PER_CPU_SYMBOL(current_task);
63 DEFINE_PER_CPU(unsigned long, old_rsp);
64 static DEFINE_PER_CPU(unsigned char, is_idle);
66 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 void idle_notifier_register(struct notifier_block *n)
72 atomic_notifier_chain_register(&idle_notifier, n);
74 EXPORT_SYMBOL_GPL(idle_notifier_register);
76 void idle_notifier_unregister(struct notifier_block *n)
78 atomic_notifier_chain_unregister(&idle_notifier, n);
80 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
84 percpu_write(is_idle, 1);
85 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
88 static void __exit_idle(void)
90 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
92 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
95 /* Called from interrupts to signify idle end */
98 /* idle loop has pid 0 */
105 static inline void play_dead(void)
112 * The idle thread. There's no useful work to be
113 * done, so just try to conserve power and have a
114 * low exit latency (ie sit in a loop waiting for
115 * somebody to say that they'd like to reschedule)
119 current_thread_info()->status |= TS_POLLING;
122 * If we're the non-boot CPU, nothing set the stack canary up
123 * for us. CPU0 already has it initialized but no harm in
124 * doing it again. This is a good place for updating it, as
125 * we wont ever return from this function (so the invalid
126 * canaries already on the stack wont ever trigger).
128 boot_init_stack_canary();
130 /* endless idle loop with no priority at all */
132 tick_nohz_stop_sched_tick(1);
133 while (!need_resched()) {
137 if (cpu_is_offline(smp_processor_id()))
140 * Idle routines should keep interrupts disabled
141 * from here on, until they go to idle.
142 * Otherwise, idle callbacks can misfire.
146 /* Don't trace irqs off for idle */
147 stop_critical_timings();
149 start_critical_timings();
150 /* In many cases the interrupt that ended idle
151 has already called exit_idle. But some idle
152 loops can be woken up without interrupt. */
156 tick_nohz_restart_sched_tick();
157 preempt_enable_no_resched();
163 /* Prints also some state that isn't saved in the pt_regs */
164 void __show_regs(struct pt_regs *regs, int all)
166 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
167 unsigned long d0, d1, d2, d3, d6, d7;
168 unsigned int fsindex, gsindex;
169 unsigned int ds, cs, es;
174 board = dmi_get_system_info(DMI_PRODUCT_NAME);
177 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
178 current->pid, current->comm, print_tainted(),
179 init_utsname()->release,
180 (int)strcspn(init_utsname()->version, " "),
181 init_utsname()->version, board);
182 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
183 printk_address(regs->ip, 1);
184 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
185 regs->sp, regs->flags);
186 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
187 regs->ax, regs->bx, regs->cx);
188 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
189 regs->dx, regs->si, regs->di);
190 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
191 regs->bp, regs->r8, regs->r9);
192 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
193 regs->r10, regs->r11, regs->r12);
194 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
195 regs->r13, regs->r14, regs->r15);
197 asm("movl %%ds,%0" : "=r" (ds));
198 asm("movl %%cs,%0" : "=r" (cs));
199 asm("movl %%es,%0" : "=r" (es));
200 asm("movl %%fs,%0" : "=r" (fsindex));
201 asm("movl %%gs,%0" : "=r" (gsindex));
203 rdmsrl(MSR_FS_BASE, fs);
204 rdmsrl(MSR_GS_BASE, gs);
205 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
215 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
216 fs, fsindex, gs, gsindex, shadowgs);
217 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
219 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
225 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
229 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
232 void show_regs(struct pt_regs *regs)
234 printk(KERN_INFO "CPU %d:", smp_processor_id());
235 __show_regs(regs, 1);
236 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
239 void release_thread(struct task_struct *dead_task)
242 if (dead_task->mm->context.size) {
243 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
245 dead_task->mm->context.ldt,
246 dead_task->mm->context.size);
250 if (unlikely(dead_task->thread.debugreg7))
251 flush_thread_hw_breakpoint(dead_task);
254 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
256 struct user_desc ud = {
263 struct desc_struct *desc = t->thread.tls_array;
268 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
270 return get_desc_base(&t->thread.tls_array[tls]);
274 * This gets called before we allocate a new thread and copy
275 * the current task into it.
277 void prepare_to_copy(struct task_struct *tsk)
282 int copy_thread(unsigned long clone_flags, unsigned long sp,
283 unsigned long unused,
284 struct task_struct *p, struct pt_regs *regs)
287 struct pt_regs *childregs;
288 struct task_struct *me = current;
290 childregs = ((struct pt_regs *)
291 (THREAD_SIZE + task_stack_page(p))) - 1;
297 childregs->sp = (unsigned long)childregs;
299 p->thread.sp = (unsigned long) childregs;
300 p->thread.sp0 = (unsigned long) (childregs+1);
301 p->thread.usersp = me->thread.usersp;
303 set_tsk_thread_flag(p, TIF_FORK);
305 p->thread.fs = me->thread.fs;
306 p->thread.gs = me->thread.gs;
307 p->thread.io_bitmap_ptr = NULL;
309 savesegment(gs, p->thread.gsindex);
310 savesegment(fs, p->thread.fsindex);
311 savesegment(es, p->thread.es);
312 savesegment(ds, p->thread.ds);
315 if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG)))
316 if (copy_thread_hw_breakpoint(me, p, clone_flags))
319 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
320 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
321 if (!p->thread.io_bitmap_ptr) {
322 p->thread.io_bitmap_max = 0;
325 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
327 set_tsk_thread_flag(p, TIF_IO_BITMAP);
331 * Set a new TLS for the child thread?
333 if (clone_flags & CLONE_SETTLS) {
334 #ifdef CONFIG_IA32_EMULATION
335 if (test_thread_flag(TIF_IA32))
336 err = do_set_thread_area(p, -1,
337 (struct user_desc __user *)childregs->si, 0);
340 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
345 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
346 p->thread.ds_ctx = NULL;
348 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
349 p->thread.debugctlmsr = 0;
353 if (err && p->thread.io_bitmap_ptr) {
354 kfree(p->thread.io_bitmap_ptr);
355 p->thread.io_bitmap_max = 0;
358 flush_thread_hw_breakpoint(p);
364 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
372 percpu_write(old_rsp, new_sp);
373 regs->cs = __USER_CS;
374 regs->ss = __USER_DS;
378 * Free the old FP and other extended state
380 free_thread_xstate(current);
382 EXPORT_SYMBOL_GPL(start_thread);
385 * switch_to(x,y) should switch tasks from x to y.
387 * This could still be optimized:
388 * - fold all the options into a flag word and test it with a single test.
389 * - could test fs/gs bitsliced
391 * Kprobes not supported here. Set the probe on schedule instead.
392 * Function graph tracer not supported too.
394 __notrace_funcgraph struct task_struct *
395 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
397 struct thread_struct *prev = &prev_p->thread;
398 struct thread_struct *next = &next_p->thread;
399 int cpu = smp_processor_id();
400 struct tss_struct *tss = &per_cpu(init_tss, cpu);
401 unsigned fsindex, gsindex;
403 /* we're going to use this soon, after a few expensive things */
404 if (next_p->fpu_counter > 5)
405 prefetch(next->xstate);
408 * Reload esp0, LDT and the page table pointer:
414 * This won't pick up thread selector changes, but I guess that is ok.
416 savesegment(es, prev->es);
417 if (unlikely(next->es | prev->es))
418 loadsegment(es, next->es);
420 savesegment(ds, prev->ds);
421 if (unlikely(next->ds | prev->ds))
422 loadsegment(ds, next->ds);
425 /* We must save %fs and %gs before load_TLS() because
426 * %fs and %gs may be cleared by load_TLS().
428 * (e.g. xen_load_tls())
430 savesegment(fs, fsindex);
431 savesegment(gs, gsindex);
436 * Leave lazy mode, flushing any hypercalls made here.
437 * This must be done before restoring TLS segments so
438 * the GDT and LDT are properly updated, and must be
439 * done before math_state_restore, so the TS bit is up
442 arch_end_context_switch(next_p);
447 * Segment register != 0 always requires a reload. Also
448 * reload when it has changed. When prev process used 64bit
449 * base always reload to avoid an information leak.
451 if (unlikely(fsindex | next->fsindex | prev->fs)) {
452 loadsegment(fs, next->fsindex);
454 * Check if the user used a selector != 0; if yes
455 * clear 64bit base, since overloaded base is always
456 * mapped to the Null selector
461 /* when next process has a 64bit base use it */
463 wrmsrl(MSR_FS_BASE, next->fs);
464 prev->fsindex = fsindex;
466 if (unlikely(gsindex | next->gsindex | prev->gs)) {
467 load_gs_index(next->gsindex);
472 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
473 prev->gsindex = gsindex;
475 /* Must be after DS reload */
479 * Switch the PDA and FPU contexts.
481 prev->usersp = percpu_read(old_rsp);
482 percpu_write(old_rsp, next->usersp);
483 percpu_write(current_task, next_p);
485 percpu_write(kernel_stack,
486 (unsigned long)task_stack_page(next_p) +
487 THREAD_SIZE - KERNEL_STACK_OFFSET);
490 * Now maybe reload the debug registers and handle I/O bitmaps
492 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
493 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
494 __switch_to_xtra(prev_p, next_p, tss);
496 /* If the task has used fpu the last 5 timeslices, just do a full
497 * restore of the math state immediately to avoid the trap; the
498 * chances of needing FPU soon are obviously high now
500 * tsk_used_math() checks prevent calling math_state_restore(),
501 * which can sleep in the case of !tsk_used_math()
503 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
504 math_state_restore();
506 * There's a problem with moving the arch_install_thread_hw_breakpoint()
507 * call before current is updated. Suppose a kernel breakpoint is
508 * triggered in between the two, the hw-breakpoint handler will see that
509 * the 'current' task does not have TIF_DEBUG flag set and will think it
510 * is leftover from an old task (lazy switching) and will erase it. Then
511 * until the next context switch, no user-breakpoints will be installed.
513 * The real problem is that it's impossible to update both current and
514 * physical debug registers at the same instant, so there will always be
515 * a window in which they disagree and a breakpoint might get triggered.
516 * Since we use lazy switching, we are forced to assume that a
517 * disagreement means that current is correct and the exception is due
518 * to lazy debug register switching.
520 if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
521 arch_install_thread_hw_breakpoint(next_p);
527 * sys_execve() executes a new program.
530 long sys_execve(char __user *name, char __user * __user *argv,
531 char __user * __user *envp, struct pt_regs *regs)
536 filename = getname(name);
537 error = PTR_ERR(filename);
538 if (IS_ERR(filename))
540 error = do_execve(filename, argv, envp, regs);
545 void set_personality_64bit(void)
547 /* inherit personality from parent */
549 /* Make sure to be in 64bit mode */
550 clear_thread_flag(TIF_IA32);
552 /* TBD: overwrites user setup. Should have two bits.
553 But 64bit processes have always behaved this way,
554 so it's not too bad. The main problem is just that
555 32bit childs are affected again. */
556 current->personality &= ~READ_IMPLIES_EXEC;
560 sys_clone(unsigned long clone_flags, unsigned long newsp,
561 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
565 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
568 unsigned long get_wchan(struct task_struct *p)
574 if (!p || p == current || p->state == TASK_RUNNING)
576 stack = (unsigned long)task_stack_page(p);
577 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
579 fp = *(u64 *)(p->thread.sp);
581 if (fp < (unsigned long)stack ||
582 fp >= (unsigned long)stack+THREAD_SIZE)
585 if (!in_sched_functions(ip))
588 } while (count++ < 16);
592 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
595 int doit = task == current;
600 if (addr >= TASK_SIZE_OF(task))
603 /* handle small bases via the GDT because that's faster to
605 if (addr <= 0xffffffff) {
606 set_32bit_tls(task, GS_TLS, addr);
608 load_TLS(&task->thread, cpu);
609 load_gs_index(GS_TLS_SEL);
611 task->thread.gsindex = GS_TLS_SEL;
614 task->thread.gsindex = 0;
615 task->thread.gs = addr;
618 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
624 /* Not strictly needed for fs, but do it for symmetry
626 if (addr >= TASK_SIZE_OF(task))
629 /* handle small bases via the GDT because that's faster to
631 if (addr <= 0xffffffff) {
632 set_32bit_tls(task, FS_TLS, addr);
634 load_TLS(&task->thread, cpu);
635 loadsegment(fs, FS_TLS_SEL);
637 task->thread.fsindex = FS_TLS_SEL;
640 task->thread.fsindex = 0;
641 task->thread.fs = addr;
643 /* set the selector to 0 to not confuse
646 ret = checking_wrmsrl(MSR_FS_BASE, addr);
653 if (task->thread.fsindex == FS_TLS_SEL)
654 base = read_32bit_tls(task, FS_TLS);
656 rdmsrl(MSR_FS_BASE, base);
658 base = task->thread.fs;
659 ret = put_user(base, (unsigned long __user *)addr);
665 if (task->thread.gsindex == GS_TLS_SEL)
666 base = read_32bit_tls(task, GS_TLS);
668 savesegment(gs, gsindex);
670 rdmsrl(MSR_KERNEL_GS_BASE, base);
672 base = task->thread.gs;
674 base = task->thread.gs;
675 ret = put_user(base, (unsigned long __user *)addr);
687 long sys_arch_prctl(int code, unsigned long addr)
689 return do_arch_prctl(current, code, addr);