2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
22 #include <linux/kernel.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
39 #include <linux/ftrace.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
43 #include <asm/processor.h>
45 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
48 #include <asm/proto.h>
51 #include <asm/syscalls.h>
53 #include <asm/debugreg.h>
55 asmlinkage extern void ret_from_fork(void);
57 DEFINE_PER_CPU(unsigned long, old_rsp);
58 static DEFINE_PER_CPU(unsigned char, is_idle);
60 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
62 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
64 void idle_notifier_register(struct notifier_block *n)
66 atomic_notifier_chain_register(&idle_notifier, n);
68 EXPORT_SYMBOL_GPL(idle_notifier_register);
70 void idle_notifier_unregister(struct notifier_block *n)
72 atomic_notifier_chain_unregister(&idle_notifier, n);
74 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
78 percpu_write(is_idle, 1);
79 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
82 static void __exit_idle(void)
84 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
86 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
89 /* Called from interrupts to signify idle end */
92 /* idle loop has pid 0 */
99 static inline void play_dead(void)
106 * The idle thread. There's no useful work to be
107 * done, so just try to conserve power and have a
108 * low exit latency (ie sit in a loop waiting for
109 * somebody to say that they'd like to reschedule)
113 current_thread_info()->status |= TS_POLLING;
116 * If we're the non-boot CPU, nothing set the stack canary up
117 * for us. CPU0 already has it initialized but no harm in
118 * doing it again. This is a good place for updating it, as
119 * we wont ever return from this function (so the invalid
120 * canaries already on the stack wont ever trigger).
122 boot_init_stack_canary();
124 /* endless idle loop with no priority at all */
126 tick_nohz_stop_sched_tick(1);
127 while (!need_resched()) {
131 if (cpu_is_offline(smp_processor_id()))
134 * Idle routines should keep interrupts disabled
135 * from here on, until they go to idle.
136 * Otherwise, idle callbacks can misfire.
140 /* Don't trace irqs off for idle */
141 stop_critical_timings();
143 start_critical_timings();
144 /* In many cases the interrupt that ended idle
145 has already called exit_idle. But some idle
146 loops can be woken up without interrupt. */
150 tick_nohz_restart_sched_tick();
151 preempt_enable_no_resched();
157 /* Prints also some state that isn't saved in the pt_regs */
158 void __show_regs(struct pt_regs *regs, int all)
160 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
161 unsigned long d0, d1, d2, d3, d6, d7;
162 unsigned int fsindex, gsindex;
163 unsigned int ds, cs, es;
166 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
167 printk_address(regs->ip, 1);
168 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
169 regs->sp, regs->flags);
170 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
171 regs->ax, regs->bx, regs->cx);
172 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
173 regs->dx, regs->si, regs->di);
174 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
175 regs->bp, regs->r8, regs->r9);
176 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
177 regs->r10, regs->r11, regs->r12);
178 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
179 regs->r13, regs->r14, regs->r15);
181 asm("movl %%ds,%0" : "=r" (ds));
182 asm("movl %%cs,%0" : "=r" (cs));
183 asm("movl %%es,%0" : "=r" (es));
184 asm("movl %%fs,%0" : "=r" (fsindex));
185 asm("movl %%gs,%0" : "=r" (gsindex));
187 rdmsrl(MSR_FS_BASE, fs);
188 rdmsrl(MSR_GS_BASE, gs);
189 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
199 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
200 fs, fsindex, gs, gsindex, shadowgs);
201 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
203 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
209 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
213 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
216 void show_regs(struct pt_regs *regs)
218 show_registers(regs);
219 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
222 void release_thread(struct task_struct *dead_task)
225 if (dead_task->mm->context.size) {
226 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
228 dead_task->mm->context.ldt,
229 dead_task->mm->context.size);
235 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
237 struct user_desc ud = {
244 struct desc_struct *desc = t->thread.tls_array;
249 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
251 return get_desc_base(&t->thread.tls_array[tls]);
255 * This gets called before we allocate a new thread and copy
256 * the current task into it.
258 void prepare_to_copy(struct task_struct *tsk)
263 int copy_thread(unsigned long clone_flags, unsigned long sp,
264 unsigned long unused,
265 struct task_struct *p, struct pt_regs *regs)
268 struct pt_regs *childregs;
269 struct task_struct *me = current;
271 childregs = ((struct pt_regs *)
272 (THREAD_SIZE + task_stack_page(p))) - 1;
278 childregs->sp = (unsigned long)childregs;
280 p->thread.sp = (unsigned long) childregs;
281 p->thread.sp0 = (unsigned long) (childregs+1);
282 p->thread.usersp = me->thread.usersp;
284 set_tsk_thread_flag(p, TIF_FORK);
286 p->thread.fs = me->thread.fs;
287 p->thread.gs = me->thread.gs;
288 p->thread.io_bitmap_ptr = NULL;
290 savesegment(gs, p->thread.gsindex);
291 savesegment(fs, p->thread.fsindex);
292 savesegment(es, p->thread.es);
293 savesegment(ds, p->thread.ds);
296 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
298 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
299 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
300 if (!p->thread.io_bitmap_ptr) {
301 p->thread.io_bitmap_max = 0;
304 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
306 set_tsk_thread_flag(p, TIF_IO_BITMAP);
310 * Set a new TLS for the child thread?
312 if (clone_flags & CLONE_SETTLS) {
313 #ifdef CONFIG_IA32_EMULATION
314 if (test_thread_flag(TIF_IA32))
315 err = do_set_thread_area(p, -1,
316 (struct user_desc __user *)childregs->si, 0);
319 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
324 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
325 p->thread.ds_ctx = NULL;
327 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
328 p->thread.debugctlmsr = 0;
332 if (err && p->thread.io_bitmap_ptr) {
333 kfree(p->thread.io_bitmap_ptr);
334 p->thread.io_bitmap_max = 0;
341 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
342 unsigned long new_sp,
343 unsigned int _cs, unsigned int _ss, unsigned int _ds)
346 loadsegment(es, _ds);
347 loadsegment(ds, _ds);
351 percpu_write(old_rsp, new_sp);
354 regs->flags = X86_EFLAGS_IF;
357 * Free the old FP and other extended state
359 free_thread_xstate(current);
363 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
365 start_thread_common(regs, new_ip, new_sp,
366 __USER_CS, __USER_DS, 0);
369 #ifdef CONFIG_IA32_EMULATION
370 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
372 start_thread_common(regs, new_ip, new_sp,
373 __USER32_CS, __USER32_DS, __USER32_DS);
378 * switch_to(x,y) should switch tasks from x to y.
380 * This could still be optimized:
381 * - fold all the options into a flag word and test it with a single test.
382 * - could test fs/gs bitsliced
384 * Kprobes not supported here. Set the probe on schedule instead.
385 * Function graph tracer not supported too.
387 __notrace_funcgraph struct task_struct *
388 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
390 struct thread_struct *prev = &prev_p->thread;
391 struct thread_struct *next = &next_p->thread;
392 int cpu = smp_processor_id();
393 struct tss_struct *tss = &per_cpu(init_tss, cpu);
394 unsigned fsindex, gsindex;
398 * If the task has used fpu the last 5 timeslices, just do a full
399 * restore of the math state immediately to avoid the trap; the
400 * chances of needing FPU soon are obviously high now
402 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
404 /* we're going to use this soon, after a few expensive things */
406 prefetch(next->xstate);
409 * Reload esp0, LDT and the page table pointer:
415 * This won't pick up thread selector changes, but I guess that is ok.
417 savesegment(es, prev->es);
418 if (unlikely(next->es | prev->es))
419 loadsegment(es, next->es);
421 savesegment(ds, prev->ds);
422 if (unlikely(next->ds | prev->ds))
423 loadsegment(ds, next->ds);
426 /* We must save %fs and %gs before load_TLS() because
427 * %fs and %gs may be cleared by load_TLS().
429 * (e.g. xen_load_tls())
431 savesegment(fs, fsindex);
432 savesegment(gs, gsindex);
436 /* Must be after DS reload */
439 /* Make sure cpu is ready for new context */
444 * Leave lazy mode, flushing any hypercalls made here.
445 * This must be done before restoring TLS segments so
446 * the GDT and LDT are properly updated, and must be
447 * done before math_state_restore, so the TS bit is up
450 arch_end_context_switch(next_p);
455 * Segment register != 0 always requires a reload. Also
456 * reload when it has changed. When prev process used 64bit
457 * base always reload to avoid an information leak.
459 if (unlikely(fsindex | next->fsindex | prev->fs)) {
460 loadsegment(fs, next->fsindex);
462 * Check if the user used a selector != 0; if yes
463 * clear 64bit base, since overloaded base is always
464 * mapped to the Null selector
469 /* when next process has a 64bit base use it */
471 wrmsrl(MSR_FS_BASE, next->fs);
472 prev->fsindex = fsindex;
474 if (unlikely(gsindex | next->gsindex | prev->gs)) {
475 load_gs_index(next->gsindex);
480 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
481 prev->gsindex = gsindex;
484 * Switch the PDA and FPU contexts.
486 prev->usersp = percpu_read(old_rsp);
487 percpu_write(old_rsp, next->usersp);
488 percpu_write(current_task, next_p);
490 percpu_write(kernel_stack,
491 (unsigned long)task_stack_page(next_p) +
492 THREAD_SIZE - KERNEL_STACK_OFFSET);
495 * Now maybe reload the debug registers and handle I/O bitmaps
497 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
498 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
499 __switch_to_xtra(prev_p, next_p, tss);
502 * Preload the FPU context, now that we've determined that the
503 * task is likely to be using it.
506 __math_state_restore();
512 * sys_execve() executes a new program.
515 long sys_execve(char __user *name, char __user * __user *argv,
516 char __user * __user *envp, struct pt_regs *regs)
521 filename = getname(name);
522 error = PTR_ERR(filename);
523 if (IS_ERR(filename))
525 error = do_execve(filename, argv, envp, regs);
530 void set_personality_64bit(void)
532 /* inherit personality from parent */
534 /* Make sure to be in 64bit mode */
535 clear_thread_flag(TIF_IA32);
537 /* TBD: overwrites user setup. Should have two bits.
538 But 64bit processes have always behaved this way,
539 so it's not too bad. The main problem is just that
540 32bit childs are affected again. */
541 current->personality &= ~READ_IMPLIES_EXEC;
545 sys_clone(unsigned long clone_flags, unsigned long newsp,
546 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
550 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
553 unsigned long get_wchan(struct task_struct *p)
559 if (!p || p == current || p->state == TASK_RUNNING)
561 stack = (unsigned long)task_stack_page(p);
562 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
564 fp = *(u64 *)(p->thread.sp);
566 if (fp < (unsigned long)stack ||
567 fp >= (unsigned long)stack+THREAD_SIZE)
570 if (!in_sched_functions(ip))
573 } while (count++ < 16);
577 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
580 int doit = task == current;
585 if (addr >= TASK_SIZE_OF(task))
588 /* handle small bases via the GDT because that's faster to
590 if (addr <= 0xffffffff) {
591 set_32bit_tls(task, GS_TLS, addr);
593 load_TLS(&task->thread, cpu);
594 load_gs_index(GS_TLS_SEL);
596 task->thread.gsindex = GS_TLS_SEL;
599 task->thread.gsindex = 0;
600 task->thread.gs = addr;
603 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
609 /* Not strictly needed for fs, but do it for symmetry
611 if (addr >= TASK_SIZE_OF(task))
614 /* handle small bases via the GDT because that's faster to
616 if (addr <= 0xffffffff) {
617 set_32bit_tls(task, FS_TLS, addr);
619 load_TLS(&task->thread, cpu);
620 loadsegment(fs, FS_TLS_SEL);
622 task->thread.fsindex = FS_TLS_SEL;
625 task->thread.fsindex = 0;
626 task->thread.fs = addr;
628 /* set the selector to 0 to not confuse
631 ret = checking_wrmsrl(MSR_FS_BASE, addr);
638 if (task->thread.fsindex == FS_TLS_SEL)
639 base = read_32bit_tls(task, FS_TLS);
641 rdmsrl(MSR_FS_BASE, base);
643 base = task->thread.fs;
644 ret = put_user(base, (unsigned long __user *)addr);
650 if (task->thread.gsindex == GS_TLS_SEL)
651 base = read_32bit_tls(task, GS_TLS);
653 savesegment(gs, gsindex);
655 rdmsrl(MSR_KERNEL_GS_BASE, base);
657 base = task->thread.gs;
659 base = task->thread.gs;
660 ret = put_user(base, (unsigned long __user *)addr);
672 long sys_arch_prctl(int code, unsigned long addr)
674 return do_arch_prctl(current, code, addr);
677 unsigned long KSTK_ESP(struct task_struct *task)
679 return (test_tsk_thread_flag(task, TIF_IA32)) ?
680 (task_pt_regs(task)->sp) : ((task)->thread.usersp);