2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * CPU hotplug support - ashok.raj@intel.com
16 * This file handles the architecture-dependent parts of process handling..
21 #include <linux/cpu.h>
22 #include <linux/errno.h>
23 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/module.h>
31 #include <linux/a.out.h>
32 #include <linux/interrupt.h>
33 #include <linux/delay.h>
34 #include <linux/ptrace.h>
35 #include <linux/utsname.h>
36 #include <linux/random.h>
37 #include <linux/notifier.h>
38 #include <linux/kprobes.h>
40 #include <asm/uaccess.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
44 #include <asm/processor.h>
46 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/kdebug.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
63 * Powermanagement idle function, if any..
65 void (*pm_idle)(void);
66 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 void idle_notifier_register(struct notifier_block *n)
72 atomic_notifier_chain_register(&idle_notifier, n);
74 EXPORT_SYMBOL_GPL(idle_notifier_register);
76 void idle_notifier_unregister(struct notifier_block *n)
78 atomic_notifier_chain_unregister(&idle_notifier, n);
80 EXPORT_SYMBOL(idle_notifier_unregister);
82 enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
83 static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
87 __get_cpu_var(idle_state) = CPU_IDLE;
88 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
91 static void __exit_idle(void)
93 __get_cpu_var(idle_state) = CPU_NOT_IDLE;
94 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
97 /* Called from interrupts to signify idle end */
100 if (current->pid | read_pda(irqcount))
106 * We use this if we don't have any better
109 static void default_idle(void)
113 current_thread_info()->status &= ~TS_POLLING;
114 smp_mb__after_clear_bit();
115 while (!need_resched()) {
122 current_thread_info()->status |= TS_POLLING;
126 * On SMP it's slightly faster (but much more power-consuming!)
127 * to poll the ->need_resched flag instead of waiting for the
128 * cross-CPU IPI to arrive. Use this option with caution.
130 static void poll_idle (void)
140 "i" (_TIF_NEED_RESCHED),
141 "m" (current_thread_info()->flags));
144 void cpu_idle_wait(void)
146 unsigned int cpu, this_cpu = get_cpu();
149 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
153 for_each_online_cpu(cpu) {
154 per_cpu(cpu_idle_state, cpu) = 1;
158 __get_cpu_var(cpu_idle_state) = 0;
163 for_each_online_cpu(cpu) {
164 if (cpu_isset(cpu, map) &&
165 !per_cpu(cpu_idle_state, cpu))
168 cpus_and(map, map, cpu_online_map);
169 } while (!cpus_empty(map));
171 EXPORT_SYMBOL_GPL(cpu_idle_wait);
173 #ifdef CONFIG_HOTPLUG_CPU
174 DECLARE_PER_CPU(int, cpu_state);
177 /* We halt the CPU with physical CPU hotplug */
178 static inline void play_dead(void)
184 __get_cpu_var(cpu_state) = CPU_DEAD;
191 static inline void play_dead(void)
195 #endif /* CONFIG_HOTPLUG_CPU */
198 * The idle thread. There's no useful work to be
199 * done, so just try to conserve power and have a
200 * low exit latency (ie sit in a loop waiting for
201 * somebody to say that they'd like to reschedule)
205 current_thread_info()->status |= TS_POLLING;
206 /* endless idle loop with no priority at all */
208 while (!need_resched()) {
211 if (__get_cpu_var(cpu_idle_state))
212 __get_cpu_var(cpu_idle_state) = 0;
218 if (cpu_is_offline(smp_processor_id()))
225 preempt_enable_no_resched();
232 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
233 * which can obviate IPI to trigger checking of need_resched.
234 * We execute MONITOR against need_resched and enter optimized wait state
235 * through MWAIT. Whenever someone changes need_resched, we would be woken
236 * up from MWAIT (without an IPI).
238 static void mwait_idle(void)
242 while (!need_resched()) {
243 __monitor((void *)¤t_thread_info()->flags, 0, 0);
251 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
254 if (cpu_has(c, X86_FEATURE_MWAIT)) {
256 * Skip, if setup has overridden idle.
257 * One CPU supports mwait => All CPUs supports mwait
261 printk("using mwait in idle threads.\n");
264 pm_idle = mwait_idle;
269 static int __init idle_setup (char *str)
271 if (!strncmp(str, "poll", 4)) {
272 printk("using polling idle threads.\n");
276 boot_option_idle_override = 1;
280 __setup("idle=", idle_setup);
282 /* Prints also some state that isn't saved in the pt_regs */
283 void __show_regs(struct pt_regs * regs)
285 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
286 unsigned int fsindex,gsindex;
287 unsigned int ds,cs,es;
291 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
292 current->pid, current->comm, print_tainted(),
293 system_utsname.release,
294 (int)strcspn(system_utsname.version, " "),
295 system_utsname.version);
296 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
297 printk_address(regs->rip);
298 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
300 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
301 regs->rax, regs->rbx, regs->rcx);
302 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
303 regs->rdx, regs->rsi, regs->rdi);
304 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
305 regs->rbp, regs->r8, regs->r9);
306 printk("R10: %016lx R11: %016lx R12: %016lx\n",
307 regs->r10, regs->r11, regs->r12);
308 printk("R13: %016lx R14: %016lx R15: %016lx\n",
309 regs->r13, regs->r14, regs->r15);
311 asm("movl %%ds,%0" : "=r" (ds));
312 asm("movl %%cs,%0" : "=r" (cs));
313 asm("movl %%es,%0" : "=r" (es));
314 asm("movl %%fs,%0" : "=r" (fsindex));
315 asm("movl %%gs,%0" : "=r" (gsindex));
317 rdmsrl(MSR_FS_BASE, fs);
318 rdmsrl(MSR_GS_BASE, gs);
319 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
321 asm("movq %%cr0, %0": "=r" (cr0));
322 asm("movq %%cr2, %0": "=r" (cr2));
323 asm("movq %%cr3, %0": "=r" (cr3));
324 asm("movq %%cr4, %0": "=r" (cr4));
326 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
327 fs,fsindex,gs,gsindex,shadowgs);
328 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
329 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
332 void show_regs(struct pt_regs *regs)
334 printk("CPU %d:", smp_processor_id());
336 show_trace(NULL, regs, (void *)(regs + 1));
340 * Free current thread data structures etc..
342 void exit_thread(void)
344 struct task_struct *me = current;
345 struct thread_struct *t = &me->thread;
347 if (me->thread.io_bitmap_ptr) {
348 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
350 kfree(t->io_bitmap_ptr);
351 t->io_bitmap_ptr = NULL;
353 * Careful, clear this in the TSS too:
355 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
356 t->io_bitmap_max = 0;
361 void flush_thread(void)
363 struct task_struct *tsk = current;
364 struct thread_info *t = current_thread_info();
366 if (t->flags & _TIF_ABI_PENDING) {
367 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
368 if (t->flags & _TIF_IA32)
369 current_thread_info()->status |= TS_COMPAT;
372 tsk->thread.debugreg0 = 0;
373 tsk->thread.debugreg1 = 0;
374 tsk->thread.debugreg2 = 0;
375 tsk->thread.debugreg3 = 0;
376 tsk->thread.debugreg6 = 0;
377 tsk->thread.debugreg7 = 0;
378 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
380 * Forget coprocessor state..
386 void release_thread(struct task_struct *dead_task)
389 if (dead_task->mm->context.size) {
390 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
392 dead_task->mm->context.ldt,
393 dead_task->mm->context.size);
399 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
401 struct user_desc ud = {
408 struct n_desc_struct *desc = (void *)t->thread.tls_array;
410 desc->a = LDT_entry_a(&ud);
411 desc->b = LDT_entry_b(&ud);
414 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
416 struct desc_struct *desc = (void *)t->thread.tls_array;
419 (((u32)desc->base1) << 16) |
420 (((u32)desc->base2) << 24);
424 * This gets called before we allocate a new thread and copy
425 * the current task into it.
427 void prepare_to_copy(struct task_struct *tsk)
432 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
433 unsigned long unused,
434 struct task_struct * p, struct pt_regs * regs)
437 struct pt_regs * childregs;
438 struct task_struct *me = current;
440 childregs = ((struct pt_regs *)
441 (THREAD_SIZE + task_stack_page(p))) - 1;
445 childregs->rsp = rsp;
447 childregs->rsp = (unsigned long)childregs;
449 p->thread.rsp = (unsigned long) childregs;
450 p->thread.rsp0 = (unsigned long) (childregs+1);
451 p->thread.userrsp = me->thread.userrsp;
453 set_tsk_thread_flag(p, TIF_FORK);
455 p->thread.fs = me->thread.fs;
456 p->thread.gs = me->thread.gs;
458 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
459 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
460 asm("mov %%es,%0" : "=m" (p->thread.es));
461 asm("mov %%ds,%0" : "=m" (p->thread.ds));
463 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
464 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
465 if (!p->thread.io_bitmap_ptr) {
466 p->thread.io_bitmap_max = 0;
469 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
474 * Set a new TLS for the child thread?
476 if (clone_flags & CLONE_SETTLS) {
477 #ifdef CONFIG_IA32_EMULATION
478 if (test_thread_flag(TIF_IA32))
479 err = ia32_child_tls(p, childregs);
482 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
488 if (err && p->thread.io_bitmap_ptr) {
489 kfree(p->thread.io_bitmap_ptr);
490 p->thread.io_bitmap_max = 0;
496 * This special macro can be used to load a debugging register
498 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
501 * switch_to(x,y) should switch tasks from x to y.
503 * This could still be optimized:
504 * - fold all the options into a flag word and test it with a single test.
505 * - could test fs/gs bitsliced
507 * Kprobes not supported here. Set the probe on schedule instead.
509 __kprobes struct task_struct *
510 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
512 struct thread_struct *prev = &prev_p->thread,
513 *next = &next_p->thread;
514 int cpu = smp_processor_id();
515 struct tss_struct *tss = &per_cpu(init_tss, cpu);
518 * Reload esp0, LDT and the page table pointer:
520 tss->rsp0 = next->rsp0;
524 * This won't pick up thread selector changes, but I guess that is ok.
526 asm volatile("mov %%es,%0" : "=m" (prev->es));
527 if (unlikely(next->es | prev->es))
528 loadsegment(es, next->es);
530 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
531 if (unlikely(next->ds | prev->ds))
532 loadsegment(ds, next->ds);
541 asm volatile("movl %%fs,%0" : "=r" (fsindex));
542 /* segment register != 0 always requires a reload.
543 also reload when it has changed.
544 when prev process used 64bit base always reload
545 to avoid an information leak. */
546 if (unlikely(fsindex | next->fsindex | prev->fs)) {
547 loadsegment(fs, next->fsindex);
548 /* check if the user used a selector != 0
549 * if yes clear 64bit base, since overloaded base
550 * is always mapped to the Null selector
555 /* when next process has a 64bit base use it */
557 wrmsrl(MSR_FS_BASE, next->fs);
558 prev->fsindex = fsindex;
562 asm volatile("movl %%gs,%0" : "=r" (gsindex));
563 if (unlikely(gsindex | next->gsindex | prev->gs)) {
564 load_gs_index(next->gsindex);
569 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
570 prev->gsindex = gsindex;
574 * Switch the PDA and FPU contexts.
576 prev->userrsp = read_pda(oldrsp);
577 write_pda(oldrsp, next->userrsp);
578 write_pda(pcurrent, next_p);
580 /* This must be here to ensure both math_state_restore() and
581 kernel_fpu_begin() work consistently.
582 And the AMD workaround requires it to be after DS reload. */
584 write_pda(kernelstack,
585 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
588 * Now maybe reload the debug registers
590 if (unlikely(next->debugreg7)) {
602 * Handle the IO bitmap
604 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
605 if (next->io_bitmap_ptr)
607 * Copy the relevant range of the IO bitmap.
608 * Normally this is 128 bytes or less:
610 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
611 max(prev->io_bitmap_max, next->io_bitmap_max));
614 * Clear any possible leftover bits:
616 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
624 * sys_execve() executes a new program.
627 long sys_execve(char __user *name, char __user * __user *argv,
628 char __user * __user *envp, struct pt_regs regs)
633 filename = getname(name);
634 error = PTR_ERR(filename);
635 if (IS_ERR(filename))
637 error = do_execve(filename, argv, envp, ®s);
640 current->ptrace &= ~PT_DTRACE;
641 task_unlock(current);
647 void set_personality_64bit(void)
649 /* inherit personality from parent */
651 /* Make sure to be in 64bit mode */
652 clear_thread_flag(TIF_IA32);
654 /* TBD: overwrites user setup. Should have two bits.
655 But 64bit processes have always behaved this way,
656 so it's not too bad. The main problem is just that
657 32bit childs are affected again. */
658 current->personality &= ~READ_IMPLIES_EXEC;
661 asmlinkage long sys_fork(struct pt_regs *regs)
663 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
667 sys_clone(unsigned long clone_flags, unsigned long newsp,
668 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
672 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
676 * This is trivial, and on the face of it looks like it
677 * could equally well be done in user mode.
679 * Not so, for quite unobvious reasons - register pressure.
680 * In user mode vfork() cannot have a stack frame, and if
681 * done by calling the "clone()" system call directly, you
682 * do not have enough call-clobbered registers to hold all
683 * the information you need.
685 asmlinkage long sys_vfork(struct pt_regs *regs)
687 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
691 unsigned long get_wchan(struct task_struct *p)
697 if (!p || p == current || p->state==TASK_RUNNING)
699 stack = (unsigned long)task_stack_page(p);
700 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
702 fp = *(u64 *)(p->thread.rsp);
704 if (fp < (unsigned long)stack ||
705 fp > (unsigned long)stack+THREAD_SIZE)
707 rip = *(u64 *)(fp+8);
708 if (!in_sched_functions(rip))
711 } while (count++ < 16);
715 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
718 int doit = task == current;
723 if (addr >= TASK_SIZE_OF(task))
726 /* handle small bases via the GDT because that's faster to
728 if (addr <= 0xffffffff) {
729 set_32bit_tls(task, GS_TLS, addr);
731 load_TLS(&task->thread, cpu);
732 load_gs_index(GS_TLS_SEL);
734 task->thread.gsindex = GS_TLS_SEL;
737 task->thread.gsindex = 0;
738 task->thread.gs = addr;
741 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
747 /* Not strictly needed for fs, but do it for symmetry
749 if (addr >= TASK_SIZE_OF(task))
752 /* handle small bases via the GDT because that's faster to
754 if (addr <= 0xffffffff) {
755 set_32bit_tls(task, FS_TLS, addr);
757 load_TLS(&task->thread, cpu);
758 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
760 task->thread.fsindex = FS_TLS_SEL;
763 task->thread.fsindex = 0;
764 task->thread.fs = addr;
766 /* set the selector to 0 to not confuse
768 asm volatile("movl %0,%%fs" :: "r" (0));
769 ret = checking_wrmsrl(MSR_FS_BASE, addr);
776 if (task->thread.fsindex == FS_TLS_SEL)
777 base = read_32bit_tls(task, FS_TLS);
779 rdmsrl(MSR_FS_BASE, base);
781 base = task->thread.fs;
782 ret = put_user(base, (unsigned long __user *)addr);
788 if (task->thread.gsindex == GS_TLS_SEL)
789 base = read_32bit_tls(task, GS_TLS);
791 asm("movl %%gs,%0" : "=r" (gsindex));
793 rdmsrl(MSR_KERNEL_GS_BASE, base);
795 base = task->thread.gs;
798 base = task->thread.gs;
799 ret = put_user(base, (unsigned long __user *)addr);
811 long sys_arch_prctl(int code, unsigned long addr)
813 return do_arch_prctl(current, code, addr);
817 * Capture the user space registers if the task is not running (in user space)
819 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
821 struct pt_regs *pp, ptregs;
823 pp = task_pt_regs(tsk);
829 elf_core_copy_regs(regs, &ptregs);
834 unsigned long arch_align_stack(unsigned long sp)
836 if (randomize_va_space)
837 sp -= get_random_int() % 8192;