Merge branch 'spear/13xx' into next/soc2
[pandora-kernel.git] / arch / x86 / kernel / process_64.c
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *      Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *      Andi Kleen.
9  *
10  *      CPU hotplug support - ashok.raj@intel.com
11  */
12
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52
53 asmlinkage extern void ret_from_fork(void);
54
55 DEFINE_PER_CPU(unsigned long, old_rsp);
56
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61         unsigned long d0, d1, d2, d3, d6, d7;
62         unsigned int fsindex, gsindex;
63         unsigned int ds, cs, es;
64
65         show_regs_common();
66         printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
67         printk_address(regs->ip, 1);
68         printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
69                         regs->sp, regs->flags);
70         printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
71                regs->ax, regs->bx, regs->cx);
72         printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
73                regs->dx, regs->si, regs->di);
74         printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
75                regs->bp, regs->r8, regs->r9);
76         printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
77                regs->r10, regs->r11, regs->r12);
78         printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
79                regs->r13, regs->r14, regs->r15);
80
81         asm("movl %%ds,%0" : "=r" (ds));
82         asm("movl %%cs,%0" : "=r" (cs));
83         asm("movl %%es,%0" : "=r" (es));
84         asm("movl %%fs,%0" : "=r" (fsindex));
85         asm("movl %%gs,%0" : "=r" (gsindex));
86
87         rdmsrl(MSR_FS_BASE, fs);
88         rdmsrl(MSR_GS_BASE, gs);
89         rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
90
91         if (!all)
92                 return;
93
94         cr0 = read_cr0();
95         cr2 = read_cr2();
96         cr3 = read_cr3();
97         cr4 = read_cr4();
98
99         printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
100                fs, fsindex, gs, gsindex, shadowgs);
101         printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
102                         es, cr0);
103         printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
104                         cr4);
105
106         get_debugreg(d0, 0);
107         get_debugreg(d1, 1);
108         get_debugreg(d2, 2);
109         printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
110         get_debugreg(d3, 3);
111         get_debugreg(d6, 6);
112         get_debugreg(d7, 7);
113         printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
114 }
115
116 void release_thread(struct task_struct *dead_task)
117 {
118         if (dead_task->mm) {
119                 if (dead_task->mm->context.size) {
120                         printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
121                                         dead_task->comm,
122                                         dead_task->mm->context.ldt,
123                                         dead_task->mm->context.size);
124                         BUG();
125                 }
126         }
127 }
128
129 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
130 {
131         struct user_desc ud = {
132                 .base_addr = addr,
133                 .limit = 0xfffff,
134                 .seg_32bit = 1,
135                 .limit_in_pages = 1,
136                 .useable = 1,
137         };
138         struct desc_struct *desc = t->thread.tls_array;
139         desc += tls;
140         fill_ldt(desc, &ud);
141 }
142
143 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
144 {
145         return get_desc_base(&t->thread.tls_array[tls]);
146 }
147
148 /*
149  * This gets called before we allocate a new thread and copy
150  * the current task into it.
151  */
152 void prepare_to_copy(struct task_struct *tsk)
153 {
154         unlazy_fpu(tsk);
155 }
156
157 int copy_thread(unsigned long clone_flags, unsigned long sp,
158                 unsigned long unused,
159         struct task_struct *p, struct pt_regs *regs)
160 {
161         int err;
162         struct pt_regs *childregs;
163         struct task_struct *me = current;
164
165         childregs = ((struct pt_regs *)
166                         (THREAD_SIZE + task_stack_page(p))) - 1;
167         *childregs = *regs;
168
169         childregs->ax = 0;
170         if (user_mode(regs))
171                 childregs->sp = sp;
172         else
173                 childregs->sp = (unsigned long)childregs;
174
175         p->thread.sp = (unsigned long) childregs;
176         p->thread.sp0 = (unsigned long) (childregs+1);
177         p->thread.usersp = me->thread.usersp;
178
179         set_tsk_thread_flag(p, TIF_FORK);
180
181         p->fpu_counter = 0;
182         p->thread.io_bitmap_ptr = NULL;
183
184         savesegment(gs, p->thread.gsindex);
185         p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
186         savesegment(fs, p->thread.fsindex);
187         p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
188         savesegment(es, p->thread.es);
189         savesegment(ds, p->thread.ds);
190
191         err = -ENOMEM;
192         memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
193
194         if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
195                 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
196                                                   IO_BITMAP_BYTES, GFP_KERNEL);
197                 if (!p->thread.io_bitmap_ptr) {
198                         p->thread.io_bitmap_max = 0;
199                         return -ENOMEM;
200                 }
201                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
202         }
203
204         /*
205          * Set a new TLS for the child thread?
206          */
207         if (clone_flags & CLONE_SETTLS) {
208 #ifdef CONFIG_IA32_EMULATION
209                 if (test_thread_flag(TIF_IA32))
210                         err = do_set_thread_area(p, -1,
211                                 (struct user_desc __user *)childregs->si, 0);
212                 else
213 #endif
214                         err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
215                 if (err)
216                         goto out;
217         }
218         err = 0;
219 out:
220         if (err && p->thread.io_bitmap_ptr) {
221                 kfree(p->thread.io_bitmap_ptr);
222                 p->thread.io_bitmap_max = 0;
223         }
224
225         return err;
226 }
227
228 static void
229 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
230                     unsigned long new_sp,
231                     unsigned int _cs, unsigned int _ss, unsigned int _ds)
232 {
233         loadsegment(fs, 0);
234         loadsegment(es, _ds);
235         loadsegment(ds, _ds);
236         load_gs_index(0);
237         current->thread.usersp  = new_sp;
238         regs->ip                = new_ip;
239         regs->sp                = new_sp;
240         percpu_write(old_rsp, new_sp);
241         regs->cs                = _cs;
242         regs->ss                = _ss;
243         regs->flags             = X86_EFLAGS_IF;
244         /*
245          * Free the old FP and other extended state
246          */
247         free_thread_xstate(current);
248 }
249
250 void
251 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
252 {
253         start_thread_common(regs, new_ip, new_sp,
254                             __USER_CS, __USER_DS, 0);
255 }
256
257 #ifdef CONFIG_IA32_EMULATION
258 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
259 {
260         start_thread_common(regs, new_ip, new_sp,
261                             test_thread_flag(TIF_X32)
262                             ? __USER_CS : __USER32_CS,
263                             __USER_DS, __USER_DS);
264 }
265 #endif
266
267 /*
268  *      switch_to(x,y) should switch tasks from x to y.
269  *
270  * This could still be optimized:
271  * - fold all the options into a flag word and test it with a single test.
272  * - could test fs/gs bitsliced
273  *
274  * Kprobes not supported here. Set the probe on schedule instead.
275  * Function graph tracer not supported too.
276  */
277 __notrace_funcgraph struct task_struct *
278 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
279 {
280         struct thread_struct *prev = &prev_p->thread;
281         struct thread_struct *next = &next_p->thread;
282         int cpu = smp_processor_id();
283         struct tss_struct *tss = &per_cpu(init_tss, cpu);
284         unsigned fsindex, gsindex;
285         fpu_switch_t fpu;
286
287         fpu = switch_fpu_prepare(prev_p, next_p, cpu);
288
289         /*
290          * Reload esp0, LDT and the page table pointer:
291          */
292         load_sp0(tss, next);
293
294         /*
295          * Switch DS and ES.
296          * This won't pick up thread selector changes, but I guess that is ok.
297          */
298         savesegment(es, prev->es);
299         if (unlikely(next->es | prev->es))
300                 loadsegment(es, next->es);
301
302         savesegment(ds, prev->ds);
303         if (unlikely(next->ds | prev->ds))
304                 loadsegment(ds, next->ds);
305
306
307         /* We must save %fs and %gs before load_TLS() because
308          * %fs and %gs may be cleared by load_TLS().
309          *
310          * (e.g. xen_load_tls())
311          */
312         savesegment(fs, fsindex);
313         savesegment(gs, gsindex);
314
315         load_TLS(next, cpu);
316
317         /*
318          * Leave lazy mode, flushing any hypercalls made here.
319          * This must be done before restoring TLS segments so
320          * the GDT and LDT are properly updated, and must be
321          * done before math_state_restore, so the TS bit is up
322          * to date.
323          */
324         arch_end_context_switch(next_p);
325
326         /*
327          * Switch FS and GS.
328          *
329          * Segment register != 0 always requires a reload.  Also
330          * reload when it has changed.  When prev process used 64bit
331          * base always reload to avoid an information leak.
332          */
333         if (unlikely(fsindex | next->fsindex | prev->fs)) {
334                 loadsegment(fs, next->fsindex);
335                 /*
336                  * Check if the user used a selector != 0; if yes
337                  *  clear 64bit base, since overloaded base is always
338                  *  mapped to the Null selector
339                  */
340                 if (fsindex)
341                         prev->fs = 0;
342         }
343         /* when next process has a 64bit base use it */
344         if (next->fs)
345                 wrmsrl(MSR_FS_BASE, next->fs);
346         prev->fsindex = fsindex;
347
348         if (unlikely(gsindex | next->gsindex | prev->gs)) {
349                 load_gs_index(next->gsindex);
350                 if (gsindex)
351                         prev->gs = 0;
352         }
353         if (next->gs)
354                 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
355         prev->gsindex = gsindex;
356
357         switch_fpu_finish(next_p, fpu);
358
359         /*
360          * Switch the PDA and FPU contexts.
361          */
362         prev->usersp = percpu_read(old_rsp);
363         percpu_write(old_rsp, next->usersp);
364         percpu_write(current_task, next_p);
365
366         percpu_write(kernel_stack,
367                   (unsigned long)task_stack_page(next_p) +
368                   THREAD_SIZE - KERNEL_STACK_OFFSET);
369
370         /*
371          * Now maybe reload the debug registers and handle I/O bitmaps
372          */
373         if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
374                      task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
375                 __switch_to_xtra(prev_p, next_p, tss);
376
377         return prev_p;
378 }
379
380 void set_personality_64bit(void)
381 {
382         /* inherit personality from parent */
383
384         /* Make sure to be in 64bit mode */
385         clear_thread_flag(TIF_IA32);
386         clear_thread_flag(TIF_ADDR32);
387         clear_thread_flag(TIF_X32);
388
389         /* Ensure the corresponding mm is not marked. */
390         if (current->mm)
391                 current->mm->context.ia32_compat = 0;
392
393         /* TBD: overwrites user setup. Should have two bits.
394            But 64bit processes have always behaved this way,
395            so it's not too bad. The main problem is just that
396            32bit childs are affected again. */
397         current->personality &= ~READ_IMPLIES_EXEC;
398 }
399
400 void set_personality_ia32(bool x32)
401 {
402         /* inherit personality from parent */
403
404         /* Make sure to be in 32bit mode */
405         set_thread_flag(TIF_ADDR32);
406
407         /* Mark the associated mm as containing 32-bit tasks. */
408         if (current->mm)
409                 current->mm->context.ia32_compat = 1;
410
411         if (x32) {
412                 clear_thread_flag(TIF_IA32);
413                 set_thread_flag(TIF_X32);
414                 current->personality &= ~READ_IMPLIES_EXEC;
415                 /* is_compat_task() uses the presence of the x32
416                    syscall bit flag to determine compat status */
417                 current_thread_info()->status &= ~TS_COMPAT;
418         } else {
419                 set_thread_flag(TIF_IA32);
420                 clear_thread_flag(TIF_X32);
421                 current->personality |= force_personality32;
422                 /* Prepare the first "return" to user space */
423                 current_thread_info()->status |= TS_COMPAT;
424         }
425 }
426 EXPORT_SYMBOL_GPL(set_personality_ia32);
427
428 unsigned long get_wchan(struct task_struct *p)
429 {
430         unsigned long stack;
431         u64 fp, ip;
432         int count = 0;
433
434         if (!p || p == current || p->state == TASK_RUNNING)
435                 return 0;
436         stack = (unsigned long)task_stack_page(p);
437         if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
438                 return 0;
439         fp = *(u64 *)(p->thread.sp);
440         do {
441                 if (fp < (unsigned long)stack ||
442                     fp >= (unsigned long)stack+THREAD_SIZE)
443                         return 0;
444                 ip = *(u64 *)(fp+8);
445                 if (!in_sched_functions(ip))
446                         return ip;
447                 fp = *(u64 *)fp;
448         } while (count++ < 16);
449         return 0;
450 }
451
452 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
453 {
454         int ret = 0;
455         int doit = task == current;
456         int cpu;
457
458         switch (code) {
459         case ARCH_SET_GS:
460                 if (addr >= TASK_SIZE_OF(task))
461                         return -EPERM;
462                 cpu = get_cpu();
463                 /* handle small bases via the GDT because that's faster to
464                    switch. */
465                 if (addr <= 0xffffffff) {
466                         set_32bit_tls(task, GS_TLS, addr);
467                         if (doit) {
468                                 load_TLS(&task->thread, cpu);
469                                 load_gs_index(GS_TLS_SEL);
470                         }
471                         task->thread.gsindex = GS_TLS_SEL;
472                         task->thread.gs = 0;
473                 } else {
474                         task->thread.gsindex = 0;
475                         task->thread.gs = addr;
476                         if (doit) {
477                                 load_gs_index(0);
478                                 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
479                         }
480                 }
481                 put_cpu();
482                 break;
483         case ARCH_SET_FS:
484                 /* Not strictly needed for fs, but do it for symmetry
485                    with gs */
486                 if (addr >= TASK_SIZE_OF(task))
487                         return -EPERM;
488                 cpu = get_cpu();
489                 /* handle small bases via the GDT because that's faster to
490                    switch. */
491                 if (addr <= 0xffffffff) {
492                         set_32bit_tls(task, FS_TLS, addr);
493                         if (doit) {
494                                 load_TLS(&task->thread, cpu);
495                                 loadsegment(fs, FS_TLS_SEL);
496                         }
497                         task->thread.fsindex = FS_TLS_SEL;
498                         task->thread.fs = 0;
499                 } else {
500                         task->thread.fsindex = 0;
501                         task->thread.fs = addr;
502                         if (doit) {
503                                 /* set the selector to 0 to not confuse
504                                    __switch_to */
505                                 loadsegment(fs, 0);
506                                 ret = checking_wrmsrl(MSR_FS_BASE, addr);
507                         }
508                 }
509                 put_cpu();
510                 break;
511         case ARCH_GET_FS: {
512                 unsigned long base;
513                 if (task->thread.fsindex == FS_TLS_SEL)
514                         base = read_32bit_tls(task, FS_TLS);
515                 else if (doit)
516                         rdmsrl(MSR_FS_BASE, base);
517                 else
518                         base = task->thread.fs;
519                 ret = put_user(base, (unsigned long __user *)addr);
520                 break;
521         }
522         case ARCH_GET_GS: {
523                 unsigned long base;
524                 unsigned gsindex;
525                 if (task->thread.gsindex == GS_TLS_SEL)
526                         base = read_32bit_tls(task, GS_TLS);
527                 else if (doit) {
528                         savesegment(gs, gsindex);
529                         if (gsindex)
530                                 rdmsrl(MSR_KERNEL_GS_BASE, base);
531                         else
532                                 base = task->thread.gs;
533                 } else
534                         base = task->thread.gs;
535                 ret = put_user(base, (unsigned long __user *)addr);
536                 break;
537         }
538
539         default:
540                 ret = -EINVAL;
541                 break;
542         }
543
544         return ret;
545 }
546
547 long sys_arch_prctl(int code, unsigned long addr)
548 {
549         return do_arch_prctl(current, code, addr);
550 }
551
552 unsigned long KSTK_ESP(struct task_struct *task)
553 {
554         return (test_tsk_thread_flag(task, TIF_IA32)) ?
555                         (task_pt_regs(task)->sp) : ((task)->thread.usersp);
556 }