2b154da0b6d35a342008ae9a0accada9a4880a9a
[linux-2.6.git] / arch / x86 / kernel / process_64.c
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *      Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *      Andi Kleen.
9  *
10  *      CPU hotplug support - ashok.raj@intel.com
11  */
12
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52
53 asmlinkage extern void ret_from_fork(void);
54
55 DEFINE_PER_CPU(unsigned long, old_rsp);
56
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61         unsigned long d0, d1, d2, d3, d6, d7;
62         unsigned int fsindex, gsindex;
63         unsigned int ds, cs, es;
64
65         show_regs_common();
66         printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
67         printk_address(regs->ip, 1);
68         printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
69                         regs->sp, regs->flags);
70         printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
71                regs->ax, regs->bx, regs->cx);
72         printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
73                regs->dx, regs->si, regs->di);
74         printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
75                regs->bp, regs->r8, regs->r9);
76         printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
77                regs->r10, regs->r11, regs->r12);
78         printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
79                regs->r13, regs->r14, regs->r15);
80
81         asm("movl %%ds,%0" : "=r" (ds));
82         asm("movl %%cs,%0" : "=r" (cs));
83         asm("movl %%es,%0" : "=r" (es));
84         asm("movl %%fs,%0" : "=r" (fsindex));
85         asm("movl %%gs,%0" : "=r" (gsindex));
86
87         rdmsrl(MSR_FS_BASE, fs);
88         rdmsrl(MSR_GS_BASE, gs);
89         rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
90
91         if (!all)
92                 return;
93
94         cr0 = read_cr0();
95         cr2 = read_cr2();
96         cr3 = read_cr3();
97         cr4 = read_cr4();
98
99         printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
100                fs, fsindex, gs, gsindex, shadowgs);
101         printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
102                         es, cr0);
103         printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
104                         cr4);
105
106         get_debugreg(d0, 0);
107         get_debugreg(d1, 1);
108         get_debugreg(d2, 2);
109         printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
110         get_debugreg(d3, 3);
111         get_debugreg(d6, 6);
112         get_debugreg(d7, 7);
113         printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
114 }
115
116 void release_thread(struct task_struct *dead_task)
117 {
118         if (dead_task->mm) {
119                 if (dead_task->mm->context.size) {
120                         printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
121                                         dead_task->comm,
122                                         dead_task->mm->context.ldt,
123                                         dead_task->mm->context.size);
124                         BUG();
125                 }
126         }
127 }
128
129 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
130 {
131         struct user_desc ud = {
132                 .base_addr = addr,
133                 .limit = 0xfffff,
134                 .seg_32bit = 1,
135                 .limit_in_pages = 1,
136                 .useable = 1,
137         };
138         struct desc_struct *desc = t->thread.tls_array;
139         desc += tls;
140         fill_ldt(desc, &ud);
141 }
142
143 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
144 {
145         return get_desc_base(&t->thread.tls_array[tls]);
146 }
147
148 /*
149  * This gets called before we allocate a new thread and copy
150  * the current task into it.
151  */
152 void prepare_to_copy(struct task_struct *tsk)
153 {
154         unlazy_fpu(tsk);
155 }
156
157 int copy_thread(unsigned long clone_flags, unsigned long sp,
158                 unsigned long unused,
159         struct task_struct *p, struct pt_regs *regs)
160 {
161         int err;
162         struct pt_regs *childregs;
163         struct task_struct *me = current;
164
165         childregs = ((struct pt_regs *)
166                         (THREAD_SIZE + task_stack_page(p))) - 1;
167         *childregs = *regs;
168
169         childregs->ax = 0;
170         if (user_mode(regs))
171                 childregs->sp = sp;
172         else
173                 childregs->sp = (unsigned long)childregs;
174
175         p->thread.sp = (unsigned long) childregs;
176         p->thread.sp0 = (unsigned long) (childregs+1);
177         p->thread.usersp = me->thread.usersp;
178
179         set_tsk_thread_flag(p, TIF_FORK);
180
181         p->fpu_counter = 0;
182         p->thread.io_bitmap_ptr = NULL;
183
184         savesegment(gs, p->thread.gsindex);
185         p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
186         savesegment(fs, p->thread.fsindex);
187         p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
188         savesegment(es, p->thread.es);
189         savesegment(ds, p->thread.ds);
190
191         err = -ENOMEM;
192         memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
193
194         if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
195                 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
196                                                   IO_BITMAP_BYTES, GFP_KERNEL);
197                 if (!p->thread.io_bitmap_ptr) {
198                         p->thread.io_bitmap_max = 0;
199                         return -ENOMEM;
200                 }
201                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
202         }
203
204         /*
205          * Set a new TLS for the child thread?
206          */
207         if (clone_flags & CLONE_SETTLS) {
208 #ifdef CONFIG_IA32_EMULATION
209                 if (test_thread_flag(TIF_IA32))
210                         err = do_set_thread_area(p, -1,
211                                 (struct user_desc __user *)childregs->si, 0);
212                 else
213 #endif
214                         err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
215                 if (err)
216                         goto out;
217         }
218         err = 0;
219 out:
220         if (err && p->thread.io_bitmap_ptr) {
221                 kfree(p->thread.io_bitmap_ptr);
222                 p->thread.io_bitmap_max = 0;
223         }
224
225         return err;
226 }
227
228 static void
229 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
230                     unsigned long new_sp,
231                     unsigned int _cs, unsigned int _ss, unsigned int _ds)
232 {
233         loadsegment(fs, 0);
234         loadsegment(es, _ds);
235         loadsegment(ds, _ds);
236         load_gs_index(0);
237         current->thread.usersp  = new_sp;
238         regs->ip                = new_ip;
239         regs->sp                = new_sp;
240         percpu_write(old_rsp, new_sp);
241         regs->cs                = _cs;
242         regs->ss                = _ss;
243         regs->flags             = X86_EFLAGS_IF;
244         /*
245          * Free the old FP and other extended state
246          */
247         free_thread_xstate(current);
248 }
249
250 void
251 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
252 {
253         start_thread_common(regs, new_ip, new_sp,
254                             __USER_CS, __USER_DS, 0);
255 }
256
257 #ifdef CONFIG_IA32_EMULATION
258 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
259 {
260         start_thread_common(regs, new_ip, new_sp,
261                             __USER32_CS, __USER32_DS, __USER32_DS);
262 }
263 #endif
264
265 /*
266  *      switch_to(x,y) should switch tasks from x to y.
267  *
268  * This could still be optimized:
269  * - fold all the options into a flag word and test it with a single test.
270  * - could test fs/gs bitsliced
271  *
272  * Kprobes not supported here. Set the probe on schedule instead.
273  * Function graph tracer not supported too.
274  */
275 __notrace_funcgraph struct task_struct *
276 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
277 {
278         struct thread_struct *prev = &prev_p->thread;
279         struct thread_struct *next = &next_p->thread;
280         int cpu = smp_processor_id();
281         struct tss_struct *tss = &per_cpu(init_tss, cpu);
282         unsigned fsindex, gsindex;
283         fpu_switch_t fpu;
284
285         fpu = switch_fpu_prepare(prev_p, next_p, cpu);
286
287         /*
288          * Reload esp0, LDT and the page table pointer:
289          */
290         load_sp0(tss, next);
291
292         /*
293          * Switch DS and ES.
294          * This won't pick up thread selector changes, but I guess that is ok.
295          */
296         savesegment(es, prev->es);
297         if (unlikely(next->es | prev->es))
298                 loadsegment(es, next->es);
299
300         savesegment(ds, prev->ds);
301         if (unlikely(next->ds | prev->ds))
302                 loadsegment(ds, next->ds);
303
304
305         /* We must save %fs and %gs before load_TLS() because
306          * %fs and %gs may be cleared by load_TLS().
307          *
308          * (e.g. xen_load_tls())
309          */
310         savesegment(fs, fsindex);
311         savesegment(gs, gsindex);
312
313         load_TLS(next, cpu);
314
315         /*
316          * Leave lazy mode, flushing any hypercalls made here.
317          * This must be done before restoring TLS segments so
318          * the GDT and LDT are properly updated, and must be
319          * done before math_state_restore, so the TS bit is up
320          * to date.
321          */
322         arch_end_context_switch(next_p);
323
324         /*
325          * Switch FS and GS.
326          *
327          * Segment register != 0 always requires a reload.  Also
328          * reload when it has changed.  When prev process used 64bit
329          * base always reload to avoid an information leak.
330          */
331         if (unlikely(fsindex | next->fsindex | prev->fs)) {
332                 loadsegment(fs, next->fsindex);
333                 /*
334                  * Check if the user used a selector != 0; if yes
335                  *  clear 64bit base, since overloaded base is always
336                  *  mapped to the Null selector
337                  */
338                 if (fsindex)
339                         prev->fs = 0;
340         }
341         /* when next process has a 64bit base use it */
342         if (next->fs)
343                 wrmsrl(MSR_FS_BASE, next->fs);
344         prev->fsindex = fsindex;
345
346         if (unlikely(gsindex | next->gsindex | prev->gs)) {
347                 load_gs_index(next->gsindex);
348                 if (gsindex)
349                         prev->gs = 0;
350         }
351         if (next->gs)
352                 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
353         prev->gsindex = gsindex;
354
355         switch_fpu_finish(next_p, fpu);
356
357         /*
358          * Switch the PDA and FPU contexts.
359          */
360         prev->usersp = percpu_read(old_rsp);
361         percpu_write(old_rsp, next->usersp);
362         percpu_write(current_task, next_p);
363
364         percpu_write(kernel_stack,
365                   (unsigned long)task_stack_page(next_p) +
366                   THREAD_SIZE - KERNEL_STACK_OFFSET);
367
368         /*
369          * Now maybe reload the debug registers and handle I/O bitmaps
370          */
371         if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
372                      task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
373                 __switch_to_xtra(prev_p, next_p, tss);
374
375         return prev_p;
376 }
377
378 void set_personality_64bit(void)
379 {
380         /* inherit personality from parent */
381
382         /* Make sure to be in 64bit mode */
383         clear_thread_flag(TIF_IA32);
384
385         /* Ensure the corresponding mm is not marked. */
386         if (current->mm)
387                 current->mm->context.ia32_compat = 0;
388
389         /* TBD: overwrites user setup. Should have two bits.
390            But 64bit processes have always behaved this way,
391            so it's not too bad. The main problem is just that
392            32bit childs are affected again. */
393         current->personality &= ~READ_IMPLIES_EXEC;
394 }
395
396 void set_personality_ia32(void)
397 {
398         /* inherit personality from parent */
399
400         /* Make sure to be in 32bit mode */
401         set_thread_flag(TIF_IA32);
402         current->personality |= force_personality32;
403
404         /* Mark the associated mm as containing 32-bit tasks. */
405         if (current->mm)
406                 current->mm->context.ia32_compat = 1;
407
408         /* Prepare the first "return" to user space */
409         current_thread_info()->status |= TS_COMPAT;
410 }
411
412 unsigned long get_wchan(struct task_struct *p)
413 {
414         unsigned long stack;
415         u64 fp, ip;
416         int count = 0;
417
418         if (!p || p == current || p->state == TASK_RUNNING)
419                 return 0;
420         stack = (unsigned long)task_stack_page(p);
421         if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
422                 return 0;
423         fp = *(u64 *)(p->thread.sp);
424         do {
425                 if (fp < (unsigned long)stack ||
426                     fp >= (unsigned long)stack+THREAD_SIZE)
427                         return 0;
428                 ip = *(u64 *)(fp+8);
429                 if (!in_sched_functions(ip))
430                         return ip;
431                 fp = *(u64 *)fp;
432         } while (count++ < 16);
433         return 0;
434 }
435
436 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
437 {
438         int ret = 0;
439         int doit = task == current;
440         int cpu;
441
442         switch (code) {
443         case ARCH_SET_GS:
444                 if (addr >= TASK_SIZE_OF(task))
445                         return -EPERM;
446                 cpu = get_cpu();
447                 /* handle small bases via the GDT because that's faster to
448                    switch. */
449                 if (addr <= 0xffffffff) {
450                         set_32bit_tls(task, GS_TLS, addr);
451                         if (doit) {
452                                 load_TLS(&task->thread, cpu);
453                                 load_gs_index(GS_TLS_SEL);
454                         }
455                         task->thread.gsindex = GS_TLS_SEL;
456                         task->thread.gs = 0;
457                 } else {
458                         task->thread.gsindex = 0;
459                         task->thread.gs = addr;
460                         if (doit) {
461                                 load_gs_index(0);
462                                 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
463                         }
464                 }
465                 put_cpu();
466                 break;
467         case ARCH_SET_FS:
468                 /* Not strictly needed for fs, but do it for symmetry
469                    with gs */
470                 if (addr >= TASK_SIZE_OF(task))
471                         return -EPERM;
472                 cpu = get_cpu();
473                 /* handle small bases via the GDT because that's faster to
474                    switch. */
475                 if (addr <= 0xffffffff) {
476                         set_32bit_tls(task, FS_TLS, addr);
477                         if (doit) {
478                                 load_TLS(&task->thread, cpu);
479                                 loadsegment(fs, FS_TLS_SEL);
480                         }
481                         task->thread.fsindex = FS_TLS_SEL;
482                         task->thread.fs = 0;
483                 } else {
484                         task->thread.fsindex = 0;
485                         task->thread.fs = addr;
486                         if (doit) {
487                                 /* set the selector to 0 to not confuse
488                                    __switch_to */
489                                 loadsegment(fs, 0);
490                                 ret = checking_wrmsrl(MSR_FS_BASE, addr);
491                         }
492                 }
493                 put_cpu();
494                 break;
495         case ARCH_GET_FS: {
496                 unsigned long base;
497                 if (task->thread.fsindex == FS_TLS_SEL)
498                         base = read_32bit_tls(task, FS_TLS);
499                 else if (doit)
500                         rdmsrl(MSR_FS_BASE, base);
501                 else
502                         base = task->thread.fs;
503                 ret = put_user(base, (unsigned long __user *)addr);
504                 break;
505         }
506         case ARCH_GET_GS: {
507                 unsigned long base;
508                 unsigned gsindex;
509                 if (task->thread.gsindex == GS_TLS_SEL)
510                         base = read_32bit_tls(task, GS_TLS);
511                 else if (doit) {
512                         savesegment(gs, gsindex);
513                         if (gsindex)
514                                 rdmsrl(MSR_KERNEL_GS_BASE, base);
515                         else
516                                 base = task->thread.gs;
517                 } else
518                         base = task->thread.gs;
519                 ret = put_user(base, (unsigned long __user *)addr);
520                 break;
521         }
522
523         default:
524                 ret = -EINVAL;
525                 break;
526         }
527
528         return ret;
529 }
530
531 long sys_arch_prctl(int code, unsigned long addr)
532 {
533         return do_arch_prctl(current, code, addr);
534 }
535
536 unsigned long KSTK_ESP(struct task_struct *task)
537 {
538         return (test_tsk_thread_flag(task, TIF_IA32)) ?
539                         (task_pt_regs(task)->sp) : ((task)->thread.usersp);
540 }