0f49677da51e62afe9b14f6967ded036e098a578
[linux-3.10.git] / arch / x86 / kernel / process_64.c
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *      Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *      Andi Kleen.
9  *
10  *      CPU hotplug support - ashok.raj@intel.com
11  */
12
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52
53 asmlinkage extern void ret_from_fork(void);
54
55 DEFINE_PER_CPU(unsigned long, old_rsp);
56
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61         unsigned long d0, d1, d2, d3, d6, d7;
62         unsigned int fsindex, gsindex;
63         unsigned int ds, cs, es;
64
65         show_regs_common();
66         printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
67         printk_address(regs->ip, 1);
68         printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
69                         regs->sp, regs->flags);
70         printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
71                regs->ax, regs->bx, regs->cx);
72         printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
73                regs->dx, regs->si, regs->di);
74         printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
75                regs->bp, regs->r8, regs->r9);
76         printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
77                regs->r10, regs->r11, regs->r12);
78         printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
79                regs->r13, regs->r14, regs->r15);
80
81         asm("movl %%ds,%0" : "=r" (ds));
82         asm("movl %%cs,%0" : "=r" (cs));
83         asm("movl %%es,%0" : "=r" (es));
84         asm("movl %%fs,%0" : "=r" (fsindex));
85         asm("movl %%gs,%0" : "=r" (gsindex));
86
87         rdmsrl(MSR_FS_BASE, fs);
88         rdmsrl(MSR_GS_BASE, gs);
89         rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
90
91         if (!all)
92                 return;
93
94         cr0 = read_cr0();
95         cr2 = read_cr2();
96         cr3 = read_cr3();
97         cr4 = read_cr4();
98
99         printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
100                fs, fsindex, gs, gsindex, shadowgs);
101         printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
102                         es, cr0);
103         printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
104                         cr4);
105
106         get_debugreg(d0, 0);
107         get_debugreg(d1, 1);
108         get_debugreg(d2, 2);
109         printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
110         get_debugreg(d3, 3);
111         get_debugreg(d6, 6);
112         get_debugreg(d7, 7);
113         printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
114 }
115
116 void release_thread(struct task_struct *dead_task)
117 {
118         if (dead_task->mm) {
119                 if (dead_task->mm->context.size) {
120                         pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
121                                 dead_task->comm,
122                                 dead_task->mm->context.ldt,
123                                 dead_task->mm->context.size);
124                         BUG();
125                 }
126         }
127 }
128
129 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
130 {
131         struct user_desc ud = {
132                 .base_addr = addr,
133                 .limit = 0xfffff,
134                 .seg_32bit = 1,
135                 .limit_in_pages = 1,
136                 .useable = 1,
137         };
138         struct desc_struct *desc = t->thread.tls_array;
139         desc += tls;
140         fill_ldt(desc, &ud);
141 }
142
143 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
144 {
145         return get_desc_base(&t->thread.tls_array[tls]);
146 }
147
148 int copy_thread(unsigned long clone_flags, unsigned long sp,
149                 unsigned long arg, struct task_struct *p)
150 {
151         int err;
152         struct pt_regs *childregs;
153         struct task_struct *me = current;
154
155         p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
156         childregs = task_pt_regs(p);
157         p->thread.sp = (unsigned long) childregs;
158         p->thread.usersp = me->thread.usersp;
159         set_tsk_thread_flag(p, TIF_FORK);
160         p->fpu_counter = 0;
161         p->thread.io_bitmap_ptr = NULL;
162
163         savesegment(gs, p->thread.gsindex);
164         p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
165         savesegment(fs, p->thread.fsindex);
166         p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
167         savesegment(es, p->thread.es);
168         savesegment(ds, p->thread.ds);
169         memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
170
171         if (unlikely(p->flags & PF_KTHREAD)) {
172                 /* kernel thread */
173                 memset(childregs, 0, sizeof(struct pt_regs));
174                 childregs->sp = (unsigned long)childregs;
175                 childregs->ss = __KERNEL_DS;
176                 childregs->bx = sp; /* function */
177                 childregs->bp = arg;
178                 childregs->orig_ax = -1;
179                 childregs->cs = __KERNEL_CS | get_kernel_rpl();
180                 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
181                 return 0;
182         }
183         *childregs = *current_pt_regs();
184
185         childregs->ax = 0;
186         if (sp)
187                 childregs->sp = sp;
188
189         err = -ENOMEM;
190         memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
191
192         if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
193                 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
194                                                   IO_BITMAP_BYTES, GFP_KERNEL);
195                 if (!p->thread.io_bitmap_ptr) {
196                         p->thread.io_bitmap_max = 0;
197                         return -ENOMEM;
198                 }
199                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
200         }
201
202         /*
203          * Set a new TLS for the child thread?
204          */
205         if (clone_flags & CLONE_SETTLS) {
206 #ifdef CONFIG_IA32_EMULATION
207                 if (test_thread_flag(TIF_IA32))
208                         err = do_set_thread_area(p, -1,
209                                 (struct user_desc __user *)childregs->si, 0);
210                 else
211 #endif
212                         err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
213                 if (err)
214                         goto out;
215         }
216         err = 0;
217 out:
218         if (err && p->thread.io_bitmap_ptr) {
219                 kfree(p->thread.io_bitmap_ptr);
220                 p->thread.io_bitmap_max = 0;
221         }
222
223         return err;
224 }
225
226 static void
227 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
228                     unsigned long new_sp,
229                     unsigned int _cs, unsigned int _ss, unsigned int _ds)
230 {
231         loadsegment(fs, 0);
232         loadsegment(es, _ds);
233         loadsegment(ds, _ds);
234         load_gs_index(0);
235         current->thread.usersp  = new_sp;
236         regs->ip                = new_ip;
237         regs->sp                = new_sp;
238         this_cpu_write(old_rsp, new_sp);
239         regs->cs                = _cs;
240         regs->ss                = _ss;
241         regs->flags             = X86_EFLAGS_IF;
242 }
243
244 void
245 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
246 {
247         start_thread_common(regs, new_ip, new_sp,
248                             __USER_CS, __USER_DS, 0);
249 }
250
251 #ifdef CONFIG_IA32_EMULATION
252 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
253 {
254         start_thread_common(regs, new_ip, new_sp,
255                             test_thread_flag(TIF_X32)
256                             ? __USER_CS : __USER32_CS,
257                             __USER_DS, __USER_DS);
258 }
259 #endif
260
261 /*
262  *      switch_to(x,y) should switch tasks from x to y.
263  *
264  * This could still be optimized:
265  * - fold all the options into a flag word and test it with a single test.
266  * - could test fs/gs bitsliced
267  *
268  * Kprobes not supported here. Set the probe on schedule instead.
269  * Function graph tracer not supported too.
270  */
271 __notrace_funcgraph struct task_struct *
272 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
273 {
274         struct thread_struct *prev = &prev_p->thread;
275         struct thread_struct *next = &next_p->thread;
276         int cpu = smp_processor_id();
277         struct tss_struct *tss = &per_cpu(init_tss, cpu);
278         unsigned fsindex, gsindex;
279         fpu_switch_t fpu;
280
281         fpu = switch_fpu_prepare(prev_p, next_p, cpu);
282
283         /*
284          * Reload esp0, LDT and the page table pointer:
285          */
286         load_sp0(tss, next);
287
288         /*
289          * Switch DS and ES.
290          * This won't pick up thread selector changes, but I guess that is ok.
291          */
292         savesegment(es, prev->es);
293         if (unlikely(next->es | prev->es))
294                 loadsegment(es, next->es);
295
296         savesegment(ds, prev->ds);
297         if (unlikely(next->ds | prev->ds))
298                 loadsegment(ds, next->ds);
299
300
301         /* We must save %fs and %gs before load_TLS() because
302          * %fs and %gs may be cleared by load_TLS().
303          *
304          * (e.g. xen_load_tls())
305          */
306         savesegment(fs, fsindex);
307         savesegment(gs, gsindex);
308
309         load_TLS(next, cpu);
310
311         /*
312          * Leave lazy mode, flushing any hypercalls made here.
313          * This must be done before restoring TLS segments so
314          * the GDT and LDT are properly updated, and must be
315          * done before math_state_restore, so the TS bit is up
316          * to date.
317          */
318         arch_end_context_switch(next_p);
319
320         /*
321          * Switch FS and GS.
322          *
323          * Segment register != 0 always requires a reload.  Also
324          * reload when it has changed.  When prev process used 64bit
325          * base always reload to avoid an information leak.
326          */
327         if (unlikely(fsindex | next->fsindex | prev->fs)) {
328                 loadsegment(fs, next->fsindex);
329                 /*
330                  * Check if the user used a selector != 0; if yes
331                  *  clear 64bit base, since overloaded base is always
332                  *  mapped to the Null selector
333                  */
334                 if (fsindex)
335                         prev->fs = 0;
336         }
337         /* when next process has a 64bit base use it */
338         if (next->fs)
339                 wrmsrl(MSR_FS_BASE, next->fs);
340         prev->fsindex = fsindex;
341
342         if (unlikely(gsindex | next->gsindex | prev->gs)) {
343                 load_gs_index(next->gsindex);
344                 if (gsindex)
345                         prev->gs = 0;
346         }
347         if (next->gs)
348                 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
349         prev->gsindex = gsindex;
350
351         switch_fpu_finish(next_p, fpu);
352
353         /*
354          * Switch the PDA and FPU contexts.
355          */
356         prev->usersp = this_cpu_read(old_rsp);
357         this_cpu_write(old_rsp, next->usersp);
358         this_cpu_write(current_task, next_p);
359
360         this_cpu_write(kernel_stack,
361                   (unsigned long)task_stack_page(next_p) +
362                   THREAD_SIZE - KERNEL_STACK_OFFSET);
363
364         /*
365          * Now maybe reload the debug registers and handle I/O bitmaps
366          */
367         if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
368                      task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
369                 __switch_to_xtra(prev_p, next_p, tss);
370
371         return prev_p;
372 }
373
374 void set_personality_64bit(void)
375 {
376         /* inherit personality from parent */
377
378         /* Make sure to be in 64bit mode */
379         clear_thread_flag(TIF_IA32);
380         clear_thread_flag(TIF_ADDR32);
381         clear_thread_flag(TIF_X32);
382
383         /* Ensure the corresponding mm is not marked. */
384         if (current->mm)
385                 current->mm->context.ia32_compat = 0;
386
387         /* TBD: overwrites user setup. Should have two bits.
388            But 64bit processes have always behaved this way,
389            so it's not too bad. The main problem is just that
390            32bit childs are affected again. */
391         current->personality &= ~READ_IMPLIES_EXEC;
392 }
393
394 void set_personality_ia32(bool x32)
395 {
396         /* inherit personality from parent */
397
398         /* Make sure to be in 32bit mode */
399         set_thread_flag(TIF_ADDR32);
400
401         /* Mark the associated mm as containing 32-bit tasks. */
402         if (current->mm)
403                 current->mm->context.ia32_compat = 1;
404
405         if (x32) {
406                 clear_thread_flag(TIF_IA32);
407                 set_thread_flag(TIF_X32);
408                 current->personality &= ~READ_IMPLIES_EXEC;
409                 /* is_compat_task() uses the presence of the x32
410                    syscall bit flag to determine compat status */
411                 current_thread_info()->status &= ~TS_COMPAT;
412         } else {
413                 set_thread_flag(TIF_IA32);
414                 clear_thread_flag(TIF_X32);
415                 current->personality |= force_personality32;
416                 /* Prepare the first "return" to user space */
417                 current_thread_info()->status |= TS_COMPAT;
418         }
419 }
420 EXPORT_SYMBOL_GPL(set_personality_ia32);
421
422 unsigned long get_wchan(struct task_struct *p)
423 {
424         unsigned long stack;
425         u64 fp, ip;
426         int count = 0;
427
428         if (!p || p == current || p->state == TASK_RUNNING)
429                 return 0;
430         stack = (unsigned long)task_stack_page(p);
431         if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
432                 return 0;
433         fp = *(u64 *)(p->thread.sp);
434         do {
435                 if (fp < (unsigned long)stack ||
436                     fp >= (unsigned long)stack+THREAD_SIZE)
437                         return 0;
438                 ip = *(u64 *)(fp+8);
439                 if (!in_sched_functions(ip))
440                         return ip;
441                 fp = *(u64 *)fp;
442         } while (count++ < 16);
443         return 0;
444 }
445
446 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
447 {
448         int ret = 0;
449         int doit = task == current;
450         int cpu;
451
452         switch (code) {
453         case ARCH_SET_GS:
454                 if (addr >= TASK_SIZE_OF(task))
455                         return -EPERM;
456                 cpu = get_cpu();
457                 /* handle small bases via the GDT because that's faster to
458                    switch. */
459                 if (addr <= 0xffffffff) {
460                         set_32bit_tls(task, GS_TLS, addr);
461                         if (doit) {
462                                 load_TLS(&task->thread, cpu);
463                                 load_gs_index(GS_TLS_SEL);
464                         }
465                         task->thread.gsindex = GS_TLS_SEL;
466                         task->thread.gs = 0;
467                 } else {
468                         task->thread.gsindex = 0;
469                         task->thread.gs = addr;
470                         if (doit) {
471                                 load_gs_index(0);
472                                 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
473                         }
474                 }
475                 put_cpu();
476                 break;
477         case ARCH_SET_FS:
478                 /* Not strictly needed for fs, but do it for symmetry
479                    with gs */
480                 if (addr >= TASK_SIZE_OF(task))
481                         return -EPERM;
482                 cpu = get_cpu();
483                 /* handle small bases via the GDT because that's faster to
484                    switch. */
485                 if (addr <= 0xffffffff) {
486                         set_32bit_tls(task, FS_TLS, addr);
487                         if (doit) {
488                                 load_TLS(&task->thread, cpu);
489                                 loadsegment(fs, FS_TLS_SEL);
490                         }
491                         task->thread.fsindex = FS_TLS_SEL;
492                         task->thread.fs = 0;
493                 } else {
494                         task->thread.fsindex = 0;
495                         task->thread.fs = addr;
496                         if (doit) {
497                                 /* set the selector to 0 to not confuse
498                                    __switch_to */
499                                 loadsegment(fs, 0);
500                                 ret = wrmsrl_safe(MSR_FS_BASE, addr);
501                         }
502                 }
503                 put_cpu();
504                 break;
505         case ARCH_GET_FS: {
506                 unsigned long base;
507                 if (task->thread.fsindex == FS_TLS_SEL)
508                         base = read_32bit_tls(task, FS_TLS);
509                 else if (doit)
510                         rdmsrl(MSR_FS_BASE, base);
511                 else
512                         base = task->thread.fs;
513                 ret = put_user(base, (unsigned long __user *)addr);
514                 break;
515         }
516         case ARCH_GET_GS: {
517                 unsigned long base;
518                 unsigned gsindex;
519                 if (task->thread.gsindex == GS_TLS_SEL)
520                         base = read_32bit_tls(task, GS_TLS);
521                 else if (doit) {
522                         savesegment(gs, gsindex);
523                         if (gsindex)
524                                 rdmsrl(MSR_KERNEL_GS_BASE, base);
525                         else
526                                 base = task->thread.gs;
527                 } else
528                         base = task->thread.gs;
529                 ret = put_user(base, (unsigned long __user *)addr);
530                 break;
531         }
532
533         default:
534                 ret = -EINVAL;
535                 break;
536         }
537
538         return ret;
539 }
540
541 long sys_arch_prctl(int code, unsigned long addr)
542 {
543         return do_arch_prctl(current, code, addr);
544 }
545
546 unsigned long KSTK_ESP(struct task_struct *task)
547 {
548         return (test_tsk_thread_flag(task, TIF_IA32)) ?
549                         (task_pt_regs(task)->sp) : ((task)->thread.usersp);
550 }