blob: 180f4c0fcbc4125690cee81a991f24360487e695 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -070011 *
12 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
Ashok Raj76e4f662005-06-25 14:55:00 -070021#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
23#include <linux/sched.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
30#include <linux/module.h>
31#include <linux/a.out.h>
32#include <linux/interrupt.h>
33#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/ptrace.h>
35#include <linux/utsname.h>
36#include <linux/random.h>
Andi Kleen95833c82006-01-11 22:44:36 +010037#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080038#include <linux/kprobes.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070039#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070040
41#include <asm/uaccess.h>
42#include <asm/pgtable.h>
43#include <asm/system.h>
44#include <asm/io.h>
45#include <asm/processor.h>
46#include <asm/i387.h>
47#include <asm/mmu_context.h>
48#include <asm/pda.h>
49#include <asm/prctl.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070050#include <asm/desc.h>
51#include <asm/proto.h>
52#include <asm/ia32.h>
Andi Kleen95833c82006-01-11 22:44:36 +010053#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55asmlinkage extern void ret_from_fork(void);
56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58
Linus Torvalds1da177e2005-04-16 15:20:36 -070059unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
Andi Kleen2ee60e172006-06-26 13:59:44 +020066EXPORT_SYMBOL(pm_idle);
Linus Torvalds1da177e2005-04-16 15:20:36 -070067static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68
Alan Sterne041c682006-03-27 01:16:30 -080069static ATOMIC_NOTIFIER_HEAD(idle_notifier);
Andi Kleen95833c82006-01-11 22:44:36 +010070
71void idle_notifier_register(struct notifier_block *n)
72{
Alan Sterne041c682006-03-27 01:16:30 -080073 atomic_notifier_chain_register(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010074}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
Alan Sterne041c682006-03-27 01:16:30 -080079 atomic_notifier_chain_unregister(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010080}
81EXPORT_SYMBOL(idle_notifier_unregister);
82
Andi Kleen95833c82006-01-11 22:44:36 +010083void enter_idle(void)
84{
Andi Kleena15da492006-09-26 10:52:40 +020085 write_pda(isidle, 1);
Alan Sterne041c682006-03-27 01:16:30 -080086 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010087}
88
89static void __exit_idle(void)
90{
Andi Kleen94468682006-11-14 16:57:46 +010091 if (test_and_clear_bit_pda(0, isidle) == 0)
Andi Kleena15da492006-09-26 10:52:40 +020092 return;
Alan Sterne041c682006-03-27 01:16:30 -080093 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010094}
95
96/* Called from interrupts to signify idle end */
97void exit_idle(void)
98{
Andi Kleena15da492006-09-26 10:52:40 +020099 /* idle loop has pid 0 */
100 if (current->pid)
Andi Kleen95833c82006-01-11 22:44:36 +0100101 return;
102 __exit_idle();
103}
104
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105/*
106 * We use this if we don't have any better
107 * idle routine..
108 */
Adrian Bunkcdb04522006-03-24 03:15:57 -0800109static void default_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200111 current_thread_info()->status &= ~TS_POLLING;
Ingo Molnar0888f062006-12-22 01:11:56 -0800112 /*
113 * TS_POLLING-cleared state must be visible before we
114 * test NEED_RESCHED:
115 */
116 smp_mb();
Andi Kleen72690a22006-12-07 02:14:03 +0100117 local_irq_disable();
118 if (!need_resched()) {
119 /* Enables interrupts one instruction before HLT.
120 x86 special cases this so there is no race. */
121 safe_halt();
122 } else
123 local_irq_enable();
Andi Kleen495ab9c2006-06-26 13:59:11 +0200124 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700125}
126
127/*
128 * On SMP it's slightly faster (but much more power-consuming!)
129 * to poll the ->need_resched flag instead of waiting for the
130 * cross-CPU IPI to arrive. Use this option with caution.
131 */
132static void poll_idle (void)
133{
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100134 local_irq_enable();
Andi Kleen72690a22006-12-07 02:14:03 +0100135 cpu_relax();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136}
137
138void cpu_idle_wait(void)
139{
140 unsigned int cpu, this_cpu = get_cpu();
Ingo Molnardc1829a2006-11-17 14:26:18 +0100141 cpumask_t map, tmp = current->cpus_allowed;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142
143 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
144 put_cpu();
145
146 cpus_clear(map);
147 for_each_online_cpu(cpu) {
148 per_cpu(cpu_idle_state, cpu) = 1;
149 cpu_set(cpu, map);
150 }
151
152 __get_cpu_var(cpu_idle_state) = 0;
153
154 wmb();
155 do {
156 ssleep(1);
157 for_each_online_cpu(cpu) {
Andi Kleena88cde12005-11-05 17:25:54 +0100158 if (cpu_isset(cpu, map) &&
159 !per_cpu(cpu_idle_state, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160 cpu_clear(cpu, map);
161 }
162 cpus_and(map, map, cpu_online_map);
163 } while (!cpus_empty(map));
Ingo Molnardc1829a2006-11-17 14:26:18 +0100164
165 set_cpus_allowed(current, tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166}
167EXPORT_SYMBOL_GPL(cpu_idle_wait);
168
Ashok Raj76e4f662005-06-25 14:55:00 -0700169#ifdef CONFIG_HOTPLUG_CPU
170DECLARE_PER_CPU(int, cpu_state);
171
172#include <asm/nmi.h>
Shaohua Li1fa744e2006-01-06 00:12:20 -0800173/* We halt the CPU with physical CPU hotplug */
Ashok Raj76e4f662005-06-25 14:55:00 -0700174static inline void play_dead(void)
175{
176 idle_task_exit();
177 wbinvd();
178 mb();
179 /* Ack it */
180 __get_cpu_var(cpu_state) = CPU_DEAD;
181
Shaohua Li1fa744e2006-01-06 00:12:20 -0800182 local_irq_disable();
Ashok Raj76e4f662005-06-25 14:55:00 -0700183 while (1)
Shaohua Li1fa744e2006-01-06 00:12:20 -0800184 halt();
Ashok Raj76e4f662005-06-25 14:55:00 -0700185}
186#else
187static inline void play_dead(void)
188{
189 BUG();
190}
191#endif /* CONFIG_HOTPLUG_CPU */
192
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193/*
194 * The idle thread. There's no useful work to be
195 * done, so just try to conserve power and have a
196 * low exit latency (ie sit in a loop waiting for
197 * somebody to say that they'd like to reschedule)
198 */
199void cpu_idle (void)
200{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200201 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202 /* endless idle loop with no priority at all */
203 while (1) {
204 while (!need_resched()) {
205 void (*idle)(void);
206
207 if (__get_cpu_var(cpu_idle_state))
208 __get_cpu_var(cpu_idle_state) = 0;
209
Christoph Lameter34feb2c2007-07-21 17:10:30 +0200210 check_pgt_cache();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211 rmb();
212 idle = pm_idle;
213 if (!idle)
214 idle = default_idle;
Ashok Raj76e4f662005-06-25 14:55:00 -0700215 if (cpu_is_offline(smp_processor_id()))
216 play_dead();
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100217 /*
218 * Idle routines should keep interrupts disabled
219 * from here on, until they go to idle.
220 * Otherwise, idle callbacks can misfire.
221 */
222 local_irq_disable();
Andi Kleen95833c82006-01-11 22:44:36 +0100223 enter_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700224 idle();
Andi Kleena15da492006-09-26 10:52:40 +0200225 /* In many cases the interrupt that ended idle
226 has already called exit_idle. But some idle
227 loops can be woken up without interrupt. */
Andi Kleen95833c82006-01-11 22:44:36 +0100228 __exit_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700229 }
230
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800231 preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232 schedule();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800233 preempt_disable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700234 }
235}
236
237/*
238 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
239 * which can obviate IPI to trigger checking of need_resched.
240 * We execute MONITOR against need_resched and enter optimized wait state
241 * through MWAIT. Whenever someone changes need_resched, we would be woken
242 * up from MWAIT (without an IPI).
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700243 *
244 * New with Core Duo processors, MWAIT can take some hints based on CPU
245 * capability.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700246 */
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700247void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
248{
249 if (!need_resched()) {
250 __monitor((void *)&current_thread_info()->flags, 0, 0);
251 smp_mb();
252 if (!need_resched())
253 __mwait(eax, ecx);
254 }
255}
256
257/* Default MONITOR/MWAIT with no hints, used for default C1 state */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258static void mwait_idle(void)
259{
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100260 if (!need_resched()) {
261 __monitor((void *)&current_thread_info()->flags, 0, 0);
262 smp_mb();
263 if (!need_resched())
264 __sti_mwait(0, 0);
265 else
266 local_irq_enable();
267 } else {
268 local_irq_enable();
269 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700270}
271
Ashok Raje6982c62005-06-25 14:54:58 -0700272void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273{
274 static int printed;
275 if (cpu_has(c, X86_FEATURE_MWAIT)) {
276 /*
277 * Skip, if setup has overridden idle.
278 * One CPU supports mwait => All CPUs supports mwait
279 */
280 if (!pm_idle) {
281 if (!printed) {
282 printk("using mwait in idle threads.\n");
283 printed = 1;
284 }
285 pm_idle = mwait_idle;
286 }
287 }
288}
289
290static int __init idle_setup (char *str)
291{
Andi Kleenf039b752007-05-02 19:27:12 +0200292 if (!strcmp(str, "poll")) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293 printk("using polling idle threads.\n");
294 pm_idle = poll_idle;
Andi Kleenf039b752007-05-02 19:27:12 +0200295 } else if (!strcmp(str, "mwait"))
296 force_mwait = 1;
297 else
298 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299
300 boot_option_idle_override = 1;
Andi Kleenf039b752007-05-02 19:27:12 +0200301 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302}
Andi Kleenf039b752007-05-02 19:27:12 +0200303early_param("idle", idle_setup);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304
305/* Prints also some state that isn't saved in the pt_regs */
306void __show_regs(struct pt_regs * regs)
307{
308 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
309 unsigned int fsindex,gsindex;
310 unsigned int ds,cs,es;
311
312 printk("\n");
313 print_modules();
Andi Kleen9acf23c2005-09-12 18:49:24 +0200314 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
315 current->pid, current->comm, print_tainted(),
Serge E. Hallyn96b644b2006-10-02 02:18:13 -0700316 init_utsname()->release,
317 (int)strcspn(init_utsname()->version, " "),
318 init_utsname()->version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
320 printk_address(regs->rip);
Ingo Molnar3ac94932006-07-03 00:24:36 -0700321 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
Andi Kleena88cde12005-11-05 17:25:54 +0100322 regs->eflags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
324 regs->rax, regs->rbx, regs->rcx);
325 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
326 regs->rdx, regs->rsi, regs->rdi);
327 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
328 regs->rbp, regs->r8, regs->r9);
329 printk("R10: %016lx R11: %016lx R12: %016lx\n",
330 regs->r10, regs->r11, regs->r12);
331 printk("R13: %016lx R14: %016lx R15: %016lx\n",
332 regs->r13, regs->r14, regs->r15);
333
334 asm("movl %%ds,%0" : "=r" (ds));
335 asm("movl %%cs,%0" : "=r" (cs));
336 asm("movl %%es,%0" : "=r" (es));
337 asm("movl %%fs,%0" : "=r" (fsindex));
338 asm("movl %%gs,%0" : "=r" (gsindex));
339
340 rdmsrl(MSR_FS_BASE, fs);
341 rdmsrl(MSR_GS_BASE, gs);
342 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
343
344 asm("movq %%cr0, %0": "=r" (cr0));
345 asm("movq %%cr2, %0": "=r" (cr2));
346 asm("movq %%cr3, %0": "=r" (cr3));
347 asm("movq %%cr4, %0": "=r" (cr4));
348
349 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
350 fs,fsindex,gs,gsindex,shadowgs);
351 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
352 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
353}
354
355void show_regs(struct pt_regs *regs)
356{
Zwane Mwaikamboc078d322005-09-06 15:16:16 -0700357 printk("CPU %d:", smp_processor_id());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358 __show_regs(regs);
Jan Beulichb538ed22006-06-26 13:57:32 +0200359 show_trace(NULL, regs, (void *)(regs + 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360}
361
362/*
363 * Free current thread data structures etc..
364 */
365void exit_thread(void)
366{
367 struct task_struct *me = current;
368 struct thread_struct *t = &me->thread;
Rusty Lynch73649da2005-06-23 00:09:23 -0700369
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370 if (me->thread.io_bitmap_ptr) {
371 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
372
373 kfree(t->io_bitmap_ptr);
374 t->io_bitmap_ptr = NULL;
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200375 clear_thread_flag(TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376 /*
377 * Careful, clear this in the TSS too:
378 */
379 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
380 t->io_bitmap_max = 0;
381 put_cpu();
382 }
383}
384
385void flush_thread(void)
386{
387 struct task_struct *tsk = current;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700388
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800389 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
390 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
391 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
392 clear_tsk_thread_flag(tsk, TIF_IA32);
393 } else {
394 set_tsk_thread_flag(tsk, TIF_IA32);
Andi Kleen4d9bc792006-06-26 13:57:19 +0200395 current_thread_info()->status |= TS_COMPAT;
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800396 }
Andi Kleen4d9bc792006-06-26 13:57:19 +0200397 }
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800398 clear_tsk_thread_flag(tsk, TIF_DEBUG);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399
400 tsk->thread.debugreg0 = 0;
401 tsk->thread.debugreg1 = 0;
402 tsk->thread.debugreg2 = 0;
403 tsk->thread.debugreg3 = 0;
404 tsk->thread.debugreg6 = 0;
405 tsk->thread.debugreg7 = 0;
406 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
407 /*
408 * Forget coprocessor state..
409 */
410 clear_fpu(tsk);
411 clear_used_math();
412}
413
414void release_thread(struct task_struct *dead_task)
415{
416 if (dead_task->mm) {
417 if (dead_task->mm->context.size) {
418 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
419 dead_task->comm,
420 dead_task->mm->context.ldt,
421 dead_task->mm->context.size);
422 BUG();
423 }
424 }
425}
426
427static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
428{
429 struct user_desc ud = {
430 .base_addr = addr,
431 .limit = 0xfffff,
432 .seg_32bit = 1,
433 .limit_in_pages = 1,
434 .useable = 1,
435 };
436 struct n_desc_struct *desc = (void *)t->thread.tls_array;
437 desc += tls;
438 desc->a = LDT_entry_a(&ud);
439 desc->b = LDT_entry_b(&ud);
440}
441
442static inline u32 read_32bit_tls(struct task_struct *t, int tls)
443{
444 struct desc_struct *desc = (void *)t->thread.tls_array;
445 desc += tls;
446 return desc->base0 |
447 (((u32)desc->base1) << 16) |
448 (((u32)desc->base2) << 24);
449}
450
451/*
452 * This gets called before we allocate a new thread and copy
453 * the current task into it.
454 */
455void prepare_to_copy(struct task_struct *tsk)
456{
457 unlazy_fpu(tsk);
458}
459
460int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
461 unsigned long unused,
462 struct task_struct * p, struct pt_regs * regs)
463{
464 int err;
465 struct pt_regs * childregs;
466 struct task_struct *me = current;
467
Andi Kleena88cde12005-11-05 17:25:54 +0100468 childregs = ((struct pt_regs *)
Al Viro57eafdc2006-01-12 01:05:39 -0800469 (THREAD_SIZE + task_stack_page(p))) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470 *childregs = *regs;
471
472 childregs->rax = 0;
473 childregs->rsp = rsp;
Andi Kleena88cde12005-11-05 17:25:54 +0100474 if (rsp == ~0UL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700475 childregs->rsp = (unsigned long)childregs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700476
477 p->thread.rsp = (unsigned long) childregs;
478 p->thread.rsp0 = (unsigned long) (childregs+1);
479 p->thread.userrsp = me->thread.userrsp;
480
Al Viroe4f17c42006-01-12 01:05:38 -0800481 set_tsk_thread_flag(p, TIF_FORK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482
483 p->thread.fs = me->thread.fs;
484 p->thread.gs = me->thread.gs;
485
H. J. Lufd51f662005-05-01 08:58:48 -0700486 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
487 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
488 asm("mov %%es,%0" : "=m" (p->thread.es));
489 asm("mov %%ds,%0" : "=m" (p->thread.ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200491 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700492 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
493 if (!p->thread.io_bitmap_ptr) {
494 p->thread.io_bitmap_max = 0;
495 return -ENOMEM;
496 }
Andi Kleena88cde12005-11-05 17:25:54 +0100497 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
498 IO_BITMAP_BYTES);
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200499 set_tsk_thread_flag(p, TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500 }
501
502 /*
503 * Set a new TLS for the child thread?
504 */
505 if (clone_flags & CLONE_SETTLS) {
506#ifdef CONFIG_IA32_EMULATION
507 if (test_thread_flag(TIF_IA32))
508 err = ia32_child_tls(p, childregs);
509 else
510#endif
511 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
512 if (err)
513 goto out;
514 }
515 err = 0;
516out:
517 if (err && p->thread.io_bitmap_ptr) {
518 kfree(p->thread.io_bitmap_ptr);
519 p->thread.io_bitmap_max = 0;
520 }
521 return err;
522}
523
524/*
525 * This special macro can be used to load a debugging register
526 */
Jan Beulich2b514e72006-03-25 16:29:22 +0100527#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700528
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200529static inline void __switch_to_xtra(struct task_struct *prev_p,
530 struct task_struct *next_p,
531 struct tss_struct *tss)
532{
533 struct thread_struct *prev, *next;
534
535 prev = &prev_p->thread,
536 next = &next_p->thread;
537
538 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
539 loaddebug(next, 0);
540 loaddebug(next, 1);
541 loaddebug(next, 2);
542 loaddebug(next, 3);
543 /* no 4 and 5 */
544 loaddebug(next, 6);
545 loaddebug(next, 7);
546 }
547
548 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
549 /*
550 * Copy the relevant range of the IO bitmap.
551 * Normally this is 128 bytes or less:
552 */
553 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
554 max(prev->io_bitmap_max, next->io_bitmap_max));
555 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
556 /*
557 * Clear any possible leftover bits:
558 */
559 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
560 }
561}
562
Linus Torvalds1da177e2005-04-16 15:20:36 -0700563/*
564 * switch_to(x,y) should switch tasks from x to y.
565 *
566 * This could still be optimized:
567 * - fold all the options into a flag word and test it with a single test.
568 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100569 *
570 * Kprobes not supported here. Set the probe on schedule instead.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700571 */
Andi Kleen099f3182006-02-03 21:51:38 +0100572__kprobes struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100573__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574{
575 struct thread_struct *prev = &prev_p->thread,
576 *next = &next_p->thread;
577 int cpu = smp_processor_id();
578 struct tss_struct *tss = &per_cpu(init_tss, cpu);
579
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200580 /* we're going to use this soon, after a few expensive things */
581 if (next_p->fpu_counter>5)
582 prefetch(&next->i387.fxsave);
583
Linus Torvalds1da177e2005-04-16 15:20:36 -0700584 /*
585 * Reload esp0, LDT and the page table pointer:
586 */
587 tss->rsp0 = next->rsp0;
588
589 /*
590 * Switch DS and ES.
591 * This won't pick up thread selector changes, but I guess that is ok.
592 */
H. J. Lufd51f662005-05-01 08:58:48 -0700593 asm volatile("mov %%es,%0" : "=m" (prev->es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594 if (unlikely(next->es | prev->es))
595 loadsegment(es, next->es);
596
H. J. Lufd51f662005-05-01 08:58:48 -0700597 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700598 if (unlikely(next->ds | prev->ds))
599 loadsegment(ds, next->ds);
600
601 load_TLS(next, cpu);
602
603 /*
604 * Switch FS and GS.
605 */
606 {
607 unsigned fsindex;
608 asm volatile("movl %%fs,%0" : "=r" (fsindex));
609 /* segment register != 0 always requires a reload.
610 also reload when it has changed.
611 when prev process used 64bit base always reload
612 to avoid an information leak. */
613 if (unlikely(fsindex | next->fsindex | prev->fs)) {
614 loadsegment(fs, next->fsindex);
615 /* check if the user used a selector != 0
616 * if yes clear 64bit base, since overloaded base
617 * is always mapped to the Null selector
618 */
619 if (fsindex)
620 prev->fs = 0;
621 }
622 /* when next process has a 64bit base use it */
623 if (next->fs)
624 wrmsrl(MSR_FS_BASE, next->fs);
625 prev->fsindex = fsindex;
626 }
627 {
628 unsigned gsindex;
629 asm volatile("movl %%gs,%0" : "=r" (gsindex));
630 if (unlikely(gsindex | next->gsindex | prev->gs)) {
631 load_gs_index(next->gsindex);
632 if (gsindex)
633 prev->gs = 0;
634 }
635 if (next->gs)
636 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
637 prev->gsindex = gsindex;
638 }
639
Andi Kleen0a5ace22006-10-05 18:47:22 +0200640 /* Must be after DS reload */
641 unlazy_fpu(prev_p);
642
Linus Torvalds1da177e2005-04-16 15:20:36 -0700643 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100644 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645 */
646 prev->userrsp = read_pda(oldrsp);
647 write_pda(oldrsp, next->userrsp);
648 write_pda(pcurrent, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200649
Andi Kleena88cde12005-11-05 17:25:54 +0100650 write_pda(kernelstack,
Andi Kleen7b0bda72006-09-26 10:52:39 +0200651 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
Arjan van de Ven0a425402006-09-26 10:52:38 +0200652#ifdef CONFIG_CC_STACKPROTECTOR
653 write_pda(stack_canary, next_p->stack_canary);
654 /*
655 * Build time only check to make sure the stack_canary is at
656 * offset 40 in the pda; this is a gcc ABI requirement
657 */
658 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
659#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700660
661 /*
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200662 * Now maybe reload the debug registers and handle I/O bitmaps
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663 */
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200664 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
665 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
666 __switch_to_xtra(prev_p, next_p, tss);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200668 /* If the task has used fpu the last 5 timeslices, just do a full
669 * restore of the math state immediately to avoid the trap; the
670 * chances of needing FPU soon are obviously high now
671 */
672 if (next_p->fpu_counter>5)
673 math_state_restore();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700674 return prev_p;
675}
676
677/*
678 * sys_execve() executes a new program.
679 */
680asmlinkage
681long sys_execve(char __user *name, char __user * __user *argv,
682 char __user * __user *envp, struct pt_regs regs)
683{
684 long error;
685 char * filename;
686
687 filename = getname(name);
688 error = PTR_ERR(filename);
689 if (IS_ERR(filename))
690 return error;
691 error = do_execve(filename, argv, envp, &regs);
692 if (error == 0) {
693 task_lock(current);
694 current->ptrace &= ~PT_DTRACE;
695 task_unlock(current);
696 }
697 putname(filename);
698 return error;
699}
700
701void set_personality_64bit(void)
702{
703 /* inherit personality from parent */
704
705 /* Make sure to be in 64bit mode */
706 clear_thread_flag(TIF_IA32);
707
708 /* TBD: overwrites user setup. Should have two bits.
709 But 64bit processes have always behaved this way,
710 so it's not too bad. The main problem is just that
711 32bit childs are affected again. */
712 current->personality &= ~READ_IMPLIES_EXEC;
713}
714
715asmlinkage long sys_fork(struct pt_regs *regs)
716{
717 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
718}
719
Andi Kleena88cde12005-11-05 17:25:54 +0100720asmlinkage long
721sys_clone(unsigned long clone_flags, unsigned long newsp,
722 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700723{
724 if (!newsp)
725 newsp = regs->rsp;
726 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
727}
728
729/*
730 * This is trivial, and on the face of it looks like it
731 * could equally well be done in user mode.
732 *
733 * Not so, for quite unobvious reasons - register pressure.
734 * In user mode vfork() cannot have a stack frame, and if
735 * done by calling the "clone()" system call directly, you
736 * do not have enough call-clobbered registers to hold all
737 * the information you need.
738 */
739asmlinkage long sys_vfork(struct pt_regs *regs)
740{
741 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
742 NULL, NULL);
743}
744
745unsigned long get_wchan(struct task_struct *p)
746{
747 unsigned long stack;
748 u64 fp,rip;
749 int count = 0;
750
751 if (!p || p == current || p->state==TASK_RUNNING)
752 return 0;
Al Viro57eafdc2006-01-12 01:05:39 -0800753 stack = (unsigned long)task_stack_page(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
755 return 0;
756 fp = *(u64 *)(p->thread.rsp);
757 do {
Andi Kleena88cde12005-11-05 17:25:54 +0100758 if (fp < (unsigned long)stack ||
759 fp > (unsigned long)stack+THREAD_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700760 return 0;
761 rip = *(u64 *)(fp+8);
762 if (!in_sched_functions(rip))
763 return rip;
764 fp = *(u64 *)fp;
765 } while (count++ < 16);
766 return 0;
767}
768
769long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
770{
771 int ret = 0;
772 int doit = task == current;
773 int cpu;
774
775 switch (code) {
776 case ARCH_SET_GS:
Suresh Siddha84929802005-06-21 17:14:32 -0700777 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700778 return -EPERM;
779 cpu = get_cpu();
780 /* handle small bases via the GDT because that's faster to
781 switch. */
782 if (addr <= 0xffffffff) {
783 set_32bit_tls(task, GS_TLS, addr);
784 if (doit) {
785 load_TLS(&task->thread, cpu);
786 load_gs_index(GS_TLS_SEL);
787 }
788 task->thread.gsindex = GS_TLS_SEL;
789 task->thread.gs = 0;
790 } else {
791 task->thread.gsindex = 0;
792 task->thread.gs = addr;
793 if (doit) {
Andi Kleena88cde12005-11-05 17:25:54 +0100794 load_gs_index(0);
795 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700796 }
797 }
798 put_cpu();
799 break;
800 case ARCH_SET_FS:
801 /* Not strictly needed for fs, but do it for symmetry
802 with gs */
Suresh Siddha84929802005-06-21 17:14:32 -0700803 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700804 return -EPERM;
805 cpu = get_cpu();
806 /* handle small bases via the GDT because that's faster to
807 switch. */
808 if (addr <= 0xffffffff) {
809 set_32bit_tls(task, FS_TLS, addr);
810 if (doit) {
811 load_TLS(&task->thread, cpu);
Andi Kleena88cde12005-11-05 17:25:54 +0100812 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700813 }
814 task->thread.fsindex = FS_TLS_SEL;
815 task->thread.fs = 0;
816 } else {
817 task->thread.fsindex = 0;
818 task->thread.fs = addr;
819 if (doit) {
820 /* set the selector to 0 to not confuse
821 __switch_to */
Andi Kleena88cde12005-11-05 17:25:54 +0100822 asm volatile("movl %0,%%fs" :: "r" (0));
823 ret = checking_wrmsrl(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700824 }
825 }
826 put_cpu();
827 break;
828 case ARCH_GET_FS: {
829 unsigned long base;
830 if (task->thread.fsindex == FS_TLS_SEL)
831 base = read_32bit_tls(task, FS_TLS);
Andi Kleena88cde12005-11-05 17:25:54 +0100832 else if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100834 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835 base = task->thread.fs;
836 ret = put_user(base, (unsigned long __user *)addr);
837 break;
838 }
839 case ARCH_GET_GS: {
840 unsigned long base;
John Blackwood97c28032006-04-07 19:50:25 +0200841 unsigned gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700842 if (task->thread.gsindex == GS_TLS_SEL)
843 base = read_32bit_tls(task, GS_TLS);
John Blackwood97c28032006-04-07 19:50:25 +0200844 else if (doit) {
845 asm("movl %%gs,%0" : "=r" (gsindex));
846 if (gsindex)
847 rdmsrl(MSR_KERNEL_GS_BASE, base);
848 else
849 base = task->thread.gs;
850 }
Andi Kleena88cde12005-11-05 17:25:54 +0100851 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700852 base = task->thread.gs;
853 ret = put_user(base, (unsigned long __user *)addr);
854 break;
855 }
856
857 default:
858 ret = -EINVAL;
859 break;
860 }
861
862 return ret;
863}
864
865long sys_arch_prctl(int code, unsigned long addr)
866{
867 return do_arch_prctl(current, code, addr);
868}
869
870/*
871 * Capture the user space registers if the task is not running (in user space)
872 */
873int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
874{
875 struct pt_regs *pp, ptregs;
876
Al Virobb049232006-01-12 01:05:38 -0800877 pp = task_pt_regs(tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878
879 ptregs = *pp;
880 ptregs.cs &= 0xffff;
881 ptregs.ss &= 0xffff;
882
883 elf_core_copy_regs(regs, &ptregs);
884
885 return 1;
886}
887
888unsigned long arch_align_stack(unsigned long sp)
889{
Andi Kleenc16b63e2006-09-26 10:52:28 +0200890 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700891 sp -= get_random_int() % 8192;
892 return sp & ~0xf;
893}