blob: 2842f50cbe3fde220dfc33a61b720720aa7a7e7f [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -070011 *
12 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
Ashok Raj76e4f662005-06-25 14:55:00 -070021#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
23#include <linux/sched.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
Alexey Dobriyan4e950f62007-07-30 02:36:13 +040026#include <linux/fs.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070027#include <linux/elfcore.h>
28#include <linux/smp.h>
29#include <linux/slab.h>
30#include <linux/user.h>
31#include <linux/module.h>
32#include <linux/a.out.h>
33#include <linux/interrupt.h>
34#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070035#include <linux/ptrace.h>
36#include <linux/utsname.h>
37#include <linux/random.h>
Andi Kleen95833c82006-01-11 22:44:36 +010038#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080039#include <linux/kprobes.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070040#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070041
42#include <asm/uaccess.h>
43#include <asm/pgtable.h>
44#include <asm/system.h>
45#include <asm/io.h>
46#include <asm/processor.h>
47#include <asm/i387.h>
48#include <asm/mmu_context.h>
49#include <asm/pda.h>
50#include <asm/prctl.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070051#include <asm/desc.h>
52#include <asm/proto.h>
53#include <asm/ia32.h>
Andi Kleen95833c82006-01-11 22:44:36 +010054#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070055
56asmlinkage extern void ret_from_fork(void);
57
58unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59
Linus Torvalds1da177e2005-04-16 15:20:36 -070060unsigned long boot_option_idle_override = 0;
61EXPORT_SYMBOL(boot_option_idle_override);
62
63/*
64 * Powermanagement idle function, if any..
65 */
66void (*pm_idle)(void);
Andi Kleen2ee60e172006-06-26 13:59:44 +020067EXPORT_SYMBOL(pm_idle);
Linus Torvalds1da177e2005-04-16 15:20:36 -070068static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
69
Alan Sterne041c682006-03-27 01:16:30 -080070static ATOMIC_NOTIFIER_HEAD(idle_notifier);
Andi Kleen95833c82006-01-11 22:44:36 +010071
72void idle_notifier_register(struct notifier_block *n)
73{
Alan Sterne041c682006-03-27 01:16:30 -080074 atomic_notifier_chain_register(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010075}
76EXPORT_SYMBOL_GPL(idle_notifier_register);
77
78void idle_notifier_unregister(struct notifier_block *n)
79{
Alan Sterne041c682006-03-27 01:16:30 -080080 atomic_notifier_chain_unregister(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010081}
82EXPORT_SYMBOL(idle_notifier_unregister);
83
Andi Kleen95833c82006-01-11 22:44:36 +010084void enter_idle(void)
85{
Andi Kleena15da492006-09-26 10:52:40 +020086 write_pda(isidle, 1);
Alan Sterne041c682006-03-27 01:16:30 -080087 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010088}
89
90static void __exit_idle(void)
91{
Andi Kleen94468682006-11-14 16:57:46 +010092 if (test_and_clear_bit_pda(0, isidle) == 0)
Andi Kleena15da492006-09-26 10:52:40 +020093 return;
Alan Sterne041c682006-03-27 01:16:30 -080094 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010095}
96
97/* Called from interrupts to signify idle end */
98void exit_idle(void)
99{
Andi Kleena15da492006-09-26 10:52:40 +0200100 /* idle loop has pid 0 */
101 if (current->pid)
Andi Kleen95833c82006-01-11 22:44:36 +0100102 return;
103 __exit_idle();
104}
105
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106/*
107 * We use this if we don't have any better
108 * idle routine..
109 */
Adrian Bunkcdb04522006-03-24 03:15:57 -0800110static void default_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200112 current_thread_info()->status &= ~TS_POLLING;
Ingo Molnar0888f062006-12-22 01:11:56 -0800113 /*
114 * TS_POLLING-cleared state must be visible before we
115 * test NEED_RESCHED:
116 */
117 smp_mb();
Andi Kleen72690a22006-12-07 02:14:03 +0100118 local_irq_disable();
119 if (!need_resched()) {
120 /* Enables interrupts one instruction before HLT.
121 x86 special cases this so there is no race. */
122 safe_halt();
123 } else
124 local_irq_enable();
Andi Kleen495ab9c2006-06-26 13:59:11 +0200125 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126}
127
128/*
129 * On SMP it's slightly faster (but much more power-consuming!)
130 * to poll the ->need_resched flag instead of waiting for the
131 * cross-CPU IPI to arrive. Use this option with caution.
132 */
133static void poll_idle (void)
134{
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100135 local_irq_enable();
Andi Kleen72690a22006-12-07 02:14:03 +0100136 cpu_relax();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137}
138
139void cpu_idle_wait(void)
140{
141 unsigned int cpu, this_cpu = get_cpu();
Ingo Molnardc1829a2006-11-17 14:26:18 +0100142 cpumask_t map, tmp = current->cpus_allowed;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143
144 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
145 put_cpu();
146
147 cpus_clear(map);
148 for_each_online_cpu(cpu) {
149 per_cpu(cpu_idle_state, cpu) = 1;
150 cpu_set(cpu, map);
151 }
152
153 __get_cpu_var(cpu_idle_state) = 0;
154
155 wmb();
156 do {
157 ssleep(1);
158 for_each_online_cpu(cpu) {
Andi Kleena88cde12005-11-05 17:25:54 +0100159 if (cpu_isset(cpu, map) &&
160 !per_cpu(cpu_idle_state, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161 cpu_clear(cpu, map);
162 }
163 cpus_and(map, map, cpu_online_map);
164 } while (!cpus_empty(map));
Ingo Molnardc1829a2006-11-17 14:26:18 +0100165
166 set_cpus_allowed(current, tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167}
168EXPORT_SYMBOL_GPL(cpu_idle_wait);
169
Ashok Raj76e4f662005-06-25 14:55:00 -0700170#ifdef CONFIG_HOTPLUG_CPU
171DECLARE_PER_CPU(int, cpu_state);
172
173#include <asm/nmi.h>
Shaohua Li1fa744e2006-01-06 00:12:20 -0800174/* We halt the CPU with physical CPU hotplug */
Ashok Raj76e4f662005-06-25 14:55:00 -0700175static inline void play_dead(void)
176{
177 idle_task_exit();
178 wbinvd();
179 mb();
180 /* Ack it */
181 __get_cpu_var(cpu_state) = CPU_DEAD;
182
Shaohua Li1fa744e2006-01-06 00:12:20 -0800183 local_irq_disable();
Ashok Raj76e4f662005-06-25 14:55:00 -0700184 while (1)
Shaohua Li1fa744e2006-01-06 00:12:20 -0800185 halt();
Ashok Raj76e4f662005-06-25 14:55:00 -0700186}
187#else
188static inline void play_dead(void)
189{
190 BUG();
191}
192#endif /* CONFIG_HOTPLUG_CPU */
193
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194/*
195 * The idle thread. There's no useful work to be
196 * done, so just try to conserve power and have a
197 * low exit latency (ie sit in a loop waiting for
198 * somebody to say that they'd like to reschedule)
199 */
200void cpu_idle (void)
201{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200202 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700203 /* endless idle loop with no priority at all */
204 while (1) {
205 while (!need_resched()) {
206 void (*idle)(void);
207
208 if (__get_cpu_var(cpu_idle_state))
209 __get_cpu_var(cpu_idle_state) = 0;
210
Christoph Lameter34feb2c2007-07-21 17:10:30 +0200211 check_pgt_cache();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700212 rmb();
213 idle = pm_idle;
214 if (!idle)
215 idle = default_idle;
Ashok Raj76e4f662005-06-25 14:55:00 -0700216 if (cpu_is_offline(smp_processor_id()))
217 play_dead();
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100218 /*
219 * Idle routines should keep interrupts disabled
220 * from here on, until they go to idle.
221 * Otherwise, idle callbacks can misfire.
222 */
223 local_irq_disable();
Andi Kleen95833c82006-01-11 22:44:36 +0100224 enter_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700225 idle();
Andi Kleena15da492006-09-26 10:52:40 +0200226 /* In many cases the interrupt that ended idle
227 has already called exit_idle. But some idle
228 loops can be woken up without interrupt. */
Andi Kleen95833c82006-01-11 22:44:36 +0100229 __exit_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230 }
231
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800232 preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700233 schedule();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800234 preempt_disable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700235 }
236}
237
238/*
239 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
240 * which can obviate IPI to trigger checking of need_resched.
241 * We execute MONITOR against need_resched and enter optimized wait state
242 * through MWAIT. Whenever someone changes need_resched, we would be woken
243 * up from MWAIT (without an IPI).
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700244 *
245 * New with Core Duo processors, MWAIT can take some hints based on CPU
246 * capability.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247 */
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700248void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
249{
250 if (!need_resched()) {
251 __monitor((void *)&current_thread_info()->flags, 0, 0);
252 smp_mb();
253 if (!need_resched())
254 __mwait(eax, ecx);
255 }
256}
257
258/* Default MONITOR/MWAIT with no hints, used for default C1 state */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259static void mwait_idle(void)
260{
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100261 if (!need_resched()) {
262 __monitor((void *)&current_thread_info()->flags, 0, 0);
263 smp_mb();
264 if (!need_resched())
265 __sti_mwait(0, 0);
266 else
267 local_irq_enable();
268 } else {
269 local_irq_enable();
270 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271}
272
Ashok Raje6982c62005-06-25 14:54:58 -0700273void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274{
275 static int printed;
276 if (cpu_has(c, X86_FEATURE_MWAIT)) {
277 /*
278 * Skip, if setup has overridden idle.
279 * One CPU supports mwait => All CPUs supports mwait
280 */
281 if (!pm_idle) {
282 if (!printed) {
Dan Aloni2d4fa2f2007-07-21 17:11:20 +0200283 printk(KERN_INFO "using mwait in idle threads.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 printed = 1;
285 }
286 pm_idle = mwait_idle;
287 }
288 }
289}
290
291static int __init idle_setup (char *str)
292{
Andi Kleenf039b752007-05-02 19:27:12 +0200293 if (!strcmp(str, "poll")) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700294 printk("using polling idle threads.\n");
295 pm_idle = poll_idle;
Andi Kleenf039b752007-05-02 19:27:12 +0200296 } else if (!strcmp(str, "mwait"))
297 force_mwait = 1;
298 else
299 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300
301 boot_option_idle_override = 1;
Andi Kleenf039b752007-05-02 19:27:12 +0200302 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303}
Andi Kleenf039b752007-05-02 19:27:12 +0200304early_param("idle", idle_setup);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305
306/* Prints also some state that isn't saved in the pt_regs */
307void __show_regs(struct pt_regs * regs)
308{
309 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
Alan Sternbb1995d2007-07-21 17:10:42 +0200310 unsigned long d0, d1, d2, d3, d6, d7;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700311 unsigned int fsindex,gsindex;
312 unsigned int ds,cs,es;
313
314 printk("\n");
315 print_modules();
Andi Kleen9acf23c2005-09-12 18:49:24 +0200316 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
317 current->pid, current->comm, print_tainted(),
Serge E. Hallyn96b644b2006-10-02 02:18:13 -0700318 init_utsname()->release,
319 (int)strcspn(init_utsname()->version, " "),
320 init_utsname()->version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
322 printk_address(regs->rip);
Ingo Molnar3ac94932006-07-03 00:24:36 -0700323 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
Andi Kleena88cde12005-11-05 17:25:54 +0100324 regs->eflags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
326 regs->rax, regs->rbx, regs->rcx);
327 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
328 regs->rdx, regs->rsi, regs->rdi);
329 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
330 regs->rbp, regs->r8, regs->r9);
331 printk("R10: %016lx R11: %016lx R12: %016lx\n",
332 regs->r10, regs->r11, regs->r12);
333 printk("R13: %016lx R14: %016lx R15: %016lx\n",
334 regs->r13, regs->r14, regs->r15);
335
336 asm("movl %%ds,%0" : "=r" (ds));
337 asm("movl %%cs,%0" : "=r" (cs));
338 asm("movl %%es,%0" : "=r" (es));
339 asm("movl %%fs,%0" : "=r" (fsindex));
340 asm("movl %%gs,%0" : "=r" (gsindex));
341
342 rdmsrl(MSR_FS_BASE, fs);
343 rdmsrl(MSR_GS_BASE, gs);
344 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
345
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200346 cr0 = read_cr0();
347 cr2 = read_cr2();
348 cr3 = read_cr3();
349 cr4 = read_cr4();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350
351 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
352 fs,fsindex,gs,gsindex,shadowgs);
353 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
354 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
Alan Sternbb1995d2007-07-21 17:10:42 +0200355
356 get_debugreg(d0, 0);
357 get_debugreg(d1, 1);
358 get_debugreg(d2, 2);
359 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
360 get_debugreg(d3, 3);
361 get_debugreg(d6, 6);
362 get_debugreg(d7, 7);
363 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700364}
365
366void show_regs(struct pt_regs *regs)
367{
Zwane Mwaikamboc078d322005-09-06 15:16:16 -0700368 printk("CPU %d:", smp_processor_id());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369 __show_regs(regs);
Jan Beulichb538ed22006-06-26 13:57:32 +0200370 show_trace(NULL, regs, (void *)(regs + 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700371}
372
373/*
374 * Free current thread data structures etc..
375 */
376void exit_thread(void)
377{
378 struct task_struct *me = current;
379 struct thread_struct *t = &me->thread;
Rusty Lynch73649da2005-06-23 00:09:23 -0700380
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381 if (me->thread.io_bitmap_ptr) {
382 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
383
384 kfree(t->io_bitmap_ptr);
385 t->io_bitmap_ptr = NULL;
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200386 clear_thread_flag(TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700387 /*
388 * Careful, clear this in the TSS too:
389 */
390 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
391 t->io_bitmap_max = 0;
392 put_cpu();
393 }
394}
395
396void flush_thread(void)
397{
398 struct task_struct *tsk = current;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800400 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
401 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
402 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
403 clear_tsk_thread_flag(tsk, TIF_IA32);
404 } else {
405 set_tsk_thread_flag(tsk, TIF_IA32);
Andi Kleen4d9bc792006-06-26 13:57:19 +0200406 current_thread_info()->status |= TS_COMPAT;
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800407 }
Andi Kleen4d9bc792006-06-26 13:57:19 +0200408 }
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800409 clear_tsk_thread_flag(tsk, TIF_DEBUG);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700410
411 tsk->thread.debugreg0 = 0;
412 tsk->thread.debugreg1 = 0;
413 tsk->thread.debugreg2 = 0;
414 tsk->thread.debugreg3 = 0;
415 tsk->thread.debugreg6 = 0;
416 tsk->thread.debugreg7 = 0;
417 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
418 /*
419 * Forget coprocessor state..
420 */
421 clear_fpu(tsk);
422 clear_used_math();
423}
424
425void release_thread(struct task_struct *dead_task)
426{
427 if (dead_task->mm) {
428 if (dead_task->mm->context.size) {
429 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
430 dead_task->comm,
431 dead_task->mm->context.ldt,
432 dead_task->mm->context.size);
433 BUG();
434 }
435 }
436}
437
438static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
439{
440 struct user_desc ud = {
441 .base_addr = addr,
442 .limit = 0xfffff,
443 .seg_32bit = 1,
444 .limit_in_pages = 1,
445 .useable = 1,
446 };
447 struct n_desc_struct *desc = (void *)t->thread.tls_array;
448 desc += tls;
449 desc->a = LDT_entry_a(&ud);
450 desc->b = LDT_entry_b(&ud);
451}
452
453static inline u32 read_32bit_tls(struct task_struct *t, int tls)
454{
455 struct desc_struct *desc = (void *)t->thread.tls_array;
456 desc += tls;
457 return desc->base0 |
458 (((u32)desc->base1) << 16) |
459 (((u32)desc->base2) << 24);
460}
461
462/*
463 * This gets called before we allocate a new thread and copy
464 * the current task into it.
465 */
466void prepare_to_copy(struct task_struct *tsk)
467{
468 unlazy_fpu(tsk);
469}
470
471int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
472 unsigned long unused,
473 struct task_struct * p, struct pt_regs * regs)
474{
475 int err;
476 struct pt_regs * childregs;
477 struct task_struct *me = current;
478
Andi Kleena88cde12005-11-05 17:25:54 +0100479 childregs = ((struct pt_regs *)
Al Viro57eafdc2006-01-12 01:05:39 -0800480 (THREAD_SIZE + task_stack_page(p))) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700481 *childregs = *regs;
482
483 childregs->rax = 0;
484 childregs->rsp = rsp;
Andi Kleena88cde12005-11-05 17:25:54 +0100485 if (rsp == ~0UL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 childregs->rsp = (unsigned long)childregs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487
488 p->thread.rsp = (unsigned long) childregs;
489 p->thread.rsp0 = (unsigned long) (childregs+1);
490 p->thread.userrsp = me->thread.userrsp;
491
Al Viroe4f17c42006-01-12 01:05:38 -0800492 set_tsk_thread_flag(p, TIF_FORK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700493
494 p->thread.fs = me->thread.fs;
495 p->thread.gs = me->thread.gs;
496
H. J. Lufd51f662005-05-01 08:58:48 -0700497 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
498 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
499 asm("mov %%es,%0" : "=m" (p->thread.es));
500 asm("mov %%ds,%0" : "=m" (p->thread.ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200502 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700503 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
504 if (!p->thread.io_bitmap_ptr) {
505 p->thread.io_bitmap_max = 0;
506 return -ENOMEM;
507 }
Andi Kleena88cde12005-11-05 17:25:54 +0100508 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
509 IO_BITMAP_BYTES);
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200510 set_tsk_thread_flag(p, TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511 }
512
513 /*
514 * Set a new TLS for the child thread?
515 */
516 if (clone_flags & CLONE_SETTLS) {
517#ifdef CONFIG_IA32_EMULATION
518 if (test_thread_flag(TIF_IA32))
519 err = ia32_child_tls(p, childregs);
520 else
521#endif
522 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
523 if (err)
524 goto out;
525 }
526 err = 0;
527out:
528 if (err && p->thread.io_bitmap_ptr) {
529 kfree(p->thread.io_bitmap_ptr);
530 p->thread.io_bitmap_max = 0;
531 }
532 return err;
533}
534
535/*
536 * This special macro can be used to load a debugging register
537 */
Jan Beulich2b514e72006-03-25 16:29:22 +0100538#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700539
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200540static inline void __switch_to_xtra(struct task_struct *prev_p,
541 struct task_struct *next_p,
542 struct tss_struct *tss)
543{
544 struct thread_struct *prev, *next;
545
546 prev = &prev_p->thread,
547 next = &next_p->thread;
548
549 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
550 loaddebug(next, 0);
551 loaddebug(next, 1);
552 loaddebug(next, 2);
553 loaddebug(next, 3);
554 /* no 4 and 5 */
555 loaddebug(next, 6);
556 loaddebug(next, 7);
557 }
558
559 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
560 /*
561 * Copy the relevant range of the IO bitmap.
562 * Normally this is 128 bytes or less:
563 */
564 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
565 max(prev->io_bitmap_max, next->io_bitmap_max));
566 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
567 /*
568 * Clear any possible leftover bits:
569 */
570 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
571 }
572}
573
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574/*
575 * switch_to(x,y) should switch tasks from x to y.
576 *
577 * This could still be optimized:
578 * - fold all the options into a flag word and test it with a single test.
579 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100580 *
581 * Kprobes not supported here. Set the probe on schedule instead.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700582 */
Andi Kleen099f3182006-02-03 21:51:38 +0100583__kprobes struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100584__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585{
586 struct thread_struct *prev = &prev_p->thread,
587 *next = &next_p->thread;
588 int cpu = smp_processor_id();
589 struct tss_struct *tss = &per_cpu(init_tss, cpu);
590
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200591 /* we're going to use this soon, after a few expensive things */
592 if (next_p->fpu_counter>5)
593 prefetch(&next->i387.fxsave);
594
Linus Torvalds1da177e2005-04-16 15:20:36 -0700595 /*
596 * Reload esp0, LDT and the page table pointer:
597 */
598 tss->rsp0 = next->rsp0;
599
600 /*
601 * Switch DS and ES.
602 * This won't pick up thread selector changes, but I guess that is ok.
603 */
H. J. Lufd51f662005-05-01 08:58:48 -0700604 asm volatile("mov %%es,%0" : "=m" (prev->es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605 if (unlikely(next->es | prev->es))
606 loadsegment(es, next->es);
607
H. J. Lufd51f662005-05-01 08:58:48 -0700608 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609 if (unlikely(next->ds | prev->ds))
610 loadsegment(ds, next->ds);
611
612 load_TLS(next, cpu);
613
614 /*
615 * Switch FS and GS.
616 */
617 {
618 unsigned fsindex;
619 asm volatile("movl %%fs,%0" : "=r" (fsindex));
620 /* segment register != 0 always requires a reload.
621 also reload when it has changed.
622 when prev process used 64bit base always reload
623 to avoid an information leak. */
624 if (unlikely(fsindex | next->fsindex | prev->fs)) {
625 loadsegment(fs, next->fsindex);
626 /* check if the user used a selector != 0
627 * if yes clear 64bit base, since overloaded base
628 * is always mapped to the Null selector
629 */
630 if (fsindex)
631 prev->fs = 0;
632 }
633 /* when next process has a 64bit base use it */
634 if (next->fs)
635 wrmsrl(MSR_FS_BASE, next->fs);
636 prev->fsindex = fsindex;
637 }
638 {
639 unsigned gsindex;
640 asm volatile("movl %%gs,%0" : "=r" (gsindex));
641 if (unlikely(gsindex | next->gsindex | prev->gs)) {
642 load_gs_index(next->gsindex);
643 if (gsindex)
644 prev->gs = 0;
645 }
646 if (next->gs)
647 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
648 prev->gsindex = gsindex;
649 }
650
Andi Kleen0a5ace22006-10-05 18:47:22 +0200651 /* Must be after DS reload */
652 unlazy_fpu(prev_p);
653
Linus Torvalds1da177e2005-04-16 15:20:36 -0700654 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100655 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700656 */
657 prev->userrsp = read_pda(oldrsp);
658 write_pda(oldrsp, next->userrsp);
659 write_pda(pcurrent, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200660
Andi Kleena88cde12005-11-05 17:25:54 +0100661 write_pda(kernelstack,
Andi Kleen7b0bda72006-09-26 10:52:39 +0200662 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
Arjan van de Ven0a425402006-09-26 10:52:38 +0200663#ifdef CONFIG_CC_STACKPROTECTOR
664 write_pda(stack_canary, next_p->stack_canary);
665 /*
666 * Build time only check to make sure the stack_canary is at
667 * offset 40 in the pda; this is a gcc ABI requirement
668 */
669 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
670#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671
672 /*
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200673 * Now maybe reload the debug registers and handle I/O bitmaps
Linus Torvalds1da177e2005-04-16 15:20:36 -0700674 */
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200675 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
676 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
677 __switch_to_xtra(prev_p, next_p, tss);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200679 /* If the task has used fpu the last 5 timeslices, just do a full
680 * restore of the math state immediately to avoid the trap; the
681 * chances of needing FPU soon are obviously high now
682 */
683 if (next_p->fpu_counter>5)
684 math_state_restore();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700685 return prev_p;
686}
687
688/*
689 * sys_execve() executes a new program.
690 */
691asmlinkage
692long sys_execve(char __user *name, char __user * __user *argv,
693 char __user * __user *envp, struct pt_regs regs)
694{
695 long error;
696 char * filename;
697
698 filename = getname(name);
699 error = PTR_ERR(filename);
700 if (IS_ERR(filename))
701 return error;
702 error = do_execve(filename, argv, envp, &regs);
703 if (error == 0) {
704 task_lock(current);
705 current->ptrace &= ~PT_DTRACE;
706 task_unlock(current);
707 }
708 putname(filename);
709 return error;
710}
711
712void set_personality_64bit(void)
713{
714 /* inherit personality from parent */
715
716 /* Make sure to be in 64bit mode */
717 clear_thread_flag(TIF_IA32);
718
719 /* TBD: overwrites user setup. Should have two bits.
720 But 64bit processes have always behaved this way,
721 so it's not too bad. The main problem is just that
722 32bit childs are affected again. */
723 current->personality &= ~READ_IMPLIES_EXEC;
724}
725
726asmlinkage long sys_fork(struct pt_regs *regs)
727{
728 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
729}
730
Andi Kleena88cde12005-11-05 17:25:54 +0100731asmlinkage long
732sys_clone(unsigned long clone_flags, unsigned long newsp,
733 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734{
735 if (!newsp)
736 newsp = regs->rsp;
737 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
738}
739
740/*
741 * This is trivial, and on the face of it looks like it
742 * could equally well be done in user mode.
743 *
744 * Not so, for quite unobvious reasons - register pressure.
745 * In user mode vfork() cannot have a stack frame, and if
746 * done by calling the "clone()" system call directly, you
747 * do not have enough call-clobbered registers to hold all
748 * the information you need.
749 */
750asmlinkage long sys_vfork(struct pt_regs *regs)
751{
752 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
753 NULL, NULL);
754}
755
756unsigned long get_wchan(struct task_struct *p)
757{
758 unsigned long stack;
759 u64 fp,rip;
760 int count = 0;
761
762 if (!p || p == current || p->state==TASK_RUNNING)
763 return 0;
Al Viro57eafdc2006-01-12 01:05:39 -0800764 stack = (unsigned long)task_stack_page(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700765 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
766 return 0;
767 fp = *(u64 *)(p->thread.rsp);
768 do {
Andi Kleena88cde12005-11-05 17:25:54 +0100769 if (fp < (unsigned long)stack ||
770 fp > (unsigned long)stack+THREAD_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700771 return 0;
772 rip = *(u64 *)(fp+8);
773 if (!in_sched_functions(rip))
774 return rip;
775 fp = *(u64 *)fp;
776 } while (count++ < 16);
777 return 0;
778}
779
780long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
781{
782 int ret = 0;
783 int doit = task == current;
784 int cpu;
785
786 switch (code) {
787 case ARCH_SET_GS:
Suresh Siddha84929802005-06-21 17:14:32 -0700788 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700789 return -EPERM;
790 cpu = get_cpu();
791 /* handle small bases via the GDT because that's faster to
792 switch. */
793 if (addr <= 0xffffffff) {
794 set_32bit_tls(task, GS_TLS, addr);
795 if (doit) {
796 load_TLS(&task->thread, cpu);
797 load_gs_index(GS_TLS_SEL);
798 }
799 task->thread.gsindex = GS_TLS_SEL;
800 task->thread.gs = 0;
801 } else {
802 task->thread.gsindex = 0;
803 task->thread.gs = addr;
804 if (doit) {
Andi Kleena88cde12005-11-05 17:25:54 +0100805 load_gs_index(0);
806 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700807 }
808 }
809 put_cpu();
810 break;
811 case ARCH_SET_FS:
812 /* Not strictly needed for fs, but do it for symmetry
813 with gs */
Suresh Siddha84929802005-06-21 17:14:32 -0700814 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700815 return -EPERM;
816 cpu = get_cpu();
817 /* handle small bases via the GDT because that's faster to
818 switch. */
819 if (addr <= 0xffffffff) {
820 set_32bit_tls(task, FS_TLS, addr);
821 if (doit) {
822 load_TLS(&task->thread, cpu);
Andi Kleena88cde12005-11-05 17:25:54 +0100823 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700824 }
825 task->thread.fsindex = FS_TLS_SEL;
826 task->thread.fs = 0;
827 } else {
828 task->thread.fsindex = 0;
829 task->thread.fs = addr;
830 if (doit) {
831 /* set the selector to 0 to not confuse
832 __switch_to */
Andi Kleena88cde12005-11-05 17:25:54 +0100833 asm volatile("movl %0,%%fs" :: "r" (0));
834 ret = checking_wrmsrl(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835 }
836 }
837 put_cpu();
838 break;
839 case ARCH_GET_FS: {
840 unsigned long base;
841 if (task->thread.fsindex == FS_TLS_SEL)
842 base = read_32bit_tls(task, FS_TLS);
Andi Kleena88cde12005-11-05 17:25:54 +0100843 else if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700844 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100845 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846 base = task->thread.fs;
847 ret = put_user(base, (unsigned long __user *)addr);
848 break;
849 }
850 case ARCH_GET_GS: {
851 unsigned long base;
John Blackwood97c28032006-04-07 19:50:25 +0200852 unsigned gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853 if (task->thread.gsindex == GS_TLS_SEL)
854 base = read_32bit_tls(task, GS_TLS);
John Blackwood97c28032006-04-07 19:50:25 +0200855 else if (doit) {
856 asm("movl %%gs,%0" : "=r" (gsindex));
857 if (gsindex)
858 rdmsrl(MSR_KERNEL_GS_BASE, base);
859 else
860 base = task->thread.gs;
861 }
Andi Kleena88cde12005-11-05 17:25:54 +0100862 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700863 base = task->thread.gs;
864 ret = put_user(base, (unsigned long __user *)addr);
865 break;
866 }
867
868 default:
869 ret = -EINVAL;
870 break;
871 }
872
873 return ret;
874}
875
876long sys_arch_prctl(int code, unsigned long addr)
877{
878 return do_arch_prctl(current, code, addr);
879}
880
881/*
882 * Capture the user space registers if the task is not running (in user space)
883 */
884int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
885{
886 struct pt_regs *pp, ptregs;
887
Al Virobb049232006-01-12 01:05:38 -0800888 pp = task_pt_regs(tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889
890 ptregs = *pp;
891 ptregs.cs &= 0xffff;
892 ptregs.ss &= 0xffff;
893
894 elf_core_copy_regs(regs, &ptregs);
895
896 return 1;
897}
898
899unsigned long arch_align_stack(unsigned long sp)
900{
Andi Kleenc16b63e2006-09-26 10:52:28 +0200901 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700902 sp -= get_random_int() % 8192;
903 return sp & ~0xf;
904}