blob: 40fed477f3e5e488030a12735bcaa3b016dda28d [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * X86-64 port
8 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -07009 *
10 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 */
12
13/*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17#include <stdarg.h>
18
Ashok Raj76e4f662005-06-25 14:55:00 -070019#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070020#include <linux/errno.h>
21#include <linux/sched.h>
22#include <linux/kernel.h>
23#include <linux/mm.h>
Alexey Dobriyan4e950f62007-07-30 02:36:13 +040024#include <linux/fs.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070025#include <linux/elfcore.h>
26#include <linux/smp.h>
27#include <linux/slab.h>
28#include <linux/user.h>
29#include <linux/module.h>
30#include <linux/a.out.h>
31#include <linux/interrupt.h>
32#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070033#include <linux/ptrace.h>
34#include <linux/utsname.h>
35#include <linux/random.h>
Andi Kleen95833c82006-01-11 22:44:36 +010036#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080037#include <linux/kprobes.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070038#include <linux/kdebug.h>
Chris Wright02290682007-10-12 23:04:07 +020039#include <linux/tick.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070040
41#include <asm/uaccess.h>
42#include <asm/pgtable.h>
43#include <asm/system.h>
44#include <asm/io.h>
45#include <asm/processor.h>
46#include <asm/i387.h>
47#include <asm/mmu_context.h>
48#include <asm/pda.h>
49#include <asm/prctl.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070050#include <asm/desc.h>
51#include <asm/proto.h>
52#include <asm/ia32.h>
Andi Kleen95833c82006-01-11 22:44:36 +010053#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55asmlinkage extern void ret_from_fork(void);
56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58
Linus Torvalds1da177e2005-04-16 15:20:36 -070059unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
Andi Kleen2ee60e172006-06-26 13:59:44 +020066EXPORT_SYMBOL(pm_idle);
Linus Torvalds1da177e2005-04-16 15:20:36 -070067static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68
Alan Sterne041c682006-03-27 01:16:30 -080069static ATOMIC_NOTIFIER_HEAD(idle_notifier);
Andi Kleen95833c82006-01-11 22:44:36 +010070
71void idle_notifier_register(struct notifier_block *n)
72{
Alan Sterne041c682006-03-27 01:16:30 -080073 atomic_notifier_chain_register(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010074}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
Alan Sterne041c682006-03-27 01:16:30 -080079 atomic_notifier_chain_unregister(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010080}
81EXPORT_SYMBOL(idle_notifier_unregister);
82
Andi Kleen95833c82006-01-11 22:44:36 +010083void enter_idle(void)
84{
Andi Kleena15da492006-09-26 10:52:40 +020085 write_pda(isidle, 1);
Alan Sterne041c682006-03-27 01:16:30 -080086 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010087}
88
89static void __exit_idle(void)
90{
Andi Kleen94468682006-11-14 16:57:46 +010091 if (test_and_clear_bit_pda(0, isidle) == 0)
Andi Kleena15da492006-09-26 10:52:40 +020092 return;
Alan Sterne041c682006-03-27 01:16:30 -080093 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010094}
95
96/* Called from interrupts to signify idle end */
97void exit_idle(void)
98{
Andi Kleena15da492006-09-26 10:52:40 +020099 /* idle loop has pid 0 */
100 if (current->pid)
Andi Kleen95833c82006-01-11 22:44:36 +0100101 return;
102 __exit_idle();
103}
104
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105/*
106 * We use this if we don't have any better
107 * idle routine..
108 */
Adrian Bunkcdb04522006-03-24 03:15:57 -0800109static void default_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200111 current_thread_info()->status &= ~TS_POLLING;
Ingo Molnar0888f062006-12-22 01:11:56 -0800112 /*
113 * TS_POLLING-cleared state must be visible before we
114 * test NEED_RESCHED:
115 */
116 smp_mb();
Andi Kleen72690a22006-12-07 02:14:03 +0100117 local_irq_disable();
118 if (!need_resched()) {
Ingo Molnar5ee613b2008-01-30 13:30:06 +0100119 ktime_t t0, t1;
120 u64 t0n, t1n;
121
122 t0 = ktime_get();
123 t0n = ktime_to_ns(t0);
124 safe_halt(); /* enables interrupts racelessly */
125 local_irq_disable();
126 t1 = ktime_get();
127 t1n = ktime_to_ns(t1);
128 sched_clock_idle_wakeup_event(t1n - t0n);
Andi Kleen72690a22006-12-07 02:14:03 +0100129 } else
130 local_irq_enable();
Andi Kleen495ab9c2006-06-26 13:59:11 +0200131 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132}
133
134/*
135 * On SMP it's slightly faster (but much more power-consuming!)
136 * to poll the ->need_resched flag instead of waiting for the
137 * cross-CPU IPI to arrive. Use this option with caution.
138 */
139static void poll_idle (void)
140{
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100141 local_irq_enable();
Andi Kleen72690a22006-12-07 02:14:03 +0100142 cpu_relax();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143}
144
Steven Rostedt40d6a142008-01-14 00:55:10 -0800145static void do_nothing(void *unused)
146{
147}
148
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149void cpu_idle_wait(void)
150{
151 unsigned int cpu, this_cpu = get_cpu();
Ingo Molnardc1829a2006-11-17 14:26:18 +0100152 cpumask_t map, tmp = current->cpus_allowed;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153
154 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
155 put_cpu();
156
157 cpus_clear(map);
158 for_each_online_cpu(cpu) {
159 per_cpu(cpu_idle_state, cpu) = 1;
160 cpu_set(cpu, map);
161 }
162
163 __get_cpu_var(cpu_idle_state) = 0;
164
165 wmb();
166 do {
167 ssleep(1);
168 for_each_online_cpu(cpu) {
Andi Kleena88cde12005-11-05 17:25:54 +0100169 if (cpu_isset(cpu, map) &&
170 !per_cpu(cpu_idle_state, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171 cpu_clear(cpu, map);
172 }
173 cpus_and(map, map, cpu_online_map);
Steven Rostedt40d6a142008-01-14 00:55:10 -0800174 /*
175 * We waited 1 sec, if a CPU still did not call idle
176 * it may be because it is in idle and not waking up
177 * because it has nothing to do.
178 * Give all the remaining CPUS a kick.
179 */
180 smp_call_function_mask(map, do_nothing, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181 } while (!cpus_empty(map));
Ingo Molnardc1829a2006-11-17 14:26:18 +0100182
183 set_cpus_allowed(current, tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184}
185EXPORT_SYMBOL_GPL(cpu_idle_wait);
186
Ashok Raj76e4f662005-06-25 14:55:00 -0700187#ifdef CONFIG_HOTPLUG_CPU
188DECLARE_PER_CPU(int, cpu_state);
189
190#include <asm/nmi.h>
Shaohua Li1fa744e2006-01-06 00:12:20 -0800191/* We halt the CPU with physical CPU hotplug */
Ashok Raj76e4f662005-06-25 14:55:00 -0700192static inline void play_dead(void)
193{
194 idle_task_exit();
195 wbinvd();
196 mb();
197 /* Ack it */
198 __get_cpu_var(cpu_state) = CPU_DEAD;
199
Shaohua Li1fa744e2006-01-06 00:12:20 -0800200 local_irq_disable();
Ashok Raj76e4f662005-06-25 14:55:00 -0700201 while (1)
Shaohua Li1fa744e2006-01-06 00:12:20 -0800202 halt();
Ashok Raj76e4f662005-06-25 14:55:00 -0700203}
204#else
205static inline void play_dead(void)
206{
207 BUG();
208}
209#endif /* CONFIG_HOTPLUG_CPU */
210
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211/*
212 * The idle thread. There's no useful work to be
213 * done, so just try to conserve power and have a
214 * low exit latency (ie sit in a loop waiting for
215 * somebody to say that they'd like to reschedule)
216 */
Pavel Machekb10db7f2008-01-30 13:30:00 +0100217void cpu_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700218{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200219 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220 /* endless idle loop with no priority at all */
221 while (1) {
222 while (!need_resched()) {
223 void (*idle)(void);
224
225 if (__get_cpu_var(cpu_idle_state))
226 __get_cpu_var(cpu_idle_state) = 0;
227
Chris Wright02290682007-10-12 23:04:07 +0200228 tick_nohz_stop_sched_tick();
229
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230 rmb();
231 idle = pm_idle;
232 if (!idle)
233 idle = default_idle;
Ashok Raj76e4f662005-06-25 14:55:00 -0700234 if (cpu_is_offline(smp_processor_id()))
235 play_dead();
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100236 /*
237 * Idle routines should keep interrupts disabled
238 * from here on, until they go to idle.
239 * Otherwise, idle callbacks can misfire.
240 */
241 local_irq_disable();
Andi Kleen95833c82006-01-11 22:44:36 +0100242 enter_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243 idle();
Andi Kleena15da492006-09-26 10:52:40 +0200244 /* In many cases the interrupt that ended idle
245 has already called exit_idle. But some idle
246 loops can be woken up without interrupt. */
Andi Kleen95833c82006-01-11 22:44:36 +0100247 __exit_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700248 }
249
Chris Wright02290682007-10-12 23:04:07 +0200250 tick_nohz_restart_sched_tick();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800251 preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252 schedule();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800253 preempt_disable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254 }
255}
256
257/*
258 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
259 * which can obviate IPI to trigger checking of need_resched.
260 * We execute MONITOR against need_resched and enter optimized wait state
261 * through MWAIT. Whenever someone changes need_resched, we would be woken
262 * up from MWAIT (without an IPI).
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700263 *
264 * New with Core Duo processors, MWAIT can take some hints based on CPU
265 * capability.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266 */
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700267void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
268{
269 if (!need_resched()) {
270 __monitor((void *)&current_thread_info()->flags, 0, 0);
271 smp_mb();
272 if (!need_resched())
273 __mwait(eax, ecx);
274 }
275}
276
277/* Default MONITOR/MWAIT with no hints, used for default C1 state */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278static void mwait_idle(void)
279{
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100280 if (!need_resched()) {
281 __monitor((void *)&current_thread_info()->flags, 0, 0);
282 smp_mb();
283 if (!need_resched())
284 __sti_mwait(0, 0);
285 else
286 local_irq_enable();
287 } else {
288 local_irq_enable();
289 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700290}
291
Ashok Raje6982c62005-06-25 14:54:58 -0700292void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293{
294 static int printed;
295 if (cpu_has(c, X86_FEATURE_MWAIT)) {
296 /*
297 * Skip, if setup has overridden idle.
298 * One CPU supports mwait => All CPUs supports mwait
299 */
300 if (!pm_idle) {
301 if (!printed) {
Dan Aloni2d4fa2f2007-07-21 17:11:20 +0200302 printk(KERN_INFO "using mwait in idle threads.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303 printed = 1;
304 }
305 pm_idle = mwait_idle;
306 }
307 }
308}
309
310static int __init idle_setup (char *str)
311{
Andi Kleenf039b752007-05-02 19:27:12 +0200312 if (!strcmp(str, "poll")) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313 printk("using polling idle threads.\n");
314 pm_idle = poll_idle;
Andi Kleenf039b752007-05-02 19:27:12 +0200315 } else if (!strcmp(str, "mwait"))
316 force_mwait = 1;
317 else
318 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319
320 boot_option_idle_override = 1;
Andi Kleenf039b752007-05-02 19:27:12 +0200321 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322}
Andi Kleenf039b752007-05-02 19:27:12 +0200323early_param("idle", idle_setup);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324
325/* Prints also some state that isn't saved in the pt_regs */
326void __show_regs(struct pt_regs * regs)
327{
328 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
Alan Sternbb1995d2007-07-21 17:10:42 +0200329 unsigned long d0, d1, d2, d3, d6, d7;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330 unsigned int fsindex,gsindex;
331 unsigned int ds,cs,es;
332
333 printk("\n");
334 print_modules();
Andi Kleen9acf23c2005-09-12 18:49:24 +0200335 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
336 current->pid, current->comm, print_tainted(),
Serge E. Hallyn96b644b2006-10-02 02:18:13 -0700337 init_utsname()->release,
338 (int)strcspn(init_utsname()->version, " "),
339 init_utsname()->version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
341 printk_address(regs->rip);
Ingo Molnar3ac94932006-07-03 00:24:36 -0700342 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
Andi Kleena88cde12005-11-05 17:25:54 +0100343 regs->eflags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
345 regs->rax, regs->rbx, regs->rcx);
346 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
347 regs->rdx, regs->rsi, regs->rdi);
348 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
349 regs->rbp, regs->r8, regs->r9);
350 printk("R10: %016lx R11: %016lx R12: %016lx\n",
351 regs->r10, regs->r11, regs->r12);
352 printk("R13: %016lx R14: %016lx R15: %016lx\n",
353 regs->r13, regs->r14, regs->r15);
354
355 asm("movl %%ds,%0" : "=r" (ds));
356 asm("movl %%cs,%0" : "=r" (cs));
357 asm("movl %%es,%0" : "=r" (es));
358 asm("movl %%fs,%0" : "=r" (fsindex));
359 asm("movl %%gs,%0" : "=r" (gsindex));
360
361 rdmsrl(MSR_FS_BASE, fs);
362 rdmsrl(MSR_GS_BASE, gs);
363 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
364
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200365 cr0 = read_cr0();
366 cr2 = read_cr2();
367 cr3 = read_cr3();
368 cr4 = read_cr4();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369
370 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
371 fs,fsindex,gs,gsindex,shadowgs);
372 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
373 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
Alan Sternbb1995d2007-07-21 17:10:42 +0200374
375 get_debugreg(d0, 0);
376 get_debugreg(d1, 1);
377 get_debugreg(d2, 2);
378 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
379 get_debugreg(d3, 3);
380 get_debugreg(d6, 6);
381 get_debugreg(d7, 7);
382 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383}
384
385void show_regs(struct pt_regs *regs)
386{
Zwane Mwaikamboc078d322005-09-06 15:16:16 -0700387 printk("CPU %d:", smp_processor_id());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700388 __show_regs(regs);
Jan Beulichb538ed22006-06-26 13:57:32 +0200389 show_trace(NULL, regs, (void *)(regs + 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390}
391
392/*
393 * Free current thread data structures etc..
394 */
395void exit_thread(void)
396{
397 struct task_struct *me = current;
398 struct thread_struct *t = &me->thread;
Rusty Lynch73649da2005-06-23 00:09:23 -0700399
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400 if (me->thread.io_bitmap_ptr) {
401 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
402
403 kfree(t->io_bitmap_ptr);
404 t->io_bitmap_ptr = NULL;
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200405 clear_thread_flag(TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700406 /*
407 * Careful, clear this in the TSS too:
408 */
409 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
410 t->io_bitmap_max = 0;
411 put_cpu();
412 }
413}
414
415void flush_thread(void)
416{
417 struct task_struct *tsk = current;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800419 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
420 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
421 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
422 clear_tsk_thread_flag(tsk, TIF_IA32);
423 } else {
424 set_tsk_thread_flag(tsk, TIF_IA32);
Andi Kleen4d9bc792006-06-26 13:57:19 +0200425 current_thread_info()->status |= TS_COMPAT;
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800426 }
Andi Kleen4d9bc792006-06-26 13:57:19 +0200427 }
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800428 clear_tsk_thread_flag(tsk, TIF_DEBUG);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429
430 tsk->thread.debugreg0 = 0;
431 tsk->thread.debugreg1 = 0;
432 tsk->thread.debugreg2 = 0;
433 tsk->thread.debugreg3 = 0;
434 tsk->thread.debugreg6 = 0;
435 tsk->thread.debugreg7 = 0;
436 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
437 /*
438 * Forget coprocessor state..
439 */
440 clear_fpu(tsk);
441 clear_used_math();
442}
443
444void release_thread(struct task_struct *dead_task)
445{
446 if (dead_task->mm) {
447 if (dead_task->mm->context.size) {
448 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
449 dead_task->comm,
450 dead_task->mm->context.ldt,
451 dead_task->mm->context.size);
452 BUG();
453 }
454 }
455}
456
457static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
458{
459 struct user_desc ud = {
460 .base_addr = addr,
461 .limit = 0xfffff,
462 .seg_32bit = 1,
463 .limit_in_pages = 1,
464 .useable = 1,
465 };
466 struct n_desc_struct *desc = (void *)t->thread.tls_array;
467 desc += tls;
468 desc->a = LDT_entry_a(&ud);
469 desc->b = LDT_entry_b(&ud);
470}
471
472static inline u32 read_32bit_tls(struct task_struct *t, int tls)
473{
474 struct desc_struct *desc = (void *)t->thread.tls_array;
475 desc += tls;
476 return desc->base0 |
477 (((u32)desc->base1) << 16) |
478 (((u32)desc->base2) << 24);
479}
480
481/*
482 * This gets called before we allocate a new thread and copy
483 * the current task into it.
484 */
485void prepare_to_copy(struct task_struct *tsk)
486{
487 unlazy_fpu(tsk);
488}
489
490int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
491 unsigned long unused,
492 struct task_struct * p, struct pt_regs * regs)
493{
494 int err;
495 struct pt_regs * childregs;
496 struct task_struct *me = current;
497
Andi Kleena88cde12005-11-05 17:25:54 +0100498 childregs = ((struct pt_regs *)
Al Viro57eafdc2006-01-12 01:05:39 -0800499 (THREAD_SIZE + task_stack_page(p))) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500 *childregs = *regs;
501
502 childregs->rax = 0;
503 childregs->rsp = rsp;
Andi Kleena88cde12005-11-05 17:25:54 +0100504 if (rsp == ~0UL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505 childregs->rsp = (unsigned long)childregs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506
507 p->thread.rsp = (unsigned long) childregs;
508 p->thread.rsp0 = (unsigned long) (childregs+1);
509 p->thread.userrsp = me->thread.userrsp;
510
Al Viroe4f17c42006-01-12 01:05:38 -0800511 set_tsk_thread_flag(p, TIF_FORK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700512
513 p->thread.fs = me->thread.fs;
514 p->thread.gs = me->thread.gs;
515
H. J. Lufd51f662005-05-01 08:58:48 -0700516 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
517 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
518 asm("mov %%es,%0" : "=m" (p->thread.es));
519 asm("mov %%ds,%0" : "=m" (p->thread.ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700520
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200521 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700522 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
523 if (!p->thread.io_bitmap_ptr) {
524 p->thread.io_bitmap_max = 0;
525 return -ENOMEM;
526 }
Andi Kleena88cde12005-11-05 17:25:54 +0100527 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
528 IO_BITMAP_BYTES);
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200529 set_tsk_thread_flag(p, TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700530 }
531
532 /*
533 * Set a new TLS for the child thread?
534 */
535 if (clone_flags & CLONE_SETTLS) {
536#ifdef CONFIG_IA32_EMULATION
537 if (test_thread_flag(TIF_IA32))
538 err = ia32_child_tls(p, childregs);
539 else
540#endif
541 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
542 if (err)
543 goto out;
544 }
545 err = 0;
546out:
547 if (err && p->thread.io_bitmap_ptr) {
548 kfree(p->thread.io_bitmap_ptr);
549 p->thread.io_bitmap_max = 0;
550 }
551 return err;
552}
553
554/*
555 * This special macro can be used to load a debugging register
556 */
Jan Beulich2b514e72006-03-25 16:29:22 +0100557#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700558
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200559static inline void __switch_to_xtra(struct task_struct *prev_p,
560 struct task_struct *next_p,
561 struct tss_struct *tss)
562{
563 struct thread_struct *prev, *next;
564
565 prev = &prev_p->thread,
566 next = &next_p->thread;
567
568 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
569 loaddebug(next, 0);
570 loaddebug(next, 1);
571 loaddebug(next, 2);
572 loaddebug(next, 3);
573 /* no 4 and 5 */
574 loaddebug(next, 6);
575 loaddebug(next, 7);
576 }
577
578 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
579 /*
580 * Copy the relevant range of the IO bitmap.
581 * Normally this is 128 bytes or less:
582 */
583 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
584 max(prev->io_bitmap_max, next->io_bitmap_max));
585 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
586 /*
587 * Clear any possible leftover bits:
588 */
589 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
590 }
591}
592
Linus Torvalds1da177e2005-04-16 15:20:36 -0700593/*
594 * switch_to(x,y) should switch tasks from x to y.
595 *
596 * This could still be optimized:
597 * - fold all the options into a flag word and test it with a single test.
598 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100599 *
600 * Kprobes not supported here. Set the probe on schedule instead.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700601 */
Masami Hiramatsuf438d912007-10-16 01:27:49 -0700602struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100603__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700604{
605 struct thread_struct *prev = &prev_p->thread,
606 *next = &next_p->thread;
607 int cpu = smp_processor_id();
608 struct tss_struct *tss = &per_cpu(init_tss, cpu);
609
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200610 /* we're going to use this soon, after a few expensive things */
611 if (next_p->fpu_counter>5)
612 prefetch(&next->i387.fxsave);
613
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614 /*
615 * Reload esp0, LDT and the page table pointer:
616 */
617 tss->rsp0 = next->rsp0;
618
619 /*
620 * Switch DS and ES.
621 * This won't pick up thread selector changes, but I guess that is ok.
622 */
H. J. Lufd51f662005-05-01 08:58:48 -0700623 asm volatile("mov %%es,%0" : "=m" (prev->es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624 if (unlikely(next->es | prev->es))
625 loadsegment(es, next->es);
626
H. J. Lufd51f662005-05-01 08:58:48 -0700627 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700628 if (unlikely(next->ds | prev->ds))
629 loadsegment(ds, next->ds);
630
631 load_TLS(next, cpu);
632
633 /*
634 * Switch FS and GS.
635 */
636 {
637 unsigned fsindex;
638 asm volatile("movl %%fs,%0" : "=r" (fsindex));
639 /* segment register != 0 always requires a reload.
640 also reload when it has changed.
641 when prev process used 64bit base always reload
642 to avoid an information leak. */
643 if (unlikely(fsindex | next->fsindex | prev->fs)) {
644 loadsegment(fs, next->fsindex);
645 /* check if the user used a selector != 0
646 * if yes clear 64bit base, since overloaded base
647 * is always mapped to the Null selector
648 */
649 if (fsindex)
650 prev->fs = 0;
651 }
652 /* when next process has a 64bit base use it */
653 if (next->fs)
654 wrmsrl(MSR_FS_BASE, next->fs);
655 prev->fsindex = fsindex;
656 }
657 {
658 unsigned gsindex;
659 asm volatile("movl %%gs,%0" : "=r" (gsindex));
660 if (unlikely(gsindex | next->gsindex | prev->gs)) {
661 load_gs_index(next->gsindex);
662 if (gsindex)
663 prev->gs = 0;
664 }
665 if (next->gs)
666 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
667 prev->gsindex = gsindex;
668 }
669
Andi Kleen0a5ace22006-10-05 18:47:22 +0200670 /* Must be after DS reload */
671 unlazy_fpu(prev_p);
672
Linus Torvalds1da177e2005-04-16 15:20:36 -0700673 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100674 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675 */
676 prev->userrsp = read_pda(oldrsp);
677 write_pda(oldrsp, next->userrsp);
678 write_pda(pcurrent, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200679
Andi Kleena88cde12005-11-05 17:25:54 +0100680 write_pda(kernelstack,
Andi Kleen7b0bda72006-09-26 10:52:39 +0200681 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
Arjan van de Ven0a425402006-09-26 10:52:38 +0200682#ifdef CONFIG_CC_STACKPROTECTOR
683 write_pda(stack_canary, next_p->stack_canary);
684 /*
685 * Build time only check to make sure the stack_canary is at
686 * offset 40 in the pda; this is a gcc ABI requirement
687 */
688 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
689#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700690
691 /*
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200692 * Now maybe reload the debug registers and handle I/O bitmaps
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693 */
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200694 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
695 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
696 __switch_to_xtra(prev_p, next_p, tss);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700697
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200698 /* If the task has used fpu the last 5 timeslices, just do a full
699 * restore of the math state immediately to avoid the trap; the
700 * chances of needing FPU soon are obviously high now
701 */
702 if (next_p->fpu_counter>5)
703 math_state_restore();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700704 return prev_p;
705}
706
707/*
708 * sys_execve() executes a new program.
709 */
710asmlinkage
711long sys_execve(char __user *name, char __user * __user *argv,
712 char __user * __user *envp, struct pt_regs regs)
713{
714 long error;
715 char * filename;
716
717 filename = getname(name);
718 error = PTR_ERR(filename);
719 if (IS_ERR(filename))
720 return error;
721 error = do_execve(filename, argv, envp, &regs);
722 if (error == 0) {
723 task_lock(current);
724 current->ptrace &= ~PT_DTRACE;
725 task_unlock(current);
726 }
727 putname(filename);
728 return error;
729}
730
731void set_personality_64bit(void)
732{
733 /* inherit personality from parent */
734
735 /* Make sure to be in 64bit mode */
736 clear_thread_flag(TIF_IA32);
737
738 /* TBD: overwrites user setup. Should have two bits.
739 But 64bit processes have always behaved this way,
740 so it's not too bad. The main problem is just that
741 32bit childs are affected again. */
742 current->personality &= ~READ_IMPLIES_EXEC;
743}
744
745asmlinkage long sys_fork(struct pt_regs *regs)
746{
747 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
748}
749
Andi Kleena88cde12005-11-05 17:25:54 +0100750asmlinkage long
751sys_clone(unsigned long clone_flags, unsigned long newsp,
752 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700753{
754 if (!newsp)
755 newsp = regs->rsp;
756 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
757}
758
759/*
760 * This is trivial, and on the face of it looks like it
761 * could equally well be done in user mode.
762 *
763 * Not so, for quite unobvious reasons - register pressure.
764 * In user mode vfork() cannot have a stack frame, and if
765 * done by calling the "clone()" system call directly, you
766 * do not have enough call-clobbered registers to hold all
767 * the information you need.
768 */
769asmlinkage long sys_vfork(struct pt_regs *regs)
770{
771 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
772 NULL, NULL);
773}
774
775unsigned long get_wchan(struct task_struct *p)
776{
777 unsigned long stack;
778 u64 fp,rip;
779 int count = 0;
780
781 if (!p || p == current || p->state==TASK_RUNNING)
782 return 0;
Al Viro57eafdc2006-01-12 01:05:39 -0800783 stack = (unsigned long)task_stack_page(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700784 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
785 return 0;
786 fp = *(u64 *)(p->thread.rsp);
787 do {
Andi Kleena88cde12005-11-05 17:25:54 +0100788 if (fp < (unsigned long)stack ||
789 fp > (unsigned long)stack+THREAD_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790 return 0;
791 rip = *(u64 *)(fp+8);
792 if (!in_sched_functions(rip))
793 return rip;
794 fp = *(u64 *)fp;
795 } while (count++ < 16);
796 return 0;
797}
798
799long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
800{
801 int ret = 0;
802 int doit = task == current;
803 int cpu;
804
805 switch (code) {
806 case ARCH_SET_GS:
Suresh Siddha84929802005-06-21 17:14:32 -0700807 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700808 return -EPERM;
809 cpu = get_cpu();
810 /* handle small bases via the GDT because that's faster to
811 switch. */
812 if (addr <= 0xffffffff) {
813 set_32bit_tls(task, GS_TLS, addr);
814 if (doit) {
815 load_TLS(&task->thread, cpu);
816 load_gs_index(GS_TLS_SEL);
817 }
818 task->thread.gsindex = GS_TLS_SEL;
819 task->thread.gs = 0;
820 } else {
821 task->thread.gsindex = 0;
822 task->thread.gs = addr;
823 if (doit) {
Andi Kleena88cde12005-11-05 17:25:54 +0100824 load_gs_index(0);
825 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700826 }
827 }
828 put_cpu();
829 break;
830 case ARCH_SET_FS:
831 /* Not strictly needed for fs, but do it for symmetry
832 with gs */
Suresh Siddha84929802005-06-21 17:14:32 -0700833 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700834 return -EPERM;
835 cpu = get_cpu();
836 /* handle small bases via the GDT because that's faster to
837 switch. */
838 if (addr <= 0xffffffff) {
839 set_32bit_tls(task, FS_TLS, addr);
840 if (doit) {
841 load_TLS(&task->thread, cpu);
Andi Kleena88cde12005-11-05 17:25:54 +0100842 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700843 }
844 task->thread.fsindex = FS_TLS_SEL;
845 task->thread.fs = 0;
846 } else {
847 task->thread.fsindex = 0;
848 task->thread.fs = addr;
849 if (doit) {
850 /* set the selector to 0 to not confuse
851 __switch_to */
Andi Kleena88cde12005-11-05 17:25:54 +0100852 asm volatile("movl %0,%%fs" :: "r" (0));
853 ret = checking_wrmsrl(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700854 }
855 }
856 put_cpu();
857 break;
858 case ARCH_GET_FS: {
859 unsigned long base;
860 if (task->thread.fsindex == FS_TLS_SEL)
861 base = read_32bit_tls(task, FS_TLS);
Andi Kleena88cde12005-11-05 17:25:54 +0100862 else if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700863 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100864 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700865 base = task->thread.fs;
866 ret = put_user(base, (unsigned long __user *)addr);
867 break;
868 }
869 case ARCH_GET_GS: {
870 unsigned long base;
John Blackwood97c28032006-04-07 19:50:25 +0200871 unsigned gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872 if (task->thread.gsindex == GS_TLS_SEL)
873 base = read_32bit_tls(task, GS_TLS);
John Blackwood97c28032006-04-07 19:50:25 +0200874 else if (doit) {
875 asm("movl %%gs,%0" : "=r" (gsindex));
876 if (gsindex)
877 rdmsrl(MSR_KERNEL_GS_BASE, base);
878 else
879 base = task->thread.gs;
880 }
Andi Kleena88cde12005-11-05 17:25:54 +0100881 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700882 base = task->thread.gs;
883 ret = put_user(base, (unsigned long __user *)addr);
884 break;
885 }
886
887 default:
888 ret = -EINVAL;
889 break;
890 }
891
892 return ret;
893}
894
895long sys_arch_prctl(int code, unsigned long addr)
896{
897 return do_arch_prctl(current, code, addr);
898}
899
900/*
901 * Capture the user space registers if the task is not running (in user space)
902 */
903int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
904{
905 struct pt_regs *pp, ptregs;
906
Al Virobb049232006-01-12 01:05:38 -0800907 pp = task_pt_regs(tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700908
909 ptregs = *pp;
910 ptregs.cs &= 0xffff;
911 ptregs.ss &= 0xffff;
912
913 elf_core_copy_regs(regs, &ptregs);
914
915 return 1;
916}
917
918unsigned long arch_align_stack(unsigned long sp)
919{
Andi Kleenc16b63e2006-09-26 10:52:28 +0200920 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700921 sp -= get_random_int() % 8192;
922 return sp & ~0xf;
923}