/*
* entry.S contains the system-call and fault low-level handling routines.
*
+ * Some of this is documented in Documentation/x86/entry_64.txt
+ *
* NOTE: This code handles signal-recognition, which happens every time
* after an interrupt and after each system call.
*
* A note on terminology:
* - top of stack: Architecture defined interrupt frame from SS to RIP
* at the top of the kernel process stack.
- * - partial stack frame: partially saved registers upto R11.
+ * - partial stack frame: partially saved registers up to R11.
* - full stack frame: Like partial stack frame, but all register saved.
*
* Some macro usage:
#include <asm/paravirt.h>
#include <asm/ftrace.h>
#include <asm/percpu.h>
+#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
#define __AUDIT_ARCH_LE 0x40000000
.code64
+ .section .entry.text, "ax"
+
#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(mcount)
leaq 8(%rbp), %rdi
movq 0x38(%rsp), %rsi
+ movq (%rbp), %rdx
subq $MCOUNT_INSN_SIZE, %rsi
call prepare_ftrace_return
END(ftrace_graph_caller)
GLOBAL(return_to_handler)
- subq $80, %rsp
+ subq $24, %rsp
/* Save the return values */
movq %rax, (%rsp)
movq %rdx, 8(%rsp)
+ movq %rbp, %rdi
call ftrace_return_to_handler
- movq %rax, 72(%rsp)
+ movq %rax, %rdi
movq 8(%rsp), %rdx
movq (%rsp), %rax
- addq $72, %rsp
- retq
+ addq $24, %rsp
+ jmp *%rdi
#endif
.macro FAKE_STACK_FRAME child_rip
/* push in order ss, rsp, eflags, cs, rip */
xorl %eax, %eax
- pushq $__KERNEL_DS /* ss */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $__KERNEL_DS /* ss */
/*CFI_REL_OFFSET ss,0*/
- pushq %rax /* rsp */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rax /* rsp */
CFI_REL_OFFSET rsp,0
- pushq $X86_EFLAGS_IF /* eflags - interrupts on */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
/*CFI_REL_OFFSET rflags,0*/
- pushq $__KERNEL_CS /* cs */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $__KERNEL_CS /* cs */
/*CFI_REL_OFFSET cs,0*/
- pushq \child_rip /* rip */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi \child_rip /* rip */
CFI_REL_OFFSET rip,0
- pushq %rax /* orig rax */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rax /* orig rax */
.endm
.macro UNFAKE_STACK_FRAME
.endm
/* save partial stack frame */
-ENTRY(save_args)
- XCPT_FRAME
+ .macro SAVE_ARGS_IRQ
cld
- movq_cfi rdi, RDI+16-ARGOFFSET
- movq_cfi rsi, RSI+16-ARGOFFSET
- movq_cfi rdx, RDX+16-ARGOFFSET
- movq_cfi rcx, RCX+16-ARGOFFSET
- movq_cfi rax, RAX+16-ARGOFFSET
- movq_cfi r8, R8+16-ARGOFFSET
- movq_cfi r9, R9+16-ARGOFFSET
- movq_cfi r10, R10+16-ARGOFFSET
- movq_cfi r11, R11+16-ARGOFFSET
-
- leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
- movq_cfi rbp, 8 /* push %rbp */
- leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
- testl $3, CS(%rdi)
+ /* start from rbp in pt_regs and jump over */
+ movq_cfi rdi, RDI-RBP
+ movq_cfi rsi, RSI-RBP
+ movq_cfi rdx, RDX-RBP
+ movq_cfi rcx, RCX-RBP
+ movq_cfi rax, RAX-RBP
+ movq_cfi r8, R8-RBP
+ movq_cfi r9, R9-RBP
+ movq_cfi r10, R10-RBP
+ movq_cfi r11, R11-RBP
+
+ /* Save rbp so that we can unwind from get_irq_regs() */
+ movq_cfi rbp, 0
+
+ /* Save previous stack value */
+ movq %rsp, %rsi
+
+ leaq -RBP(%rsp),%rdi /* arg1 for handler */
+ testl $3, CS-RBP(%rsi)
je 1f
SWAPGS
/*
* moving irq_enter into assembly, which would be too much work)
*/
1: incl PER_CPU_VAR(irq_count)
- jne 2f
- popq_cfi %rax /* move return address... */
- mov PER_CPU_VAR(irq_stack_ptr),%rsp
- EMPTY_FRAME 0
- pushq_cfi %rbp /* backlink for unwinder */
- pushq_cfi %rax /* ... to the new stack */
- /*
- * We entered an interrupt context - irqs are off:
- */
-2: TRACE_IRQS_OFF
- ret
- CFI_ENDPROC
-END(save_args)
+ cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
+ CFI_DEF_CFA_REGISTER rsi
+
+ /* Store previous stack value */
+ pushq %rsi
+ CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
+ 0x77 /* DW_OP_breg7 */, 0, \
+ 0x06 /* DW_OP_deref */, \
+ 0x08 /* DW_OP_const1u */, SS+8-RBP, \
+ 0x22 /* DW_OP_plus */
+ /* We entered an interrupt context - irqs are off: */
+ TRACE_IRQS_OFF
+ .endm
ENTRY(save_rest)
PARTIAL_FRAME 1 REST_SKIP+8
LOCK ; btr $TIF_FORK,TI_flags(%r8)
- push kernel_eflags(%rip)
- CFI_ADJUST_CFA_OFFSET 8
- popf # reset kernel eflags
- CFI_ADJUST_CFA_OFFSET -8
+ pushq_cfi kernel_eflags(%rip)
+ popfq_cfi # reset kernel eflags
call schedule_tail # rdi: 'prev' task parameter
RESTORE_REST
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
- je int_ret_from_sys_call
+ jz retint_restore_args
testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
jnz int_ret_from_sys_call
END(ret_from_fork)
/*
- * System call entry. Upto 6 arguments in registers are supported.
+ * System call entry. Up to 6 arguments in registers are supported.
*
* SYSCALL does not save anything on the stack and does not change the
* stack pointer.
* after the swapgs, so that it can do the swapgs
* for the guest and jump here on syscall.
*/
-ENTRY(system_call_after_swapgs)
+GLOBAL(system_call_after_swapgs)
movq %rsp,PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack),%rsp
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,1
+ SAVE_ARGS 8,0
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
- GET_THREAD_INFO(%rcx)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz tracesys
system_call_fastpath:
+#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
+#else
+ andl $__SYSCALL_MASK,%eax
+ cmpl $__NR_syscall_max,%eax
+#endif
ja badsys
movq %r10,%rcx
call *sys_call_table(,%rax,8) # XXX: rip relative
/* edi: flagmask */
sysret_check:
LOCKDEP_SYS_EXIT
- GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- movl TI_flags(%rcx),%edx
+ movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
andl %edi,%edx
jnz sysret_careful
CFI_REMEMBER_STATE
TRACE_IRQS_ON
movq RIP-ARGOFFSET(%rsp),%rcx
CFI_REGISTER rip,rcx
- RESTORE_ARGS 0,-ARG_SKIP,1
+ RESTORE_ARGS 1,-ARG_SKIP,0
/*CFI_REGISTER rflags,r11*/
movq PER_CPU_VAR(old_rsp), %rsp
USERGS_SYSRET64
jnc sysret_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
jmp sysret_check
/* Handle a signal */
bt $TIF_SYSCALL_AUDIT,%edx
jc sysret_audit
#endif
- /* edx: work flags (arg3) */
- leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
- xorl %esi,%esi # oldset -> arg2
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
- call do_notify_resume
- RESTORE_TOP_OF_STACK %r11
- RESTORE_REST
- movl $_TIF_WORK_MASK,%edi
- /* Use IRET because user could have changed frame. This
- works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp int_with_check
+ /*
+ * We have a signal, or exit tracing or single-step.
+ * These all wind up with the iret return path anyway,
+ * so just join that path right now.
+ */
+ FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
+ jmp int_check_syscall_exit_work
badsys:
movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
#ifdef CONFIG_AUDITSYSCALL
/*
* Fast path for syscall audit without full syscall trace.
- * We just call audit_syscall_entry() directly, and then
+ * We just call __audit_syscall_entry() directly, and then
* jump back to the normal fast path.
*/
auditsys:
movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
movq %rax,%rsi /* 2nd arg: syscall number */
movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
- call audit_syscall_entry
+ call __audit_syscall_entry
LOAD_ARGS 0 /* reload call-clobbered registers */
jmp system_call_fastpath
/*
- * Return fast path for syscall audit. Call audit_syscall_exit()
+ * Return fast path for syscall audit. Call __audit_syscall_exit()
* directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
* masked off.
*/
sysret_audit:
- movq %rax,%rsi /* second arg, syscall return value */
- cmpq $0,%rax /* is it < 0? */
- setl %al /* 1 if so, 0 if not */
+ movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
+ cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
+ setbe %al /* 1 if so, 0 if not */
movzbl %al,%edi /* zero-extend that into %edi */
- inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
- call audit_syscall_exit
+ call __audit_syscall_exit
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
jmp sysret_check
#endif /* CONFIG_AUDITSYSCALL */
/* Do syscall tracing */
tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz auditsys
#endif
SAVE_REST
*/
LOAD_ARGS ARGOFFSET, 1
RESTORE_REST
+#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
+#else
+ andl $__SYSCALL_MASK,%eax
+ cmpl $__NR_syscall_max,%eax
+#endif
ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
movq %r10,%rcx /* fixup for C */
call *sys_call_table(,%rax,8)
GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $3,CS-ARGOFFSET(%rsp)
- je retint_restore_args
movl $_TIF_ALLWORK_MASK,%edi
/* edi: mask to check */
GLOBAL(int_with_check)
jnc int_very_careful
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
+int_check_syscall_exit_work:
SAVE_REST
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
leaq 8(%rsp),%rdi # &ptregs -> arg1
call syscall_trace_leave
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
jmp int_restore_rest
ENTRY(stub_execve)
CFI_STARTPROC
- popq %r11
- CFI_ADJUST_CFA_OFFSET -8
- CFI_REGISTER rip, r11
+ addq $8, %rsp
+ PARTIAL_FRAME 0
SAVE_REST
FIXUP_TOP_OF_STACK %r11
movq %rsp, %rcx
ENTRY(stub_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
+ PARTIAL_FRAME 0
SAVE_REST
movq %rsp,%rdi
FIXUP_TOP_OF_STACK %r11
CFI_ENDPROC
END(stub_rt_sigreturn)
+#ifdef CONFIG_X86_X32_ABI
+ PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx
+
+ENTRY(stub_x32_rt_sigreturn)
+ CFI_STARTPROC
+ addq $8, %rsp
+ PARTIAL_FRAME 0
+ SAVE_REST
+ movq %rsp,%rdi
+ FIXUP_TOP_OF_STACK %r11
+ call sys32_x32_rt_sigreturn
+ movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_x32_rt_sigreturn)
+
+ENTRY(stub_x32_execve)
+ CFI_STARTPROC
+ addq $8, %rsp
+ PARTIAL_FRAME 0
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ movq %rsp, %rcx
+ call sys32_execve
+ RESTORE_TOP_OF_STACK %r11
+ movq %rax,RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_x32_execve)
+
+#endif
+
/*
* Build the entry stubs and pointer table with some assembler magic.
* We pack 7 stubs into a single 32-byte chunk, which will fit in a
*/
.section .init.rodata,"a"
ENTRY(interrupt)
- .text
+ .section .entry.text
.p2align 5
.p2align CONFIG_X86_L1_CACHE_SHIFT
ENTRY(irq_entries_start)
.if vector <> FIRST_EXTERNAL_VECTOR
CFI_ADJUST_CFA_OFFSET -8
.endif
-1: pushq $(~vector+0x80) /* Note: always in signed byte range */
- CFI_ADJUST_CFA_OFFSET 8
+1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
.if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
jmp 2f
.endif
.previous
.quad 1b
- .text
+ .section .entry.text
vector=vector+1
.endif
.endr
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- subq $10*8, %rsp
- CFI_ADJUST_CFA_OFFSET 10*8
- call save_args
- PARTIAL_FRAME 0
+ /* reserve pt_regs for scratch regs and rbp */
+ subq $ORIG_RAX-RBP, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
+ SAVE_ARGS_IRQ
call \func
.endm
+/*
+ * Interrupt entry/exit should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
/*
* The interrupt stubs push (~vector+0x80) onto the stack and
* then jump to common_interrupt.
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count)
- leaveq
+
+ /* Restore saved previous stack */
+ popq %rsi
+ CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
+ leaq ARGOFFSET-RBP(%rsi), %rsp
CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET -8
+ CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
+
exit_intr:
GET_THREAD_INFO(%rcx)
testl $3,CS-ARGOFFSET(%rsp)
*/
TRACE_IRQS_IRETQ
restore_args:
- RESTORE_ARGS 0,8,0
+ RESTORE_ARGS 1,8,1
irq_return:
INTERRUPT_RETURN
jnc retint_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
CFI_ENDPROC
END(common_interrupt)
+/*
+ * End of kprobes section
+ */
+ .popsection
/*
* APIC interrupts.
.macro apicinterrupt num sym do_sym
ENTRY(\sym)
INTR_FRAME
- pushq $~(\num)
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $~(\num)
+.Lcommon_\sym:
interrupt \do_sym
jmp ret_from_intr
CFI_ENDPROC
#endif
apicinterrupt LOCAL_TIMER_VECTOR \
apic_timer_interrupt smp_apic_timer_interrupt
-apicinterrupt GENERIC_INTERRUPT_VECTOR \
- generic_interrupt smp_generic_interrupt
+apicinterrupt X86_PLATFORM_IPI_VECTOR \
+ x86_platform_ipi smp_x86_platform_ipi
#ifdef CONFIG_SMP
-apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
- invalidate_interrupt0 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
- invalidate_interrupt1 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
- invalidate_interrupt2 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
- invalidate_interrupt3 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
- invalidate_interrupt4 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
- invalidate_interrupt5 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
- invalidate_interrupt6 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
- invalidate_interrupt7 smp_invalidate_interrupt
+ ALIGN
+ INTR_FRAME
+.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
+ 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+.if NUM_INVALIDATE_TLB_VECTORS > \idx
+ENTRY(invalidate_interrupt\idx)
+ pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
+ jmp .Lcommon_invalidate_interrupt0
+ CFI_ADJUST_CFA_OFFSET -8
+END(invalidate_interrupt\idx)
+.endif
+.endr
+ CFI_ENDPROC
+apicinterrupt INVALIDATE_TLB_VECTOR_START, \
+ invalidate_interrupt0, smp_invalidate_interrupt
#endif
apicinterrupt THRESHOLD_APIC_VECTOR \
apicinterrupt THERMAL_APIC_VECTOR \
thermal_interrupt smp_thermal_interrupt
-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
- mce_self_interrupt smp_mce_self_interrupt
-#endif
-
#ifdef CONFIG_SMP
apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
call_function_single_interrupt smp_call_function_single_interrupt
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt
-#ifdef CONFIG_PERF_COUNTERS
-apicinterrupt LOCAL_PENDING_VECTOR \
- perf_pending_interrupt smp_perf_pending_interrupt
+#ifdef CONFIG_IRQ_WORK
+apicinterrupt IRQ_WORK_VECTOR \
+ irq_work_interrupt smp_irq_work_interrupt
#endif
/*
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call error_entry
DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
- subq $15*8, %rsp
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
END(\sym)
.endm
+#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
.macro paranoidzeroentry_ist sym do_sym ist
ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
- subq $15*8, %rsp
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
- PER_CPU(init_tss, %rbp)
- subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
+ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
call \do_sym
- addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
+ addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
END(\sym)
ENTRY(\sym)
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call error_entry
DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
ENTRY(\sym)
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
DEFAULT_FRAME 0
TRACE_IRQS_OFF
errorentry alignment_check do_alignment_check
zeroentry simd_coprocessor_error do_simd_coprocessor_error
+
/* Reload gs selector with exception handling */
/* edi: new selector */
ENTRY(native_load_gs_index)
CFI_STARTPROC
- pushf
- CFI_ADJUST_CFA_OFFSET 8
+ pushfq_cfi
DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
SWAPGS
gs_change:
movl %edi,%gs
2: mfence /* workaround */
SWAPGS
- popf
- CFI_ADJUST_CFA_OFFSET -8
+ popfq_cfi
ret
CFI_ENDPROC
END(native_load_gs_index)
jmp 2b
.previous
-/*
- * Create a kernel thread.
- *
- * C extern interface:
- * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
- *
- * asm input arguments:
- * rdi: fn, rsi: arg, rdx: flags
- */
-ENTRY(kernel_thread)
- CFI_STARTPROC
- FAKE_STACK_FRAME $child_rip
- SAVE_ALL
-
- # rdi: flags, rsi: usp, rdx: will be &pt_regs
- movq %rdx,%rdi
- orq kernel_thread_flags(%rip),%rdi
- movq $-1, %rsi
- movq %rsp, %rdx
-
- xorl %r8d,%r8d
- xorl %r9d,%r9d
-
- # clone now
- call do_fork
- movq %rax,RAX(%rsp)
- xorl %edi,%edi
-
- /*
- * It isn't worth to check for reschedule here,
- * so internally to the x86_64 port you can rely on kernel_thread()
- * not to reschedule the child before returning, this avoids the need
- * of hacks for example to fork off the per-CPU idle tasks.
- * [Hopefully no generic code relies on the reschedule -AK]
- */
- RESTORE_ALL
- UNFAKE_STACK_FRAME
- ret
- CFI_ENDPROC
-END(kernel_thread)
-
-ENTRY(child_rip)
+ENTRY(kernel_thread_helper)
pushq $0 # fake return address
CFI_STARTPROC
/*
* Here we are in the child and the registers are set as they were
* at kernel_thread() invocation in the parent.
*/
- movq %rdi, %rax
- movq %rsi, %rdi
- call *%rax
+ call *%rsi
# exit
mov %eax, %edi
call do_exit
ud2 # padding for call trace
CFI_ENDPROC
-END(child_rip)
+END(kernel_thread_helper)
/*
* execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
*
* C extern interface:
- * extern long execve(char *name, char **argv, char **envp)
+ * extern long execve(const char *name, char **argv, char **envp)
*
* asm input arguments:
* rdi: name, rsi: argv, rdx: envp
*
* We want to fallback into:
- * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
+ * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
*
* do_sys_execve asm fallback arguments:
* rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
CFI_STARTPROC
- push %rbp
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rbp
CFI_REL_OFFSET rbp,0
mov %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
push %rbp # backlink for old unwinder
call __do_softirq
leaveq
+ CFI_RESTORE rbp
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
decl PER_CPU_VAR(irq_count)
decl PER_CPU_VAR(irq_count)
jmp error_exit
CFI_ENDPROC
-END(do_hypervisor_callback)
+END(xen_do_hypervisor_callback)
/*
* Hypervisor uses this for application faults while it executes.
CFI_ENDPROC
END(xen_failsafe_callback)
+apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
+ xen_hvm_callback_vector xen_evtchn_do_upcall
+
#endif /* CONFIG_XEN */
/*
#endif
errorentry general_protection do_general_protection
errorentry page_fault do_page_fault
+#ifdef CONFIG_KVM_GUEST
+errorentry async_page_fault do_async_page_fault
+#endif
#ifdef CONFIG_X86_MCE
paranoidzeroentry machine_check *machine_check_vector(%rip)
#endif
/* ebx: no swapgs flag */
ENTRY(paranoid_exit)
- INTR_FRAME
+ DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl %ebx,%ebx /* swapgs needed? */
error_sti:
TRACE_IRQS_OFF
ret
- CFI_ENDPROC
/*
* There are two places in the kernel that can potentially fault with
leaq irq_return(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
- movl %ecx,%ecx /* zero extend */
- cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
+ movl %ecx,%eax /* zero extend */
+ cmpq %rax,RIP+8(%rsp)
+ je bstep_iret
cmpq $gs_change,RIP+8(%rsp)
je error_swapgs
jmp error_sti
+
+bstep_iret:
+ /* Fix truncated RIP */
+ movq %rcx,RIP+8(%rsp)
+ jmp error_swapgs
+ CFI_ENDPROC
END(error_entry)
CFI_ENDPROC
END(error_exit)
+/*
+ * Test if a given stack is an NMI stack or not.
+ */
+ .macro test_in_nmi reg stack nmi_ret normal_ret
+ cmpq %\reg, \stack
+ ja \normal_ret
+ subq $EXCEPTION_STKSZ, %\reg
+ cmpq %\reg, \stack
+ jb \normal_ret
+ jmp \nmi_ret
+ .endm
/* runs on exception stack */
ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1
- subq $15*8, %rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ /*
+ * We allow breakpoints in NMIs. If a breakpoint occurs, then
+ * the iretq it performs will take us out of NMI context.
+ * This means that we can have nested NMIs where the next
+ * NMI is using the top of the stack of the previous NMI. We
+ * can't let it execute because the nested NMI will corrupt the
+ * stack of the previous NMI. NMI handlers are not re-entrant
+ * anyway.
+ *
+ * To handle this case we do the following:
+ * Check the a special location on the stack that contains
+ * a variable that is set when NMIs are executing.
+ * The interrupted task's stack is also checked to see if it
+ * is an NMI stack.
+ * If the variable is not set and the stack is not the NMI
+ * stack then:
+ * o Set the special variable on the stack
+ * o Copy the interrupt frame into a "saved" location on the stack
+ * o Copy the interrupt frame into a "copy" location on the stack
+ * o Continue processing the NMI
+ * If the variable is set or the previous stack is the NMI stack:
+ * o Modify the "copy" location to jump to the repeate_nmi
+ * o return back to the first NMI
+ *
+ * Now on exit of the first NMI, we first clear the stack variable
+ * The NMI stack will tell any nested NMIs at that point that it is
+ * nested. Then we pop the stack normally with iret, and if there was
+ * a nested NMI that updated the copy interrupt stack frame, a
+ * jump will be made to the repeat_nmi code that will handle the second
+ * NMI.
+ */
+
+ /* Use %rdx as out temp variable throughout */
+ pushq_cfi %rdx
+ CFI_REL_OFFSET rdx, 0
+
+ /*
+ * If %cs was not the kernel segment, then the NMI triggered in user
+ * space, which means it is definitely not nested.
+ */
+ cmpl $__KERNEL_CS, 16(%rsp)
+ jne first_nmi
+
+ /*
+ * Check the special variable on the stack to see if NMIs are
+ * executing.
+ */
+ cmpl $1, -8(%rsp)
+ je nested_nmi
+
+ /*
+ * Now test if the previous stack was an NMI stack.
+ * We need the double check. We check the NMI stack to satisfy the
+ * race when the first NMI clears the variable before returning.
+ * We check the variable because the first NMI could be in a
+ * breakpoint routine using a breakpoint stack.
+ */
+ lea 6*8(%rsp), %rdx
+ test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+ CFI_REMEMBER_STATE
+
+nested_nmi:
+ /*
+ * Do nothing if we interrupted the fixup in repeat_nmi.
+ * It's about to repeat the NMI handler, so we are fine
+ * with ignoring this one.
+ */
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+
+1:
+ /* Set up the interrupted NMIs stack to jump to repeat_nmi */
+ leaq -6*8(%rsp), %rdx
+ movq %rdx, %rsp
+ CFI_ADJUST_CFA_OFFSET 6*8
+ pushq_cfi $__KERNEL_DS
+ pushq_cfi %rdx
+ pushfq_cfi
+ pushq_cfi $__KERNEL_CS
+ pushq_cfi $repeat_nmi
+
+ /* Put stack back */
+ addq $(11*8), %rsp
+ CFI_ADJUST_CFA_OFFSET -11*8
+
+nested_nmi_out:
+ popq_cfi %rdx
+ CFI_RESTORE rdx
+
+ /* No need to check faults here */
+ INTERRUPT_RETURN
+
+ CFI_RESTORE_STATE
+first_nmi:
+ /*
+ * Because nested NMIs will use the pushed location that we
+ * stored in rdx, we must keep that space available.
+ * Here's what our stack frame will look like:
+ * +-------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +-------------------------+
+ * | temp storage for rdx |
+ * +-------------------------+
+ * | NMI executing variable |
+ * +-------------------------+
+ * | Saved SS |
+ * | Saved Return RSP |
+ * | Saved RFLAGS |
+ * | Saved CS |
+ * | Saved RIP |
+ * +-------------------------+
+ * | copied SS |
+ * | copied Return RSP |
+ * | copied RFLAGS |
+ * | copied CS |
+ * | copied RIP |
+ * +-------------------------+
+ * | pt_regs |
+ * +-------------------------+
+ *
+ * The saved stack frame is used to fix up the copied stack frame
+ * that a nested NMI may change to make the interrupted NMI iret jump
+ * to the repeat_nmi. The original stack frame and the temp storage
+ * is also used by nested NMIs and can not be trusted on exit.
+ */
+ /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
+ movq (%rsp), %rdx
+ CFI_RESTORE rdx
+
+ /* Set the NMI executing variable on the stack. */
+ pushq_cfi $1
+
+ /* Copy the stack frame to the Saved frame */
+ .rept 5
+ pushq_cfi 6*8(%rsp)
+ .endr
+ CFI_DEF_CFA_OFFSET SS+8-RIP
+
+ /* Everything up to here is safe from nested NMIs */
+
+ /*
+ * If there was a nested NMI, the first NMI's iret will return
+ * here. But NMIs are still enabled and we can take another
+ * nested NMI. The nested NMI checks the interrupted RIP to see
+ * if it is between repeat_nmi and end_repeat_nmi, and if so
+ * it will just return, as we are about to repeat an NMI anyway.
+ * This makes it safe to copy to the stack frame that a nested
+ * NMI will update.
+ */
+repeat_nmi:
+ /*
+ * Update the stack variable to say we are still in NMI (the update
+ * is benign for the non-repeat case, where 1 was pushed just above
+ * to this very stack slot).
+ */
+ movq $1, 5*8(%rsp)
+
+ /* Make another copy, this one may be modified by nested NMIs */
+ .rept 5
+ pushq_cfi 4*8(%rsp)
+ .endr
+ CFI_DEF_CFA_OFFSET SS+8-RIP
+end_repeat_nmi:
+
+ /*
+ * Everything below this point can be preempted by a nested
+ * NMI if the first NMI took an exception and reset our iret stack
+ * so that we repeat another NMI.
+ */
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+ /*
+ * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+ * as we should not be calling schedule in NMI context.
+ * Even with normal interrupts enabled. An NMI should not be
+ * setting NEED_RESCHED or anything that normal interrupts and
+ * exceptions might do.
+ */
call save_paranoid
DEFAULT_FRAME 0
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp,%rdi
movq $-1,%rsi
call do_nmi
-#ifdef CONFIG_TRACE_IRQFLAGS
- /* paranoidexit; without TRACE_IRQS_OFF */
- /* ebx: no swapgs flag */
- DISABLE_INTERRUPTS(CLBR_NONE)
testl %ebx,%ebx /* swapgs needed? */
jnz nmi_restore
- testl $3,CS(%rsp)
- jnz nmi_userspace
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_ALL 8
+ /* Clear the NMI executing stack variable */
+ movq $0, 10*8(%rsp)
jmp irq_return
-nmi_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz nmi_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz nmi_schedule
- movl %ebx,%edx /* arg3: thread flags */
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- jmp nmi_userspace
-nmi_schedule:
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- jmp nmi_userspace
CFI_ENDPROC
-#else
- jmp paranoid_exit
- CFI_ENDPROC
-#endif
END(nmi)
ENTRY(ignore_sysret)