[IA64] wire up sendmmsg() syscall for Itanium
[linux-2.6.git] / arch / ia64 / kernel / entry.S
index d3f0938..97dd2ab 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * ia64/kernel/entry.S
+ * arch/ia64/kernel/entry.S
  *
  * Kernel entry points.
  *
  * 11/07/2000
  */
 /*
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *                    pv_ops.
+ */
+/*
  * Global (preserved) predicate usage on syscall entry/exit path:
  *
  *     pKStk:          See entry.h.
  *     pNonSys:        !pSys
  */
 
-#include <linux/config.h>
 
 #include <asm/asmmacro.h>
 #include <asm/cache.h>
 #include <asm/errno.h>
 #include <asm/kregs.h>
-#include <asm/offsets.h>
+#include <asm/asm-offsets.h>
 #include <asm/pgtable.h>
 #include <asm/percpu.h>
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 #include <asm/unistd.h>
+#include <asm/ftrace.h>
 
 #include "minstate.h"
 
+#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE
        /*
         * execve() is special because in case of success, we need to
         * setup a null register window frame.
@@ -65,15 +71,6 @@ ENTRY(ia64_execve)
        add out3=16,sp                  // regs
        br.call.sptk.many rp=sys_execve
 .ret0:
-#ifdef CONFIG_IA32_SUPPORT
-       /*
-        * Check if we're returning to ia32 mode. If so, we need to restore ia32 registers
-        * from pt_regs.
-        */
-       adds r16=PT(CR_IPSR)+16,sp
-       ;;
-       ld8 r16=[r16]
-#endif
        cmp4.ge p6,p7=r8,r0
        mov ar.pfs=loc1                 // restore ar.pfs
        sxt4 r8=r8                      // return 64-bit result
@@ -102,12 +99,6 @@ ENTRY(ia64_execve)
        ldf.fill f23=[sp];      ldf.fill f24=[sp];      mov f25=f0
        ldf.fill f26=[sp];      ldf.fill f27=[sp];      mov f28=f0
        ldf.fill f29=[sp];      ldf.fill f30=[sp];      mov f31=f0
-#ifdef CONFIG_IA32_SUPPORT
-       tbit.nz p6,p0=r16, IA64_PSR_IS_BIT
-       movl loc0=ia64_ret_from_ia32_execve
-       ;;
-(p6)   mov rp=loc0
-#endif
        br.ret.sptk.many rp
 END(ia64_execve)
 
@@ -174,6 +165,7 @@ GLOBAL_ENTRY(sys_clone)
        mov rp=loc0
        br.ret.sptk.many rp
 END(sys_clone)
+#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
 
 /*
  * prev_task <- ia64_switch_to(struct task_struct *next)
@@ -181,7 +173,7 @@ END(sys_clone)
  *     called.  The code starting at .map relies on this.  The rest of the code
  *     doesn't care about the interrupt masking status.
  */
-GLOBAL_ENTRY(ia64_switch_to)
+GLOBAL_ENTRY(__paravirt_switch_to)
        .prologue
        alloc r16=ar.pfs,1,0,0,0
        DO_SAVE_SWITCH_STACK
@@ -204,11 +196,8 @@ GLOBAL_ENTRY(ia64_switch_to)
 (p6)   br.cond.dpnt .map
        ;;
 .done:
-(p6)   ssm psr.ic                      // if we had to map, reenable the psr.ic bit FIRST!!!
-       ;;
-(p6)   srlz.d
        ld8 sp=[r21]                    // load kernel stack pointer of new task
-       mov IA64_KR(CURRENT)=in0        // update "current" application register
+       MOV_TO_KR(CURRENT, in0, r8, r9)         // update "current" application register
        mov r8=r13                      // return pointer to previously running task
        mov r13=in0                     // set "current" pointer
        ;;
@@ -220,23 +209,25 @@ GLOBAL_ENTRY(ia64_switch_to)
        br.ret.sptk.many rp             // boogie on out in new context
 
 .map:
-       rsm psr.ic                      // interrupts (psr.i) are already disabled here
+       RSM_PSR_IC(r25)                 // interrupts (psr.i) are already disabled here
        movl r25=PAGE_KERNEL
        ;;
        srlz.d
        or r23=r25,r20                  // construct PA | page properties
        mov r25=IA64_GRANULE_SHIFT<<2
        ;;
-       mov cr.itir=r25
-       mov cr.ifa=in0                  // VA of next task...
+       MOV_TO_ITIR(p0, r25, r8)
+       MOV_TO_IFA(in0, r8)             // VA of next task...
        ;;
        mov r25=IA64_TR_CURRENT_STACK
-       mov IA64_KR(CURRENT_STACK)=r26  // remember last page we mapped...
+       MOV_TO_KR(CURRENT_STACK, r26, r8, r9)   // remember last page we mapped...
        ;;
        itr.d dtr[r25]=r23              // wire in new mapping...
+       SSM_PSR_IC_AND_SRLZ_D(r8, r9)   // reenable the psr.ic bit
        br.cond.sptk .done
-END(ia64_switch_to)
+END(__paravirt_switch_to)
 
+#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE
 /*
  * Note that interrupts are enabled during save_switch_stack and load_switch_stack.  This
  * means that we may get an interrupt with "sp" pointing to the new kernel stack while
@@ -376,7 +367,7 @@ END(save_switch_stack)
  *     - b7 holds address to return to
  *     - must not touch r8-r11
  */
-ENTRY(load_switch_stack)
+GLOBAL_ENTRY(load_switch_stack)
        .prologue
        .altrp b7
 
@@ -470,23 +461,35 @@ ENTRY(load_switch_stack)
        br.cond.sptk.many b7
 END(load_switch_stack)
 
-GLOBAL_ENTRY(__ia64_syscall)
-       .regstk 6,0,0,0
-       mov r15=in5                             // put syscall number in place
-       break __BREAK_SYSCALL
-       movl r2=errno
-       cmp.eq p6,p7=-1,r10
+GLOBAL_ENTRY(prefetch_stack)
+       add r14 = -IA64_SWITCH_STACK_SIZE, sp
+       add r15 = IA64_TASK_THREAD_KSP_OFFSET, in0
+       ;;
+       ld8 r16 = [r15]                         // load next's stack pointer
+       lfetch.fault.excl [r14], 128
+       ;;
+       lfetch.fault.excl [r14], 128
+       lfetch.fault [r16], 128
+       ;;
+       lfetch.fault.excl [r14], 128
+       lfetch.fault [r16], 128
        ;;
-(p6)   st4 [r2]=r8
-(p6)   mov r8=-1
+       lfetch.fault.excl [r14], 128
+       lfetch.fault [r16], 128
+       ;;
+       lfetch.fault.excl [r14], 128
+       lfetch.fault [r16], 128
+       ;;
+       lfetch.fault [r16], 128
        br.ret.sptk.many rp
-END(__ia64_syscall)
+END(prefetch_stack)
 
-GLOBAL_ENTRY(execve)
+GLOBAL_ENTRY(kernel_execve)
+       rum psr.ac
        mov r15=__NR_execve                     // put syscall number in place
        break __BREAK_SYSCALL
        br.ret.sptk.many rp
-END(execve)
+END(kernel_execve)
 
 GLOBAL_ENTRY(clone)
        mov r15=__NR_clone                      // put syscall number in place
@@ -518,6 +521,11 @@ GLOBAL_ENTRY(ia64_trace_syscall)
        stf.spill [r16]=f10
        stf.spill [r17]=f11
        br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args
+       cmp.lt p6,p0=r8,r0                      // check tracehook
+       adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
+       adds r3=PT(R10)+16,sp                   // r3 = &pt_regs.r10
+       mov r10=0
+(p6)   br.cond.sptk strace_error               // syscall failed ->
        adds r16=PT(F6)+16,sp
        adds r17=PT(F7)+16,sp
        ;;
@@ -558,7 +566,10 @@ GLOBAL_ENTRY(ia64_trace_syscall)
 .mem.offset 0,0; st8.spill [r2]=r8             // store return value in slot for r8
 .mem.offset 8,0; st8.spill [r3]=r10            // clear error indication in slot for r10
        br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
-.ret3: br.cond.sptk .work_pending_syscall_end
+.ret3:
+(pUStk)        cmp.eq.unc p6,p0=r0,r0                  // p6 <- pUStk
+(pUStk)        rsm psr.i                               // disable interrupts
+       br.cond.sptk ia64_work_pending_syscall_end
 
 strace_error:
        ld8 r3=[r2]                             // load pt_regs.r8
@@ -623,8 +634,17 @@ GLOBAL_ENTRY(ia64_ret_from_syscall)
        adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
        mov r10=r0                              // clear error indication in r10
 (p7)   br.cond.spnt handle_syscall_error       // handle potential syscall failure
+#ifdef CONFIG_PARAVIRT
+       ;;
+       br.cond.sptk.few ia64_leave_syscall
+       ;;
+#endif /* CONFIG_PARAVIRT */
 END(ia64_ret_from_syscall)
+#ifndef CONFIG_PARAVIRT
        // fall through
+#endif
+#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
+
 /*
  * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
  *     need to switch to bank 0 and doesn't restore the scratch registers.
@@ -637,7 +657,7 @@ END(ia64_ret_from_syscall)
  *           r8-r11: restored (syscall return value(s))
  *              r12: restored (user-level stack pointer)
  *              r13: restored (user-level thread pointer)
- *              r14: cleared
+ *              r14: set to __kernel_syscall_via_epc
  *              r15: restored (syscall #)
  *          r16-r17: cleared
  *              r18: user-level b6
@@ -658,7 +678,7 @@ END(ia64_ret_from_syscall)
  *               pr: restored (user-level pr)
  *               b0: restored (user-level rp)
  *               b6: restored
- *               b7: cleared
+ *               b7: set to __kernel_syscall_via_epc
  *          ar.unat: restored (user-level ar.unat)
  *           ar.pfs: restored (user-level ar.pfs)
  *           ar.rsc: restored (user-level ar.rsc)
@@ -669,7 +689,7 @@ END(ia64_ret_from_syscall)
  *           ar.csd: cleared
  *           ar.ssd: cleared
  */
-ENTRY(ia64_leave_syscall)
+GLOBAL_ENTRY(__paravirt_leave_syscall)
        PT_REGS_UNWIND_INFO(0)
        /*
         * work.need_resched etc. mustn't get changed by this CPU before it returns to
@@ -679,11 +699,11 @@ ENTRY(ia64_leave_syscall)
         * extra work.  We always check for extra work when returning to user-level.
         * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
         * is 0.  After extra work processing has been completed, execution
-        * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
+        * resumes at ia64_work_processed_syscall with p6 set to 1 if the extra-work-check
         * needs to be redone.
         */
 #ifdef CONFIG_PREEMPT
-       rsm psr.i                               // disable interrupts
+       RSM_PSR_I(p0, r2, r18)                  // disable interrupts
        cmp.eq pLvSys,p0=r0,r0                  // pLvSys=1: leave from syscall
 (pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
        ;;
@@ -693,99 +713,127 @@ ENTRY(ia64_leave_syscall)
        ;;
        cmp.eq p6,p0=r21,r0             // p6 <- pUStk || (preempt_count == 0)
 #else /* !CONFIG_PREEMPT */
-(pUStk)        rsm psr.i
+       RSM_PSR_I(pUStk, r2, r18)
        cmp.eq pLvSys,p0=r0,r0          // pLvSys=1: leave from syscall
 (pUStk)        cmp.eq.unc p6,p0=r0,r0          // p6 <- pUStk
 #endif
-.work_processed_syscall:
+.global __paravirt_work_processed_syscall;
+__paravirt_work_processed_syscall:
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+       adds r2=PT(LOADRS)+16,r12
+       MOV_FROM_ITC(pUStk, p9, r22, r19)       // fetch time at leave
+       adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
+       ;;
+(p6)   ld4 r31=[r18]                           // load current_thread_info()->flags
+       ld8 r19=[r2],PT(B6)-PT(LOADRS)          // load ar.rsc value for "loadrs"
+       adds r3=PT(AR_BSPSTORE)+16,r12          // deferred
+       ;;
+#else
        adds r2=PT(LOADRS)+16,r12
        adds r3=PT(AR_BSPSTORE)+16,r12
        adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
        ;;
 (p6)   ld4 r31=[r18]                           // load current_thread_info()->flags
        ld8 r19=[r2],PT(B6)-PT(LOADRS)          // load ar.rsc value for "loadrs"
-       mov b7=r0               // clear b7
+       nop.i 0
        ;;
-       ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE)    // load ar.bspstore (may be garbage)
+#endif
+       mov r16=ar.bsp                          // M2  get existing backing store pointer
        ld8 r18=[r2],PT(R9)-PT(B6)              // load b6
 (p6)   and r15=TIF_WORK_MASK,r31               // any work other than TIF_SYSCALL_TRACE?
        ;;
-       mov r16=ar.bsp                          // M2  get existing backing store pointer
+       ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE)    // load ar.bspstore (may be garbage)
 (p6)   cmp4.ne.unc p6,p0=r15, r0               // any special work pending?
 (p6)   br.cond.spnt .work_pending_syscall
        ;;
        // start restoring the state saved on the kernel stack (struct pt_regs):
        ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
        ld8 r11=[r3],PT(CR_IIP)-PT(R11)
-       mov f6=f0               // clear f6
+(pNonSys) break 0              //      bug check: we shouldn't be here if pNonSys is TRUE!
        ;;
        invala                  // M0|1 invalidate ALAT
-       rsm psr.i | psr.ic      // M2 initiate turning off of interrupt and interruption collection
-       mov f9=f0               // clear f9
+       RSM_PSR_I_IC(r28, r29, r30)     // M2   turn off interrupts and interruption collection
+       cmp.eq p9,p0=r0,r0      // A    set p9 to indicate that we should restore cr.ifs
 
-       ld8 r29=[r2],16         // load cr.ipsr
-       ld8 r28=[r3],16                 // load cr.iip
-       mov f8=f0               // clear f8
+       ld8 r29=[r2],16         // M0|1 load cr.ipsr
+       ld8 r28=[r3],16         // M0|1 load cr.iip
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+(pUStk) add r14=TI_AC_LEAVE+IA64_TASK_SIZE,r13
        ;;
        ld8 r30=[r2],16         // M0|1 load cr.ifs
        ld8 r25=[r3],16         // M0|1 load ar.unat
-       cmp.eq p9,p0=r0,r0      // set p9 to indicate that we should restore cr.ifs
+(pUStk) add r15=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
+       ;;
+#else
+       mov r22=r0              // A    clear r22
+       ;;
+       ld8 r30=[r2],16         // M0|1 load cr.ifs
+       ld8 r25=[r3],16         // M0|1 load ar.unat
+(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
        ;;
+#endif
        ld8 r26=[r2],PT(B0)-PT(AR_PFS)  // M0|1 load ar.pfs
-(pKStk)        mov r22=psr             // M2 read PSR now that interrupts are disabled
-       mov f10=f0              // clear f10
+       MOV_FROM_PSR(pKStk, r22, r21)   // M2   read PSR now that interrupts are disabled
+       nop 0
        ;;
-       ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // load b0
-       ld8 r27=[r3],PT(PR)-PT(AR_RSC)  // load ar.rsc
-       mov f11=f0              // clear f11
+       ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // M0|1 load b0
+       ld8 r27=[r3],PT(PR)-PT(AR_RSC)  // M0|1 load ar.rsc
+       mov f6=f0                       // F    clear f6
        ;;
-       ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT)    // load ar.rnat (may be garbage)
-       ld8 r31=[r3],PT(R1)-PT(PR)              // load predicates
-(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
+       ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT)    // M0|1 load ar.rnat (may be garbage)
+       ld8 r31=[r3],PT(R1)-PT(PR)              // M0|1 load predicates
+       mov f7=f0                               // F    clear f7
        ;;
-       ld8 r20=[r2],PT(R12)-PT(AR_FPSR)        // load ar.fpsr
-       ld8.fill r1=[r3],16     // load r1
-(pUStk) mov r17=1
+       ld8 r20=[r2],PT(R12)-PT(AR_FPSR)        // M0|1 load ar.fpsr
+       ld8.fill r1=[r3],16                     // M0|1 load r1
+(pUStk) mov r17=1                              // A
        ;;
-       srlz.d                  // M0  ensure interruption collection is off
-       ld8.fill r13=[r3],16
-       mov f7=f0               // clear f7
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+(pUStk) st1 [r15]=r17                          // M2|3
+#else
+(pUStk) st1 [r14]=r17                          // M2|3
+#endif
+       ld8.fill r13=[r3],16                    // M0|1
+       mov f8=f0                               // F    clear f8
        ;;
-       ld8.fill r12=[r2]       // restore r12 (sp)
-       mov.m ar.ssd=r0         // M2 clear ar.ssd
-       mov r22=r0              // clear r22
+       ld8.fill r12=[r2]                       // M0|1 restore r12 (sp)
+       ld8.fill r15=[r3]                       // M0|1 restore r15
+       mov b6=r18                              // I0   restore b6
 
-       ld8.fill r15=[r3]       // restore r15
-(pUStk) st1 [r14]=r17
-       addl r3=THIS_CPU(ia64_phys_stacked_size_p8),r0
+       LOAD_PHYS_STACK_REG_SIZE(r17)
+       mov f9=f0                                       // F    clear f9
+(pKStk) br.cond.dpnt.many skip_rbs_switch              // B
+
+       srlz.d                          // M0   ensure interruption collection is off (for cover)
+       shr.u r18=r19,16                // I0|1 get byte size of existing "dirty" partition
+       COVER                           // B    add current frame into dirty partition & set cr.ifs
+       ;;
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+       mov r19=ar.bsp                  // M2   get new backing store pointer
+       st8 [r14]=r22                   // M    save time at leave
+       mov f10=f0                      // F    clear f10
+
+       mov r22=r0                      // A    clear r22
+       movl r14=__kernel_syscall_via_epc // X
        ;;
-(pUStk)        ld4 r17=[r3]            // r17 = cpu_data->phys_stacked_size_p8
-       mov.m ar.csd=r0         // M2 clear ar.csd
-       mov b6=r18              // I0  restore b6
+#else
+       mov r19=ar.bsp                  // M2   get new backing store pointer
+       mov f10=f0                      // F    clear f10
+
+       nop.m 0
+       movl r14=__kernel_syscall_via_epc // X
        ;;
-       mov r14=r0              // clear r14
-       shr.u r18=r19,16        // I0|1 get byte size of existing "dirty" partition
-(pKStk) br.cond.dpnt.many skip_rbs_switch
+#endif
+       mov.m ar.csd=r0                 // M2   clear ar.csd
+       mov.m ar.ccv=r0                 // M2   clear ar.ccv
+       mov b7=r14                      // I0   clear b7 (hint with __kernel_syscall_via_epc)
 
-       mov.m ar.ccv=r0         // clear ar.ccv
-(pNonSys) br.cond.dpnt.many dont_preserve_current_frame
-       br.cond.sptk.many rbs_switch
-END(ia64_leave_syscall)
+       mov.m ar.ssd=r0                 // M2   clear ar.ssd
+       mov f11=f0                      // F    clear f11
+       br.cond.sptk.many rbs_switch    // B
+END(__paravirt_leave_syscall)
 
-#ifdef CONFIG_IA32_SUPPORT
-GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
-       PT_REGS_UNWIND_INFO(0)
-       adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
-       adds r3=PT(R10)+16,sp                   // r3 = &pt_regs.r10
-       ;;
-       .mem.offset 0,0
-       st8.spill [r2]=r8       // store return value in slot for r8 and set unat bit
-       .mem.offset 8,0
-       st8.spill [r3]=r0       // clear error indication in slot for r10 and set unat bit
-END(ia64_ret_from_ia32_execve_syscall)
-       // fall through
-#endif /* CONFIG_IA32_SUPPORT */
-GLOBAL_ENTRY(ia64_leave_kernel)
+GLOBAL_ENTRY(__paravirt_leave_kernel)
        PT_REGS_UNWIND_INFO(0)
        /*
         * work.need_resched etc. mustn't get changed by this CPU before it returns to
@@ -799,7 +847,7 @@ GLOBAL_ENTRY(ia64_leave_kernel)
         * needs to be redone.
         */
 #ifdef CONFIG_PREEMPT
-       rsm psr.i                               // disable interrupts
+       RSM_PSR_I(p0, r17, r31)                 // disable interrupts
        cmp.eq p0,pLvSys=r0,r0                  // pLvSys=0: leave from kernel
 (pKStk)        adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
        ;;
@@ -809,7 +857,7 @@ GLOBAL_ENTRY(ia64_leave_kernel)
        ;;
        cmp.eq p6,p0=r21,r0             // p6 <- pUStk || (preempt_count == 0)
 #else
-(pUStk)        rsm psr.i
+       RSM_PSR_I(pUStk, r17, r31)
        cmp.eq p0,pLvSys=r0,r0          // pLvSys=0: leave from kernel
 (pUStk)        cmp.eq.unc p6,p0=r0,r0          // p6 <- pUStk
 #endif
@@ -857,7 +905,7 @@ GLOBAL_ENTRY(ia64_leave_kernel)
        mov ar.csd=r30
        mov ar.ssd=r31
        ;;
-       rsm psr.i | psr.ic      // initiate turning off of interrupt and interruption collection
+       RSM_PSR_I_IC(r23, r22, r25)     // initiate turning off of interrupt and interruption collection
        invala                  // invalidate ALAT
        ;;
        ld8.fill r22=[r2],24
@@ -885,20 +933,28 @@ GLOBAL_ENTRY(ia64_leave_kernel)
        ldf.fill f7=[r2],PT(F11)-PT(F7)
        ldf.fill f8=[r3],32
        ;;
-       srlz.i                  // ensure interruption collection is off
+       srlz.d  // ensure that inter. collection is off (VHPT is don't care, since text is pinned)
        mov ar.ccv=r15
        ;;
        ldf.fill f11=[r2]
-       bsw.0                   // switch back to bank 0 (no stop bit required beforehand...)
+       BSW_0(r2, r3, r15)      // switch back to bank 0 (no stop bit required beforehand...)
        ;;
 (pUStk)        mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency)
        adds r16=PT(CR_IPSR)+16,r12
        adds r17=PT(CR_IIP)+16,r12
 
-(pKStk)        mov r22=psr             // M2 read PSR now that interrupts are disabled
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+       .pred.rel.mutex pUStk,pKStk
+       MOV_FROM_PSR(pKStk, r22, r29)   // M2 read PSR now that interrupts are disabled
+       MOV_FROM_ITC(pUStk, p9, r22, r29)       // M  fetch time at leave
+       nop.i 0
+       ;;
+#else
+       MOV_FROM_PSR(pKStk, r22, r29)   // M2 read PSR now that interrupts are disabled
        nop.i 0
        nop.i 0
        ;;
+#endif
        ld8 r29=[r16],16        // load cr.ipsr
        ld8 r28=[r17],16        // load cr.iip
        ;;
@@ -920,24 +976,45 @@ GLOBAL_ENTRY(ia64_leave_kernel)
        ;;
        ld8.fill r12=[r16],16
        ld8.fill r13=[r17],16
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+(pUStk)        adds r3=TI_AC_LEAVE+IA64_TASK_SIZE,r18
+#else
 (pUStk)        adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
+#endif
        ;;
        ld8 r20=[r16],16        // ar.fpsr
        ld8.fill r15=[r17],16
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+(pUStk)        adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18  // deferred
+#endif
        ;;
        ld8.fill r14=[r16],16
        ld8.fill r2=[r17]
 (pUStk)        mov r17=1
        ;;
-       ld8.fill r3=[r16]
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+       //  mmi_ :  ld8 st1 shr;;         mmi_ : st8 st1 shr;;
+       //  mib  :  mov add br        ->  mib  : ld8 add br
+       //  bbb_ :  br  nop cover;;       mbb_ : mov br  cover;;
+       //
+       //  no one require bsp in r16 if (pKStk) branch is selected.
+(pUStk)        st8 [r3]=r22            // save time at leave
 (pUStk)        st1 [r18]=r17           // restore current->thread.on_ustack
        shr.u r18=r19,16        // get byte size of existing "dirty" partition
        ;;
+       ld8.fill r3=[r16]       // deferred
+       LOAD_PHYS_STACK_REG_SIZE(r17)
+(pKStk)        br.cond.dpnt skip_rbs_switch
        mov r16=ar.bsp          // get existing backing store pointer
-       addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
+#else
+       ld8.fill r3=[r16]
+(pUStk)        st1 [r18]=r17           // restore current->thread.on_ustack
+       shr.u r18=r19,16        // get byte size of existing "dirty" partition
        ;;
-       ld4 r17=[r17]           // r17 = cpu_data->phys_stacked_size_p8
+       mov r16=ar.bsp          // get existing backing store pointer
+       LOAD_PHYS_STACK_REG_SIZE(r17)
 (pKStk)        br.cond.dpnt skip_rbs_switch
+#endif
 
        /*
         * Restore user backing store.
@@ -945,11 +1022,10 @@ GLOBAL_ENTRY(ia64_leave_kernel)
         * NOTE: alloc, loadrs, and cover can't be predicated.
         */
 (pNonSys) br.cond.dpnt dont_preserve_current_frame
-
-rbs_switch:
-       cover                           // add current frame into dirty partition and set cr.ifs
+       COVER                           // add current frame into dirty partition and set cr.ifs
        ;;
        mov r19=ar.bsp                  // get new backing store pointer
+rbs_switch:
        sub r16=r16,r18                 // krbs = old bsp - size of dirty partition
        cmp.ne p9,p0=r0,r0              // clear p9 to skip restore of cr.ifs
        ;;
@@ -1024,14 +1100,14 @@ rse_clear_invalid:
        mov loc5=0
        mov loc6=0
        mov loc7=0
-(pRecurse) br.call.sptk.few b0=rse_clear_invalid
+(pRecurse) br.call.dptk.few b0=rse_clear_invalid
        ;;
        mov loc8=0
        mov loc9=0
        cmp.ne pReturn,p0=r0,in1        // if recursion count != 0, we need to do a br.ret
        mov loc10=0
        mov loc11=0
-(pReturn) br.ret.sptk.many b0
+(pReturn) br.ret.dptk.many b0
 #endif /* !CONFIG_ITANIUM */
 #      undef pRecurse
 #      undef pReturn
@@ -1049,16 +1125,16 @@ skip_rbs_switch:
 (pKStk)        dep r29=r22,r29,21,1    // I0 update ipsr.pp with psr.pp
 (pLvSys)mov r16=r0             // A  clear r16 for leave_syscall, no-op otherwise
        ;;
-       mov cr.ipsr=r29         // M2
+       MOV_TO_IPSR(p0, r29, r25)       // M2
        mov ar.pfs=r26          // I0
 (pLvSys)mov r17=r0             // A  clear r17 for leave_syscall, no-op otherwise
 
-(p9)   mov cr.ifs=r30          // M2
+       MOV_TO_IFS(p9, r30, r25)// M2
        mov b0=r21              // I0
 (pLvSys)mov r18=r0             // A  clear r18 for leave_syscall, no-op otherwise
 
        mov ar.fpsr=r20         // M2
-       mov cr.iip=r28          // M2
+       MOV_TO_IIP(r28, r25)    // M2
        nop 0
        ;;
 (pUStk)        mov ar.rnat=r24         // M2 must happen with RSE in lazy mode
@@ -1067,7 +1143,7 @@ skip_rbs_switch:
 
        mov ar.rsc=r27          // M2
        mov pr=r31,-1           // I0
-       rfi                     // B
+       RFI                     // B
 
        /*
         * On entry:
@@ -1075,6 +1151,9 @@ skip_rbs_switch:
         *      r31 = current->thread_info->flags
         * On exit:
         *      p6 = TRUE if work-pending-check needs to be redone
+        *
+        * Interrupts are disabled on entry, reenabled depend on work, and
+        * disabled on exit.
         */
 .work_pending_syscall:
        add r2=-8,r2
@@ -1083,56 +1162,43 @@ skip_rbs_switch:
        st8 [r2]=r8
        st8 [r3]=r10
 .work_pending:
-       tbit.nz p6,p0=r31,TIF_SIGDELAYED                // signal delayed from  MCA/INIT/NMI/PMI context?
-(p6)   br.cond.sptk.few .sigdelayed
-       ;;
-       tbit.z p6,p0=r31,TIF_NEED_RESCHED               // current_thread_info()->need_resched==0?
+       tbit.z p6,p0=r31,TIF_NEED_RESCHED       // is resched not needed?
 (p6)   br.cond.sptk.few .notify
 #ifdef CONFIG_PREEMPT
 (pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
        ;;
 (pKStk) st4 [r20]=r21
-       ssm psr.i               // enable interrupts
 #endif
+       SSM_PSR_I(p0, p6, r2)   // enable interrupts
        br.call.spnt.many rp=schedule
-.ret9: cmp.eq p6,p0=r0,r0                              // p6 <- 1
-       rsm psr.i               // disable interrupts
+.ret9: cmp.eq p6,p0=r0,r0      // p6 <- 1 (re-check)
+       RSM_PSR_I(p0, r2, r20)  // disable interrupts
        ;;
 #ifdef CONFIG_PREEMPT
 (pKStk)        adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
        ;;
 (pKStk)        st4 [r20]=r0            // preempt_count() <- 0
 #endif
-(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
-       br.cond.sptk.many .work_processed_kernel        // re-check
+(pLvSys)br.cond.sptk.few  __paravirt_pending_syscall_end
+       br.cond.sptk.many .work_processed_kernel
 
 .notify:
 (pUStk)        br.call.spnt.many rp=notify_resume_user
-.ret10:        cmp.ne p6,p0=r0,r0                              // p6 <- 0
-(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
-       br.cond.sptk.many .work_processed_kernel        // don't re-check
-
-// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
-// it could not be delivered.  Deliver it now.  The signal might be for us and
-// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed
-// signal.
-
-.sigdelayed:
-       br.call.sptk.many rp=do_sigdelayed
-       cmp.eq p6,p0=r0,r0                              // p6 <- 1, always re-check
-(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
-       br.cond.sptk.many .work_processed_kernel        // re-check
-
-.work_pending_syscall_end:
+.ret10:        cmp.ne p6,p0=r0,r0      // p6 <- 0 (don't re-check)
+(pLvSys)br.cond.sptk.few  __paravirt_pending_syscall_end
+       br.cond.sptk.many .work_processed_kernel
+
+.global __paravirt_pending_syscall_end;
+__paravirt_pending_syscall_end:
        adds r2=PT(R8)+16,r12
        adds r3=PT(R10)+16,r12
        ;;
        ld8 r8=[r2]
        ld8 r10=[r3]
-       br.cond.sptk.many .work_processed_syscall       // re-check
-
-END(ia64_leave_kernel)
+       br.cond.sptk.many __paravirt_work_processed_syscall_target
+END(__paravirt_leave_kernel)
 
+#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE
 ENTRY(handle_syscall_error)
        /*
         * Some system calls (e.g., ptrace, mmap) can return arbitrary values which could
@@ -1167,11 +1233,14 @@ GLOBAL_ENTRY(ia64_invoke_schedule_tail)
 END(ia64_invoke_schedule_tail)
 
        /*
-        * Setup stack and call do_notify_resume_user().  Note that pSys and pNonSys need to
-        * be set up by the caller.  We declare 8 input registers so the system call
-        * args get preserved, in case we need to restart a system call.
+        * Setup stack and call do_notify_resume_user(), keeping interrupts
+        * disabled.
+        *
+        * Note that pSys and pNonSys need to be set up by the caller.
+        * We declare 8 input registers so the system call args get preserved,
+        * in case we need to restart a system call.
         */
-ENTRY(notify_resume_user)
+GLOBAL_ENTRY(notify_resume_user)
        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
        alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
        mov r9=ar.unat
@@ -1182,7 +1251,7 @@ ENTRY(notify_resume_user)
        ;;
 (pNonSys) mov out2=0                           // out2==0 => not a syscall
        .fframe 16
-       .spillpsp ar.unat, 16                   // (note that offset is relative to psp+0x10!)
+       .spillsp ar.unat, 16
        st8 [sp]=r9,-16                         // allocate space for ar.unat and save it
        st8 [out1]=loc1,-8                      // save ar.pfs, out1=&sigscratch
        .body
@@ -1198,32 +1267,6 @@ ENTRY(notify_resume_user)
        br.ret.sptk.many rp
 END(notify_resume_user)
 
-GLOBAL_ENTRY(sys_rt_sigsuspend)
-       .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-       alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
-       mov r9=ar.unat
-       mov loc0=rp                             // save return address
-       mov out0=in0                            // mask
-       mov out1=in1                            // sigsetsize
-       adds out2=8,sp                          // out2=&sigscratch->ar_pfs
-       ;;
-       .fframe 16
-       .spillpsp ar.unat, 16                   // (note that offset is relative to psp+0x10!)
-       st8 [sp]=r9,-16                         // allocate space for ar.unat and save it
-       st8 [out2]=loc1,-8                      // save ar.pfs, out2=&sigscratch
-       .body
-       br.call.sptk.many rp=ia64_rt_sigsuspend
-.ret17:        .restore sp
-       adds sp=16,sp                           // pop scratch stack space
-       ;;
-       ld8 r9=[sp]                             // load new unat from sw->caller_unat
-       mov rp=loc0
-       ;;
-       mov ar.unat=r9
-       mov ar.pfs=loc1
-       br.ret.sptk.many rp
-END(sys_rt_sigsuspend)
-
 ENTRY(sys_rt_sigreturn)
        PT_REGS_UNWIND_INFO(0)
        /*
@@ -1255,11 +1298,11 @@ ENTRY(sys_rt_sigreturn)
        stf.spill [r17]=f11
        adds out0=16,sp                         // out0 = &sigscratch
        br.call.sptk.many rp=ia64_rt_sigreturn
-.ret19:        .restore sp 0
+.ret19:        .restore sp,0
        adds sp=16,sp
        ;;
        ld8 r9=[sp]                             // load new ar.unat
-       mov.sptk b7=r8,ia64_leave_kernel
+       mov.sptk b7=r8,ia64_native_leave_kernel
        ;;
        mov ar.unat=r9
        br.many b7
@@ -1323,6 +1366,105 @@ GLOBAL_ENTRY(unw_init_running)
        br.ret.sptk.many rp
 END(unw_init_running)
 
+#ifdef CONFIG_FUNCTION_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+GLOBAL_ENTRY(_mcount)
+       br ftrace_stub
+END(_mcount)
+
+.here:
+       br.ret.sptk.many b0
+
+GLOBAL_ENTRY(ftrace_caller)
+       alloc out0 = ar.pfs, 8, 0, 4, 0
+       mov out3 = r0
+       ;;
+       mov out2 = b0
+       add r3 = 0x20, r3
+       mov out1 = r1;
+       br.call.sptk.many b0 = ftrace_patch_gp
+       //this might be called from module, so we must patch gp
+ftrace_patch_gp:
+       movl gp=__gp
+       mov b0 = r3
+       ;;
+.global ftrace_call;
+ftrace_call:
+{
+       .mlx
+       nop.m 0x0
+       movl r3 = .here;;
+}
+       alloc loc0 = ar.pfs, 4, 4, 2, 0
+       ;;
+       mov loc1 = b0
+       mov out0 = b0
+       mov loc2 = r8
+       mov loc3 = r15
+       ;;
+       adds out0 = -MCOUNT_INSN_SIZE, out0
+       mov out1 = in2
+       mov b6 = r3
+
+       br.call.sptk.many b0 = b6
+       ;;
+       mov ar.pfs = loc0
+       mov b0 = loc1
+       mov r8 = loc2
+       mov r15 = loc3
+       br ftrace_stub
+       ;;
+END(ftrace_caller)
+
+#else
+GLOBAL_ENTRY(_mcount)
+       movl r2 = ftrace_stub
+       movl r3 = ftrace_trace_function;;
+       ld8 r3 = [r3];;
+       ld8 r3 = [r3];;
+       cmp.eq p7,p0 = r2, r3
+(p7)   br.sptk.many ftrace_stub
+       ;;
+
+       alloc loc0 = ar.pfs, 4, 4, 2, 0
+       ;;
+       mov loc1 = b0
+       mov out0 = b0
+       mov loc2 = r8
+       mov loc3 = r15
+       ;;
+       adds out0 = -MCOUNT_INSN_SIZE, out0
+       mov out1 = in2
+       mov b6 = r3
+
+       br.call.sptk.many b0 = b6
+       ;;
+       mov ar.pfs = loc0
+       mov b0 = loc1
+       mov r8 = loc2
+       mov r15 = loc3
+       br ftrace_stub
+       ;;
+END(_mcount)
+#endif
+
+GLOBAL_ENTRY(ftrace_stub)
+       mov r3 = b0
+       movl r2 = _mcount_ret_helper
+       ;;
+       mov b6 = r2
+       mov b7 = r3
+       br.ret.sptk.many b6
+
+_mcount_ret_helper:
+       mov b0 = r42
+       mov r1 = r41
+       mov ar.pfs = r40
+       br b7
+END(ftrace_stub)
+
+#endif /* CONFIG_FUNCTION_TRACER */
+
        .rodata
        .align 8
        .globl sys_call_table
@@ -1361,7 +1503,7 @@ sys_call_table:
        data8 sys_mkdir                         // 1055
        data8 sys_rmdir
        data8 sys_dup
-       data8 sys_pipe
+       data8 sys_ia64_pipe
        data8 sys_times
        data8 ia64_brk                          // 1060
        data8 sys_setgid
@@ -1571,17 +1713,70 @@ sys_call_table:
        data8 sys_mq_timedreceive               // 1265
        data8 sys_mq_notify
        data8 sys_mq_getsetattr
-       data8 sys_ni_syscall                    // reserved for kexec_load
+       data8 sys_kexec_load
        data8 sys_ni_syscall                    // reserved for vserver
        data8 sys_waitid                        // 1270
        data8 sys_add_key
        data8 sys_request_key
        data8 sys_keyctl
+       data8 sys_ioprio_set
+       data8 sys_ioprio_get                    // 1275
+       data8 sys_move_pages
+       data8 sys_inotify_init
+       data8 sys_inotify_add_watch
+       data8 sys_inotify_rm_watch
+       data8 sys_migrate_pages                 // 1280
+       data8 sys_openat
+       data8 sys_mkdirat
+       data8 sys_mknodat
+       data8 sys_fchownat
+       data8 sys_futimesat                     // 1285
+       data8 sys_newfstatat
+       data8 sys_unlinkat
+       data8 sys_renameat
+       data8 sys_linkat
+       data8 sys_symlinkat                     // 1290
+       data8 sys_readlinkat
+       data8 sys_fchmodat
+       data8 sys_faccessat
+       data8 sys_pselect6
+       data8 sys_ppoll                         // 1295
+       data8 sys_unshare
+       data8 sys_splice
+       data8 sys_set_robust_list
+       data8 sys_get_robust_list
+       data8 sys_sync_file_range               // 1300
+       data8 sys_tee
+       data8 sys_vmsplice
+       data8 sys_fallocate
+       data8 sys_getcpu
+       data8 sys_epoll_pwait                   // 1305
+       data8 sys_utimensat
+       data8 sys_signalfd
        data8 sys_ni_syscall
-       data8 sys_ni_syscall                    // 1275
-       data8 sys_ni_syscall
-       data8 sys_ni_syscall
-       data8 sys_ni_syscall
-       data8 sys_ni_syscall
+       data8 sys_eventfd
+       data8 sys_timerfd_create                // 1310
+       data8 sys_timerfd_settime
+       data8 sys_timerfd_gettime
+       data8 sys_signalfd4
+       data8 sys_eventfd2
+       data8 sys_epoll_create1                 // 1315
+       data8 sys_dup3
+       data8 sys_pipe2
+       data8 sys_inotify_init1
+       data8 sys_preadv
+       data8 sys_pwritev                       // 1320
+       data8 sys_rt_tgsigqueueinfo
+       data8 sys_recvmmsg
+       data8 sys_fanotify_init
+       data8 sys_fanotify_mark
+       data8 sys_prlimit64                     // 1325
+       data8 sys_name_to_handle_at
+       data8 sys_open_by_handle_at
+       data8 sys_clock_adjtime
+       data8 sys_syncfs
+       data8 sys_setns                         // 1330
+       data8 sys_sendmmsg
 
        .org sys_call_table + 8*NR_syscalls     // guard against failures to increase NR_syscalls
+#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */