ARM: move signal handlers into a vdso-like page

[linux-3.10.git] / arch / arm / kernel / entry-armv.S
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S

index 85298c093256180917de4eac5b6188803802ad6f..d43c7e54ec6cafd568eb48c337d96c5bc4afa714 100644 (file)
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -15,15 +15,19 @@
   *  that causes it to save wrong values...  Be aware!
   */
  
+#include <asm/assembler.h>
  #include <asm/memory.h>
  #include <asm/glue-df.h>
  #include <asm/glue-pf.h>
  #include <asm/vfpmacros.h>
+#ifndef CONFIG_MULTI_IRQ_HANDLER
  #include <mach/entry-macro.S>
+#endif
  #include <asm/thread_notify.h>
  #include <asm/unwind.h>
  #include <asm/unistd.h>
  #include <asm/tls.h>
+#include <asm/system_info.h>
  
  #include "entry-header.S"
  #include <asm/entry-macro-multi.S>
@@ -35,12 +39,11 @@
  #ifdef CONFIG_MULTI_IRQ_HANDLER
         ldr     r1, =handle_arch_irq
         mov     r0, sp
-       ldr     r1, [r1]
         adr     lr, BSYM(9997f)
-       teq     r1, #0
-       movne   pc, r1
-#endif
+       ldr     pc, [r1]
+#else
         arch_irq_handler_default
+#endif
  9997:
         .endm
  
@@ -60,6 +63,7 @@
         @
         @ Call the processor-specific abort handler:
         @
+       @  r2 - pt_regs
         @  r4 - aborted context pc
         @  r5 - aborted context psr
         @
@@ -186,29 +190,8 @@ ENDPROC(__und_invalid)
         .align  5
  __dabt_svc:
         svc_entry
-       dabt_helper
-
-       @
-       @ call main handler
-       @
         mov     r2, sp
-       bl      do_DataAbort
-
-       @
-       @ IRQs off again before pulling preserved data off the stack
-       @
-       disable_irq_notrace
-
-       @
-       @ restore SPSR and restart the instruction
-       @
-       ldr     r5, [sp, #S_PSR]
-#ifdef CONFIG_TRACE_IRQFLAGS
-       tst     r5, #PSR_I_BIT
-       bleq    trace_hardirqs_on
-       tst     r5, #PSR_I_BIT
-       blne    trace_hardirqs_off
-#endif
+       dabt_helper
         svc_exit r5                             @ return from exception
   UNWIND(.fnend         )
  ENDPROC(__dabt_svc)
@@ -227,13 +210,8 @@ __irq_svc:
         tst     r0, #_TIF_NEED_RESCHED
         blne    svc_preempt
  #endif
-       ldr     r5, [sp, #S_PSR]
-#ifdef CONFIG_TRACE_IRQFLAGS
-       @ The parent context IRQs must have been enabled to get here in
-       @ the first place, so there's no point checking the PSR I bit.
-       bl      trace_hardirqs_on
-#endif
-       svc_exit r5                             @ return from exception
+
+       svc_exit r5, irq = 1                    @ return from exception
   UNWIND(.fnend         )
  ENDPROC(__irq_svc)
  
@@ -249,6 +227,19 @@ svc_preempt:
         b       1b
  #endif
  
+__und_fault:
+       @ Correct the PC such that it is pointing at the instruction
+       @ which caused the fault.  If the faulting instruction was ARM
+       @ the PC will be pointing at the next instruction, and have to
+       @ subtract 4.  Otherwise, it is Thumb, and the PC will be
+       @ pointing at the second half of the Thumb instruction.  We
+       @ have to subtract 2.
+       ldr     r2, [r0, #S_PC]
+       sub     r2, r2, r1
+       str     r2, [r0, #S_PC]
+       b       do_undefinstr
+ENDPROC(__und_fault)
+
         .align  5
  __und_svc:
  #ifdef CONFIG_KPROBES
@@ -266,37 +257,29 @@ __und_svc:
         @
         @  r0 - instruction
         @
-#ifndef        CONFIG_THUMB2_KERNEL
+#ifndef CONFIG_THUMB2_KERNEL
         ldr     r0, [r4, #-4]
  #else
+       mov     r1, #2
         ldrh    r0, [r4, #-2]                   @ Thumb instruction at LR - 2
-       and     r9, r0, #0xf800
-       cmp     r9, #0xe800                     @ 32-bit instruction if xx >= 0
-       ldrhhs  r9, [r4]                        @ bottom 16 bits
-       orrhs   r0, r9, r0, lsl #16
+       cmp     r0, #0xe800                     @ 32-bit instruction if xx >= 0
+       blo     __und_svc_fault
+       ldrh    r9, [r4]                        @ bottom 16 bits
+       add     r4, r4, #2
+       str     r4, [sp, #S_PC]
+       orr     r0, r9, r0, lsl #16
  #endif
-       adr     r9, BSYM(1f)
+       adr     r9, BSYM(__und_svc_finish)
         mov     r2, r4
         bl      call_fpe
  
+       mov     r1, #4                          @ PC correction to apply
+__und_svc_fault:
         mov     r0, sp                          @ struct pt_regs *regs
-       bl      do_undefinstr
-
-       @
-       @ IRQs off again before pulling preserved data off the stack
-       @
-1:     disable_irq_notrace
+       bl      __und_fault
  
-       @
-       @ restore SPSR and restart the instruction
-       @
+__und_svc_finish:
         ldr     r5, [sp, #S_PSR]                @ Get SVC cpsr
-#ifdef CONFIG_TRACE_IRQFLAGS
-       tst     r5, #PSR_I_BIT
-       bleq    trace_hardirqs_on
-       tst     r5, #PSR_I_BIT
-       blne    trace_hardirqs_off
-#endif
         svc_exit r5                             @ return from exception
   UNWIND(.fnend         )
  ENDPROC(__und_svc)
@@ -306,22 +289,6 @@ __pabt_svc:
         svc_entry
         mov     r2, sp                          @ regs
         pabt_helper
-
-       @
-       @ IRQs off again before pulling preserved data off the stack
-       @
-       disable_irq_notrace
-
-       @
-       @ restore SPSR and restart the instruction
-       @
-       ldr     r5, [sp, #S_PSR]
-#ifdef CONFIG_TRACE_IRQFLAGS
-       tst     r5, #PSR_I_BIT
-       bleq    trace_hardirqs_on
-       tst     r5, #PSR_I_BIT
-       blne    trace_hardirqs_off
-#endif
         svc_exit r5                             @ return from exception
   UNWIND(.fnend         )
  ENDPROC(__pabt_svc)
@@ -386,10 +353,11 @@ ENDPROC(__pabt_svc)
  #ifdef CONFIG_IRQSOFF_TRACER
         bl      trace_hardirqs_off
  #endif
+       ct_user_exit save = 0
         .endm
  
         .macro  kuser_cmpxchg_check
-#if __LINUX_ARM_ARCH__ < 6 && !defined(CONFIG_NEEDS_SYSCALL_FOR_CMPXCHG)
+#if !defined(CONFIG_CPU_32v6K) && !defined(CONFIG_NEEDS_SYSCALL_FOR_CMPXCHG)
  #ifndef CONFIG_MMU
  #warning "NPTL on non MMU needs fixing"
  #else
@@ -398,7 +366,7 @@ ENDPROC(__pabt_svc)
         @ perform a quick test inline since it should be false
         @ 99.9999% of the time.  The rest is done out of line.
         cmp     r4, #TASK_SIZE
-       blhs    kuser_cmpxchg_fixup
+       blhs    kuser_cmpxchg64_fixup
  #endif
  #endif
         .endm
@@ -407,11 +375,9 @@ ENDPROC(__pabt_svc)
  __dabt_usr:
         usr_entry
         kuser_cmpxchg_check
-       dabt_helper
-
         mov     r2, sp
-       adr     lr, BSYM(ret_from_exception)
-       b       do_DataAbort
+       dabt_helper
+       b       ret_from_exception
   UNWIND(.fnend         )
  ENDPROC(__dabt_usr)
  
@@ -435,55 +401,91 @@ __und_usr:
         mov     r2, r4
         mov     r3, r5
  
+       @ r2 = regs->ARM_pc, which is either 2 or 4 bytes ahead of the
+       @      faulting instruction depending on Thumb mode.
+       @ r3 = regs->ARM_cpsr
         @
-       @ fall through to the emulation code, which returns using r9 if
-       @ it has emulated the instruction, or the more conventional lr
-       @ if we are to treat this as a real undefined instruction
-       @
-       @  r0 - instruction
+       @ The emulation code returns using r9 if it has emulated the
+       @ instruction, or the more conventional lr if we are to treat
+       @ this as a real undefined instruction
         @
         adr     r9, BSYM(ret_from_exception)
-       adr     lr, BSYM(__und_usr_unknown)
+
         tst     r3, #PSR_T_BIT                  @ Thumb mode?
-       itet    eq                              @ explicit IT needed for the 1f label
-       subeq   r4, r2, #4                      @ ARM instr at LR - 4
-       subne   r4, r2, #2                      @ Thumb instr at LR - 2
-1:     ldreqt  r0, [r4]
+       bne     __und_usr_thumb
+       sub     r4, r2, #4                      @ ARM instr at LR - 4
+1:     ldrt    r0, [r4]
  #ifdef CONFIG_CPU_ENDIAN_BE8
-       reveq   r0, r0                          @ little endian instruction
+       rev     r0, r0                          @ little endian instruction
  #endif
-       beq     call_fpe
+       @ r0 = 32-bit ARM instruction which caused the exception
+       @ r2 = PC value for the following instruction (:= regs->ARM_pc)
+       @ r4 = PC value for the faulting instruction
+       @ lr = 32-bit undefined instruction function
+       adr     lr, BSYM(__und_usr_fault_32)
+       b       call_fpe
+
+__und_usr_thumb:
         @ Thumb instruction
-#if __LINUX_ARM_ARCH__ >= 7
-2:
- ARM(  ldrht   r5, [r4], #2    )
- THUMB(        ldrht   r5, [r4]        )
- THUMB(        add     r4, r4, #2      )
-       and     r0, r5, #0xf800                 @ mask bits 111x x... .... ....
-       cmp     r0, #0xe800                     @ 32bit instruction if xx != 0
-       blo     __und_usr_unknown
-3:     ldrht   r0, [r4]
+       sub     r4, r2, #2                      @ First half of thumb instr at LR - 2
+#if CONFIG_ARM_THUMB && __LINUX_ARM_ARCH__ >= 6 && CONFIG_CPU_V7
+/*
+ * Thumb-2 instruction handling.  Note that because pre-v6 and >= v6 platforms
+ * can never be supported in a single kernel, this code is not applicable at
+ * all when __LINUX_ARM_ARCH__ < 6.  This allows simplifying assumptions to be
+ * made about .arch directives.
+ */
+#if __LINUX_ARM_ARCH__ < 7
+/* If the target CPU may not be Thumb-2-capable, a run-time check is needed: */
+#define NEED_CPU_ARCHITECTURE
+       ldr     r5, .LCcpu_architecture
+       ldr     r5, [r5]
+       cmp     r5, #CPU_ARCH_ARMv7
+       blo     __und_usr_fault_16              @ 16bit undefined instruction
+/*
+ * The following code won't get run unless the running CPU really is v7, so
+ * coding round the lack of ldrht on older arches is pointless.  Temporarily
+ * override the assembler target arch with the minimum required instead:
+ */
+       .arch   armv6t2
+#endif
+2:     ldrht   r5, [r4]
+       cmp     r5, #0xe800                     @ 32bit instruction if xx != 0
+       blo     __und_usr_fault_16              @ 16bit undefined instruction
+3:     ldrht   r0, [r2]
         add     r2, r2, #2                      @ r2 is PC + 2, make it PC + 4
+       str     r2, [sp, #S_PC]                 @ it's a 2x16bit instr, update
         orr     r0, r0, r5, lsl #16
+       adr     lr, BSYM(__und_usr_fault_32)
+       @ r0 = the two 16-bit Thumb instructions which caused the exception
+       @ r2 = PC value for the following Thumb instruction (:= regs->ARM_pc)
+       @ r4 = PC value for the first 16-bit Thumb instruction
+       @ lr = 32bit undefined instruction function
+
+#if __LINUX_ARM_ARCH__ < 7
+/* If the target arch was overridden, change it back: */
+#ifdef CONFIG_CPU_32v6K
+       .arch   armv6k
  #else
-       b       __und_usr_unknown
+       .arch   armv6
  #endif
- UNWIND(.fnend         )
+#endif /* __LINUX_ARM_ARCH__ < 7 */
+#else /* !(CONFIG_ARM_THUMB && __LINUX_ARM_ARCH__ >= 6 && CONFIG_CPU_V7) */
+       b       __und_usr_fault_16
+#endif
+ UNWIND(.fnend)
  ENDPROC(__und_usr)
  
-       @
-       @ fallthrough to call_fpe
-       @
-
  /*
- * The out of line fixup for the ldrt above.
+ * The out of line fixup for the ldrt instructions above.
   */
         .pushsection .fixup, "ax"
+       .align  2
  4:     mov     pc, r9
         .popsection
         .pushsection __ex_table,"a"
         .long   1b, 4b
-#if __LINUX_ARM_ARCH__ >= 7
+#if CONFIG_ARM_THUMB && __LINUX_ARM_ARCH__ >= 6 && CONFIG_CPU_V7
         .long   2b, 4b
         .long   3b, 4b
  #endif
@@ -507,31 +509,32 @@ ENDPROC(__und_usr)
   * NEON handler code.
   *
   * Emulators may wish to make use of the following registers:
- *  r0  = instruction opcode.
- *  r2  = PC+4
+ *  r0  = instruction opcode (32-bit ARM or two 16-bit Thumb)
+ *  r2  = PC value to resume execution after successful emulation
   *  r9  = normal "successful" return address
- *  r10 = this threads thread_info structure.
+ *  r10 = this threads thread_info structure
   *  lr  = unrecognised instruction return address
+ * IRQs disabled, FIQs enabled.
   */
         @
         @ Fall-through from Thumb-2 __und_usr
         @
  #ifdef CONFIG_NEON
+       get_thread_info r10                     @ get current thread
         adr     r6, .LCneon_thumb_opcodes
         b       2f
  #endif
  call_fpe:
+       get_thread_info r10                     @ get current thread
  #ifdef CONFIG_NEON
         adr     r6, .LCneon_arm_opcodes
-2:
-       ldr     r7, [r6], #4                    @ mask value
-       cmp     r7, #0                          @ end mask?
-       beq     1f
-       and     r8, r0, r7
+2:     ldr     r5, [r6], #4                    @ mask value
         ldr     r7, [r6], #4                    @ opcode bits matching in mask
+       cmp     r5, #0                          @ end mask?
+       beq     1f
+       and     r8, r0, r5
         cmp     r8, r7                          @ NEON instruction?
         bne     2b
-       get_thread_info r10
         mov     r7, #1
         strb    r7, [r10, #TI_USED_CP + 10]     @ mark CP#10 as used
         strb    r7, [r10, #TI_USED_CP + 11]     @ mark CP#11 as used
@@ -540,12 +543,7 @@ call_fpe:
  #endif
         tst     r0, #0x08000000                 @ only CDP/CPRT/LDC/STC have bit 27
         tstne   r0, #0x04000000                 @ bit 26 set on both ARM and Thumb-2
-#if defined(CONFIG_CPU_ARM610) || defined(CONFIG_CPU_ARM710)
-       and     r8, r0, #0x0f000000             @ mask out op-code bits
-       teqne   r8, #0x0f000000                 @ SWI (ARM6/7 bug)?
-#endif
         moveq   pc, lr
-       get_thread_info r10                     @ get current thread
         and     r8, r0, #0x00000f00             @ mask out CP number
   THUMB(        lsr     r8, r8, #8              )
         mov     r7, #1
@@ -592,6 +590,12 @@ call_fpe:
         movw_pc lr                              @ CP#14 (Debug)
         movw_pc lr                              @ CP#15 (Control)
  
+#ifdef NEED_CPU_ARCHITECTURE
+       .align  2
+.LCcpu_architecture:
+       .word   __cpu_architecture
+#endif
+
  #ifdef CONFIG_NEON
         .align  6
  
@@ -640,12 +644,17 @@ ENTRY(no_fp)
         mov     pc, lr
  ENDPROC(no_fp)
  
-__und_usr_unknown:
-       enable_irq
+__und_usr_fault_32:
+       mov     r1, #4
+       b       1f
+__und_usr_fault_16:
+       mov     r1, #2
+1:     enable_irq
         mov     r0, sp
         adr     lr, BSYM(ret_from_exception)
-       b       do_undefinstr
-ENDPROC(__und_usr_unknown)
+       b       __und_fault
+ENDPROC(__und_usr_fault_32)
+ENDPROC(__und_usr_fault_16)
  
         .align  5
  __pabt_usr:
@@ -715,31 +724,12 @@ ENDPROC(__switch_to)
  /*
   * User helpers.
   *
- * These are segment of kernel provided user code reachable from user space
- * at a fixed address in kernel memory.  This is used to provide user space
- * with some operations which require kernel help because of unimplemented
- * native feature and/or instructions in many ARM CPUs. The idea is for
- * this code to be executed directly in user mode for best efficiency but
- * which is too intimate with the kernel counter part to be left to user
- * libraries.  In fact this code might even differ from one CPU to another
- * depending on the available  instruction set and restrictions like on
- * SMP systems.  In other words, the kernel reserves the right to change
- * this code as needed without warning. Only the entry points and their
- * results are guaranteed to be stable.
- *
   * Each segment is 32-byte aligned and will be moved to the top of the high
   * vector page.  New segments (if ever needed) must be added in front of
   * existing ones.  This mechanism should be used only for things that are
   * really small and justified, and not be abused freely.
   *
- * User space is expected to implement those things inline when optimizing
- * for a processor that has the necessary native support, but only if such
- * resulting binaries are already to be incompatible with earlier ARM
- * processors due to the use of unsupported instructions other than what
- * is provided here.  In other words don't make binaries unable to run on
- * earlier processors just for the sake of not using these kernel helpers
- * if your compiled code is not going to use the new instructions for other
- * purpose.
+ * See Documentation/arm/kernel_user_helpers.txt for formal definitions.
   */
   THUMB(        .arm    )
  
@@ -751,101 +741,115 @@ ENDPROC(__switch_to)
  #endif
         .endm
  
+       .macro  kuser_pad, sym, size
+       .if     (. - \sym) & 3
+       .rept   4 - (. - \sym) & 3
+       .byte   0
+       .endr
+       .endif
+       .rept   (\size - (. - \sym)) / 4
+       .word   0xe7fddef1
+       .endr
+       .endm
+
+#ifdef CONFIG_KUSER_HELPERS
         .align  5
         .globl  __kuser_helper_start
  __kuser_helper_start:
  
  /*
- * Reference prototype:
- *
- *     void __kernel_memory_barrier(void)
- *
- * Input:
- *
- *     lr = return address
- *
- * Output:
- *
- *     none
- *
- * Clobbered:
- *
- *     none
- *
- * Definition and user space usage example:
- *
- *     typedef void (__kernel_dmb_t)(void);
- *     #define __kernel_dmb (*(__kernel_dmb_t *)0xffff0fa0)
- *
- * Apply any needed memory barrier to preserve consistency with data modified
- * manually and __kuser_cmpxchg usage.
- *
- * This could be used as follows:
- *
- * #define __kernel_dmb() \
- *         asm volatile ( "mov r0, #0xffff0fff; mov lr, pc; sub pc, r0, #95" \
- *             : : : "r0", "lr","cc" )
+ * Due to the length of some sequences, __kuser_cmpxchg64 spans 2 regular
+ * kuser "slots", therefore 0xffff0f80 is not used as a valid entry point.
   */
  
-__kuser_memory_barrier:                                @ 0xffff0fa0
+__kuser_cmpxchg64:                             @ 0xffff0f60
+
+#if defined(CONFIG_NEEDS_SYSCALL_FOR_CMPXCHG)
+
+       /*
+        * Poor you.  No fast solution possible...
+        * The kernel itself must perform the operation.
+        * A special ghost syscall is used for that (see traps.c).
+        */
+       stmfd   sp!, {r7, lr}
+       ldr     r7, 1f                  @ it's 20 bits
+       swi     __ARM_NR_cmpxchg64
+       ldmfd   sp!, {r7, pc}
+1:     .word   __ARM_NR_cmpxchg64
+
+#elif defined(CONFIG_CPU_32v6K)
+
+       stmfd   sp!, {r4, r5, r6, r7}
+       ldrd    r4, r5, [r0]                    @ load old val
+       ldrd    r6, r7, [r1]                    @ load new val
+       smp_dmb arm
+1:     ldrexd  r0, r1, [r2]                    @ load current val
+       eors    r3, r0, r4                      @ compare with oldval (1)
+       eoreqs  r3, r1, r5                      @ compare with oldval (2)
+       strexdeq r3, r6, r7, [r2]               @ store newval if eq
+       teqeq   r3, #1                          @ success?
+       beq     1b                              @ if no then retry
         smp_dmb arm
+       rsbs    r0, r3, #0                      @ set returned val and C flag
+       ldmfd   sp!, {r4, r5, r6, r7}
         usr_ret lr
  
-       .align  5
+#elif !defined(CONFIG_SMP)
  
-/*
- * Reference prototype:
- *
- *     int __kernel_cmpxchg(int oldval, int newval, int *ptr)
- *
- * Input:
- *
- *     r0 = oldval
- *     r1 = newval
- *     r2 = ptr
- *     lr = return address
- *
- * Output:
- *
- *     r0 = returned value (zero or non-zero)
- *     C flag = set if r0 == 0, clear if r0 != 0
- *
- * Clobbered:
- *
- *     r3, ip, flags
- *
- * Definition and user space usage example:
- *
- *     typedef int (__kernel_cmpxchg_t)(int oldval, int newval, int *ptr);
- *     #define __kernel_cmpxchg (*(__kernel_cmpxchg_t *)0xffff0fc0)
- *
- * Atomically store newval in *ptr if *ptr is equal to oldval for user space.
- * Return zero if *ptr was changed or non-zero if no exchange happened.
- * The C flag is also set if *ptr was changed to allow for assembly
- * optimization in the calling code.
- *
- * Notes:
- *
- *    - This routine already includes memory barriers as needed.
- *
- * For example, a user space atomic_add implementation could look like this:
- *
- * #define atomic_add(ptr, val) \
- *     ({ register unsigned int *__ptr asm("r2") = (ptr); \
- *        register unsigned int __result asm("r1"); \
- *        asm volatile ( \
- *            "1: @ atomic_add\n\t" \
- *            "ldr     r0, [r2]\n\t" \
- *            "mov     r3, #0xffff0fff\n\t" \
- *            "add     lr, pc, #4\n\t" \
- *            "add     r1, r0, %2\n\t" \
- *            "add     pc, r3, #(0xffff0fc0 - 0xffff0fff)\n\t" \
- *            "bcc     1b" \
- *            : "=&r" (__result) \
- *            : "r" (__ptr), "rIL" (val) \
- *            : "r0","r3","ip","lr","cc","memory" ); \
- *        __result; })
- */
+#ifdef CONFIG_MMU
+
+       /*
+        * The only thing that can break atomicity in this cmpxchg64
+        * implementation is either an IRQ or a data abort exception
+        * causing another process/thread to be scheduled in the middle of
+        * the critical sequence.  The same strategy as for cmpxchg is used.
+        */
+       stmfd   sp!, {r4, r5, r6, lr}
+       ldmia   r0, {r4, r5}                    @ load old val
+       ldmia   r1, {r6, lr}                    @ load new val
+1:     ldmia   r2, {r0, r1}                    @ load current val
+       eors    r3, r0, r4                      @ compare with oldval (1)
+       eoreqs  r3, r1, r5                      @ compare with oldval (2)
+2:     stmeqia r2, {r6, lr}                    @ store newval if eq
+       rsbs    r0, r3, #0                      @ set return val and C flag
+       ldmfd   sp!, {r4, r5, r6, pc}
+
+       .text
+kuser_cmpxchg64_fixup:
+       @ Called from kuser_cmpxchg_fixup.
+       @ r4 = address of interrupted insn (must be preserved).
+       @ sp = saved regs. r7 and r8 are clobbered.
+       @ 1b = first critical insn, 2b = last critical insn.
+       @ If r4 >= 1b and r4 <= 2b then saved pc_usr is set to 1b.
+       mov     r7, #0xffff0fff
+       sub     r7, r7, #(0xffff0fff - (0xffff0f60 + (1b - __kuser_cmpxchg64)))
+       subs    r8, r4, r7
+       rsbcss  r8, r8, #(2b - 1b)
+       strcs   r7, [sp, #S_PC]
+#if __LINUX_ARM_ARCH__ < 6
+       bcc     kuser_cmpxchg32_fixup
+#endif
+       mov     pc, lr
+       .previous
+
+#else
+#warning "NPTL on non MMU needs fixing"
+       mov     r0, #-1
+       adds    r0, r0, #0
+       usr_ret lr
+#endif
+
+#else
+#error "incoherent kernel configuration"
+#endif
+
+       kuser_pad __kuser_cmpxchg64, 64
+
+__kuser_memory_barrier:                                @ 0xffff0fa0
+       smp_dmb arm
+       usr_ret lr
+
+       kuser_pad __kuser_memory_barrier, 32
  
  __kuser_cmpxchg:                               @ 0xffff0fc0
  
@@ -882,7 +886,7 @@ __kuser_cmpxchg:                            @ 0xffff0fc0
         usr_ret lr
  
         .text
-kuser_cmpxchg_fixup:
+kuser_cmpxchg32_fixup:
         @ Called from kuser_cmpxchg_check macro.
         @ r4 = address of interrupted insn (must be preserved).
         @ sp = saved regs. r7 and r8 are clobbered.
@@ -918,76 +922,33 @@ kuser_cmpxchg_fixup:
  
  #endif
  
-       .align  5
-
-/*
- * Reference prototype:
- *
- *     int __kernel_get_tls(void)
- *
- * Input:
- *
- *     lr = return address
- *
- * Output:
- *
- *     r0 = TLS value
- *
- * Clobbered:
- *
- *     none
- *
- * Definition and user space usage example:
- *
- *     typedef int (__kernel_get_tls_t)(void);
- *     #define __kernel_get_tls (*(__kernel_get_tls_t *)0xffff0fe0)
- *
- * Get the TLS value as previously set via the __ARM_NR_set_tls syscall.
- *
- * This could be used as follows:
- *
- * #define __kernel_get_tls() \
- *     ({ register unsigned int __val asm("r0"); \
- *         asm( "mov r0, #0xffff0fff; mov lr, pc; sub pc, r0, #31" \
- *             : "=r" (__val) : : "lr","cc" ); \
- *        __val; })
- */
+       kuser_pad __kuser_cmpxchg, 32
  
  __kuser_get_tls:                               @ 0xffff0fe0
         ldr     r0, [pc, #(16 - 8)]     @ read TLS, set in kuser_get_tls_init
         usr_ret lr
         mrc     p15, 0, r0, c13, c0, 3  @ 0xffff0fe8 hardware TLS code
-       .rep    4
+       kuser_pad __kuser_get_tls, 16
+       .rep    3
         .word   0                       @ 0xffff0ff0 software TLS value, then
         .endr                           @ pad up to __kuser_helper_version
  
-/*
- * Reference declaration:
- *
- *     extern unsigned int __kernel_helper_version;
- *
- * Definition and user space usage example:
- *
- *     #define __kernel_helper_version (*(unsigned int *)0xffff0ffc)
- *
- * User space may read this to determine the curent number of helpers
- * available.
- */
-
  __kuser_helper_version:                                @ 0xffff0ffc
         .word   ((__kuser_helper_end - __kuser_helper_start) >> 5)
  
         .globl  __kuser_helper_end
  __kuser_helper_end:
  
+#endif
+
   THUMB(        .thumb  )
  
  /*
   * Vector stubs.
   *
- * This code is copied to 0xffff0200 so we can use branches in the
- * vectors, rather than ldr's.  Note that this code must not
- * exceed 0x300 bytes.
+ * This code is copied to 0xffff1000 so we can use branches in the
+ * vectors, rather than ldr's.  Note that this code must not exceed
+ * a page size.
   *
   * Common stub entry macro:
   *   Enter in IRQ mode, spsr = SVC/USR CPSR, lr = SVC/USR PC
@@ -1034,8 +995,17 @@ ENDPROC(vector_\name)
  1:
         .endm
  
-       .globl  __stubs_start
+       .section .stubs, "ax", %progbits
  __stubs_start:
+       @ This must be the first word
+       .word   vector_swi
+
+vector_rst:
+ ARM(  swi     SYS_ERROR0      )
+ THUMB(        svc     #0              )
+ THUMB(        nop                     )
+       b       vector_und
+
  /*
   * Interrupt dispatcher
   */
@@ -1129,6 +1099,16 @@ __stubs_start:
  
         .align  5
  
+/*=============================================================================
+ * Address exception handler
+ *-----------------------------------------------------------------------------
+ * These aren't too critical.
+ * (they're not supposed to happen, and won't happen in 32-bit data mode).
+ */
+
+vector_addrexcptn:
+       b       vector_addrexcptn
+
  /*=============================================================================
   * Undefined FIQs
   *-----------------------------------------------------------------------------
@@ -1140,48 +1120,21 @@ __stubs_start:
   * get out of that mode without clobbering one register.
   */
  vector_fiq:
-       disable_fiq
         subs    pc, lr, #4
  
-/*=============================================================================
- * Address exception handler
- *-----------------------------------------------------------------------------
- * These aren't too critical.
- * (they're not supposed to happen, and won't happen in 32-bit data mode).
- */
-
-vector_addrexcptn:
-       b       vector_addrexcptn
+       .globl  vector_fiq_offset
+       .equ    vector_fiq_offset, vector_fiq
  
-/*
- * We group all the following data together to optimise
- * for CPUs with separate I & D caches.
- */
-       .align  5
-
-.LCvswi:
-       .word   vector_swi
-
-       .globl  __stubs_end
-__stubs_end:
-
-       .equ    stubs_offset, __vectors_start + 0x200 - __stubs_start
-
-       .globl  __vectors_start
+       .section .vectors, "ax", %progbits
  __vectors_start:
- ARM(  swi     SYS_ERROR0      )
- THUMB(        svc     #0              )
- THUMB(        nop                     )
-       W(b)    vector_und + stubs_offset
-       W(ldr)  pc, .LCvswi + stubs_offset
-       W(b)    vector_pabt + stubs_offset
-       W(b)    vector_dabt + stubs_offset
-       W(b)    vector_addrexcptn + stubs_offset
-       W(b)    vector_irq + stubs_offset
-       W(b)    vector_fiq + stubs_offset
-
-       .globl  __vectors_end
-__vectors_end:
+       W(b)    vector_rst
+       W(b)    vector_und
+       W(ldr)  pc, __vectors_start + 0x1000
+       W(b)    vector_pabt
+       W(b)    vector_dabt
+       W(b)    vector_addrexcptn
+       W(b)    vector_irq
+       W(b)    vector_fiq
  
         .data