sh: TLB miss fast-path optimizations.
Stuart Menefy [Fri, 24 Nov 2006 02:42:24 +0000 (11:42 +0900)]
Handle simple TLB miss faults which can be resolved completely
from the page table in assembler.

Signed-off-by: Stuart Menefy <stuart.menefy@st.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

arch/sh/Kconfig
arch/sh/kernel/cpu/sh3/entry.S
arch/sh/kernel/cpu/sh4/probe.c
arch/sh/mm/Kconfig
arch/sh/mm/fault.c
include/asm-sh/pgtable.h

index a03f155..48308dc 100644 (file)
@@ -379,6 +379,9 @@ config CPU_HAS_SR_RB
          See <file:Documentation/sh/register-banks.txt> for further
          information on SR.RB and register banking in the kernel in general.
 
+config CPU_HAS_PTEA
+       bool
+
 endmenu
 
 menu "Timer support"
index 869d56f..5de99b4 100644 (file)
 #include <linux/linkage.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
-#include <asm/cpu/mmu_context.h>
 #include <asm/unistd.h>
+#include <asm/cpu/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
 
 ! NOTE:
 ! GNU as (as of 2.9.1) changes bf/s into bt/s and bra, when the address
@@ -136,29 +138,14 @@ ENTRY(tlb_protection_violation_store)
 
 call_dpf:
        mov.l   1f, r0
-       mov     r5, r8
-       mov.l   @r0, r6
-       mov     r6, r9
-       mov.l   2f, r0
-       sts     pr, r10
-       jsr     @r0
-        mov    r15, r4
-       !
-       tst     r0, r0
-       bf/s    0f
-        lds    r10, pr
-       rts
-        nop
-0:     sti
+       mov.l   @r0, r6         ! address
        mov.l   3f, r0
-       mov     r9, r6
-       mov     r8, r5
+       sti
        jmp     @r0
-        mov    r15, r4
+        mov    r15, r4         ! regs
 
        .align 2
 1:     .long   MMU_TEA
-2:     .long   __do_page_fault
 3:     .long   do_page_fault
 
        .align  2
@@ -344,9 +331,176 @@ general_exception:
 2:     .long   ret_from_exception
 !
 !
+
+/* This code makes some assumptions to improve performance.
+ * Make sure they are stil true. */
+#if PTRS_PER_PGD != PTRS_PER_PTE
+#error PDG and PTE sizes don't match
+#endif
+
+/* gas doesn't flag impossible values for mov #immediate as an error */
+#if (_PAGE_PRESENT >> 2) > 0x7f
+#error cannot load PAGE_PRESENT as an immediate
+#endif
+#if _PAGE_DIRTY > 0x7f
+#error cannot load PAGE_DIRTY as an immediate
+#endif
+#if (_PAGE_PRESENT << 2) != _PAGE_ACCESSED
+#error cannot derive PAGE_ACCESSED from PAGE_PRESENT
+#endif
+
+#if defined(CONFIG_CPU_SH4)
+#define ldmmupteh(r)   mov.l   8f, r
+#else
+#define ldmmupteh(r)   mov     #MMU_PTEH, r
+#endif
+
        .balign         1024,0,1024
 tlb_miss:
-       mov.l   1f, k2
+#ifdef COUNT_EXCEPTIONS
+       ! Increment the counts
+       mov.l   9f, k1
+       mov.l   @k1, k2
+       add     #1, k2
+       mov.l   k2, @k1
+#endif
+
+       ! k0 scratch
+       ! k1 pgd and pte pointers
+       ! k2 faulting address
+       ! k3 pgd and pte index masks
+       ! k4 shift
+
+       ! Load up the pgd entry (k1)
+
+       ldmmupteh(k0)                   !  9 LS (latency=2)     MMU_PTEH
+
+       mov.w   4f, k3                  !  8 LS (latency=2)     (PTRS_PER_PGD-1) << 2
+       mov     #-(PGDIR_SHIFT-2), k4   !  6 EX
+
+       mov.l   @(MMU_TEA-MMU_PTEH,k0), k2      ! 18 LS (latency=2)
+
+       mov.l   @(MMU_TTB-MMU_PTEH,k0), k1      ! 18 LS (latency=2)
+
+       mov     k2, k0                  !   5 MT (latency=0)
+       shld    k4, k0                  !  99 EX
+
+       and     k3, k0                  !  78 EX
+
+       mov.l   @(k0, k1), k1           !  21 LS (latency=2)
+       mov     #-(PAGE_SHIFT-2), k4    !   6 EX
+
+       ! Load up the pte entry (k2)
+
+       mov     k2, k0                  !   5 MT (latency=0)
+       shld    k4, k0                  !  99 EX
+
+       tst     k1, k1                  !  86 MT
+
+       bt      20f                     ! 110 BR
+
+       and     k3, k0                  !  78 EX
+       mov.w   5f, k4                  !   8 LS (latency=2)    _PAGE_PRESENT
+
+       mov.l   @(k0, k1), k2           !  21 LS (latency=2)
+       add     k0, k1                  !  49 EX
+
+#ifdef CONFIG_CPU_HAS_PTEA
+       ! Test the entry for present and _PAGE_ACCESSED
+
+       mov     #-28, k3                !   6 EX
+       mov     k2, k0                  !   5 MT (latency=0)
+
+       tst     k4, k2                  !  68 MT
+       shld    k3, k0                  !  99 EX
+
+       bt      20f                     ! 110 BR
+
+       ! Set PTEA register
+       ! MMU_PTEA = ((pteval >> 28) & 0xe) | (pteval & 0x1)
+       !
+       ! k0=pte>>28, k1=pte*, k2=pte, k3=<unused>, k4=_PAGE_PRESENT
+
+       and     #0xe, k0                !  79 EX
+
+       mov     k0, k3                  !   5 MT (latency=0)
+       mov     k2, k0                  !   5 MT (latency=0)
+
+       and     #1, k0                  !  79 EX
+
+       or      k0, k3                  !  82 EX
+
+       ldmmupteh(k0)                   !   9 LS (latency=2)
+       shll2   k4                      ! 101 EX                _PAGE_ACCESSED
+
+       tst     k4, k2                  !  68 MT
+
+       mov.l   k3, @(MMU_PTEA-MMU_PTEH,k0)     ! 27 LS
+
+       mov.l   7f, k3                  !   9 LS (latency=2)    _PAGE_FLAGS_HARDWARE_MASK
+
+       ! k0=MMU_PTEH, k1=pte*, k2=pte, k3=_PAGE_FLAGS_HARDWARE, k4=_PAGE_ACCESSED
+#else
+
+       ! Test the entry for present and _PAGE_ACCESSED
+
+       mov.l   7f, k3                  !   9 LS (latency=2)    _PAGE_FLAGS_HARDWARE_MASK
+       tst     k4, k2                  !  68 MT
+
+       shll2   k4                      ! 101 EX                _PAGE_ACCESSED
+       ldmmupteh(k0)                   !   9 LS (latency=2)
+
+       bt      20f                     ! 110 BR
+       tst     k4, k2                  !  68 MT
+
+       ! k0=MMU_PTEH, k1=pte*, k2=pte, k3=_PAGE_FLAGS_HARDWARE, k4=_PAGE_ACCESSED
+
+#endif
+
+       ! Set up the entry
+
+       and     k2, k3                  !  78 EX
+       bt/s    10f                     ! 108 BR
+
+        mov.l  k3, @(MMU_PTEL-MMU_PTEH,k0)     ! 27 LS
+
+       ldtlb                           ! 128 CO
+
+       ! At least one instruction between ldtlb and rte
+       nop                             ! 119 NOP
+
+       rte                             ! 126 CO
+
+        nop                            ! 119 NOP
+
+
+10:    or      k4, k2                  !  82 EX
+
+       ldtlb                           ! 128 CO
+
+       ! At least one instruction between ldtlb and rte
+       mov.l   k2, @k1                 !  27 LS
+
+       rte                             ! 126 CO
+
+       ! Note we cannot execute mov here, because it is executed after
+       ! restoring SSR, so would be executed in user space.
+        nop                            ! 119 NOP
+
+
+       .align 5
+       ! Once cache line if possible...
+1:     .long   swapper_pg_dir
+4:     .short  (PTRS_PER_PGD-1) << 2
+5:     .short  _PAGE_PRESENT
+7:     .long   _PAGE_FLAGS_HARDWARE_MASK
+8:     .long   MMU_PTEH
+#ifdef COUNT_EXCEPTIONS
+9:     .long   exception_count_miss
+#endif
+
+       ! Either pgd or pte not present
+20:    mov.l   1f, k2
        mov.l   4f, k3
        bra     handle_exception
         mov.l  @k2, k2
@@ -496,6 +650,15 @@ skip_save:
        bf      interrupt_exception
        shlr2   r8
        shlr    r8
+
+#ifdef COUNT_EXCEPTIONS
+       mov.l   5f, r9
+       add     r8, r9
+       mov.l   @r9, r10
+       add     #1, r10
+       mov.l   r10, @r9
+#endif
+
        mov.l   4f, r9
        add     r8, r9
        mov.l   @r9, r9
@@ -509,6 +672,9 @@ skip_save:
 2:     .long   0x000080f0      ! FD=1, IMASK=15
 3:     .long   0xcfffffff      ! RB=0, BL=0
 4:     .long   exception_handling_table
+#ifdef COUNT_EXCEPTIONS
+5:     .long   exception_count_table
+#endif
 
 interrupt_exception:
        mov.l   1f, r9
index c294de1..afe0f1b 100644 (file)
@@ -79,16 +79,16 @@ int __init detect_cpu_and_cache_system(void)
        case 0x205:
                cpu_data->type = CPU_SH7750;
                cpu_data->flags |= CPU_HAS_P2_FLUSH_BUG | CPU_HAS_FPU |
-                                  CPU_HAS_PERF_COUNTER | CPU_HAS_PTEA;
+                                  CPU_HAS_PERF_COUNTER;
                break;
        case 0x206:
                cpu_data->type = CPU_SH7750S;
                cpu_data->flags |= CPU_HAS_P2_FLUSH_BUG | CPU_HAS_FPU |
-                                  CPU_HAS_PERF_COUNTER | CPU_HAS_PTEA;
+                                  CPU_HAS_PERF_COUNTER;
                break;
        case 0x1100:
                cpu_data->type = CPU_SH7751;
-               cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA;
+               cpu_data->flags |= CPU_HAS_FPU;
                break;
        case 0x2000:
                cpu_data->type = CPU_SH73180;
@@ -126,23 +126,22 @@ int __init detect_cpu_and_cache_system(void)
                break;
        case 0x8000:
                cpu_data->type = CPU_ST40RA;
-               cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA;
+               cpu_data->flags |= CPU_HAS_FPU;
                break;
        case 0x8100:
                cpu_data->type = CPU_ST40GX1;
-               cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA;
+               cpu_data->flags |= CPU_HAS_FPU;
                break;
        case 0x700:
                cpu_data->type = CPU_SH4_501;
                cpu_data->icache.ways = 2;
                cpu_data->dcache.ways = 2;
-               cpu_data->flags |= CPU_HAS_PTEA;
                break;
        case 0x600:
                cpu_data->type = CPU_SH4_202;
                cpu_data->icache.ways = 2;
                cpu_data->dcache.ways = 2;
-               cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA;
+               cpu_data->flags |= CPU_HAS_FPU;
                break;
        case 0x500 ... 0x501:
                switch (prr) {
@@ -160,7 +159,7 @@ int __init detect_cpu_and_cache_system(void)
                cpu_data->icache.ways = 2;
                cpu_data->dcache.ways = 2;
 
-               cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA;
+               cpu_data->flags |= CPU_HAS_FPU;
 
                break;
        default:
@@ -173,6 +172,10 @@ int __init detect_cpu_and_cache_system(void)
        cpu_data->dcache.ways = 1;
 #endif
 
+#ifdef CONFIG_CPU_HAS_PTEA
+       cpu_data->flags |= CPU_HAS_PTEA;
+#endif
+
        /*
         * On anything that's not a direct-mapped cache, look to the CVR
         * for I/D-cache specifics.
index 88e9663..6cd6d00 100644 (file)
@@ -20,6 +20,7 @@ config CPU_SH4
        bool
        select CPU_HAS_INTEVT
        select CPU_HAS_SR_RB
+       select CPU_HAS_PTEA if !CPU_SUBTYPE_ST40
 
 config CPU_SH4A
        bool
index 128907e..123fb80 100644 (file)
@@ -223,89 +223,3 @@ do_sigbus:
        if (!user_mode(regs))
                goto no_context;
 }
-
-#ifdef CONFIG_SH_STORE_QUEUES
-/*
- * This is a special case for the SH-4 store queues, as pages for this
- * space still need to be faulted in before it's possible to flush the
- * store queue cache for writeout to the remapped region.
- */
-#define P3_ADDR_MAX            (P4SEG_STORE_QUE + 0x04000000)
-#else
-#define P3_ADDR_MAX            P4SEG
-#endif
-
-/*
- * Called with interrupts disabled.
- */
-asmlinkage int __kprobes __do_page_fault(struct pt_regs *regs,
-                                        unsigned long writeaccess,
-                                        unsigned long address)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-       pte_t entry;
-       struct mm_struct *mm = current->mm;
-       spinlock_t *ptl;
-       int ret = 1;
-
-#ifdef CONFIG_SH_KGDB
-       if (kgdb_nofault && kgdb_bus_err_hook)
-               kgdb_bus_err_hook();
-#endif
-
-       /*
-        * We don't take page faults for P1, P2, and parts of P4, these
-        * are always mapped, whether it be due to legacy behaviour in
-        * 29-bit mode, or due to PMB configuration in 32-bit mode.
-        */
-       if (address >= P3SEG && address < P3_ADDR_MAX) {
-               pgd = pgd_offset_k(address);
-               mm = NULL;
-       } else {
-               if (unlikely(address >= TASK_SIZE || !mm))
-                       return 1;
-
-               pgd = pgd_offset(mm, address);
-       }
-
-       pud = pud_offset(pgd, address);
-       if (pud_none_or_clear_bad(pud))
-               return 1;
-       pmd = pmd_offset(pud, address);
-       if (pmd_none_or_clear_bad(pmd))
-               return 1;
-
-       if (mm)
-               pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-       else
-               pte = pte_offset_kernel(pmd, address);
-
-       entry = *pte;
-       if (unlikely(pte_none(entry) || pte_not_present(entry)))
-               goto unlock;
-       if (unlikely(writeaccess && !pte_write(entry)))
-               goto unlock;
-
-       if (writeaccess)
-               entry = pte_mkdirty(entry);
-       entry = pte_mkyoung(entry);
-
-#ifdef CONFIG_CPU_SH4
-       /*
-        * ITLB is not affected by "ldtlb" instruction.
-        * So, we need to flush the entry by ourselves.
-        */
-       __flush_tlb_page(get_asid(), address & PAGE_MASK);
-#endif
-
-       set_pte(pte, entry);
-       update_mmu_cache(NULL, address, entry);
-       ret = 0;
-unlock:
-       if (mm)
-               pte_unmap_unlock(pte, ptl);
-       return ret;
-}
index b1f21e7..fa62524 100644 (file)
@@ -43,12 +43,12 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 /* PGD bits */
 #define PGDIR_SHIFT    (PTE_SHIFT + PTE_BITS)
 #define PGDIR_BITS     (32 - PGDIR_SHIFT)
-#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
+#define PGDIR_SIZE     (1 << PGDIR_SHIFT)
 #define PGDIR_MASK     (~(PGDIR_SIZE-1))
 
 /* Entries per level */
-#define PTRS_PER_PTE   (1UL << PTE_BITS)
-#define PTRS_PER_PGD   (1UL << PGDIR_BITS)
+#define PTRS_PER_PTE   (1 << PTE_BITS)
+#define PTRS_PER_PGD   (1 << PGDIR_BITS)
 
 #define USER_PTRS_PER_PGD      (TASK_SIZE/PGDIR_SIZE)
 #define FIRST_USER_ADDRESS     0