target/linux/orion/patches/002-feroceon__speed_up_flushing_of_the_entire_cache.patch

   1 Flushing the L1 D cache with a test/clean/invalidate loop is very
   2 easy in software, but it is not the quickest way of doing it, as
   3 there is a lot of overhead involved in re-scanning the cache from
   4 the beginning every time we hit a dirty line.
   5
   6 This patch makes proc-feroceon.S use "clean+invalidate by set/way"
   7 loops according to possible cache configuration of Feroceon CPUs
   8 (either direct-mapped or 4-way set associative).
   9
  10 [nico: optimized the assembly a bit]
  11
  12 Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
  13 Signed-off-by: Nicolas Pitre <nico@marvell.com>
  14 ---
  15  arch/arm/mm/proc-feroceon.S |   53 ++++++++++++++++++++++++++++++++++---------
  16  1 files changed, 42 insertions(+), 11 deletions(-)
  17
  18 --- a/arch/arm/mm/proc-feroceon.S
  19 +++ b/arch/arm/mm/proc-feroceon.S
  20 @@ -44,11 +44,31 @@
  21   */
  22  #define CACHE_DLINESIZE        32
  23
  24 +       .bss
  25 +       .align 3
  26 +__cache_params_loc:
  27 +       .space  8
  28 +
  29         .text
  30 +__cache_params:
  31 +       .word   __cache_params_loc
  32 +
  33  /*
  34   * cpu_feroceon_proc_init()
  35   */
  36  ENTRY(cpu_feroceon_proc_init)
  37 +       mrc     p15, 0, r0, c0, c0, 1           @ read cache type register
  38 +       ldr     r1, __cache_params
  39 +       mov     r2, #(16 << 5)
  40 +       tst     r0, #(1 << 16)                  @ get way
  41 +       mov     r0, r0, lsr #18                 @ get cache size order
  42 +       movne   r3, #((4 - 1) << 30)            @ 4-way
  43 +       and     r0, r0, #0xf
  44 +       moveq   r3, #0                          @ 1-way
  45 +       mov     r2, r2, lsl r0                  @ actual cache size
  46 +       movne   r2, r2, lsr #2                  @ turned into # of sets
  47 +       sub     r2, r2, #(1 << 5)
  48 +       stmia   r1, {r2, r3}
  49         mov     pc, lr
  50
  51  /*
  52 @@ -117,11 +137,19 @@
  53   */
  54  ENTRY(feroceon_flush_kern_cache_all)
  55         mov     r2, #VM_EXEC
  56 -       mov     ip, #0
  57 +
  58  __flush_whole_cache:
  59 -1:     mrc     p15, 0, r15, c7, c14, 3         @ test,clean,invalidate
  60 -       bne     1b
  61 +       ldr     r1, __cache_params
  62 +       ldmia   r1, {r1, r3}
  63 +1:     orr     ip, r1, r3
  64 +2:     mcr     p15, 0, ip, c7, c14, 2          @ clean + invalidate D set/way
  65 +       subs    ip, ip, #(1 << 30)              @ next way
  66 +       bcs     2b
  67 +       subs    r1, r1, #(1 << 5)               @ next set
  68 +       bcs     1b
  69 +
  70         tst     r2, #VM_EXEC
  71 +       mov     ip, #0
  72         mcrne   p15, 0, ip, c7, c5, 0           @ invalidate I cache
  73         mcrne   p15, 0, ip, c7, c10, 4          @ drain WB
  74         mov     pc, lr
  75 @@ -138,7 +166,6 @@
  76   */
  77         .align  5
  78  ENTRY(feroceon_flush_user_cache_range)
  79 -       mov     ip, #0
  80         sub     r3, r1, r0                      @ calculate total size
  81         cmp     r3, #CACHE_DLIMIT
  82         bgt     __flush_whole_cache
  83 @@ -152,6 +179,7 @@
  84         cmp     r0, r1
  85         blo     1b
  86         tst     r2, #VM_EXEC
  87 +       mov     ip, #0
  88         mcrne   p15, 0, ip, c7, c10, 4          @ drain WB
  89         mov     pc, lr
  90
  91 @@ -306,16 +334,19 @@
  92         .align  5
  93  ENTRY(cpu_feroceon_switch_mm)
  94  #ifdef CONFIG_MMU
  95 -       mov     ip, #0
  96 -@ && 'Clean & Invalidate whole DCache'
  97 -1:     mrc     p15, 0, r15, c7, c14, 3         @ test,clean,invalidate
  98 -       bne     1b
  99 -       mcr     p15, 0, ip, c7, c5, 0           @ invalidate I cache
 100 -       mcr     p15, 0, ip, c7, c10, 4          @ drain WB
 101 +       mov     r2, lr                          @ abuse r2 to preserve lr
 102 +       bl      __flush_whole_cache
 103 +       @ if r2 contains the VM_EXEC bit then the next 2 ops are done already
 104 +       tst     r2, #VM_EXEC
 105 +       mcreq   p15, 0, ip, c7, c5, 0           @ invalidate I cache
 106 +       mcreq   p15, 0, ip, c7, c10, 4          @ drain WB
 107 +
 108         mcr     p15, 0, r0, c2, c0, 0           @ load page table pointer
 109         mcr     p15, 0, ip, c8, c7, 0           @ invalidate I & D TLBs
 110 -#endif
 111 +       mov     pc, r2
 112 +#else
 113         mov     pc, lr
 114 +#endif
 115
 116  /*
 117   * cpu_feroceon_set_pte_ext(ptep, pte, ext)