kernel: unroll MIPS r4k cache blast function
authorFelix Fietkau <nbd@nbd.name>
Mon, 4 Dec 2017 21:44:33 +0000 (22:44 +0100)
committerFelix Fietkau <nbd@nbd.name>
Sat, 10 Mar 2018 10:58:19 +0000 (11:58 +0100)
Optimize the compiler output for larger cache blast cases that are
common for DMA-based networking.

On ar71xx, I measured a routing throughput increase of ~8%

Signed-off-by: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
Signed-off-by: Rosen Penev <rosenp@gmail.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
target/linux/brcm47xx/patches-4.9/159-cpu_fixes.patch
target/linux/generic/hack-4.14/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch [new file with mode: 0644]
target/linux/generic/hack-4.9/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch [new file with mode: 0644]

index 36d39fa..3102923 100644 (file)
  
  #define __BUILD_BLAST_USER_CACHE(pfx, desc, indexop, hitop, lsize) \
  static inline void blast_##pfx##cache##lsize##_user_page(unsigned long page) \
-@@ -660,17 +744,19 @@ __BUILD_BLAST_USER_CACHE(d, dcache, Inde
+@@ -660,53 +744,23 @@ __BUILD_BLAST_USER_CACHE(d, dcache, Inde
  __BUILD_BLAST_USER_CACHE(i, icache, Index_Invalidate_I, Hit_Invalidate_I, 64)
  
  /* build blast_xxx_range, protected_blast_xxx_range */
                                                    unsigned long end)  \
  {                                                                     \
        unsigned long lsize = cpu_##desc##_line_size();                 \
+-      unsigned long lsize_2 = lsize * 2;                              \
+-      unsigned long lsize_3 = lsize * 3;                              \
+-      unsigned long lsize_4 = lsize * 4;                              \
+-      unsigned long lsize_5 = lsize * 5;                              \
+-      unsigned long lsize_6 = lsize * 6;                              \
+-      unsigned long lsize_7 = lsize * 7;                              \
+-      unsigned long lsize_8 = lsize * 8;                              \
        unsigned long addr = start & ~(lsize - 1);                      \
-       unsigned long aend = (end - 1) & ~(lsize - 1);                  \
+-      unsigned long aend = (end + lsize - 1) & ~(lsize - 1);          \
+-      int lines = (aend - addr) / lsize;                              \
++      unsigned long aend = (end - 1) & ~(lsize - 1);                  \
 +      war                                                             \
                                                                        \
        __##pfx##flush_prologue                                         \
                                                                        \
-       while (1) {                                                     \
+-      while (lines >= 8) {                                            \
+-              prot##cache_op(hitop, addr);                            \
+-              prot##cache_op(hitop, addr + lsize);                    \
+-              prot##cache_op(hitop, addr + lsize_2);                  \
+-              prot##cache_op(hitop, addr + lsize_3);                  \
+-              prot##cache_op(hitop, addr + lsize_4);                  \
+-              prot##cache_op(hitop, addr + lsize_5);                  \
+-              prot##cache_op(hitop, addr + lsize_6);                  \
+-              prot##cache_op(hitop, addr + lsize_7);                  \
+-              addr += lsize_8;                                        \
+-              lines -= 8;                                             \
+-      }                                                               \
+-                                                                      \
+-      if (lines & 0x4) {                                              \
+-              prot##cache_op(hitop, addr);                            \
+-              prot##cache_op(hitop, addr + lsize);                    \
+-              prot##cache_op(hitop, addr + lsize_2);                  \
+-              prot##cache_op(hitop, addr + lsize_3);                  \
+-              addr += lsize_4;                                        \
+-      }                                                               \
+-                                                                      \
+-      if (lines & 0x2) {                                              \
+-              prot##cache_op(hitop, addr);                            \
+-              prot##cache_op(hitop, addr + lsize);                    \
+-              addr += lsize_2;                                        \
+-      }                                                               \
+-                                                                      \
+-      if (lines & 0x1) {                                              \
++      while (1) {                                                     \
 +              war2                                                    \
                prot##cache_op(hitop, addr);                            \
-               if (addr == aend)                                       \
-                       break;                                          \
-@@ -682,8 +768,8 @@ static inline void prot##extra##blast_##
++              if (addr == aend)                                       \
++                      break;                                          \
++              addr += lsize;                                          \
+       }                                                               \
+                                                                       \
+       __##pfx##flush_epilogue                                         \
+@@ -714,8 +768,8 @@ static inline void prot##extra##blast_##
  
  #ifndef CONFIG_EVA
  
  
  #else
  
-@@ -720,14 +806,14 @@ __BUILD_PROT_BLAST_CACHE_RANGE(d, dcache
+@@ -752,14 +806,14 @@ __BUILD_PROT_BLAST_CACHE_RANGE(d, dcache
  __BUILD_PROT_BLAST_CACHE_RANGE(i, icache, Hit_Invalidate_I)
  
  #endif
diff --git a/target/linux/generic/hack-4.14/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch b/target/linux/generic/hack-4.14/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch
new file mode 100644 (file)
index 0000000..860a7e0
--- /dev/null
@@ -0,0 +1,66 @@
+From: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
+Date: Fri, 7 Jun 2013 18:35:22 -0500
+Subject: MIPS: r4k_cache: use more efficient cache blast
+
+Optimize the compiler output for larger cache blast cases that are
+common for DMA-based networking.
+
+Signed-off-by: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
+Signed-off-by: Felix Fietkau <nbd@nbd.name>
+---
+--- a/arch/mips/include/asm/r4kcache.h
++++ b/arch/mips/include/asm/r4kcache.h
+@@ -682,16 +682,48 @@ static inline void prot##extra##blast_##
+                                                   unsigned long end)  \
+ {                                                                     \
+       unsigned long lsize = cpu_##desc##_line_size();                 \
++      unsigned long lsize_2 = lsize * 2;                              \
++      unsigned long lsize_3 = lsize * 3;                              \
++      unsigned long lsize_4 = lsize * 4;                              \
++      unsigned long lsize_5 = lsize * 5;                              \
++      unsigned long lsize_6 = lsize * 6;                              \
++      unsigned long lsize_7 = lsize * 7;                              \
++      unsigned long lsize_8 = lsize * 8;                              \
+       unsigned long addr = start & ~(lsize - 1);                      \
+-      unsigned long aend = (end - 1) & ~(lsize - 1);                  \
++      unsigned long aend = (end + lsize - 1) & ~(lsize - 1);          \
++      int lines = (aend - addr) / lsize;                              \
+                                                                       \
+       __##pfx##flush_prologue                                         \
+                                                                       \
+-      while (1) {                                                     \
++      while (lines >= 8) {                                            \
++              prot##cache_op(hitop, addr);                            \
++              prot##cache_op(hitop, addr + lsize);                    \
++              prot##cache_op(hitop, addr + lsize_2);                  \
++              prot##cache_op(hitop, addr + lsize_3);                  \
++              prot##cache_op(hitop, addr + lsize_4);                  \
++              prot##cache_op(hitop, addr + lsize_5);                  \
++              prot##cache_op(hitop, addr + lsize_6);                  \
++              prot##cache_op(hitop, addr + lsize_7);                  \
++              addr += lsize_8;                                        \
++              lines -= 8;                                             \
++      }                                                               \
++                                                                      \
++      if (lines & 0x4) {                                              \
++              prot##cache_op(hitop, addr);                            \
++              prot##cache_op(hitop, addr + lsize);                    \
++              prot##cache_op(hitop, addr + lsize_2);                  \
++              prot##cache_op(hitop, addr + lsize_3);                  \
++              addr += lsize_4;                                        \
++      }                                                               \
++                                                                      \
++      if (lines & 0x2) {                                              \
++              prot##cache_op(hitop, addr);                            \
++              prot##cache_op(hitop, addr + lsize);                    \
++              addr += lsize_2;                                        \
++      }                                                               \
++                                                                      \
++      if (lines & 0x1) {                                              \
+               prot##cache_op(hitop, addr);                            \
+-              if (addr == aend)                                       \
+-                      break;                                          \
+-              addr += lsize;                                          \
+       }                                                               \
+                                                                       \
+       __##pfx##flush_epilogue                                         \
diff --git a/target/linux/generic/hack-4.9/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch b/target/linux/generic/hack-4.9/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch
new file mode 100644 (file)
index 0000000..ce7901a
--- /dev/null
@@ -0,0 +1,66 @@
+From: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
+Date: Fri, 7 Jun 2013 18:35:22 -0500
+Subject: MIPS: r4k_cache: use more efficient cache blast
+
+Optimize the compiler output for larger cache blast cases that are
+common for DMA-based networking.
+
+Signed-off-by: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
+Signed-off-by: Felix Fietkau <nbd@nbd.name>
+---
+--- a/arch/mips/include/asm/r4kcache.h
++++ b/arch/mips/include/asm/r4kcache.h
+@@ -665,16 +665,48 @@ static inline void prot##extra##blast_##pfx##cache##_range(unsigned long start,
+                                                   unsigned long end)  \
+ {                                                                     \
+       unsigned long lsize = cpu_##desc##_line_size();                 \
++      unsigned long lsize_2 = lsize * 2;                              \
++      unsigned long lsize_3 = lsize * 3;                              \
++      unsigned long lsize_4 = lsize * 4;                              \
++      unsigned long lsize_5 = lsize * 5;                              \
++      unsigned long lsize_6 = lsize * 6;                              \
++      unsigned long lsize_7 = lsize * 7;                              \
++      unsigned long lsize_8 = lsize * 8;                              \
+       unsigned long addr = start & ~(lsize - 1);                      \
+-      unsigned long aend = (end - 1) & ~(lsize - 1);                  \
++      unsigned long aend = (end + lsize - 1) & ~(lsize - 1);          \
++      int lines = (aend - addr) / lsize;                              \
+                                                                       \
+       __##pfx##flush_prologue                                         \
+                                                                       \
+-      while (1) {                                                     \
++      while (lines >= 8) {                                            \
++              prot##cache_op(hitop, addr);                            \
++              prot##cache_op(hitop, addr + lsize);                    \
++              prot##cache_op(hitop, addr + lsize_2);                  \
++              prot##cache_op(hitop, addr + lsize_3);                  \
++              prot##cache_op(hitop, addr + lsize_4);                  \
++              prot##cache_op(hitop, addr + lsize_5);                  \
++              prot##cache_op(hitop, addr + lsize_6);                  \
++              prot##cache_op(hitop, addr + lsize_7);                  \
++              addr += lsize_8;                                        \
++              lines -= 8;                                             \
++      }                                                               \
++                                                                      \
++      if (lines & 0x4) {                                              \
++              prot##cache_op(hitop, addr);                            \
++              prot##cache_op(hitop, addr + lsize);                    \
++              prot##cache_op(hitop, addr + lsize_2);                  \
++              prot##cache_op(hitop, addr + lsize_3);                  \
++              addr += lsize_4;                                        \
++      }                                                               \
++                                                                      \
++      if (lines & 0x2) {                                              \
++              prot##cache_op(hitop, addr);                            \
++              prot##cache_op(hitop, addr + lsize);                    \
++              addr += lsize_2;                                        \
++      }                                                               \
++                                                                      \
++      if (lines & 0x1) {                                              \
+               prot##cache_op(hitop, addr);                            \
+-              if (addr == aend)                                       \
+-                      break;                                          \
+-              addr += lsize;                                          \
+       }                                                               \
+                                                                       \
+       __##pfx##flush_epilogue                                         \