target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch

   1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
   2 From: Ard Biesheuvel <ardb@kernel.org>
   3 Date: Tue, 3 Nov 2020 17:28:09 +0100
   4 Subject: [PATCH] crypto: arm/chacha-neon - optimize for non-block size
   5  multiples
   6
   7 commit 86cd97ec4b943af35562a74688bc4e909b32c3d1 upstream.
   8
   9 The current NEON based ChaCha implementation for ARM is optimized for
  10 multiples of 4x the ChaCha block size (64 bytes). This makes sense for
  11 block encryption, but given that ChaCha is also often used in the
  12 context of networking, it makes sense to consider arbitrary length
  13 inputs as well.
  14
  15 For example, WireGuard typically uses 1420 byte packets, and performing
  16 ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
  17 and 3 invocations of chacha_block_xor_neon(), where the last one also
  18 involves a memcpy() using a buffer on the stack to process the final
  19 chunk of 1420 % 64 == 12 bytes.
  20
  21 Let's optimize for this case as well, by letting chacha_4block_xor_neon()
  22 deal with any input size between 64 and 256 bytes, using NEON permutation
  23 instructions and overlapping loads and stores. This way, the 140 byte
  24 tail of a 1420 byte input buffer can simply be processed in one go.
  25
  26 This results in the following performance improvements for 1420 byte
  27 blocks, without significant impact on power-of-2 input sizes. (Note
  28 that Raspberry Pi is widely used in combination with a 32-bit kernel,
  29 even though the core is 64-bit capable)
  30
  31    Cortex-A8  (BeagleBone)       :   7%
  32    Cortex-A15 (Calxeda Midway)   :  21%
  33    Cortex-A53 (Raspberry Pi 3)   :   3%
  34    Cortex-A72 (Raspberry Pi 4)   :  19%
  35
  36 Cc: Eric Biggers <ebiggers@google.com>
  37 Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
  38 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
  39 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
  40 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
  41 ---
  42  arch/arm/crypto/chacha-glue.c      | 34 +++++------
  43  arch/arm/crypto/chacha-neon-core.S | 97 +++++++++++++++++++++++++++---
  44  2 files changed, 107 insertions(+), 24 deletions(-)
  45
  46 --- a/arch/arm/crypto/chacha-glue.c
  47 +++ b/arch/arm/crypto/chacha-glue.c
  48 @@ -23,7 +23,7 @@
  49  asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
  50                                       int nrounds);
  51  asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
  52 -                                      int nrounds);
  53 +                                      int nrounds, unsigned int nbytes);
  54  asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
  55  asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
  56
  57 @@ -42,24 +42,24 @@ static void chacha_doneon(u32 *state, u8
  58  {
  59         u8 buf[CHACHA_BLOCK_SIZE];
  60
  61 -       while (bytes >= CHACHA_BLOCK_SIZE * 4) {
  62 -               chacha_4block_xor_neon(state, dst, src, nrounds);
  63 -               bytes -= CHACHA_BLOCK_SIZE * 4;
  64 -               src += CHACHA_BLOCK_SIZE * 4;
  65 -               dst += CHACHA_BLOCK_SIZE * 4;
  66 -               state[12] += 4;
  67 -       }
  68 -       while (bytes >= CHACHA_BLOCK_SIZE) {
  69 -               chacha_block_xor_neon(state, dst, src, nrounds);
  70 -               bytes -= CHACHA_BLOCK_SIZE;
  71 -               src += CHACHA_BLOCK_SIZE;
  72 -               dst += CHACHA_BLOCK_SIZE;
  73 -               state[12]++;
  74 +       while (bytes > CHACHA_BLOCK_SIZE) {
  75 +               unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
  76 +
  77 +               chacha_4block_xor_neon(state, dst, src, nrounds, l);
  78 +               bytes -= l;
  79 +               src += l;
  80 +               dst += l;
  81 +               state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
  82         }
  83         if (bytes) {
  84 -               memcpy(buf, src, bytes);
  85 -               chacha_block_xor_neon(state, buf, buf, nrounds);
  86 -               memcpy(dst, buf, bytes);
  87 +               const u8 *s = src;
  88 +               u8 *d = dst;
  89 +
  90 +               if (bytes != CHACHA_BLOCK_SIZE)
  91 +                       s = d = memcpy(buf, src, bytes);
  92 +               chacha_block_xor_neon(state, d, s, nrounds);
  93 +               if (d != dst)
  94 +                       memcpy(dst, buf, bytes);
  95         }
  96  }
  97
  98 --- a/arch/arm/crypto/chacha-neon-core.S
  99 +++ b/arch/arm/crypto/chacha-neon-core.S
 100 @@ -47,6 +47,7 @@
 101    */
 102
 103  #include <linux/linkage.h>
 104 +#include <asm/cache.h>
 105
 106         .text
 107         .fpu            neon
 108 @@ -205,7 +206,7 @@ ENDPROC(hchacha_block_neon)
 109
 110         .align          5
 111  ENTRY(chacha_4block_xor_neon)
 112 -       push            {r4-r5}
 113 +       push            {r4, lr}
 114         mov             r4, sp                  // preserve the stack pointer
 115         sub             ip, sp, #0x20           // allocate a 32 byte buffer
 116         bic             ip, ip, #0x1f           // aligned to 32 bytes
 117 @@ -229,10 +230,10 @@ ENTRY(chacha_4block_xor_neon)
 118         vld1.32         {q0-q1}, [r0]
 119         vld1.32         {q2-q3}, [ip]
 120
 121 -       adr             r5, .Lctrinc
 122 +       adr             lr, .Lctrinc
 123         vdup.32         q15, d7[1]
 124         vdup.32         q14, d7[0]
 125 -       vld1.32         {q4}, [r5, :128]
 126 +       vld1.32         {q4}, [lr, :128]
 127         vdup.32         q13, d6[1]
 128         vdup.32         q12, d6[0]
 129         vdup.32         q11, d5[1]
 130 @@ -455,7 +456,7 @@ ENTRY(chacha_4block_xor_neon)
 131
 132         // Re-interleave the words in the first two rows of each block (x0..7).
 133         // Also add the counter values 0-3 to x12[0-3].
 134 -         vld1.32       {q8}, [r5, :128]        // load counter values 0-3
 135 +         vld1.32       {q8}, [lr, :128]        // load counter values 0-3
 136         vzip.32         q0, q1                  // => (0 1 0 1) (0 1 0 1)
 137         vzip.32         q2, q3                  // => (2 3 2 3) (2 3 2 3)
 138         vzip.32         q4, q5                  // => (4 5 4 5) (4 5 4 5)
 139 @@ -493,6 +494,8 @@ ENTRY(chacha_4block_xor_neon)
 140
 141         // Re-interleave the words in the last two rows of each block (x8..15).
 142         vld1.32         {q8-q9}, [sp, :256]
 143 +         mov           sp, r4          // restore original stack pointer
 144 +         ldr           r4, [r4, #8]    // load number of bytes
 145         vzip.32         q12, q13        // => (12 13 12 13) (12 13 12 13)
 146         vzip.32         q14, q15        // => (14 15 14 15) (14 15 14 15)
 147         vzip.32         q8, q9          // => (8 9 8 9) (8 9 8 9)
 148 @@ -520,41 +523,121 @@ ENTRY(chacha_4block_xor_neon)
 149         // XOR the rest of the data with the keystream
 150
 151         vld1.8          {q0-q1}, [r2]!
 152 +       subs            r4, r4, #96
 153         veor            q0, q0, q8
 154         veor            q1, q1, q12
 155 +       ble             .Lle96
 156         vst1.8          {q0-q1}, [r1]!
 157
 158         vld1.8          {q0-q1}, [r2]!
 159 +       subs            r4, r4, #32
 160         veor            q0, q0, q2
 161         veor            q1, q1, q6
 162 +       ble             .Lle128
 163         vst1.8          {q0-q1}, [r1]!
 164
 165         vld1.8          {q0-q1}, [r2]!
 166 +       subs            r4, r4, #32
 167         veor            q0, q0, q10
 168         veor            q1, q1, q14
 169 +       ble             .Lle160
 170         vst1.8          {q0-q1}, [r1]!
 171
 172         vld1.8          {q0-q1}, [r2]!
 173 +       subs            r4, r4, #32
 174         veor            q0, q0, q4
 175         veor            q1, q1, q5
 176 +       ble             .Lle192
 177         vst1.8          {q0-q1}, [r1]!
 178
 179         vld1.8          {q0-q1}, [r2]!
 180 +       subs            r4, r4, #32
 181         veor            q0, q0, q9
 182         veor            q1, q1, q13
 183 +       ble             .Lle224
 184         vst1.8          {q0-q1}, [r1]!
 185
 186         vld1.8          {q0-q1}, [r2]!
 187 +       subs            r4, r4, #32
 188         veor            q0, q0, q3
 189         veor            q1, q1, q7
 190 +       blt             .Llt256
 191 +.Lout:
 192         vst1.8          {q0-q1}, [r1]!
 193
 194         vld1.8          {q0-q1}, [r2]
 195 -         mov           sp, r4          // restore original stack pointer
 196         veor            q0, q0, q11
 197         veor            q1, q1, q15
 198         vst1.8          {q0-q1}, [r1]
 199
 200 -       pop             {r4-r5}
 201 -       bx              lr
 202 +       pop             {r4, pc}
 203 +
 204 +.Lle192:
 205 +       vmov            q4, q9
 206 +       vmov            q5, q13
 207 +
 208 +.Lle160:
 209 +       // nothing to do
 210 +
 211 +.Lfinalblock:
 212 +       // Process the final block if processing less than 4 full blocks.
 213 +       // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
 214 +       // previous 32 byte output block that still needs to be written at
 215 +       // [r1] in q0-q1.
 216 +       beq             .Lfullblock
 217 +
 218 +.Lpartialblock:
 219 +       adr             lr, .Lpermute + 32
 220 +       add             r2, r2, r4
 221 +       add             lr, lr, r4
 222 +       add             r4, r4, r1
 223 +
 224 +       vld1.8          {q2-q3}, [lr]
 225 +       vld1.8          {q6-q7}, [r2]
 226 +
 227 +       add             r4, r4, #32
 228 +
 229 +       vtbl.8          d4, {q4-q5}, d4
 230 +       vtbl.8          d5, {q4-q5}, d5
 231 +       vtbl.8          d6, {q4-q5}, d6
 232 +       vtbl.8          d7, {q4-q5}, d7
 233 +
 234 +       veor            q6, q6, q2
 235 +       veor            q7, q7, q3
 236 +
 237 +       vst1.8          {q6-q7}, [r4]   // overlapping stores
 238 +       vst1.8          {q0-q1}, [r1]
 239 +       pop             {r4, pc}
 240 +
 241 +.Lfullblock:
 242 +       vmov            q11, q4
 243 +       vmov            q15, q5
 244 +       b               .Lout
 245 +.Lle96:
 246 +       vmov            q4, q2
 247 +       vmov            q5, q6
 248 +       b               .Lfinalblock
 249 +.Lle128:
 250 +       vmov            q4, q10
 251 +       vmov            q5, q14
 252 +       b               .Lfinalblock
 253 +.Lle224:
 254 +       vmov            q4, q3
 255 +       vmov            q5, q7
 256 +       b               .Lfinalblock
 257 +.Llt256:
 258 +       vmov            q4, q11
 259 +       vmov            q5, q15
 260 +       b               .Lpartialblock
 261  ENDPROC(chacha_4block_xor_neon)
 262 +
 263 +       .align          L1_CACHE_SHIFT
 264 +.Lpermute:
 265 +       .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
 266 +       .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 267 +       .byte           0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
 268 +       .byte           0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
 269 +       .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
 270 +       .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 271 +       .byte           0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
 272 +       .byte           0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f