kernel: 5.4: import wireguard backport
[openwrt/openwrt.git] / target / linux / generic / backport-5.4 / 080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Ard Biesheuvel <ardb@kernel.org>
3 Date: Tue, 3 Nov 2020 17:28:09 +0100
4 Subject: [PATCH] crypto: arm/chacha-neon - optimize for non-block size
5 multiples
6
7 commit 86cd97ec4b943af35562a74688bc4e909b32c3d1 upstream.
8
9 The current NEON based ChaCha implementation for ARM is optimized for
10 multiples of 4x the ChaCha block size (64 bytes). This makes sense for
11 block encryption, but given that ChaCha is also often used in the
12 context of networking, it makes sense to consider arbitrary length
13 inputs as well.
14
15 For example, WireGuard typically uses 1420 byte packets, and performing
16 ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
17 and 3 invocations of chacha_block_xor_neon(), where the last one also
18 involves a memcpy() using a buffer on the stack to process the final
19 chunk of 1420 % 64 == 12 bytes.
20
21 Let's optimize for this case as well, by letting chacha_4block_xor_neon()
22 deal with any input size between 64 and 256 bytes, using NEON permutation
23 instructions and overlapping loads and stores. This way, the 140 byte
24 tail of a 1420 byte input buffer can simply be processed in one go.
25
26 This results in the following performance improvements for 1420 byte
27 blocks, without significant impact on power-of-2 input sizes. (Note
28 that Raspberry Pi is widely used in combination with a 32-bit kernel,
29 even though the core is 64-bit capable)
30
31 Cortex-A8 (BeagleBone) : 7%
32 Cortex-A15 (Calxeda Midway) : 21%
33 Cortex-A53 (Raspberry Pi 3) : 3%
34 Cortex-A72 (Raspberry Pi 4) : 19%
35
36 Cc: Eric Biggers <ebiggers@google.com>
37 Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
38 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
39 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
40 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
41 ---
42 arch/arm/crypto/chacha-glue.c | 34 +++++------
43 arch/arm/crypto/chacha-neon-core.S | 97 +++++++++++++++++++++++++++---
44 2 files changed, 107 insertions(+), 24 deletions(-)
45
46 --- a/arch/arm/crypto/chacha-glue.c
47 +++ b/arch/arm/crypto/chacha-glue.c
48 @@ -23,7 +23,7 @@
49 asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
50 int nrounds);
51 asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
52 - int nrounds);
53 + int nrounds, unsigned int nbytes);
54 asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
55 asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
56
57 @@ -42,24 +42,24 @@ static void chacha_doneon(u32 *state, u8
58 {
59 u8 buf[CHACHA_BLOCK_SIZE];
60
61 - while (bytes >= CHACHA_BLOCK_SIZE * 4) {
62 - chacha_4block_xor_neon(state, dst, src, nrounds);
63 - bytes -= CHACHA_BLOCK_SIZE * 4;
64 - src += CHACHA_BLOCK_SIZE * 4;
65 - dst += CHACHA_BLOCK_SIZE * 4;
66 - state[12] += 4;
67 - }
68 - while (bytes >= CHACHA_BLOCK_SIZE) {
69 - chacha_block_xor_neon(state, dst, src, nrounds);
70 - bytes -= CHACHA_BLOCK_SIZE;
71 - src += CHACHA_BLOCK_SIZE;
72 - dst += CHACHA_BLOCK_SIZE;
73 - state[12]++;
74 + while (bytes > CHACHA_BLOCK_SIZE) {
75 + unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
76 +
77 + chacha_4block_xor_neon(state, dst, src, nrounds, l);
78 + bytes -= l;
79 + src += l;
80 + dst += l;
81 + state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
82 }
83 if (bytes) {
84 - memcpy(buf, src, bytes);
85 - chacha_block_xor_neon(state, buf, buf, nrounds);
86 - memcpy(dst, buf, bytes);
87 + const u8 *s = src;
88 + u8 *d = dst;
89 +
90 + if (bytes != CHACHA_BLOCK_SIZE)
91 + s = d = memcpy(buf, src, bytes);
92 + chacha_block_xor_neon(state, d, s, nrounds);
93 + if (d != dst)
94 + memcpy(dst, buf, bytes);
95 }
96 }
97
98 --- a/arch/arm/crypto/chacha-neon-core.S
99 +++ b/arch/arm/crypto/chacha-neon-core.S
100 @@ -47,6 +47,7 @@
101 */
102
103 #include <linux/linkage.h>
104 +#include <asm/cache.h>
105
106 .text
107 .fpu neon
108 @@ -205,7 +206,7 @@ ENDPROC(hchacha_block_neon)
109
110 .align 5
111 ENTRY(chacha_4block_xor_neon)
112 - push {r4-r5}
113 + push {r4, lr}
114 mov r4, sp // preserve the stack pointer
115 sub ip, sp, #0x20 // allocate a 32 byte buffer
116 bic ip, ip, #0x1f // aligned to 32 bytes
117 @@ -229,10 +230,10 @@ ENTRY(chacha_4block_xor_neon)
118 vld1.32 {q0-q1}, [r0]
119 vld1.32 {q2-q3}, [ip]
120
121 - adr r5, .Lctrinc
122 + adr lr, .Lctrinc
123 vdup.32 q15, d7[1]
124 vdup.32 q14, d7[0]
125 - vld1.32 {q4}, [r5, :128]
126 + vld1.32 {q4}, [lr, :128]
127 vdup.32 q13, d6[1]
128 vdup.32 q12, d6[0]
129 vdup.32 q11, d5[1]
130 @@ -455,7 +456,7 @@ ENTRY(chacha_4block_xor_neon)
131
132 // Re-interleave the words in the first two rows of each block (x0..7).
133 // Also add the counter values 0-3 to x12[0-3].
134 - vld1.32 {q8}, [r5, :128] // load counter values 0-3
135 + vld1.32 {q8}, [lr, :128] // load counter values 0-3
136 vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
137 vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
138 vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
139 @@ -493,6 +494,8 @@ ENTRY(chacha_4block_xor_neon)
140
141 // Re-interleave the words in the last two rows of each block (x8..15).
142 vld1.32 {q8-q9}, [sp, :256]
143 + mov sp, r4 // restore original stack pointer
144 + ldr r4, [r4, #8] // load number of bytes
145 vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
146 vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
147 vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
148 @@ -520,41 +523,121 @@ ENTRY(chacha_4block_xor_neon)
149 // XOR the rest of the data with the keystream
150
151 vld1.8 {q0-q1}, [r2]!
152 + subs r4, r4, #96
153 veor q0, q0, q8
154 veor q1, q1, q12
155 + ble .Lle96
156 vst1.8 {q0-q1}, [r1]!
157
158 vld1.8 {q0-q1}, [r2]!
159 + subs r4, r4, #32
160 veor q0, q0, q2
161 veor q1, q1, q6
162 + ble .Lle128
163 vst1.8 {q0-q1}, [r1]!
164
165 vld1.8 {q0-q1}, [r2]!
166 + subs r4, r4, #32
167 veor q0, q0, q10
168 veor q1, q1, q14
169 + ble .Lle160
170 vst1.8 {q0-q1}, [r1]!
171
172 vld1.8 {q0-q1}, [r2]!
173 + subs r4, r4, #32
174 veor q0, q0, q4
175 veor q1, q1, q5
176 + ble .Lle192
177 vst1.8 {q0-q1}, [r1]!
178
179 vld1.8 {q0-q1}, [r2]!
180 + subs r4, r4, #32
181 veor q0, q0, q9
182 veor q1, q1, q13
183 + ble .Lle224
184 vst1.8 {q0-q1}, [r1]!
185
186 vld1.8 {q0-q1}, [r2]!
187 + subs r4, r4, #32
188 veor q0, q0, q3
189 veor q1, q1, q7
190 + blt .Llt256
191 +.Lout:
192 vst1.8 {q0-q1}, [r1]!
193
194 vld1.8 {q0-q1}, [r2]
195 - mov sp, r4 // restore original stack pointer
196 veor q0, q0, q11
197 veor q1, q1, q15
198 vst1.8 {q0-q1}, [r1]
199
200 - pop {r4-r5}
201 - bx lr
202 + pop {r4, pc}
203 +
204 +.Lle192:
205 + vmov q4, q9
206 + vmov q5, q13
207 +
208 +.Lle160:
209 + // nothing to do
210 +
211 +.Lfinalblock:
212 + // Process the final block if processing less than 4 full blocks.
213 + // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
214 + // previous 32 byte output block that still needs to be written at
215 + // [r1] in q0-q1.
216 + beq .Lfullblock
217 +
218 +.Lpartialblock:
219 + adr lr, .Lpermute + 32
220 + add r2, r2, r4
221 + add lr, lr, r4
222 + add r4, r4, r1
223 +
224 + vld1.8 {q2-q3}, [lr]
225 + vld1.8 {q6-q7}, [r2]
226 +
227 + add r4, r4, #32
228 +
229 + vtbl.8 d4, {q4-q5}, d4
230 + vtbl.8 d5, {q4-q5}, d5
231 + vtbl.8 d6, {q4-q5}, d6
232 + vtbl.8 d7, {q4-q5}, d7
233 +
234 + veor q6, q6, q2
235 + veor q7, q7, q3
236 +
237 + vst1.8 {q6-q7}, [r4] // overlapping stores
238 + vst1.8 {q0-q1}, [r1]
239 + pop {r4, pc}
240 +
241 +.Lfullblock:
242 + vmov q11, q4
243 + vmov q15, q5
244 + b .Lout
245 +.Lle96:
246 + vmov q4, q2
247 + vmov q5, q6
248 + b .Lfinalblock
249 +.Lle128:
250 + vmov q4, q10
251 + vmov q5, q14
252 + b .Lfinalblock
253 +.Lle224:
254 + vmov q4, q3
255 + vmov q5, q7
256 + b .Lfinalblock
257 +.Llt256:
258 + vmov q4, q11
259 + vmov q5, q15
260 + b .Lpartialblock
261 ENDPROC(chacha_4block_xor_neon)
262 +
263 + .align L1_CACHE_SHIFT
264 +.Lpermute:
265 + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
266 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
267 + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
268 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
269 + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
270 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
271 + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
272 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f