kernel: 5.4: import wireguard backport
[openwrt/openwrt.git] / target / linux / generic / backport-5.4 / 080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Fri, 8 Nov 2019 13:22:16 +0100
4 Subject: [PATCH] crypto: mips/chacha - import 32r2 ChaCha code from Zinc
5 MIME-Version: 1.0
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
8
9 commit 49aa7c00eddf8d8f462b0256bd82e81762d7b0c6 upstream.
10
11 This imports the accelerated MIPS 32r2 ChaCha20 implementation from the
12 Zinc patch set.
13
14 Co-developed-by: René van Dorst <opensource@vdorst.com>
15 Signed-off-by: René van Dorst <opensource@vdorst.com>
16 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
17 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
18 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
19 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
20 ---
21 arch/mips/crypto/chacha-core.S | 424 +++++++++++++++++++++++++++++++++
22 1 file changed, 424 insertions(+)
23 create mode 100644 arch/mips/crypto/chacha-core.S
24
25 --- /dev/null
26 +++ b/arch/mips/crypto/chacha-core.S
27 @@ -0,0 +1,424 @@
28 +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
29 +/*
30 + * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
31 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
32 + */
33 +
34 +#define MASK_U32 0x3c
35 +#define CHACHA20_BLOCK_SIZE 64
36 +#define STACK_SIZE 32
37 +
38 +#define X0 $t0
39 +#define X1 $t1
40 +#define X2 $t2
41 +#define X3 $t3
42 +#define X4 $t4
43 +#define X5 $t5
44 +#define X6 $t6
45 +#define X7 $t7
46 +#define X8 $t8
47 +#define X9 $t9
48 +#define X10 $v1
49 +#define X11 $s6
50 +#define X12 $s5
51 +#define X13 $s4
52 +#define X14 $s3
53 +#define X15 $s2
54 +/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
55 +#define T0 $s1
56 +#define T1 $s0
57 +#define T(n) T ## n
58 +#define X(n) X ## n
59 +
60 +/* Input arguments */
61 +#define STATE $a0
62 +#define OUT $a1
63 +#define IN $a2
64 +#define BYTES $a3
65 +
66 +/* Output argument */
67 +/* NONCE[0] is kept in a register and not in memory.
68 + * We don't want to touch original value in memory.
69 + * Must be incremented every loop iteration.
70 + */
71 +#define NONCE_0 $v0
72 +
73 +/* SAVED_X and SAVED_CA are set in the jump table.
74 + * Use regs which are overwritten on exit else we don't leak clear data.
75 + * They are used to handling the last bytes which are not multiple of 4.
76 + */
77 +#define SAVED_X X15
78 +#define SAVED_CA $s7
79 +
80 +#define IS_UNALIGNED $s7
81 +
82 +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
83 +#define MSB 0
84 +#define LSB 3
85 +#define ROTx rotl
86 +#define ROTR(n) rotr n, 24
87 +#define CPU_TO_LE32(n) \
88 + wsbh n; \
89 + rotr n, 16;
90 +#else
91 +#define MSB 3
92 +#define LSB 0
93 +#define ROTx rotr
94 +#define CPU_TO_LE32(n)
95 +#define ROTR(n)
96 +#endif
97 +
98 +#define FOR_EACH_WORD(x) \
99 + x( 0); \
100 + x( 1); \
101 + x( 2); \
102 + x( 3); \
103 + x( 4); \
104 + x( 5); \
105 + x( 6); \
106 + x( 7); \
107 + x( 8); \
108 + x( 9); \
109 + x(10); \
110 + x(11); \
111 + x(12); \
112 + x(13); \
113 + x(14); \
114 + x(15);
115 +
116 +#define FOR_EACH_WORD_REV(x) \
117 + x(15); \
118 + x(14); \
119 + x(13); \
120 + x(12); \
121 + x(11); \
122 + x(10); \
123 + x( 9); \
124 + x( 8); \
125 + x( 7); \
126 + x( 6); \
127 + x( 5); \
128 + x( 4); \
129 + x( 3); \
130 + x( 2); \
131 + x( 1); \
132 + x( 0);
133 +
134 +#define PLUS_ONE_0 1
135 +#define PLUS_ONE_1 2
136 +#define PLUS_ONE_2 3
137 +#define PLUS_ONE_3 4
138 +#define PLUS_ONE_4 5
139 +#define PLUS_ONE_5 6
140 +#define PLUS_ONE_6 7
141 +#define PLUS_ONE_7 8
142 +#define PLUS_ONE_8 9
143 +#define PLUS_ONE_9 10
144 +#define PLUS_ONE_10 11
145 +#define PLUS_ONE_11 12
146 +#define PLUS_ONE_12 13
147 +#define PLUS_ONE_13 14
148 +#define PLUS_ONE_14 15
149 +#define PLUS_ONE_15 16
150 +#define PLUS_ONE(x) PLUS_ONE_ ## x
151 +#define _CONCAT3(a,b,c) a ## b ## c
152 +#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
153 +
154 +#define STORE_UNALIGNED(x) \
155 +CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
156 + .if (x != 12); \
157 + lw T0, (x*4)(STATE); \
158 + .endif; \
159 + lwl T1, (x*4)+MSB ## (IN); \
160 + lwr T1, (x*4)+LSB ## (IN); \
161 + .if (x == 12); \
162 + addu X ## x, NONCE_0; \
163 + .else; \
164 + addu X ## x, T0; \
165 + .endif; \
166 + CPU_TO_LE32(X ## x); \
167 + xor X ## x, T1; \
168 + swl X ## x, (x*4)+MSB ## (OUT); \
169 + swr X ## x, (x*4)+LSB ## (OUT);
170 +
171 +#define STORE_ALIGNED(x) \
172 +CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
173 + .if (x != 12); \
174 + lw T0, (x*4)(STATE); \
175 + .endif; \
176 + lw T1, (x*4) ## (IN); \
177 + .if (x == 12); \
178 + addu X ## x, NONCE_0; \
179 + .else; \
180 + addu X ## x, T0; \
181 + .endif; \
182 + CPU_TO_LE32(X ## x); \
183 + xor X ## x, T1; \
184 + sw X ## x, (x*4) ## (OUT);
185 +
186 +/* Jump table macro.
187 + * Used for setup and handling the last bytes, which are not multiple of 4.
188 + * X15 is free to store Xn
189 + * Every jumptable entry must be equal in size.
190 + */
191 +#define JMPTBL_ALIGNED(x) \
192 +.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
193 + .set noreorder; \
194 + b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
195 + .if (x == 12); \
196 + addu SAVED_X, X ## x, NONCE_0; \
197 + .else; \
198 + addu SAVED_X, X ## x, SAVED_CA; \
199 + .endif; \
200 + .set reorder
201 +
202 +#define JMPTBL_UNALIGNED(x) \
203 +.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
204 + .set noreorder; \
205 + b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
206 + .if (x == 12); \
207 + addu SAVED_X, X ## x, NONCE_0; \
208 + .else; \
209 + addu SAVED_X, X ## x, SAVED_CA; \
210 + .endif; \
211 + .set reorder
212 +
213 +#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
214 + addu X(A), X(K); \
215 + addu X(B), X(L); \
216 + addu X(C), X(M); \
217 + addu X(D), X(N); \
218 + xor X(V), X(A); \
219 + xor X(W), X(B); \
220 + xor X(Y), X(C); \
221 + xor X(Z), X(D); \
222 + rotl X(V), S; \
223 + rotl X(W), S; \
224 + rotl X(Y), S; \
225 + rotl X(Z), S;
226 +
227 +.text
228 +.set reorder
229 +.set noat
230 +.globl chacha20_mips
231 +.ent chacha20_mips
232 +chacha20_mips:
233 + .frame $sp, STACK_SIZE, $ra
234 +
235 + addiu $sp, -STACK_SIZE
236 +
237 + /* Return bytes = 0. */
238 + beqz BYTES, .Lchacha20_mips_end
239 +
240 + lw NONCE_0, 48(STATE)
241 +
242 + /* Save s0-s7 */
243 + sw $s0, 0($sp)
244 + sw $s1, 4($sp)
245 + sw $s2, 8($sp)
246 + sw $s3, 12($sp)
247 + sw $s4, 16($sp)
248 + sw $s5, 20($sp)
249 + sw $s6, 24($sp)
250 + sw $s7, 28($sp)
251 +
252 + /* Test IN or OUT is unaligned.
253 + * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
254 + */
255 + or IS_UNALIGNED, IN, OUT
256 + andi IS_UNALIGNED, 0x3
257 +
258 + /* Set number of rounds */
259 + li $at, 20
260 +
261 + b .Lchacha20_rounds_start
262 +
263 +.align 4
264 +.Loop_chacha20_rounds:
265 + addiu IN, CHACHA20_BLOCK_SIZE
266 + addiu OUT, CHACHA20_BLOCK_SIZE
267 + addiu NONCE_0, 1
268 +
269 +.Lchacha20_rounds_start:
270 + lw X0, 0(STATE)
271 + lw X1, 4(STATE)
272 + lw X2, 8(STATE)
273 + lw X3, 12(STATE)
274 +
275 + lw X4, 16(STATE)
276 + lw X5, 20(STATE)
277 + lw X6, 24(STATE)
278 + lw X7, 28(STATE)
279 + lw X8, 32(STATE)
280 + lw X9, 36(STATE)
281 + lw X10, 40(STATE)
282 + lw X11, 44(STATE)
283 +
284 + move X12, NONCE_0
285 + lw X13, 52(STATE)
286 + lw X14, 56(STATE)
287 + lw X15, 60(STATE)
288 +
289 +.Loop_chacha20_xor_rounds:
290 + addiu $at, -2
291 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
292 + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
293 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
294 + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
295 + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
296 + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
297 + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
298 + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
299 + bnez $at, .Loop_chacha20_xor_rounds
300 +
301 + addiu BYTES, -(CHACHA20_BLOCK_SIZE)
302 +
303 + /* Is data src/dst unaligned? Jump */
304 + bnez IS_UNALIGNED, .Loop_chacha20_unaligned
305 +
306 + /* Set number rounds here to fill delayslot. */
307 + li $at, 20
308 +
309 + /* BYTES < 0, it has no full block. */
310 + bltz BYTES, .Lchacha20_mips_no_full_block_aligned
311 +
312 + FOR_EACH_WORD_REV(STORE_ALIGNED)
313 +
314 + /* BYTES > 0? Loop again. */
315 + bgtz BYTES, .Loop_chacha20_rounds
316 +
317 + /* Place this here to fill delay slot */
318 + addiu NONCE_0, 1
319 +
320 + /* BYTES < 0? Handle last bytes */
321 + bltz BYTES, .Lchacha20_mips_xor_bytes
322 +
323 +.Lchacha20_mips_xor_done:
324 + /* Restore used registers */
325 + lw $s0, 0($sp)
326 + lw $s1, 4($sp)
327 + lw $s2, 8($sp)
328 + lw $s3, 12($sp)
329 + lw $s4, 16($sp)
330 + lw $s5, 20($sp)
331 + lw $s6, 24($sp)
332 + lw $s7, 28($sp)
333 +
334 + /* Write NONCE_0 back to right location in state */
335 + sw NONCE_0, 48(STATE)
336 +
337 +.Lchacha20_mips_end:
338 + addiu $sp, STACK_SIZE
339 + jr $ra
340 +
341 +.Lchacha20_mips_no_full_block_aligned:
342 + /* Restore the offset on BYTES */
343 + addiu BYTES, CHACHA20_BLOCK_SIZE
344 +
345 + /* Get number of full WORDS */
346 + andi $at, BYTES, MASK_U32
347 +
348 + /* Load upper half of jump table addr */
349 + lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
350 +
351 + /* Calculate lower half jump table offset */
352 + ins T0, $at, 1, 6
353 +
354 + /* Add offset to STATE */
355 + addu T1, STATE, $at
356 +
357 + /* Add lower half jump table addr */
358 + addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
359 +
360 + /* Read value from STATE */
361 + lw SAVED_CA, 0(T1)
362 +
363 + /* Store remaining bytecounter as negative value */
364 + subu BYTES, $at, BYTES
365 +
366 + jr T0
367 +
368 + /* Jump table */
369 + FOR_EACH_WORD(JMPTBL_ALIGNED)
370 +
371 +
372 +.Loop_chacha20_unaligned:
373 + /* Set number rounds here to fill delayslot. */
374 + li $at, 20
375 +
376 + /* BYTES > 0, it has no full block. */
377 + bltz BYTES, .Lchacha20_mips_no_full_block_unaligned
378 +
379 + FOR_EACH_WORD_REV(STORE_UNALIGNED)
380 +
381 + /* BYTES > 0? Loop again. */
382 + bgtz BYTES, .Loop_chacha20_rounds
383 +
384 + /* Write NONCE_0 back to right location in state */
385 + sw NONCE_0, 48(STATE)
386 +
387 + .set noreorder
388 + /* Fall through to byte handling */
389 + bgez BYTES, .Lchacha20_mips_xor_done
390 +.Lchacha20_mips_xor_unaligned_0_b:
391 +.Lchacha20_mips_xor_aligned_0_b:
392 + /* Place this here to fill delay slot */
393 + addiu NONCE_0, 1
394 + .set reorder
395 +
396 +.Lchacha20_mips_xor_bytes:
397 + addu IN, $at
398 + addu OUT, $at
399 + /* First byte */
400 + lbu T1, 0(IN)
401 + addiu $at, BYTES, 1
402 + CPU_TO_LE32(SAVED_X)
403 + ROTR(SAVED_X)
404 + xor T1, SAVED_X
405 + sb T1, 0(OUT)
406 + beqz $at, .Lchacha20_mips_xor_done
407 + /* Second byte */
408 + lbu T1, 1(IN)
409 + addiu $at, BYTES, 2
410 + ROTx SAVED_X, 8
411 + xor T1, SAVED_X
412 + sb T1, 1(OUT)
413 + beqz $at, .Lchacha20_mips_xor_done
414 + /* Third byte */
415 + lbu T1, 2(IN)
416 + ROTx SAVED_X, 8
417 + xor T1, SAVED_X
418 + sb T1, 2(OUT)
419 + b .Lchacha20_mips_xor_done
420 +
421 +.Lchacha20_mips_no_full_block_unaligned:
422 + /* Restore the offset on BYTES */
423 + addiu BYTES, CHACHA20_BLOCK_SIZE
424 +
425 + /* Get number of full WORDS */
426 + andi $at, BYTES, MASK_U32
427 +
428 + /* Load upper half of jump table addr */
429 + lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
430 +
431 + /* Calculate lower half jump table offset */
432 + ins T0, $at, 1, 6
433 +
434 + /* Add offset to STATE */
435 + addu T1, STATE, $at
436 +
437 + /* Add lower half jump table addr */
438 + addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
439 +
440 + /* Read value from STATE */
441 + lw SAVED_CA, 0(T1)
442 +
443 + /* Store remaining bytecounter as negative value */
444 + subu BYTES, $at, BYTES
445 +
446 + jr T0
447 +
448 + /* Jump table */
449 + FOR_EACH_WORD(JMPTBL_UNALIGNED)
450 +.end chacha20_mips
451 +.set at