kernel: 5.4: import wireguard backport
[openwrt/openwrt.git] / target / linux / generic / backport-5.4 / 080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Ard Biesheuvel <ardb@kernel.org>
3 Date: Fri, 8 Nov 2019 13:22:25 +0100
4 Subject: [PATCH] crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON
5 implementation
6
7 commit a6b803b3ddc793d6db0c16f12fc12d30d20fa9cc upstream.
8
9 This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation
10 for NEON authored by Andy Polyakov, and contributed by him to the OpenSSL
11 project. The file 'poly1305-armv4.pl' is taken straight from this upstream
12 GitHub repository [0] at commit ec55a08dc0244ce570c4fc7cade330c60798952f,
13 and already contains all the changes required to build it as part of a
14 Linux kernel module.
15
16 [0] https://github.com/dot-asm/cryptogams
17
18 Co-developed-by: Andy Polyakov <appro@cryptogams.org>
19 Signed-off-by: Andy Polyakov <appro@cryptogams.org>
20 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
21 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
22 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
23 ---
24 arch/arm/crypto/Kconfig | 5 +
25 arch/arm/crypto/Makefile | 12 +-
26 arch/arm/crypto/poly1305-armv4.pl | 1236 +++++++++++++++++++++++
27 arch/arm/crypto/poly1305-core.S_shipped | 1158 +++++++++++++++++++++
28 arch/arm/crypto/poly1305-glue.c | 276 +++++
29 lib/crypto/Kconfig | 2 +-
30 6 files changed, 2687 insertions(+), 2 deletions(-)
31 create mode 100644 arch/arm/crypto/poly1305-armv4.pl
32 create mode 100644 arch/arm/crypto/poly1305-core.S_shipped
33 create mode 100644 arch/arm/crypto/poly1305-glue.c
34
35 --- a/arch/arm/crypto/Kconfig
36 +++ b/arch/arm/crypto/Kconfig
37 @@ -131,6 +131,11 @@ config CRYPTO_CHACHA20_NEON
38 select CRYPTO_BLKCIPHER
39 select CRYPTO_ARCH_HAVE_LIB_CHACHA
40
41 +config CRYPTO_POLY1305_ARM
42 + tristate "Accelerated scalar and SIMD Poly1305 hash implementations"
43 + select CRYPTO_HASH
44 + select CRYPTO_ARCH_HAVE_LIB_POLY1305
45 +
46 config CRYPTO_NHPOLY1305_NEON
47 tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
48 depends on KERNEL_MODE_NEON
49 --- a/arch/arm/crypto/Makefile
50 +++ b/arch/arm/crypto/Makefile
51 @@ -10,6 +10,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sh
52 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
53 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
54 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
55 +obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
56 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
57
58 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
59 @@ -55,12 +56,16 @@ crct10dif-arm-ce-y := crct10dif-ce-core.
60 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
61 chacha-neon-y := chacha-scalar-core.o chacha-glue.o
62 chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
63 +poly1305-arm-y := poly1305-core.o poly1305-glue.o
64 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
65
66 ifdef REGENERATE_ARM_CRYPTO
67 quiet_cmd_perl = PERL $@
68 cmd_perl = $(PERL) $(<) > $(@)
69
70 +$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl
71 + $(call cmd,perl)
72 +
73 $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
74 $(call cmd,perl)
75
76 @@ -68,4 +73,9 @@ $(src)/sha512-core.S_shipped: $(src)/sha
77 $(call cmd,perl)
78 endif
79
80 -clean-files += sha256-core.S sha512-core.S
81 +clean-files += poly1305-core.S sha256-core.S sha512-core.S
82 +
83 +# massage the perlasm code a bit so we only get the NEON routine if we need it
84 +poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
85 +poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
86 +AFLAGS_poly1305-core.o += $(poly1305-aflags-y)
87 --- /dev/null
88 +++ b/arch/arm/crypto/poly1305-armv4.pl
89 @@ -0,0 +1,1236 @@
90 +#!/usr/bin/env perl
91 +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
92 +#
93 +# ====================================================================
94 +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
95 +# project.
96 +# ====================================================================
97 +#
98 +# IALU(*)/gcc-4.4 NEON
99 +#
100 +# ARM11xx(ARMv6) 7.78/+100% -
101 +# Cortex-A5 6.35/+130% 3.00
102 +# Cortex-A8 6.25/+115% 2.36
103 +# Cortex-A9 5.10/+95% 2.55
104 +# Cortex-A15 3.85/+85% 1.25(**)
105 +# Snapdragon S4 5.70/+100% 1.48(**)
106 +#
107 +# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
108 +# (**) these are trade-off results, they can be improved by ~8% but at
109 +# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
110 +# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
111 +
112 +$flavour = shift;
113 +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
114 +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
115 +
116 +if ($flavour && $flavour ne "void") {
117 + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
118 + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
119 + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
120 + die "can't locate arm-xlate.pl";
121 +
122 + open STDOUT,"| \"$^X\" $xlate $flavour $output";
123 +} else {
124 + open STDOUT,">$output";
125 +}
126 +
127 +($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
128 +
129 +$code.=<<___;
130 +#ifndef __KERNEL__
131 +# include "arm_arch.h"
132 +#else
133 +# define __ARM_ARCH__ __LINUX_ARM_ARCH__
134 +# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
135 +# define poly1305_init poly1305_init_arm
136 +# define poly1305_blocks poly1305_blocks_arm
137 +# define poly1305_emit poly1305_emit_arm
138 +.globl poly1305_blocks_neon
139 +#endif
140 +
141 +#if defined(__thumb2__)
142 +.syntax unified
143 +.thumb
144 +#else
145 +.code 32
146 +#endif
147 +
148 +.text
149 +
150 +.globl poly1305_emit
151 +.globl poly1305_blocks
152 +.globl poly1305_init
153 +.type poly1305_init,%function
154 +.align 5
155 +poly1305_init:
156 +.Lpoly1305_init:
157 + stmdb sp!,{r4-r11}
158 +
159 + eor r3,r3,r3
160 + cmp $inp,#0
161 + str r3,[$ctx,#0] @ zero hash value
162 + str r3,[$ctx,#4]
163 + str r3,[$ctx,#8]
164 + str r3,[$ctx,#12]
165 + str r3,[$ctx,#16]
166 + str r3,[$ctx,#36] @ clear is_base2_26
167 + add $ctx,$ctx,#20
168 +
169 +#ifdef __thumb2__
170 + it eq
171 +#endif
172 + moveq r0,#0
173 + beq .Lno_key
174 +
175 +#if __ARM_MAX_ARCH__>=7
176 + mov r3,#-1
177 + str r3,[$ctx,#28] @ impossible key power value
178 +# ifndef __KERNEL__
179 + adr r11,.Lpoly1305_init
180 + ldr r12,.LOPENSSL_armcap
181 +# endif
182 +#endif
183 + ldrb r4,[$inp,#0]
184 + mov r10,#0x0fffffff
185 + ldrb r5,[$inp,#1]
186 + and r3,r10,#-4 @ 0x0ffffffc
187 + ldrb r6,[$inp,#2]
188 + ldrb r7,[$inp,#3]
189 + orr r4,r4,r5,lsl#8
190 + ldrb r5,[$inp,#4]
191 + orr r4,r4,r6,lsl#16
192 + ldrb r6,[$inp,#5]
193 + orr r4,r4,r7,lsl#24
194 + ldrb r7,[$inp,#6]
195 + and r4,r4,r10
196 +
197 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
198 +# if !defined(_WIN32)
199 + ldr r12,[r11,r12] @ OPENSSL_armcap_P
200 +# endif
201 +# if defined(__APPLE__) || defined(_WIN32)
202 + ldr r12,[r12]
203 +# endif
204 +#endif
205 + ldrb r8,[$inp,#7]
206 + orr r5,r5,r6,lsl#8
207 + ldrb r6,[$inp,#8]
208 + orr r5,r5,r7,lsl#16
209 + ldrb r7,[$inp,#9]
210 + orr r5,r5,r8,lsl#24
211 + ldrb r8,[$inp,#10]
212 + and r5,r5,r3
213 +
214 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
215 + tst r12,#ARMV7_NEON @ check for NEON
216 +# ifdef __thumb2__
217 + adr r9,.Lpoly1305_blocks_neon
218 + adr r11,.Lpoly1305_blocks
219 + it ne
220 + movne r11,r9
221 + adr r12,.Lpoly1305_emit
222 + orr r11,r11,#1 @ thumb-ify addresses
223 + orr r12,r12,#1
224 +# else
225 + add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
226 + ite eq
227 + addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
228 + addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
229 +# endif
230 +#endif
231 + ldrb r9,[$inp,#11]
232 + orr r6,r6,r7,lsl#8
233 + ldrb r7,[$inp,#12]
234 + orr r6,r6,r8,lsl#16
235 + ldrb r8,[$inp,#13]
236 + orr r6,r6,r9,lsl#24
237 + ldrb r9,[$inp,#14]
238 + and r6,r6,r3
239 +
240 + ldrb r10,[$inp,#15]
241 + orr r7,r7,r8,lsl#8
242 + str r4,[$ctx,#0]
243 + orr r7,r7,r9,lsl#16
244 + str r5,[$ctx,#4]
245 + orr r7,r7,r10,lsl#24
246 + str r6,[$ctx,#8]
247 + and r7,r7,r3
248 + str r7,[$ctx,#12]
249 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
250 + stmia r2,{r11,r12} @ fill functions table
251 + mov r0,#1
252 +#else
253 + mov r0,#0
254 +#endif
255 +.Lno_key:
256 + ldmia sp!,{r4-r11}
257 +#if __ARM_ARCH__>=5
258 + ret @ bx lr
259 +#else
260 + tst lr,#1
261 + moveq pc,lr @ be binary compatible with V4, yet
262 + bx lr @ interoperable with Thumb ISA:-)
263 +#endif
264 +.size poly1305_init,.-poly1305_init
265 +___
266 +{
267 +my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
268 +my ($s1,$s2,$s3)=($r1,$r2,$r3);
269 +
270 +$code.=<<___;
271 +.type poly1305_blocks,%function
272 +.align 5
273 +poly1305_blocks:
274 +.Lpoly1305_blocks:
275 + stmdb sp!,{r3-r11,lr}
276 +
277 + ands $len,$len,#-16
278 + beq .Lno_data
279 +
280 + add $len,$len,$inp @ end pointer
281 + sub sp,sp,#32
282 +
283 +#if __ARM_ARCH__<7
284 + ldmia $ctx,{$h0-$r3} @ load context
285 + add $ctx,$ctx,#20
286 + str $len,[sp,#16] @ offload stuff
287 + str $ctx,[sp,#12]
288 +#else
289 + ldr lr,[$ctx,#36] @ is_base2_26
290 + ldmia $ctx!,{$h0-$h4} @ load hash value
291 + str $len,[sp,#16] @ offload stuff
292 + str $ctx,[sp,#12]
293 +
294 + adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
295 + mov $r1,$h1,lsr#6
296 + adcs $r1,$r1,$h2,lsl#20
297 + mov $r2,$h2,lsr#12
298 + adcs $r2,$r2,$h3,lsl#14
299 + mov $r3,$h3,lsr#18
300 + adcs $r3,$r3,$h4,lsl#8
301 + mov $len,#0
302 + teq lr,#0
303 + str $len,[$ctx,#16] @ clear is_base2_26
304 + adc $len,$len,$h4,lsr#24
305 +
306 + itttt ne
307 + movne $h0,$r0 @ choose between radixes
308 + movne $h1,$r1
309 + movne $h2,$r2
310 + movne $h3,$r3
311 + ldmia $ctx,{$r0-$r3} @ load key
312 + it ne
313 + movne $h4,$len
314 +#endif
315 +
316 + mov lr,$inp
317 + cmp $padbit,#0
318 + str $r1,[sp,#20]
319 + str $r2,[sp,#24]
320 + str $r3,[sp,#28]
321 + b .Loop
322 +
323 +.align 4
324 +.Loop:
325 +#if __ARM_ARCH__<7
326 + ldrb r0,[lr],#16 @ load input
327 +# ifdef __thumb2__
328 + it hi
329 +# endif
330 + addhi $h4,$h4,#1 @ 1<<128
331 + ldrb r1,[lr,#-15]
332 + ldrb r2,[lr,#-14]
333 + ldrb r3,[lr,#-13]
334 + orr r1,r0,r1,lsl#8
335 + ldrb r0,[lr,#-12]
336 + orr r2,r1,r2,lsl#16
337 + ldrb r1,[lr,#-11]
338 + orr r3,r2,r3,lsl#24
339 + ldrb r2,[lr,#-10]
340 + adds $h0,$h0,r3 @ accumulate input
341 +
342 + ldrb r3,[lr,#-9]
343 + orr r1,r0,r1,lsl#8
344 + ldrb r0,[lr,#-8]
345 + orr r2,r1,r2,lsl#16
346 + ldrb r1,[lr,#-7]
347 + orr r3,r2,r3,lsl#24
348 + ldrb r2,[lr,#-6]
349 + adcs $h1,$h1,r3
350 +
351 + ldrb r3,[lr,#-5]
352 + orr r1,r0,r1,lsl#8
353 + ldrb r0,[lr,#-4]
354 + orr r2,r1,r2,lsl#16
355 + ldrb r1,[lr,#-3]
356 + orr r3,r2,r3,lsl#24
357 + ldrb r2,[lr,#-2]
358 + adcs $h2,$h2,r3
359 +
360 + ldrb r3,[lr,#-1]
361 + orr r1,r0,r1,lsl#8
362 + str lr,[sp,#8] @ offload input pointer
363 + orr r2,r1,r2,lsl#16
364 + add $s1,$r1,$r1,lsr#2
365 + orr r3,r2,r3,lsl#24
366 +#else
367 + ldr r0,[lr],#16 @ load input
368 + it hi
369 + addhi $h4,$h4,#1 @ padbit
370 + ldr r1,[lr,#-12]
371 + ldr r2,[lr,#-8]
372 + ldr r3,[lr,#-4]
373 +# ifdef __ARMEB__
374 + rev r0,r0
375 + rev r1,r1
376 + rev r2,r2
377 + rev r3,r3
378 +# endif
379 + adds $h0,$h0,r0 @ accumulate input
380 + str lr,[sp,#8] @ offload input pointer
381 + adcs $h1,$h1,r1
382 + add $s1,$r1,$r1,lsr#2
383 + adcs $h2,$h2,r2
384 +#endif
385 + add $s2,$r2,$r2,lsr#2
386 + adcs $h3,$h3,r3
387 + add $s3,$r3,$r3,lsr#2
388 +
389 + umull r2,r3,$h1,$r0
390 + adc $h4,$h4,#0
391 + umull r0,r1,$h0,$r0
392 + umlal r2,r3,$h4,$s1
393 + umlal r0,r1,$h3,$s1
394 + ldr $r1,[sp,#20] @ reload $r1
395 + umlal r2,r3,$h2,$s3
396 + umlal r0,r1,$h1,$s3
397 + umlal r2,r3,$h3,$s2
398 + umlal r0,r1,$h2,$s2
399 + umlal r2,r3,$h0,$r1
400 + str r0,[sp,#0] @ future $h0
401 + mul r0,$s2,$h4
402 + ldr $r2,[sp,#24] @ reload $r2
403 + adds r2,r2,r1 @ d1+=d0>>32
404 + eor r1,r1,r1
405 + adc lr,r3,#0 @ future $h2
406 + str r2,[sp,#4] @ future $h1
407 +
408 + mul r2,$s3,$h4
409 + eor r3,r3,r3
410 + umlal r0,r1,$h3,$s3
411 + ldr $r3,[sp,#28] @ reload $r3
412 + umlal r2,r3,$h3,$r0
413 + umlal r0,r1,$h2,$r0
414 + umlal r2,r3,$h2,$r1
415 + umlal r0,r1,$h1,$r1
416 + umlal r2,r3,$h1,$r2
417 + umlal r0,r1,$h0,$r2
418 + umlal r2,r3,$h0,$r3
419 + ldr $h0,[sp,#0]
420 + mul $h4,$r0,$h4
421 + ldr $h1,[sp,#4]
422 +
423 + adds $h2,lr,r0 @ d2+=d1>>32
424 + ldr lr,[sp,#8] @ reload input pointer
425 + adc r1,r1,#0
426 + adds $h3,r2,r1 @ d3+=d2>>32
427 + ldr r0,[sp,#16] @ reload end pointer
428 + adc r3,r3,#0
429 + add $h4,$h4,r3 @ h4+=d3>>32
430 +
431 + and r1,$h4,#-4
432 + and $h4,$h4,#3
433 + add r1,r1,r1,lsr#2 @ *=5
434 + adds $h0,$h0,r1
435 + adcs $h1,$h1,#0
436 + adcs $h2,$h2,#0
437 + adcs $h3,$h3,#0
438 + adc $h4,$h4,#0
439 +
440 + cmp r0,lr @ done yet?
441 + bhi .Loop
442 +
443 + ldr $ctx,[sp,#12]
444 + add sp,sp,#32
445 + stmdb $ctx,{$h0-$h4} @ store the result
446 +
447 +.Lno_data:
448 +#if __ARM_ARCH__>=5
449 + ldmia sp!,{r3-r11,pc}
450 +#else
451 + ldmia sp!,{r3-r11,lr}
452 + tst lr,#1
453 + moveq pc,lr @ be binary compatible with V4, yet
454 + bx lr @ interoperable with Thumb ISA:-)
455 +#endif
456 +.size poly1305_blocks,.-poly1305_blocks
457 +___
458 +}
459 +{
460 +my ($ctx,$mac,$nonce)=map("r$_",(0..2));
461 +my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
462 +my $g4=$ctx;
463 +
464 +$code.=<<___;
465 +.type poly1305_emit,%function
466 +.align 5
467 +poly1305_emit:
468 +.Lpoly1305_emit:
469 + stmdb sp!,{r4-r11}
470 +
471 + ldmia $ctx,{$h0-$h4}
472 +
473 +#if __ARM_ARCH__>=7
474 + ldr ip,[$ctx,#36] @ is_base2_26
475 +
476 + adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
477 + mov $g1,$h1,lsr#6
478 + adcs $g1,$g1,$h2,lsl#20
479 + mov $g2,$h2,lsr#12
480 + adcs $g2,$g2,$h3,lsl#14
481 + mov $g3,$h3,lsr#18
482 + adcs $g3,$g3,$h4,lsl#8
483 + mov $g4,#0
484 + adc $g4,$g4,$h4,lsr#24
485 +
486 + tst ip,ip
487 + itttt ne
488 + movne $h0,$g0
489 + movne $h1,$g1
490 + movne $h2,$g2
491 + movne $h3,$g3
492 + it ne
493 + movne $h4,$g4
494 +#endif
495 +
496 + adds $g0,$h0,#5 @ compare to modulus
497 + adcs $g1,$h1,#0
498 + adcs $g2,$h2,#0
499 + adcs $g3,$h3,#0
500 + adc $g4,$h4,#0
501 + tst $g4,#4 @ did it carry/borrow?
502 +
503 +#ifdef __thumb2__
504 + it ne
505 +#endif
506 + movne $h0,$g0
507 + ldr $g0,[$nonce,#0]
508 +#ifdef __thumb2__
509 + it ne
510 +#endif
511 + movne $h1,$g1
512 + ldr $g1,[$nonce,#4]
513 +#ifdef __thumb2__
514 + it ne
515 +#endif
516 + movne $h2,$g2
517 + ldr $g2,[$nonce,#8]
518 +#ifdef __thumb2__
519 + it ne
520 +#endif
521 + movne $h3,$g3
522 + ldr $g3,[$nonce,#12]
523 +
524 + adds $h0,$h0,$g0
525 + adcs $h1,$h1,$g1
526 + adcs $h2,$h2,$g2
527 + adc $h3,$h3,$g3
528 +
529 +#if __ARM_ARCH__>=7
530 +# ifdef __ARMEB__
531 + rev $h0,$h0
532 + rev $h1,$h1
533 + rev $h2,$h2
534 + rev $h3,$h3
535 +# endif
536 + str $h0,[$mac,#0]
537 + str $h1,[$mac,#4]
538 + str $h2,[$mac,#8]
539 + str $h3,[$mac,#12]
540 +#else
541 + strb $h0,[$mac,#0]
542 + mov $h0,$h0,lsr#8
543 + strb $h1,[$mac,#4]
544 + mov $h1,$h1,lsr#8
545 + strb $h2,[$mac,#8]
546 + mov $h2,$h2,lsr#8
547 + strb $h3,[$mac,#12]
548 + mov $h3,$h3,lsr#8
549 +
550 + strb $h0,[$mac,#1]
551 + mov $h0,$h0,lsr#8
552 + strb $h1,[$mac,#5]
553 + mov $h1,$h1,lsr#8
554 + strb $h2,[$mac,#9]
555 + mov $h2,$h2,lsr#8
556 + strb $h3,[$mac,#13]
557 + mov $h3,$h3,lsr#8
558 +
559 + strb $h0,[$mac,#2]
560 + mov $h0,$h0,lsr#8
561 + strb $h1,[$mac,#6]
562 + mov $h1,$h1,lsr#8
563 + strb $h2,[$mac,#10]
564 + mov $h2,$h2,lsr#8
565 + strb $h3,[$mac,#14]
566 + mov $h3,$h3,lsr#8
567 +
568 + strb $h0,[$mac,#3]
569 + strb $h1,[$mac,#7]
570 + strb $h2,[$mac,#11]
571 + strb $h3,[$mac,#15]
572 +#endif
573 + ldmia sp!,{r4-r11}
574 +#if __ARM_ARCH__>=5
575 + ret @ bx lr
576 +#else
577 + tst lr,#1
578 + moveq pc,lr @ be binary compatible with V4, yet
579 + bx lr @ interoperable with Thumb ISA:-)
580 +#endif
581 +.size poly1305_emit,.-poly1305_emit
582 +___
583 +{
584 +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
585 +my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
586 +my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
587 +
588 +my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
589 +
590 +$code.=<<___;
591 +#if __ARM_MAX_ARCH__>=7
592 +.fpu neon
593 +
594 +.type poly1305_init_neon,%function
595 +.align 5
596 +poly1305_init_neon:
597 +.Lpoly1305_init_neon:
598 + ldr r3,[$ctx,#48] @ first table element
599 + cmp r3,#-1 @ is value impossible?
600 + bne .Lno_init_neon
601 +
602 + ldr r4,[$ctx,#20] @ load key base 2^32
603 + ldr r5,[$ctx,#24]
604 + ldr r6,[$ctx,#28]
605 + ldr r7,[$ctx,#32]
606 +
607 + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
608 + mov r3,r4,lsr#26
609 + mov r4,r5,lsr#20
610 + orr r3,r3,r5,lsl#6
611 + mov r5,r6,lsr#14
612 + orr r4,r4,r6,lsl#12
613 + mov r6,r7,lsr#8
614 + orr r5,r5,r7,lsl#18
615 + and r3,r3,#0x03ffffff
616 + and r4,r4,#0x03ffffff
617 + and r5,r5,#0x03ffffff
618 +
619 + vdup.32 $R0,r2 @ r^1 in both lanes
620 + add r2,r3,r3,lsl#2 @ *5
621 + vdup.32 $R1,r3
622 + add r3,r4,r4,lsl#2
623 + vdup.32 $S1,r2
624 + vdup.32 $R2,r4
625 + add r4,r5,r5,lsl#2
626 + vdup.32 $S2,r3
627 + vdup.32 $R3,r5
628 + add r5,r6,r6,lsl#2
629 + vdup.32 $S3,r4
630 + vdup.32 $R4,r6
631 + vdup.32 $S4,r5
632 +
633 + mov $zeros,#2 @ counter
634 +
635 +.Lsquare_neon:
636 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
637 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
638 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
639 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
640 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
641 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
642 +
643 + vmull.u32 $D0,$R0,${R0}[1]
644 + vmull.u32 $D1,$R1,${R0}[1]
645 + vmull.u32 $D2,$R2,${R0}[1]
646 + vmull.u32 $D3,$R3,${R0}[1]
647 + vmull.u32 $D4,$R4,${R0}[1]
648 +
649 + vmlal.u32 $D0,$R4,${S1}[1]
650 + vmlal.u32 $D1,$R0,${R1}[1]
651 + vmlal.u32 $D2,$R1,${R1}[1]
652 + vmlal.u32 $D3,$R2,${R1}[1]
653 + vmlal.u32 $D4,$R3,${R1}[1]
654 +
655 + vmlal.u32 $D0,$R3,${S2}[1]
656 + vmlal.u32 $D1,$R4,${S2}[1]
657 + vmlal.u32 $D3,$R1,${R2}[1]
658 + vmlal.u32 $D2,$R0,${R2}[1]
659 + vmlal.u32 $D4,$R2,${R2}[1]
660 +
661 + vmlal.u32 $D0,$R2,${S3}[1]
662 + vmlal.u32 $D3,$R0,${R3}[1]
663 + vmlal.u32 $D1,$R3,${S3}[1]
664 + vmlal.u32 $D2,$R4,${S3}[1]
665 + vmlal.u32 $D4,$R1,${R3}[1]
666 +
667 + vmlal.u32 $D3,$R4,${S4}[1]
668 + vmlal.u32 $D0,$R1,${S4}[1]
669 + vmlal.u32 $D1,$R2,${S4}[1]
670 + vmlal.u32 $D2,$R3,${S4}[1]
671 + vmlal.u32 $D4,$R0,${R4}[1]
672 +
673 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
674 + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
675 + @ and P. Schwabe
676 + @
677 + @ H0>>+H1>>+H2>>+H3>>+H4
678 + @ H3>>+H4>>*5+H0>>+H1
679 + @
680 + @ Trivia.
681 + @
682 + @ Result of multiplication of n-bit number by m-bit number is
683 + @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
684 + @ m-bit number multiplied by 2^n is still n+m bits wide.
685 + @
686 + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
687 + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
688 + @ one is n+1 bits wide.
689 + @
690 + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
691 + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
692 + @ can be 27. However! In cases when their width exceeds 26 bits
693 + @ they are limited by 2^26+2^6. This in turn means that *sum*
694 + @ of the products with these values can still be viewed as sum
695 + @ of 52-bit numbers as long as the amount of addends is not a
696 + @ power of 2. For example,
697 + @
698 + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
699 + @
700 + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
701 + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
702 + @ 8 * (2^52) or 2^55. However, the value is then multiplied by
703 + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
704 + @ which is less than 32 * (2^52) or 2^57. And when processing
705 + @ data we are looking at triple as many addends...
706 + @
707 + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
708 + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
709 + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
710 + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
711 + @ instruction accepts 2x32-bit input and writes 2x64-bit result.
712 + @ This means that result of reduction have to be compressed upon
713 + @ loop wrap-around. This can be done in the process of reduction
714 + @ to minimize amount of instructions [as well as amount of
715 + @ 128-bit instructions, which benefits low-end processors], but
716 + @ one has to watch for H2 (which is narrower than H0) and 5*H4
717 + @ not being wider than 58 bits, so that result of right shift
718 + @ by 26 bits fits in 32 bits. This is also useful on x86,
719 + @ because it allows to use paddd in place for paddq, which
720 + @ benefits Atom, where paddq is ridiculously slow.
721 +
722 + vshr.u64 $T0,$D3,#26
723 + vmovn.i64 $D3#lo,$D3
724 + vshr.u64 $T1,$D0,#26
725 + vmovn.i64 $D0#lo,$D0
726 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4
727 + vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
728 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1
729 + vbic.i32 $D0#lo,#0xfc000000
730 +
731 + vshrn.u64 $T0#lo,$D4,#26
732 + vmovn.i64 $D4#lo,$D4
733 + vshr.u64 $T1,$D1,#26
734 + vmovn.i64 $D1#lo,$D1
735 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2
736 + vbic.i32 $D4#lo,#0xfc000000
737 + vbic.i32 $D1#lo,#0xfc000000
738 +
739 + vadd.i32 $D0#lo,$D0#lo,$T0#lo
740 + vshl.u32 $T0#lo,$T0#lo,#2
741 + vshrn.u64 $T1#lo,$D2,#26
742 + vmovn.i64 $D2#lo,$D2
743 + vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
744 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
745 + vbic.i32 $D2#lo,#0xfc000000
746 +
747 + vshr.u32 $T0#lo,$D0#lo,#26
748 + vbic.i32 $D0#lo,#0xfc000000
749 + vshr.u32 $T1#lo,$D3#lo,#26
750 + vbic.i32 $D3#lo,#0xfc000000
751 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
752 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
753 +
754 + subs $zeros,$zeros,#1
755 + beq .Lsquare_break_neon
756 +
757 + add $tbl0,$ctx,#(48+0*9*4)
758 + add $tbl1,$ctx,#(48+1*9*4)
759 +
760 + vtrn.32 $R0,$D0#lo @ r^2:r^1
761 + vtrn.32 $R2,$D2#lo
762 + vtrn.32 $R3,$D3#lo
763 + vtrn.32 $R1,$D1#lo
764 + vtrn.32 $R4,$D4#lo
765 +
766 + vshl.u32 $S2,$R2,#2 @ *5
767 + vshl.u32 $S3,$R3,#2
768 + vshl.u32 $S1,$R1,#2
769 + vshl.u32 $S4,$R4,#2
770 + vadd.i32 $S2,$S2,$R2
771 + vadd.i32 $S1,$S1,$R1
772 + vadd.i32 $S3,$S3,$R3
773 + vadd.i32 $S4,$S4,$R4
774 +
775 + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
776 + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
777 + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
778 + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
779 + vst1.32 {${S4}[0]},[$tbl0,:32]
780 + vst1.32 {${S4}[1]},[$tbl1,:32]
781 +
782 + b .Lsquare_neon
783 +
784 +.align 4
785 +.Lsquare_break_neon:
786 + add $tbl0,$ctx,#(48+2*4*9)
787 + add $tbl1,$ctx,#(48+3*4*9)
788 +
789 + vmov $R0,$D0#lo @ r^4:r^3
790 + vshl.u32 $S1,$D1#lo,#2 @ *5
791 + vmov $R1,$D1#lo
792 + vshl.u32 $S2,$D2#lo,#2
793 + vmov $R2,$D2#lo
794 + vshl.u32 $S3,$D3#lo,#2
795 + vmov $R3,$D3#lo
796 + vshl.u32 $S4,$D4#lo,#2
797 + vmov $R4,$D4#lo
798 + vadd.i32 $S1,$S1,$D1#lo
799 + vadd.i32 $S2,$S2,$D2#lo
800 + vadd.i32 $S3,$S3,$D3#lo
801 + vadd.i32 $S4,$S4,$D4#lo
802 +
803 + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
804 + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
805 + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
806 + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
807 + vst1.32 {${S4}[0]},[$tbl0]
808 + vst1.32 {${S4}[1]},[$tbl1]
809 +
810 +.Lno_init_neon:
811 + ret @ bx lr
812 +.size poly1305_init_neon,.-poly1305_init_neon
813 +
814 +.type poly1305_blocks_neon,%function
815 +.align 5
816 +poly1305_blocks_neon:
817 +.Lpoly1305_blocks_neon:
818 + ldr ip,[$ctx,#36] @ is_base2_26
819 +
820 + cmp $len,#64
821 + blo .Lpoly1305_blocks
822 +
823 + stmdb sp!,{r4-r7}
824 + vstmdb sp!,{d8-d15} @ ABI specification says so
825 +
826 + tst ip,ip @ is_base2_26?
827 + bne .Lbase2_26_neon
828 +
829 + stmdb sp!,{r1-r3,lr}
830 + bl .Lpoly1305_init_neon
831 +
832 + ldr r4,[$ctx,#0] @ load hash value base 2^32
833 + ldr r5,[$ctx,#4]
834 + ldr r6,[$ctx,#8]
835 + ldr r7,[$ctx,#12]
836 + ldr ip,[$ctx,#16]
837 +
838 + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
839 + mov r3,r4,lsr#26
840 + veor $D0#lo,$D0#lo,$D0#lo
841 + mov r4,r5,lsr#20
842 + orr r3,r3,r5,lsl#6
843 + veor $D1#lo,$D1#lo,$D1#lo
844 + mov r5,r6,lsr#14
845 + orr r4,r4,r6,lsl#12
846 + veor $D2#lo,$D2#lo,$D2#lo
847 + mov r6,r7,lsr#8
848 + orr r5,r5,r7,lsl#18
849 + veor $D3#lo,$D3#lo,$D3#lo
850 + and r3,r3,#0x03ffffff
851 + orr r6,r6,ip,lsl#24
852 + veor $D4#lo,$D4#lo,$D4#lo
853 + and r4,r4,#0x03ffffff
854 + mov r1,#1
855 + and r5,r5,#0x03ffffff
856 + str r1,[$ctx,#36] @ set is_base2_26
857 +
858 + vmov.32 $D0#lo[0],r2
859 + vmov.32 $D1#lo[0],r3
860 + vmov.32 $D2#lo[0],r4
861 + vmov.32 $D3#lo[0],r5
862 + vmov.32 $D4#lo[0],r6
863 + adr $zeros,.Lzeros
864 +
865 + ldmia sp!,{r1-r3,lr}
866 + b .Lhash_loaded
867 +
868 +.align 4
869 +.Lbase2_26_neon:
870 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
871 + @ load hash value
872 +
873 + veor $D0#lo,$D0#lo,$D0#lo
874 + veor $D1#lo,$D1#lo,$D1#lo
875 + veor $D2#lo,$D2#lo,$D2#lo
876 + veor $D3#lo,$D3#lo,$D3#lo
877 + veor $D4#lo,$D4#lo,$D4#lo
878 + vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
879 + adr $zeros,.Lzeros
880 + vld1.32 {$D4#lo[0]},[$ctx]
881 + sub $ctx,$ctx,#16 @ rewind
882 +
883 +.Lhash_loaded:
884 + add $in2,$inp,#32
885 + mov $padbit,$padbit,lsl#24
886 + tst $len,#31
887 + beq .Leven
888 +
889 + vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
890 + vmov.32 $H4#lo[0],$padbit
891 + sub $len,$len,#16
892 + add $in2,$inp,#32
893 +
894 +# ifdef __ARMEB__
895 + vrev32.8 $H0,$H0
896 + vrev32.8 $H3,$H3
897 + vrev32.8 $H1,$H1
898 + vrev32.8 $H2,$H2
899 +# endif
900 + vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
901 + vshl.u32 $H3#lo,$H3#lo,#18
902 +
903 + vsri.u32 $H3#lo,$H2#lo,#14
904 + vshl.u32 $H2#lo,$H2#lo,#12
905 + vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
906 +
907 + vbic.i32 $H3#lo,#0xfc000000
908 + vsri.u32 $H2#lo,$H1#lo,#20
909 + vshl.u32 $H1#lo,$H1#lo,#6
910 +
911 + vbic.i32 $H2#lo,#0xfc000000
912 + vsri.u32 $H1#lo,$H0#lo,#26
913 + vadd.i32 $H3#hi,$H3#lo,$D3#lo
914 +
915 + vbic.i32 $H0#lo,#0xfc000000
916 + vbic.i32 $H1#lo,#0xfc000000
917 + vadd.i32 $H2#hi,$H2#lo,$D2#lo
918 +
919 + vadd.i32 $H0#hi,$H0#lo,$D0#lo
920 + vadd.i32 $H1#hi,$H1#lo,$D1#lo
921 +
922 + mov $tbl1,$zeros
923 + add $tbl0,$ctx,#48
924 +
925 + cmp $len,$len
926 + b .Long_tail
927 +
928 +.align 4
929 +.Leven:
930 + subs $len,$len,#64
931 + it lo
932 + movlo $in2,$zeros
933 +
934 + vmov.i32 $H4,#1<<24 @ padbit, yes, always
935 + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
936 + add $inp,$inp,#64
937 + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
938 + add $in2,$in2,#64
939 + itt hi
940 + addhi $tbl1,$ctx,#(48+1*9*4)
941 + addhi $tbl0,$ctx,#(48+3*9*4)
942 +
943 +# ifdef __ARMEB__
944 + vrev32.8 $H0,$H0
945 + vrev32.8 $H3,$H3
946 + vrev32.8 $H1,$H1
947 + vrev32.8 $H2,$H2
948 +# endif
949 + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
950 + vshl.u32 $H3,$H3,#18
951 +
952 + vsri.u32 $H3,$H2,#14
953 + vshl.u32 $H2,$H2,#12
954 +
955 + vbic.i32 $H3,#0xfc000000
956 + vsri.u32 $H2,$H1,#20
957 + vshl.u32 $H1,$H1,#6
958 +
959 + vbic.i32 $H2,#0xfc000000
960 + vsri.u32 $H1,$H0,#26
961 +
962 + vbic.i32 $H0,#0xfc000000
963 + vbic.i32 $H1,#0xfc000000
964 +
965 + bls .Lskip_loop
966 +
967 + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
968 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
969 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
970 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
971 + b .Loop_neon
972 +
973 +.align 5
974 +.Loop_neon:
975 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
976 + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
977 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
978 + @ \___________________/
979 + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
980 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
981 + @ \___________________/ \____________________/
982 + @
983 + @ Note that we start with inp[2:3]*r^2. This is because it
984 + @ doesn't depend on reduction in previous iteration.
985 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
986 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
987 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
988 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
989 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
990 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
991 +
992 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
993 + @ inp[2:3]*r^2
994 +
995 + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
996 + vmull.u32 $D2,$H2#hi,${R0}[1]
997 + vadd.i32 $H0#lo,$H0#lo,$D0#lo
998 + vmull.u32 $D0,$H0#hi,${R0}[1]
999 + vadd.i32 $H3#lo,$H3#lo,$D3#lo
1000 + vmull.u32 $D3,$H3#hi,${R0}[1]
1001 + vmlal.u32 $D2,$H1#hi,${R1}[1]
1002 + vadd.i32 $H1#lo,$H1#lo,$D1#lo
1003 + vmull.u32 $D1,$H1#hi,${R0}[1]
1004 +
1005 + vadd.i32 $H4#lo,$H4#lo,$D4#lo
1006 + vmull.u32 $D4,$H4#hi,${R0}[1]
1007 + subs $len,$len,#64
1008 + vmlal.u32 $D0,$H4#hi,${S1}[1]
1009 + it lo
1010 + movlo $in2,$zeros
1011 + vmlal.u32 $D3,$H2#hi,${R1}[1]
1012 + vld1.32 ${S4}[1],[$tbl1,:32]
1013 + vmlal.u32 $D1,$H0#hi,${R1}[1]
1014 + vmlal.u32 $D4,$H3#hi,${R1}[1]
1015 +
1016 + vmlal.u32 $D0,$H3#hi,${S2}[1]
1017 + vmlal.u32 $D3,$H1#hi,${R2}[1]
1018 + vmlal.u32 $D4,$H2#hi,${R2}[1]
1019 + vmlal.u32 $D1,$H4#hi,${S2}[1]
1020 + vmlal.u32 $D2,$H0#hi,${R2}[1]
1021 +
1022 + vmlal.u32 $D3,$H0#hi,${R3}[1]
1023 + vmlal.u32 $D0,$H2#hi,${S3}[1]
1024 + vmlal.u32 $D4,$H1#hi,${R3}[1]
1025 + vmlal.u32 $D1,$H3#hi,${S3}[1]
1026 + vmlal.u32 $D2,$H4#hi,${S3}[1]
1027 +
1028 + vmlal.u32 $D3,$H4#hi,${S4}[1]
1029 + vmlal.u32 $D0,$H1#hi,${S4}[1]
1030 + vmlal.u32 $D4,$H0#hi,${R4}[1]
1031 + vmlal.u32 $D1,$H2#hi,${S4}[1]
1032 + vmlal.u32 $D2,$H3#hi,${S4}[1]
1033 +
1034 + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
1035 + add $in2,$in2,#64
1036 +
1037 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1038 + @ (hash+inp[0:1])*r^4 and accumulate
1039 +
1040 + vmlal.u32 $D3,$H3#lo,${R0}[0]
1041 + vmlal.u32 $D0,$H0#lo,${R0}[0]
1042 + vmlal.u32 $D4,$H4#lo,${R0}[0]
1043 + vmlal.u32 $D1,$H1#lo,${R0}[0]
1044 + vmlal.u32 $D2,$H2#lo,${R0}[0]
1045 + vld1.32 ${S4}[0],[$tbl0,:32]
1046 +
1047 + vmlal.u32 $D3,$H2#lo,${R1}[0]
1048 + vmlal.u32 $D0,$H4#lo,${S1}[0]
1049 + vmlal.u32 $D4,$H3#lo,${R1}[0]
1050 + vmlal.u32 $D1,$H0#lo,${R1}[0]
1051 + vmlal.u32 $D2,$H1#lo,${R1}[0]
1052 +
1053 + vmlal.u32 $D3,$H1#lo,${R2}[0]
1054 + vmlal.u32 $D0,$H3#lo,${S2}[0]
1055 + vmlal.u32 $D4,$H2#lo,${R2}[0]
1056 + vmlal.u32 $D1,$H4#lo,${S2}[0]
1057 + vmlal.u32 $D2,$H0#lo,${R2}[0]
1058 +
1059 + vmlal.u32 $D3,$H0#lo,${R3}[0]
1060 + vmlal.u32 $D0,$H2#lo,${S3}[0]
1061 + vmlal.u32 $D4,$H1#lo,${R3}[0]
1062 + vmlal.u32 $D1,$H3#lo,${S3}[0]
1063 + vmlal.u32 $D3,$H4#lo,${S4}[0]
1064 +
1065 + vmlal.u32 $D2,$H4#lo,${S3}[0]
1066 + vmlal.u32 $D0,$H1#lo,${S4}[0]
1067 + vmlal.u32 $D4,$H0#lo,${R4}[0]
1068 + vmov.i32 $H4,#1<<24 @ padbit, yes, always
1069 + vmlal.u32 $D1,$H2#lo,${S4}[0]
1070 + vmlal.u32 $D2,$H3#lo,${S4}[0]
1071 +
1072 + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
1073 + add $inp,$inp,#64
1074 +# ifdef __ARMEB__
1075 + vrev32.8 $H0,$H0
1076 + vrev32.8 $H1,$H1
1077 + vrev32.8 $H2,$H2
1078 + vrev32.8 $H3,$H3
1079 +# endif
1080 +
1081 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1082 + @ lazy reduction interleaved with base 2^32 -> base 2^26 of
1083 + @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
1084 +
1085 + vshr.u64 $T0,$D3,#26
1086 + vmovn.i64 $D3#lo,$D3
1087 + vshr.u64 $T1,$D0,#26
1088 + vmovn.i64 $D0#lo,$D0
1089 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4
1090 + vbic.i32 $D3#lo,#0xfc000000
1091 + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
1092 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1
1093 + vshl.u32 $H3,$H3,#18
1094 + vbic.i32 $D0#lo,#0xfc000000
1095 +
1096 + vshrn.u64 $T0#lo,$D4,#26
1097 + vmovn.i64 $D4#lo,$D4
1098 + vshr.u64 $T1,$D1,#26
1099 + vmovn.i64 $D1#lo,$D1
1100 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2
1101 + vsri.u32 $H3,$H2,#14
1102 + vbic.i32 $D4#lo,#0xfc000000
1103 + vshl.u32 $H2,$H2,#12
1104 + vbic.i32 $D1#lo,#0xfc000000
1105 +
1106 + vadd.i32 $D0#lo,$D0#lo,$T0#lo
1107 + vshl.u32 $T0#lo,$T0#lo,#2
1108 + vbic.i32 $H3,#0xfc000000
1109 + vshrn.u64 $T1#lo,$D2,#26
1110 + vmovn.i64 $D2#lo,$D2
1111 + vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
1112 + vsri.u32 $H2,$H1,#20
1113 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
1114 + vshl.u32 $H1,$H1,#6
1115 + vbic.i32 $D2#lo,#0xfc000000
1116 + vbic.i32 $H2,#0xfc000000
1117 +
1118 + vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
1119 + vmovn.i64 $D0#lo,$D0
1120 + vsri.u32 $H1,$H0,#26
1121 + vbic.i32 $H0,#0xfc000000
1122 + vshr.u32 $T1#lo,$D3#lo,#26
1123 + vbic.i32 $D3#lo,#0xfc000000
1124 + vbic.i32 $D0#lo,#0xfc000000
1125 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
1126 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
1127 + vbic.i32 $H1,#0xfc000000
1128 +
1129 + bhi .Loop_neon
1130 +
1131 +.Lskip_loop:
1132 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1133 + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1134 +
1135 + add $tbl1,$ctx,#(48+0*9*4)
1136 + add $tbl0,$ctx,#(48+1*9*4)
1137 + adds $len,$len,#32
1138 + it ne
1139 + movne $len,#0
1140 + bne .Long_tail
1141 +
1142 + vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
1143 + vadd.i32 $H0#hi,$H0#lo,$D0#lo
1144 + vadd.i32 $H3#hi,$H3#lo,$D3#lo
1145 + vadd.i32 $H1#hi,$H1#lo,$D1#lo
1146 + vadd.i32 $H4#hi,$H4#lo,$D4#lo
1147 +
1148 +.Long_tail:
1149 + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
1150 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
1151 +
1152 + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
1153 + vmull.u32 $D2,$H2#hi,$R0
1154 + vadd.i32 $H0#lo,$H0#lo,$D0#lo
1155 + vmull.u32 $D0,$H0#hi,$R0
1156 + vadd.i32 $H3#lo,$H3#lo,$D3#lo
1157 + vmull.u32 $D3,$H3#hi,$R0
1158 + vadd.i32 $H1#lo,$H1#lo,$D1#lo
1159 + vmull.u32 $D1,$H1#hi,$R0
1160 + vadd.i32 $H4#lo,$H4#lo,$D4#lo
1161 + vmull.u32 $D4,$H4#hi,$R0
1162 +
1163 + vmlal.u32 $D0,$H4#hi,$S1
1164 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1165 + vmlal.u32 $D3,$H2#hi,$R1
1166 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1167 + vmlal.u32 $D1,$H0#hi,$R1
1168 + vmlal.u32 $D4,$H3#hi,$R1
1169 + vmlal.u32 $D2,$H1#hi,$R1
1170 +
1171 + vmlal.u32 $D3,$H1#hi,$R2
1172 + vld1.32 ${S4}[1],[$tbl1,:32]
1173 + vmlal.u32 $D0,$H3#hi,$S2
1174 + vld1.32 ${S4}[0],[$tbl0,:32]
1175 + vmlal.u32 $D4,$H2#hi,$R2
1176 + vmlal.u32 $D1,$H4#hi,$S2
1177 + vmlal.u32 $D2,$H0#hi,$R2
1178 +
1179 + vmlal.u32 $D3,$H0#hi,$R3
1180 + it ne
1181 + addne $tbl1,$ctx,#(48+2*9*4)
1182 + vmlal.u32 $D0,$H2#hi,$S3
1183 + it ne
1184 + addne $tbl0,$ctx,#(48+3*9*4)
1185 + vmlal.u32 $D4,$H1#hi,$R3
1186 + vmlal.u32 $D1,$H3#hi,$S3
1187 + vmlal.u32 $D2,$H4#hi,$S3
1188 +
1189 + vmlal.u32 $D3,$H4#hi,$S4
1190 + vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
1191 + vmlal.u32 $D0,$H1#hi,$S4
1192 + vshr.u64 $MASK,$MASK,#38
1193 + vmlal.u32 $D4,$H0#hi,$R4
1194 + vmlal.u32 $D1,$H2#hi,$S4
1195 + vmlal.u32 $D2,$H3#hi,$S4
1196 +
1197 + beq .Lshort_tail
1198 +
1199 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1200 + @ (hash+inp[0:1])*r^4:r^3 and accumulate
1201 +
1202 + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
1203 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
1204 +
1205 + vmlal.u32 $D2,$H2#lo,$R0
1206 + vmlal.u32 $D0,$H0#lo,$R0
1207 + vmlal.u32 $D3,$H3#lo,$R0
1208 + vmlal.u32 $D1,$H1#lo,$R0
1209 + vmlal.u32 $D4,$H4#lo,$R0
1210 +
1211 + vmlal.u32 $D0,$H4#lo,$S1
1212 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1213 + vmlal.u32 $D3,$H2#lo,$R1
1214 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1215 + vmlal.u32 $D1,$H0#lo,$R1
1216 + vmlal.u32 $D4,$H3#lo,$R1
1217 + vmlal.u32 $D2,$H1#lo,$R1
1218 +
1219 + vmlal.u32 $D3,$H1#lo,$R2
1220 + vld1.32 ${S4}[1],[$tbl1,:32]
1221 + vmlal.u32 $D0,$H3#lo,$S2
1222 + vld1.32 ${S4}[0],[$tbl0,:32]
1223 + vmlal.u32 $D4,$H2#lo,$R2
1224 + vmlal.u32 $D1,$H4#lo,$S2
1225 + vmlal.u32 $D2,$H0#lo,$R2
1226 +
1227 + vmlal.u32 $D3,$H0#lo,$R3
1228 + vmlal.u32 $D0,$H2#lo,$S3
1229 + vmlal.u32 $D4,$H1#lo,$R3
1230 + vmlal.u32 $D1,$H3#lo,$S3
1231 + vmlal.u32 $D2,$H4#lo,$S3
1232 +
1233 + vmlal.u32 $D3,$H4#lo,$S4
1234 + vorn $MASK,$MASK,$MASK @ all-ones
1235 + vmlal.u32 $D0,$H1#lo,$S4
1236 + vshr.u64 $MASK,$MASK,#38
1237 + vmlal.u32 $D4,$H0#lo,$R4
1238 + vmlal.u32 $D1,$H2#lo,$S4
1239 + vmlal.u32 $D2,$H3#lo,$S4
1240 +
1241 +.Lshort_tail:
1242 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1243 + @ horizontal addition
1244 +
1245 + vadd.i64 $D3#lo,$D3#lo,$D3#hi
1246 + vadd.i64 $D0#lo,$D0#lo,$D0#hi
1247 + vadd.i64 $D4#lo,$D4#lo,$D4#hi
1248 + vadd.i64 $D1#lo,$D1#lo,$D1#hi
1249 + vadd.i64 $D2#lo,$D2#lo,$D2#hi
1250 +
1251 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1252 + @ lazy reduction, but without narrowing
1253 +
1254 + vshr.u64 $T0,$D3,#26
1255 + vand.i64 $D3,$D3,$MASK
1256 + vshr.u64 $T1,$D0,#26
1257 + vand.i64 $D0,$D0,$MASK
1258 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4
1259 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1
1260 +
1261 + vshr.u64 $T0,$D4,#26
1262 + vand.i64 $D4,$D4,$MASK
1263 + vshr.u64 $T1,$D1,#26
1264 + vand.i64 $D1,$D1,$MASK
1265 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2
1266 +
1267 + vadd.i64 $D0,$D0,$T0
1268 + vshl.u64 $T0,$T0,#2
1269 + vshr.u64 $T1,$D2,#26
1270 + vand.i64 $D2,$D2,$MASK
1271 + vadd.i64 $D0,$D0,$T0 @ h4 -> h0
1272 + vadd.i64 $D3,$D3,$T1 @ h2 -> h3
1273 +
1274 + vshr.u64 $T0,$D0,#26
1275 + vand.i64 $D0,$D0,$MASK
1276 + vshr.u64 $T1,$D3,#26
1277 + vand.i64 $D3,$D3,$MASK
1278 + vadd.i64 $D1,$D1,$T0 @ h0 -> h1
1279 + vadd.i64 $D4,$D4,$T1 @ h3 -> h4
1280 +
1281 + cmp $len,#0
1282 + bne .Leven
1283 +
1284 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1285 + @ store hash value
1286 +
1287 + vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
1288 + vst1.32 {$D4#lo[0]},[$ctx]
1289 +
1290 + vldmia sp!,{d8-d15} @ epilogue
1291 + ldmia sp!,{r4-r7}
1292 + ret @ bx lr
1293 +.size poly1305_blocks_neon,.-poly1305_blocks_neon
1294 +
1295 +.align 5
1296 +.Lzeros:
1297 +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1298 +#ifndef __KERNEL__
1299 +.LOPENSSL_armcap:
1300 +# ifdef _WIN32
1301 +.word OPENSSL_armcap_P
1302 +# else
1303 +.word OPENSSL_armcap_P-.Lpoly1305_init
1304 +# endif
1305 +.comm OPENSSL_armcap_P,4,4
1306 +.hidden OPENSSL_armcap_P
1307 +#endif
1308 +#endif
1309 +___
1310 +} }
1311 +$code.=<<___;
1312 +.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
1313 +.align 2
1314 +___
1315 +
1316 +foreach (split("\n",$code)) {
1317 + s/\`([^\`]*)\`/eval $1/geo;
1318 +
1319 + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
1320 + s/\bret\b/bx lr/go or
1321 + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
1322 +
1323 + print $_,"\n";
1324 +}
1325 +close STDOUT; # enforce flush
1326 --- /dev/null
1327 +++ b/arch/arm/crypto/poly1305-core.S_shipped
1328 @@ -0,0 +1,1158 @@
1329 +#ifndef __KERNEL__
1330 +# include "arm_arch.h"
1331 +#else
1332 +# define __ARM_ARCH__ __LINUX_ARM_ARCH__
1333 +# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
1334 +# define poly1305_init poly1305_init_arm
1335 +# define poly1305_blocks poly1305_blocks_arm
1336 +# define poly1305_emit poly1305_emit_arm
1337 +.globl poly1305_blocks_neon
1338 +#endif
1339 +
1340 +#if defined(__thumb2__)
1341 +.syntax unified
1342 +.thumb
1343 +#else
1344 +.code 32
1345 +#endif
1346 +
1347 +.text
1348 +
1349 +.globl poly1305_emit
1350 +.globl poly1305_blocks
1351 +.globl poly1305_init
1352 +.type poly1305_init,%function
1353 +.align 5
1354 +poly1305_init:
1355 +.Lpoly1305_init:
1356 + stmdb sp!,{r4-r11}
1357 +
1358 + eor r3,r3,r3
1359 + cmp r1,#0
1360 + str r3,[r0,#0] @ zero hash value
1361 + str r3,[r0,#4]
1362 + str r3,[r0,#8]
1363 + str r3,[r0,#12]
1364 + str r3,[r0,#16]
1365 + str r3,[r0,#36] @ clear is_base2_26
1366 + add r0,r0,#20
1367 +
1368 +#ifdef __thumb2__
1369 + it eq
1370 +#endif
1371 + moveq r0,#0
1372 + beq .Lno_key
1373 +
1374 +#if __ARM_MAX_ARCH__>=7
1375 + mov r3,#-1
1376 + str r3,[r0,#28] @ impossible key power value
1377 +# ifndef __KERNEL__
1378 + adr r11,.Lpoly1305_init
1379 + ldr r12,.LOPENSSL_armcap
1380 +# endif
1381 +#endif
1382 + ldrb r4,[r1,#0]
1383 + mov r10,#0x0fffffff
1384 + ldrb r5,[r1,#1]
1385 + and r3,r10,#-4 @ 0x0ffffffc
1386 + ldrb r6,[r1,#2]
1387 + ldrb r7,[r1,#3]
1388 + orr r4,r4,r5,lsl#8
1389 + ldrb r5,[r1,#4]
1390 + orr r4,r4,r6,lsl#16
1391 + ldrb r6,[r1,#5]
1392 + orr r4,r4,r7,lsl#24
1393 + ldrb r7,[r1,#6]
1394 + and r4,r4,r10
1395 +
1396 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1397 +# if !defined(_WIN32)
1398 + ldr r12,[r11,r12] @ OPENSSL_armcap_P
1399 +# endif
1400 +# if defined(__APPLE__) || defined(_WIN32)
1401 + ldr r12,[r12]
1402 +# endif
1403 +#endif
1404 + ldrb r8,[r1,#7]
1405 + orr r5,r5,r6,lsl#8
1406 + ldrb r6,[r1,#8]
1407 + orr r5,r5,r7,lsl#16
1408 + ldrb r7,[r1,#9]
1409 + orr r5,r5,r8,lsl#24
1410 + ldrb r8,[r1,#10]
1411 + and r5,r5,r3
1412 +
1413 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1414 + tst r12,#ARMV7_NEON @ check for NEON
1415 +# ifdef __thumb2__
1416 + adr r9,.Lpoly1305_blocks_neon
1417 + adr r11,.Lpoly1305_blocks
1418 + it ne
1419 + movne r11,r9
1420 + adr r12,.Lpoly1305_emit
1421 + orr r11,r11,#1 @ thumb-ify addresses
1422 + orr r12,r12,#1
1423 +# else
1424 + add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
1425 + ite eq
1426 + addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
1427 + addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
1428 +# endif
1429 +#endif
1430 + ldrb r9,[r1,#11]
1431 + orr r6,r6,r7,lsl#8
1432 + ldrb r7,[r1,#12]
1433 + orr r6,r6,r8,lsl#16
1434 + ldrb r8,[r1,#13]
1435 + orr r6,r6,r9,lsl#24
1436 + ldrb r9,[r1,#14]
1437 + and r6,r6,r3
1438 +
1439 + ldrb r10,[r1,#15]
1440 + orr r7,r7,r8,lsl#8
1441 + str r4,[r0,#0]
1442 + orr r7,r7,r9,lsl#16
1443 + str r5,[r0,#4]
1444 + orr r7,r7,r10,lsl#24
1445 + str r6,[r0,#8]
1446 + and r7,r7,r3
1447 + str r7,[r0,#12]
1448 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1449 + stmia r2,{r11,r12} @ fill functions table
1450 + mov r0,#1
1451 +#else
1452 + mov r0,#0
1453 +#endif
1454 +.Lno_key:
1455 + ldmia sp!,{r4-r11}
1456 +#if __ARM_ARCH__>=5
1457 + bx lr @ bx lr
1458 +#else
1459 + tst lr,#1
1460 + moveq pc,lr @ be binary compatible with V4, yet
1461 + .word 0xe12fff1e @ interoperable with Thumb ISA:-)
1462 +#endif
1463 +.size poly1305_init,.-poly1305_init
1464 +.type poly1305_blocks,%function
1465 +.align 5
1466 +poly1305_blocks:
1467 +.Lpoly1305_blocks:
1468 + stmdb sp!,{r3-r11,lr}
1469 +
1470 + ands r2,r2,#-16
1471 + beq .Lno_data
1472 +
1473 + add r2,r2,r1 @ end pointer
1474 + sub sp,sp,#32
1475 +
1476 +#if __ARM_ARCH__<7
1477 + ldmia r0,{r4-r12} @ load context
1478 + add r0,r0,#20
1479 + str r2,[sp,#16] @ offload stuff
1480 + str r0,[sp,#12]
1481 +#else
1482 + ldr lr,[r0,#36] @ is_base2_26
1483 + ldmia r0!,{r4-r8} @ load hash value
1484 + str r2,[sp,#16] @ offload stuff
1485 + str r0,[sp,#12]
1486 +
1487 + adds r9,r4,r5,lsl#26 @ base 2^26 -> base 2^32
1488 + mov r10,r5,lsr#6
1489 + adcs r10,r10,r6,lsl#20
1490 + mov r11,r6,lsr#12
1491 + adcs r11,r11,r7,lsl#14
1492 + mov r12,r7,lsr#18
1493 + adcs r12,r12,r8,lsl#8
1494 + mov r2,#0
1495 + teq lr,#0
1496 + str r2,[r0,#16] @ clear is_base2_26
1497 + adc r2,r2,r8,lsr#24
1498 +
1499 + itttt ne
1500 + movne r4,r9 @ choose between radixes
1501 + movne r5,r10
1502 + movne r6,r11
1503 + movne r7,r12
1504 + ldmia r0,{r9-r12} @ load key
1505 + it ne
1506 + movne r8,r2
1507 +#endif
1508 +
1509 + mov lr,r1
1510 + cmp r3,#0
1511 + str r10,[sp,#20]
1512 + str r11,[sp,#24]
1513 + str r12,[sp,#28]
1514 + b .Loop
1515 +
1516 +.align 4
1517 +.Loop:
1518 +#if __ARM_ARCH__<7
1519 + ldrb r0,[lr],#16 @ load input
1520 +# ifdef __thumb2__
1521 + it hi
1522 +# endif
1523 + addhi r8,r8,#1 @ 1<<128
1524 + ldrb r1,[lr,#-15]
1525 + ldrb r2,[lr,#-14]
1526 + ldrb r3,[lr,#-13]
1527 + orr r1,r0,r1,lsl#8
1528 + ldrb r0,[lr,#-12]
1529 + orr r2,r1,r2,lsl#16
1530 + ldrb r1,[lr,#-11]
1531 + orr r3,r2,r3,lsl#24
1532 + ldrb r2,[lr,#-10]
1533 + adds r4,r4,r3 @ accumulate input
1534 +
1535 + ldrb r3,[lr,#-9]
1536 + orr r1,r0,r1,lsl#8
1537 + ldrb r0,[lr,#-8]
1538 + orr r2,r1,r2,lsl#16
1539 + ldrb r1,[lr,#-7]
1540 + orr r3,r2,r3,lsl#24
1541 + ldrb r2,[lr,#-6]
1542 + adcs r5,r5,r3
1543 +
1544 + ldrb r3,[lr,#-5]
1545 + orr r1,r0,r1,lsl#8
1546 + ldrb r0,[lr,#-4]
1547 + orr r2,r1,r2,lsl#16
1548 + ldrb r1,[lr,#-3]
1549 + orr r3,r2,r3,lsl#24
1550 + ldrb r2,[lr,#-2]
1551 + adcs r6,r6,r3
1552 +
1553 + ldrb r3,[lr,#-1]
1554 + orr r1,r0,r1,lsl#8
1555 + str lr,[sp,#8] @ offload input pointer
1556 + orr r2,r1,r2,lsl#16
1557 + add r10,r10,r10,lsr#2
1558 + orr r3,r2,r3,lsl#24
1559 +#else
1560 + ldr r0,[lr],#16 @ load input
1561 + it hi
1562 + addhi r8,r8,#1 @ padbit
1563 + ldr r1,[lr,#-12]
1564 + ldr r2,[lr,#-8]
1565 + ldr r3,[lr,#-4]
1566 +# ifdef __ARMEB__
1567 + rev r0,r0
1568 + rev r1,r1
1569 + rev r2,r2
1570 + rev r3,r3
1571 +# endif
1572 + adds r4,r4,r0 @ accumulate input
1573 + str lr,[sp,#8] @ offload input pointer
1574 + adcs r5,r5,r1
1575 + add r10,r10,r10,lsr#2
1576 + adcs r6,r6,r2
1577 +#endif
1578 + add r11,r11,r11,lsr#2
1579 + adcs r7,r7,r3
1580 + add r12,r12,r12,lsr#2
1581 +
1582 + umull r2,r3,r5,r9
1583 + adc r8,r8,#0
1584 + umull r0,r1,r4,r9
1585 + umlal r2,r3,r8,r10
1586 + umlal r0,r1,r7,r10
1587 + ldr r10,[sp,#20] @ reload r10
1588 + umlal r2,r3,r6,r12
1589 + umlal r0,r1,r5,r12
1590 + umlal r2,r3,r7,r11
1591 + umlal r0,r1,r6,r11
1592 + umlal r2,r3,r4,r10
1593 + str r0,[sp,#0] @ future r4
1594 + mul r0,r11,r8
1595 + ldr r11,[sp,#24] @ reload r11
1596 + adds r2,r2,r1 @ d1+=d0>>32
1597 + eor r1,r1,r1
1598 + adc lr,r3,#0 @ future r6
1599 + str r2,[sp,#4] @ future r5
1600 +
1601 + mul r2,r12,r8
1602 + eor r3,r3,r3
1603 + umlal r0,r1,r7,r12
1604 + ldr r12,[sp,#28] @ reload r12
1605 + umlal r2,r3,r7,r9
1606 + umlal r0,r1,r6,r9
1607 + umlal r2,r3,r6,r10
1608 + umlal r0,r1,r5,r10
1609 + umlal r2,r3,r5,r11
1610 + umlal r0,r1,r4,r11
1611 + umlal r2,r3,r4,r12
1612 + ldr r4,[sp,#0]
1613 + mul r8,r9,r8
1614 + ldr r5,[sp,#4]
1615 +
1616 + adds r6,lr,r0 @ d2+=d1>>32
1617 + ldr lr,[sp,#8] @ reload input pointer
1618 + adc r1,r1,#0
1619 + adds r7,r2,r1 @ d3+=d2>>32
1620 + ldr r0,[sp,#16] @ reload end pointer
1621 + adc r3,r3,#0
1622 + add r8,r8,r3 @ h4+=d3>>32
1623 +
1624 + and r1,r8,#-4
1625 + and r8,r8,#3
1626 + add r1,r1,r1,lsr#2 @ *=5
1627 + adds r4,r4,r1
1628 + adcs r5,r5,#0
1629 + adcs r6,r6,#0
1630 + adcs r7,r7,#0
1631 + adc r8,r8,#0
1632 +
1633 + cmp r0,lr @ done yet?
1634 + bhi .Loop
1635 +
1636 + ldr r0,[sp,#12]
1637 + add sp,sp,#32
1638 + stmdb r0,{r4-r8} @ store the result
1639 +
1640 +.Lno_data:
1641 +#if __ARM_ARCH__>=5
1642 + ldmia sp!,{r3-r11,pc}
1643 +#else
1644 + ldmia sp!,{r3-r11,lr}
1645 + tst lr,#1
1646 + moveq pc,lr @ be binary compatible with V4, yet
1647 + .word 0xe12fff1e @ interoperable with Thumb ISA:-)
1648 +#endif
1649 +.size poly1305_blocks,.-poly1305_blocks
1650 +.type poly1305_emit,%function
1651 +.align 5
1652 +poly1305_emit:
1653 +.Lpoly1305_emit:
1654 + stmdb sp!,{r4-r11}
1655 +
1656 + ldmia r0,{r3-r7}
1657 +
1658 +#if __ARM_ARCH__>=7
1659 + ldr ip,[r0,#36] @ is_base2_26
1660 +
1661 + adds r8,r3,r4,lsl#26 @ base 2^26 -> base 2^32
1662 + mov r9,r4,lsr#6
1663 + adcs r9,r9,r5,lsl#20
1664 + mov r10,r5,lsr#12
1665 + adcs r10,r10,r6,lsl#14
1666 + mov r11,r6,lsr#18
1667 + adcs r11,r11,r7,lsl#8
1668 + mov r0,#0
1669 + adc r0,r0,r7,lsr#24
1670 +
1671 + tst ip,ip
1672 + itttt ne
1673 + movne r3,r8
1674 + movne r4,r9
1675 + movne r5,r10
1676 + movne r6,r11
1677 + it ne
1678 + movne r7,r0
1679 +#endif
1680 +
1681 + adds r8,r3,#5 @ compare to modulus
1682 + adcs r9,r4,#0
1683 + adcs r10,r5,#0
1684 + adcs r11,r6,#0
1685 + adc r0,r7,#0
1686 + tst r0,#4 @ did it carry/borrow?
1687 +
1688 +#ifdef __thumb2__
1689 + it ne
1690 +#endif
1691 + movne r3,r8
1692 + ldr r8,[r2,#0]
1693 +#ifdef __thumb2__
1694 + it ne
1695 +#endif
1696 + movne r4,r9
1697 + ldr r9,[r2,#4]
1698 +#ifdef __thumb2__
1699 + it ne
1700 +#endif
1701 + movne r5,r10
1702 + ldr r10,[r2,#8]
1703 +#ifdef __thumb2__
1704 + it ne
1705 +#endif
1706 + movne r6,r11
1707 + ldr r11,[r2,#12]
1708 +
1709 + adds r3,r3,r8
1710 + adcs r4,r4,r9
1711 + adcs r5,r5,r10
1712 + adc r6,r6,r11
1713 +
1714 +#if __ARM_ARCH__>=7
1715 +# ifdef __ARMEB__
1716 + rev r3,r3
1717 + rev r4,r4
1718 + rev r5,r5
1719 + rev r6,r6
1720 +# endif
1721 + str r3,[r1,#0]
1722 + str r4,[r1,#4]
1723 + str r5,[r1,#8]
1724 + str r6,[r1,#12]
1725 +#else
1726 + strb r3,[r1,#0]
1727 + mov r3,r3,lsr#8
1728 + strb r4,[r1,#4]
1729 + mov r4,r4,lsr#8
1730 + strb r5,[r1,#8]
1731 + mov r5,r5,lsr#8
1732 + strb r6,[r1,#12]
1733 + mov r6,r6,lsr#8
1734 +
1735 + strb r3,[r1,#1]
1736 + mov r3,r3,lsr#8
1737 + strb r4,[r1,#5]
1738 + mov r4,r4,lsr#8
1739 + strb r5,[r1,#9]
1740 + mov r5,r5,lsr#8
1741 + strb r6,[r1,#13]
1742 + mov r6,r6,lsr#8
1743 +
1744 + strb r3,[r1,#2]
1745 + mov r3,r3,lsr#8
1746 + strb r4,[r1,#6]
1747 + mov r4,r4,lsr#8
1748 + strb r5,[r1,#10]
1749 + mov r5,r5,lsr#8
1750 + strb r6,[r1,#14]
1751 + mov r6,r6,lsr#8
1752 +
1753 + strb r3,[r1,#3]
1754 + strb r4,[r1,#7]
1755 + strb r5,[r1,#11]
1756 + strb r6,[r1,#15]
1757 +#endif
1758 + ldmia sp!,{r4-r11}
1759 +#if __ARM_ARCH__>=5
1760 + bx lr @ bx lr
1761 +#else
1762 + tst lr,#1
1763 + moveq pc,lr @ be binary compatible with V4, yet
1764 + .word 0xe12fff1e @ interoperable with Thumb ISA:-)
1765 +#endif
1766 +.size poly1305_emit,.-poly1305_emit
1767 +#if __ARM_MAX_ARCH__>=7
1768 +.fpu neon
1769 +
1770 +.type poly1305_init_neon,%function
1771 +.align 5
1772 +poly1305_init_neon:
1773 +.Lpoly1305_init_neon:
1774 + ldr r3,[r0,#48] @ first table element
1775 + cmp r3,#-1 @ is value impossible?
1776 + bne .Lno_init_neon
1777 +
1778 + ldr r4,[r0,#20] @ load key base 2^32
1779 + ldr r5,[r0,#24]
1780 + ldr r6,[r0,#28]
1781 + ldr r7,[r0,#32]
1782 +
1783 + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
1784 + mov r3,r4,lsr#26
1785 + mov r4,r5,lsr#20
1786 + orr r3,r3,r5,lsl#6
1787 + mov r5,r6,lsr#14
1788 + orr r4,r4,r6,lsl#12
1789 + mov r6,r7,lsr#8
1790 + orr r5,r5,r7,lsl#18
1791 + and r3,r3,#0x03ffffff
1792 + and r4,r4,#0x03ffffff
1793 + and r5,r5,#0x03ffffff
1794 +
1795 + vdup.32 d0,r2 @ r^1 in both lanes
1796 + add r2,r3,r3,lsl#2 @ *5
1797 + vdup.32 d1,r3
1798 + add r3,r4,r4,lsl#2
1799 + vdup.32 d2,r2
1800 + vdup.32 d3,r4
1801 + add r4,r5,r5,lsl#2
1802 + vdup.32 d4,r3
1803 + vdup.32 d5,r5
1804 + add r5,r6,r6,lsl#2
1805 + vdup.32 d6,r4
1806 + vdup.32 d7,r6
1807 + vdup.32 d8,r5
1808 +
1809 + mov r5,#2 @ counter
1810 +
1811 +.Lsquare_neon:
1812 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1813 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1814 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1815 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1816 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1817 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1818 +
1819 + vmull.u32 q5,d0,d0[1]
1820 + vmull.u32 q6,d1,d0[1]
1821 + vmull.u32 q7,d3,d0[1]
1822 + vmull.u32 q8,d5,d0[1]
1823 + vmull.u32 q9,d7,d0[1]
1824 +
1825 + vmlal.u32 q5,d7,d2[1]
1826 + vmlal.u32 q6,d0,d1[1]
1827 + vmlal.u32 q7,d1,d1[1]
1828 + vmlal.u32 q8,d3,d1[1]
1829 + vmlal.u32 q9,d5,d1[1]
1830 +
1831 + vmlal.u32 q5,d5,d4[1]
1832 + vmlal.u32 q6,d7,d4[1]
1833 + vmlal.u32 q8,d1,d3[1]
1834 + vmlal.u32 q7,d0,d3[1]
1835 + vmlal.u32 q9,d3,d3[1]
1836 +
1837 + vmlal.u32 q5,d3,d6[1]
1838 + vmlal.u32 q8,d0,d5[1]
1839 + vmlal.u32 q6,d5,d6[1]
1840 + vmlal.u32 q7,d7,d6[1]
1841 + vmlal.u32 q9,d1,d5[1]
1842 +
1843 + vmlal.u32 q8,d7,d8[1]
1844 + vmlal.u32 q5,d1,d8[1]
1845 + vmlal.u32 q6,d3,d8[1]
1846 + vmlal.u32 q7,d5,d8[1]
1847 + vmlal.u32 q9,d0,d7[1]
1848 +
1849 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1850 + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1851 + @ and P. Schwabe
1852 + @
1853 + @ H0>>+H1>>+H2>>+H3>>+H4
1854 + @ H3>>+H4>>*5+H0>>+H1
1855 + @
1856 + @ Trivia.
1857 + @
1858 + @ Result of multiplication of n-bit number by m-bit number is
1859 + @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
1860 + @ m-bit number multiplied by 2^n is still n+m bits wide.
1861 + @
1862 + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
1863 + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
1864 + @ one is n+1 bits wide.
1865 + @
1866 + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
1867 + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
1868 + @ can be 27. However! In cases when their width exceeds 26 bits
1869 + @ they are limited by 2^26+2^6. This in turn means that *sum*
1870 + @ of the products with these values can still be viewed as sum
1871 + @ of 52-bit numbers as long as the amount of addends is not a
1872 + @ power of 2. For example,
1873 + @
1874 + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
1875 + @
1876 + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
1877 + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
1878 + @ 8 * (2^52) or 2^55. However, the value is then multiplied by
1879 + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
1880 + @ which is less than 32 * (2^52) or 2^57. And when processing
1881 + @ data we are looking at triple as many addends...
1882 + @
1883 + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
1884 + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
1885 + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
1886 + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
1887 + @ instruction accepts 2x32-bit input and writes 2x64-bit result.
1888 + @ This means that result of reduction have to be compressed upon
1889 + @ loop wrap-around. This can be done in the process of reduction
1890 + @ to minimize amount of instructions [as well as amount of
1891 + @ 128-bit instructions, which benefits low-end processors], but
1892 + @ one has to watch for H2 (which is narrower than H0) and 5*H4
1893 + @ not being wider than 58 bits, so that result of right shift
1894 + @ by 26 bits fits in 32 bits. This is also useful on x86,
1895 + @ because it allows to use paddd in place for paddq, which
1896 + @ benefits Atom, where paddq is ridiculously slow.
1897 +
1898 + vshr.u64 q15,q8,#26
1899 + vmovn.i64 d16,q8
1900 + vshr.u64 q4,q5,#26
1901 + vmovn.i64 d10,q5
1902 + vadd.i64 q9,q9,q15 @ h3 -> h4
1903 + vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
1904 + vadd.i64 q6,q6,q4 @ h0 -> h1
1905 + vbic.i32 d10,#0xfc000000
1906 +
1907 + vshrn.u64 d30,q9,#26
1908 + vmovn.i64 d18,q9
1909 + vshr.u64 q4,q6,#26
1910 + vmovn.i64 d12,q6
1911 + vadd.i64 q7,q7,q4 @ h1 -> h2
1912 + vbic.i32 d18,#0xfc000000
1913 + vbic.i32 d12,#0xfc000000
1914 +
1915 + vadd.i32 d10,d10,d30
1916 + vshl.u32 d30,d30,#2
1917 + vshrn.u64 d8,q7,#26
1918 + vmovn.i64 d14,q7
1919 + vadd.i32 d10,d10,d30 @ h4 -> h0
1920 + vadd.i32 d16,d16,d8 @ h2 -> h3
1921 + vbic.i32 d14,#0xfc000000
1922 +
1923 + vshr.u32 d30,d10,#26
1924 + vbic.i32 d10,#0xfc000000
1925 + vshr.u32 d8,d16,#26
1926 + vbic.i32 d16,#0xfc000000
1927 + vadd.i32 d12,d12,d30 @ h0 -> h1
1928 + vadd.i32 d18,d18,d8 @ h3 -> h4
1929 +
1930 + subs r5,r5,#1
1931 + beq .Lsquare_break_neon
1932 +
1933 + add r6,r0,#(48+0*9*4)
1934 + add r7,r0,#(48+1*9*4)
1935 +
1936 + vtrn.32 d0,d10 @ r^2:r^1
1937 + vtrn.32 d3,d14
1938 + vtrn.32 d5,d16
1939 + vtrn.32 d1,d12
1940 + vtrn.32 d7,d18
1941 +
1942 + vshl.u32 d4,d3,#2 @ *5
1943 + vshl.u32 d6,d5,#2
1944 + vshl.u32 d2,d1,#2
1945 + vshl.u32 d8,d7,#2
1946 + vadd.i32 d4,d4,d3
1947 + vadd.i32 d2,d2,d1
1948 + vadd.i32 d6,d6,d5
1949 + vadd.i32 d8,d8,d7
1950 +
1951 + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
1952 + vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
1953 + vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
1954 + vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
1955 + vst1.32 {d8[0]},[r6,:32]
1956 + vst1.32 {d8[1]},[r7,:32]
1957 +
1958 + b .Lsquare_neon
1959 +
1960 +.align 4
1961 +.Lsquare_break_neon:
1962 + add r6,r0,#(48+2*4*9)
1963 + add r7,r0,#(48+3*4*9)
1964 +
1965 + vmov d0,d10 @ r^4:r^3
1966 + vshl.u32 d2,d12,#2 @ *5
1967 + vmov d1,d12
1968 + vshl.u32 d4,d14,#2
1969 + vmov d3,d14
1970 + vshl.u32 d6,d16,#2
1971 + vmov d5,d16
1972 + vshl.u32 d8,d18,#2
1973 + vmov d7,d18
1974 + vadd.i32 d2,d2,d12
1975 + vadd.i32 d4,d4,d14
1976 + vadd.i32 d6,d6,d16
1977 + vadd.i32 d8,d8,d18
1978 +
1979 + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
1980 + vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
1981 + vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
1982 + vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
1983 + vst1.32 {d8[0]},[r6]
1984 + vst1.32 {d8[1]},[r7]
1985 +
1986 +.Lno_init_neon:
1987 + bx lr @ bx lr
1988 +.size poly1305_init_neon,.-poly1305_init_neon
1989 +
1990 +.type poly1305_blocks_neon,%function
1991 +.align 5
1992 +poly1305_blocks_neon:
1993 +.Lpoly1305_blocks_neon:
1994 + ldr ip,[r0,#36] @ is_base2_26
1995 +
1996 + cmp r2,#64
1997 + blo .Lpoly1305_blocks
1998 +
1999 + stmdb sp!,{r4-r7}
2000 + vstmdb sp!,{d8-d15} @ ABI specification says so
2001 +
2002 + tst ip,ip @ is_base2_26?
2003 + bne .Lbase2_26_neon
2004 +
2005 + stmdb sp!,{r1-r3,lr}
2006 + bl .Lpoly1305_init_neon
2007 +
2008 + ldr r4,[r0,#0] @ load hash value base 2^32
2009 + ldr r5,[r0,#4]
2010 + ldr r6,[r0,#8]
2011 + ldr r7,[r0,#12]
2012 + ldr ip,[r0,#16]
2013 +
2014 + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
2015 + mov r3,r4,lsr#26
2016 + veor d10,d10,d10
2017 + mov r4,r5,lsr#20
2018 + orr r3,r3,r5,lsl#6
2019 + veor d12,d12,d12
2020 + mov r5,r6,lsr#14
2021 + orr r4,r4,r6,lsl#12
2022 + veor d14,d14,d14
2023 + mov r6,r7,lsr#8
2024 + orr r5,r5,r7,lsl#18
2025 + veor d16,d16,d16
2026 + and r3,r3,#0x03ffffff
2027 + orr r6,r6,ip,lsl#24
2028 + veor d18,d18,d18
2029 + and r4,r4,#0x03ffffff
2030 + mov r1,#1
2031 + and r5,r5,#0x03ffffff
2032 + str r1,[r0,#36] @ set is_base2_26
2033 +
2034 + vmov.32 d10[0],r2
2035 + vmov.32 d12[0],r3
2036 + vmov.32 d14[0],r4
2037 + vmov.32 d16[0],r5
2038 + vmov.32 d18[0],r6
2039 + adr r5,.Lzeros
2040 +
2041 + ldmia sp!,{r1-r3,lr}
2042 + b .Lhash_loaded
2043 +
2044 +.align 4
2045 +.Lbase2_26_neon:
2046 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2047 + @ load hash value
2048 +
2049 + veor d10,d10,d10
2050 + veor d12,d12,d12
2051 + veor d14,d14,d14
2052 + veor d16,d16,d16
2053 + veor d18,d18,d18
2054 + vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
2055 + adr r5,.Lzeros
2056 + vld1.32 {d18[0]},[r0]
2057 + sub r0,r0,#16 @ rewind
2058 +
2059 +.Lhash_loaded:
2060 + add r4,r1,#32
2061 + mov r3,r3,lsl#24
2062 + tst r2,#31
2063 + beq .Leven
2064 +
2065 + vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
2066 + vmov.32 d28[0],r3
2067 + sub r2,r2,#16
2068 + add r4,r1,#32
2069 +
2070 +# ifdef __ARMEB__
2071 + vrev32.8 q10,q10
2072 + vrev32.8 q13,q13
2073 + vrev32.8 q11,q11
2074 + vrev32.8 q12,q12
2075 +# endif
2076 + vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
2077 + vshl.u32 d26,d26,#18
2078 +
2079 + vsri.u32 d26,d24,#14
2080 + vshl.u32 d24,d24,#12
2081 + vadd.i32 d29,d28,d18 @ add hash value and move to #hi
2082 +
2083 + vbic.i32 d26,#0xfc000000
2084 + vsri.u32 d24,d22,#20
2085 + vshl.u32 d22,d22,#6
2086 +
2087 + vbic.i32 d24,#0xfc000000
2088 + vsri.u32 d22,d20,#26
2089 + vadd.i32 d27,d26,d16
2090 +
2091 + vbic.i32 d20,#0xfc000000
2092 + vbic.i32 d22,#0xfc000000
2093 + vadd.i32 d25,d24,d14
2094 +
2095 + vadd.i32 d21,d20,d10
2096 + vadd.i32 d23,d22,d12
2097 +
2098 + mov r7,r5
2099 + add r6,r0,#48
2100 +
2101 + cmp r2,r2
2102 + b .Long_tail
2103 +
2104 +.align 4
2105 +.Leven:
2106 + subs r2,r2,#64
2107 + it lo
2108 + movlo r4,r5
2109 +
2110 + vmov.i32 q14,#1<<24 @ padbit, yes, always
2111 + vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
2112 + add r1,r1,#64
2113 + vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
2114 + add r4,r4,#64
2115 + itt hi
2116 + addhi r7,r0,#(48+1*9*4)
2117 + addhi r6,r0,#(48+3*9*4)
2118 +
2119 +# ifdef __ARMEB__
2120 + vrev32.8 q10,q10
2121 + vrev32.8 q13,q13
2122 + vrev32.8 q11,q11
2123 + vrev32.8 q12,q12
2124 +# endif
2125 + vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
2126 + vshl.u32 q13,q13,#18
2127 +
2128 + vsri.u32 q13,q12,#14
2129 + vshl.u32 q12,q12,#12
2130 +
2131 + vbic.i32 q13,#0xfc000000
2132 + vsri.u32 q12,q11,#20
2133 + vshl.u32 q11,q11,#6
2134 +
2135 + vbic.i32 q12,#0xfc000000
2136 + vsri.u32 q11,q10,#26
2137 +
2138 + vbic.i32 q10,#0xfc000000
2139 + vbic.i32 q11,#0xfc000000
2140 +
2141 + bls .Lskip_loop
2142 +
2143 + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
2144 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
2145 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
2146 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
2147 + b .Loop_neon
2148 +
2149 +.align 5
2150 +.Loop_neon:
2151 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2152 + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
2153 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
2154 + @ ___________________/
2155 + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
2156 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
2157 + @ ___________________/ ____________________/
2158 + @
2159 + @ Note that we start with inp[2:3]*r^2. This is because it
2160 + @ doesn't depend on reduction in previous iteration.
2161 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2162 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
2163 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
2164 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
2165 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2166 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2167 +
2168 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2169 + @ inp[2:3]*r^2
2170 +
2171 + vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
2172 + vmull.u32 q7,d25,d0[1]
2173 + vadd.i32 d20,d20,d10
2174 + vmull.u32 q5,d21,d0[1]
2175 + vadd.i32 d26,d26,d16
2176 + vmull.u32 q8,d27,d0[1]
2177 + vmlal.u32 q7,d23,d1[1]
2178 + vadd.i32 d22,d22,d12
2179 + vmull.u32 q6,d23,d0[1]
2180 +
2181 + vadd.i32 d28,d28,d18
2182 + vmull.u32 q9,d29,d0[1]
2183 + subs r2,r2,#64
2184 + vmlal.u32 q5,d29,d2[1]
2185 + it lo
2186 + movlo r4,r5
2187 + vmlal.u32 q8,d25,d1[1]
2188 + vld1.32 d8[1],[r7,:32]
2189 + vmlal.u32 q6,d21,d1[1]
2190 + vmlal.u32 q9,d27,d1[1]
2191 +
2192 + vmlal.u32 q5,d27,d4[1]
2193 + vmlal.u32 q8,d23,d3[1]
2194 + vmlal.u32 q9,d25,d3[1]
2195 + vmlal.u32 q6,d29,d4[1]
2196 + vmlal.u32 q7,d21,d3[1]
2197 +
2198 + vmlal.u32 q8,d21,d5[1]
2199 + vmlal.u32 q5,d25,d6[1]
2200 + vmlal.u32 q9,d23,d5[1]
2201 + vmlal.u32 q6,d27,d6[1]
2202 + vmlal.u32 q7,d29,d6[1]
2203 +
2204 + vmlal.u32 q8,d29,d8[1]
2205 + vmlal.u32 q5,d23,d8[1]
2206 + vmlal.u32 q9,d21,d7[1]
2207 + vmlal.u32 q6,d25,d8[1]
2208 + vmlal.u32 q7,d27,d8[1]
2209 +
2210 + vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
2211 + add r4,r4,#64
2212 +
2213 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2214 + @ (hash+inp[0:1])*r^4 and accumulate
2215 +
2216 + vmlal.u32 q8,d26,d0[0]
2217 + vmlal.u32 q5,d20,d0[0]
2218 + vmlal.u32 q9,d28,d0[0]
2219 + vmlal.u32 q6,d22,d0[0]
2220 + vmlal.u32 q7,d24,d0[0]
2221 + vld1.32 d8[0],[r6,:32]
2222 +
2223 + vmlal.u32 q8,d24,d1[0]
2224 + vmlal.u32 q5,d28,d2[0]
2225 + vmlal.u32 q9,d26,d1[0]
2226 + vmlal.u32 q6,d20,d1[0]
2227 + vmlal.u32 q7,d22,d1[0]
2228 +
2229 + vmlal.u32 q8,d22,d3[0]
2230 + vmlal.u32 q5,d26,d4[0]
2231 + vmlal.u32 q9,d24,d3[0]
2232 + vmlal.u32 q6,d28,d4[0]
2233 + vmlal.u32 q7,d20,d3[0]
2234 +
2235 + vmlal.u32 q8,d20,d5[0]
2236 + vmlal.u32 q5,d24,d6[0]
2237 + vmlal.u32 q9,d22,d5[0]
2238 + vmlal.u32 q6,d26,d6[0]
2239 + vmlal.u32 q8,d28,d8[0]
2240 +
2241 + vmlal.u32 q7,d28,d6[0]
2242 + vmlal.u32 q5,d22,d8[0]
2243 + vmlal.u32 q9,d20,d7[0]
2244 + vmov.i32 q14,#1<<24 @ padbit, yes, always
2245 + vmlal.u32 q6,d24,d8[0]
2246 + vmlal.u32 q7,d26,d8[0]
2247 +
2248 + vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
2249 + add r1,r1,#64
2250 +# ifdef __ARMEB__
2251 + vrev32.8 q10,q10
2252 + vrev32.8 q11,q11
2253 + vrev32.8 q12,q12
2254 + vrev32.8 q13,q13
2255 +# endif
2256 +
2257 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2258 + @ lazy reduction interleaved with base 2^32 -> base 2^26 of
2259 + @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
2260 +
2261 + vshr.u64 q15,q8,#26
2262 + vmovn.i64 d16,q8
2263 + vshr.u64 q4,q5,#26
2264 + vmovn.i64 d10,q5
2265 + vadd.i64 q9,q9,q15 @ h3 -> h4
2266 + vbic.i32 d16,#0xfc000000
2267 + vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
2268 + vadd.i64 q6,q6,q4 @ h0 -> h1
2269 + vshl.u32 q13,q13,#18
2270 + vbic.i32 d10,#0xfc000000
2271 +
2272 + vshrn.u64 d30,q9,#26
2273 + vmovn.i64 d18,q9
2274 + vshr.u64 q4,q6,#26
2275 + vmovn.i64 d12,q6
2276 + vadd.i64 q7,q7,q4 @ h1 -> h2
2277 + vsri.u32 q13,q12,#14
2278 + vbic.i32 d18,#0xfc000000
2279 + vshl.u32 q12,q12,#12
2280 + vbic.i32 d12,#0xfc000000
2281 +
2282 + vadd.i32 d10,d10,d30
2283 + vshl.u32 d30,d30,#2
2284 + vbic.i32 q13,#0xfc000000
2285 + vshrn.u64 d8,q7,#26
2286 + vmovn.i64 d14,q7
2287 + vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
2288 + vsri.u32 q12,q11,#20
2289 + vadd.i32 d16,d16,d8 @ h2 -> h3
2290 + vshl.u32 q11,q11,#6
2291 + vbic.i32 d14,#0xfc000000
2292 + vbic.i32 q12,#0xfc000000
2293 +
2294 + vshrn.u64 d30,q5,#26 @ re-narrow
2295 + vmovn.i64 d10,q5
2296 + vsri.u32 q11,q10,#26
2297 + vbic.i32 q10,#0xfc000000
2298 + vshr.u32 d8,d16,#26
2299 + vbic.i32 d16,#0xfc000000
2300 + vbic.i32 d10,#0xfc000000
2301 + vadd.i32 d12,d12,d30 @ h0 -> h1
2302 + vadd.i32 d18,d18,d8 @ h3 -> h4
2303 + vbic.i32 q11,#0xfc000000
2304 +
2305 + bhi .Loop_neon
2306 +
2307 +.Lskip_loop:
2308 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2309 + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
2310 +
2311 + add r7,r0,#(48+0*9*4)
2312 + add r6,r0,#(48+1*9*4)
2313 + adds r2,r2,#32
2314 + it ne
2315 + movne r2,#0
2316 + bne .Long_tail
2317 +
2318 + vadd.i32 d25,d24,d14 @ add hash value and move to #hi
2319 + vadd.i32 d21,d20,d10
2320 + vadd.i32 d27,d26,d16
2321 + vadd.i32 d23,d22,d12
2322 + vadd.i32 d29,d28,d18
2323 +
2324 +.Long_tail:
2325 + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
2326 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
2327 +
2328 + vadd.i32 d24,d24,d14 @ can be redundant
2329 + vmull.u32 q7,d25,d0
2330 + vadd.i32 d20,d20,d10
2331 + vmull.u32 q5,d21,d0
2332 + vadd.i32 d26,d26,d16
2333 + vmull.u32 q8,d27,d0
2334 + vadd.i32 d22,d22,d12
2335 + vmull.u32 q6,d23,d0
2336 + vadd.i32 d28,d28,d18
2337 + vmull.u32 q9,d29,d0
2338 +
2339 + vmlal.u32 q5,d29,d2
2340 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
2341 + vmlal.u32 q8,d25,d1
2342 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
2343 + vmlal.u32 q6,d21,d1
2344 + vmlal.u32 q9,d27,d1
2345 + vmlal.u32 q7,d23,d1
2346 +
2347 + vmlal.u32 q8,d23,d3
2348 + vld1.32 d8[1],[r7,:32]
2349 + vmlal.u32 q5,d27,d4
2350 + vld1.32 d8[0],[r6,:32]
2351 + vmlal.u32 q9,d25,d3
2352 + vmlal.u32 q6,d29,d4
2353 + vmlal.u32 q7,d21,d3
2354 +
2355 + vmlal.u32 q8,d21,d5
2356 + it ne
2357 + addne r7,r0,#(48+2*9*4)
2358 + vmlal.u32 q5,d25,d6
2359 + it ne
2360 + addne r6,r0,#(48+3*9*4)
2361 + vmlal.u32 q9,d23,d5
2362 + vmlal.u32 q6,d27,d6
2363 + vmlal.u32 q7,d29,d6
2364 +
2365 + vmlal.u32 q8,d29,d8
2366 + vorn q0,q0,q0 @ all-ones, can be redundant
2367 + vmlal.u32 q5,d23,d8
2368 + vshr.u64 q0,q0,#38
2369 + vmlal.u32 q9,d21,d7
2370 + vmlal.u32 q6,d25,d8
2371 + vmlal.u32 q7,d27,d8
2372 +
2373 + beq .Lshort_tail
2374 +
2375 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2376 + @ (hash+inp[0:1])*r^4:r^3 and accumulate
2377 +
2378 + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
2379 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
2380 +
2381 + vmlal.u32 q7,d24,d0
2382 + vmlal.u32 q5,d20,d0
2383 + vmlal.u32 q8,d26,d0
2384 + vmlal.u32 q6,d22,d0
2385 + vmlal.u32 q9,d28,d0
2386 +
2387 + vmlal.u32 q5,d28,d2
2388 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
2389 + vmlal.u32 q8,d24,d1
2390 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
2391 + vmlal.u32 q6,d20,d1
2392 + vmlal.u32 q9,d26,d1
2393 + vmlal.u32 q7,d22,d1
2394 +
2395 + vmlal.u32 q8,d22,d3
2396 + vld1.32 d8[1],[r7,:32]
2397 + vmlal.u32 q5,d26,d4
2398 + vld1.32 d8[0],[r6,:32]
2399 + vmlal.u32 q9,d24,d3
2400 + vmlal.u32 q6,d28,d4
2401 + vmlal.u32 q7,d20,d3
2402 +
2403 + vmlal.u32 q8,d20,d5
2404 + vmlal.u32 q5,d24,d6
2405 + vmlal.u32 q9,d22,d5
2406 + vmlal.u32 q6,d26,d6
2407 + vmlal.u32 q7,d28,d6
2408 +
2409 + vmlal.u32 q8,d28,d8
2410 + vorn q0,q0,q0 @ all-ones
2411 + vmlal.u32 q5,d22,d8
2412 + vshr.u64 q0,q0,#38
2413 + vmlal.u32 q9,d20,d7
2414 + vmlal.u32 q6,d24,d8
2415 + vmlal.u32 q7,d26,d8
2416 +
2417 +.Lshort_tail:
2418 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2419 + @ horizontal addition
2420 +
2421 + vadd.i64 d16,d16,d17
2422 + vadd.i64 d10,d10,d11
2423 + vadd.i64 d18,d18,d19
2424 + vadd.i64 d12,d12,d13
2425 + vadd.i64 d14,d14,d15
2426 +
2427 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2428 + @ lazy reduction, but without narrowing
2429 +
2430 + vshr.u64 q15,q8,#26
2431 + vand.i64 q8,q8,q0
2432 + vshr.u64 q4,q5,#26
2433 + vand.i64 q5,q5,q0
2434 + vadd.i64 q9,q9,q15 @ h3 -> h4
2435 + vadd.i64 q6,q6,q4 @ h0 -> h1
2436 +
2437 + vshr.u64 q15,q9,#26
2438 + vand.i64 q9,q9,q0
2439 + vshr.u64 q4,q6,#26
2440 + vand.i64 q6,q6,q0
2441 + vadd.i64 q7,q7,q4 @ h1 -> h2
2442 +
2443 + vadd.i64 q5,q5,q15
2444 + vshl.u64 q15,q15,#2
2445 + vshr.u64 q4,q7,#26
2446 + vand.i64 q7,q7,q0
2447 + vadd.i64 q5,q5,q15 @ h4 -> h0
2448 + vadd.i64 q8,q8,q4 @ h2 -> h3
2449 +
2450 + vshr.u64 q15,q5,#26
2451 + vand.i64 q5,q5,q0
2452 + vshr.u64 q4,q8,#26
2453 + vand.i64 q8,q8,q0
2454 + vadd.i64 q6,q6,q15 @ h0 -> h1
2455 + vadd.i64 q9,q9,q4 @ h3 -> h4
2456 +
2457 + cmp r2,#0
2458 + bne .Leven
2459 +
2460 + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2461 + @ store hash value
2462 +
2463 + vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
2464 + vst1.32 {d18[0]},[r0]
2465 +
2466 + vldmia sp!,{d8-d15} @ epilogue
2467 + ldmia sp!,{r4-r7}
2468 + bx lr @ bx lr
2469 +.size poly1305_blocks_neon,.-poly1305_blocks_neon
2470 +
2471 +.align 5
2472 +.Lzeros:
2473 +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2474 +#ifndef __KERNEL__
2475 +.LOPENSSL_armcap:
2476 +# ifdef _WIN32
2477 +.word OPENSSL_armcap_P
2478 +# else
2479 +.word OPENSSL_armcap_P-.Lpoly1305_init
2480 +# endif
2481 +.comm OPENSSL_armcap_P,4,4
2482 +.hidden OPENSSL_armcap_P
2483 +#endif
2484 +#endif
2485 +.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by @dot-asm"
2486 +.align 2
2487 --- /dev/null
2488 +++ b/arch/arm/crypto/poly1305-glue.c
2489 @@ -0,0 +1,276 @@
2490 +// SPDX-License-Identifier: GPL-2.0
2491 +/*
2492 + * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
2493 + *
2494 + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
2495 + */
2496 +
2497 +#include <asm/hwcap.h>
2498 +#include <asm/neon.h>
2499 +#include <asm/simd.h>
2500 +#include <asm/unaligned.h>
2501 +#include <crypto/algapi.h>
2502 +#include <crypto/internal/hash.h>
2503 +#include <crypto/internal/poly1305.h>
2504 +#include <crypto/internal/simd.h>
2505 +#include <linux/cpufeature.h>
2506 +#include <linux/crypto.h>
2507 +#include <linux/jump_label.h>
2508 +#include <linux/module.h>
2509 +
2510 +void poly1305_init_arm(void *state, const u8 *key);
2511 +void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
2512 +void poly1305_emit_arm(void *state, __le32 *digest, const u32 *nonce);
2513 +
2514 +void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
2515 +{
2516 +}
2517 +
2518 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
2519 +
2520 +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
2521 +{
2522 + poly1305_init_arm(&dctx->h, key);
2523 + dctx->s[0] = get_unaligned_le32(key + 16);
2524 + dctx->s[1] = get_unaligned_le32(key + 20);
2525 + dctx->s[2] = get_unaligned_le32(key + 24);
2526 + dctx->s[3] = get_unaligned_le32(key + 28);
2527 + dctx->buflen = 0;
2528 +}
2529 +EXPORT_SYMBOL(poly1305_init_arch);
2530 +
2531 +static int arm_poly1305_init(struct shash_desc *desc)
2532 +{
2533 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2534 +
2535 + dctx->buflen = 0;
2536 + dctx->rset = 0;
2537 + dctx->sset = false;
2538 +
2539 + return 0;
2540 +}
2541 +
2542 +static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
2543 + u32 len, u32 hibit, bool do_neon)
2544 +{
2545 + if (unlikely(!dctx->sset)) {
2546 + if (!dctx->rset) {
2547 + poly1305_init_arm(&dctx->h, src);
2548 + src += POLY1305_BLOCK_SIZE;
2549 + len -= POLY1305_BLOCK_SIZE;
2550 + dctx->rset = 1;
2551 + }
2552 + if (len >= POLY1305_BLOCK_SIZE) {
2553 + dctx->s[0] = get_unaligned_le32(src + 0);
2554 + dctx->s[1] = get_unaligned_le32(src + 4);
2555 + dctx->s[2] = get_unaligned_le32(src + 8);
2556 + dctx->s[3] = get_unaligned_le32(src + 12);
2557 + src += POLY1305_BLOCK_SIZE;
2558 + len -= POLY1305_BLOCK_SIZE;
2559 + dctx->sset = true;
2560 + }
2561 + if (len < POLY1305_BLOCK_SIZE)
2562 + return;
2563 + }
2564 +
2565 + len &= ~(POLY1305_BLOCK_SIZE - 1);
2566 +
2567 + if (static_branch_likely(&have_neon) && likely(do_neon))
2568 + poly1305_blocks_neon(&dctx->h, src, len, hibit);
2569 + else
2570 + poly1305_blocks_arm(&dctx->h, src, len, hibit);
2571 +}
2572 +
2573 +static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
2574 + const u8 *src, u32 len, bool do_neon)
2575 +{
2576 + if (unlikely(dctx->buflen)) {
2577 + u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
2578 +
2579 + memcpy(dctx->buf + dctx->buflen, src, bytes);
2580 + src += bytes;
2581 + len -= bytes;
2582 + dctx->buflen += bytes;
2583 +
2584 + if (dctx->buflen == POLY1305_BLOCK_SIZE) {
2585 + arm_poly1305_blocks(dctx, dctx->buf,
2586 + POLY1305_BLOCK_SIZE, 1, false);
2587 + dctx->buflen = 0;
2588 + }
2589 + }
2590 +
2591 + if (likely(len >= POLY1305_BLOCK_SIZE)) {
2592 + arm_poly1305_blocks(dctx, src, len, 1, do_neon);
2593 + src += round_down(len, POLY1305_BLOCK_SIZE);
2594 + len %= POLY1305_BLOCK_SIZE;
2595 + }
2596 +
2597 + if (unlikely(len)) {
2598 + dctx->buflen = len;
2599 + memcpy(dctx->buf, src, len);
2600 + }
2601 +}
2602 +
2603 +static int arm_poly1305_update(struct shash_desc *desc,
2604 + const u8 *src, unsigned int srclen)
2605 +{
2606 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2607 +
2608 + arm_poly1305_do_update(dctx, src, srclen, false);
2609 + return 0;
2610 +}
2611 +
2612 +static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
2613 + const u8 *src,
2614 + unsigned int srclen)
2615 +{
2616 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2617 + bool do_neon = crypto_simd_usable() && srclen > 128;
2618 +
2619 + if (static_branch_likely(&have_neon) && do_neon)
2620 + kernel_neon_begin();
2621 + arm_poly1305_do_update(dctx, src, srclen, do_neon);
2622 + if (static_branch_likely(&have_neon) && do_neon)
2623 + kernel_neon_end();
2624 + return 0;
2625 +}
2626 +
2627 +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
2628 + unsigned int nbytes)
2629 +{
2630 + bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
2631 + crypto_simd_usable();
2632 +
2633 + if (unlikely(dctx->buflen)) {
2634 + u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
2635 +
2636 + memcpy(dctx->buf + dctx->buflen, src, bytes);
2637 + src += bytes;
2638 + nbytes -= bytes;
2639 + dctx->buflen += bytes;
2640 +
2641 + if (dctx->buflen == POLY1305_BLOCK_SIZE) {
2642 + poly1305_blocks_arm(&dctx->h, dctx->buf,
2643 + POLY1305_BLOCK_SIZE, 1);
2644 + dctx->buflen = 0;
2645 + }
2646 + }
2647 +
2648 + if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
2649 + unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
2650 +
2651 + if (static_branch_likely(&have_neon) && do_neon) {
2652 + kernel_neon_begin();
2653 + poly1305_blocks_neon(&dctx->h, src, len, 1);
2654 + kernel_neon_end();
2655 + } else {
2656 + poly1305_blocks_arm(&dctx->h, src, len, 1);
2657 + }
2658 + src += len;
2659 + nbytes %= POLY1305_BLOCK_SIZE;
2660 + }
2661 +
2662 + if (unlikely(nbytes)) {
2663 + dctx->buflen = nbytes;
2664 + memcpy(dctx->buf, src, nbytes);
2665 + }
2666 +}
2667 +EXPORT_SYMBOL(poly1305_update_arch);
2668 +
2669 +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
2670 +{
2671 + __le32 digest[4];
2672 + u64 f = 0;
2673 +
2674 + if (unlikely(dctx->buflen)) {
2675 + dctx->buf[dctx->buflen++] = 1;
2676 + memset(dctx->buf + dctx->buflen, 0,
2677 + POLY1305_BLOCK_SIZE - dctx->buflen);
2678 + poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
2679 + }
2680 +
2681 + poly1305_emit_arm(&dctx->h, digest, dctx->s);
2682 +
2683 + /* mac = (h + s) % (2^128) */
2684 + f = (f >> 32) + le32_to_cpu(digest[0]);
2685 + put_unaligned_le32(f, dst);
2686 + f = (f >> 32) + le32_to_cpu(digest[1]);
2687 + put_unaligned_le32(f, dst + 4);
2688 + f = (f >> 32) + le32_to_cpu(digest[2]);
2689 + put_unaligned_le32(f, dst + 8);
2690 + f = (f >> 32) + le32_to_cpu(digest[3]);
2691 + put_unaligned_le32(f, dst + 12);
2692 +
2693 + *dctx = (struct poly1305_desc_ctx){};
2694 +}
2695 +EXPORT_SYMBOL(poly1305_final_arch);
2696 +
2697 +static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
2698 +{
2699 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2700 +
2701 + if (unlikely(!dctx->sset))
2702 + return -ENOKEY;
2703 +
2704 + poly1305_final_arch(dctx, dst);
2705 + return 0;
2706 +}
2707 +
2708 +static struct shash_alg arm_poly1305_algs[] = {{
2709 + .init = arm_poly1305_init,
2710 + .update = arm_poly1305_update,
2711 + .final = arm_poly1305_final,
2712 + .digestsize = POLY1305_DIGEST_SIZE,
2713 + .descsize = sizeof(struct poly1305_desc_ctx),
2714 +
2715 + .base.cra_name = "poly1305",
2716 + .base.cra_driver_name = "poly1305-arm",
2717 + .base.cra_priority = 150,
2718 + .base.cra_blocksize = POLY1305_BLOCK_SIZE,
2719 + .base.cra_module = THIS_MODULE,
2720 +#ifdef CONFIG_KERNEL_MODE_NEON
2721 +}, {
2722 + .init = arm_poly1305_init,
2723 + .update = arm_poly1305_update_neon,
2724 + .final = arm_poly1305_final,
2725 + .digestsize = POLY1305_DIGEST_SIZE,
2726 + .descsize = sizeof(struct poly1305_desc_ctx),
2727 +
2728 + .base.cra_name = "poly1305",
2729 + .base.cra_driver_name = "poly1305-neon",
2730 + .base.cra_priority = 200,
2731 + .base.cra_blocksize = POLY1305_BLOCK_SIZE,
2732 + .base.cra_module = THIS_MODULE,
2733 +#endif
2734 +}};
2735 +
2736 +static int __init arm_poly1305_mod_init(void)
2737 +{
2738 + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
2739 + (elf_hwcap & HWCAP_NEON))
2740 + static_branch_enable(&have_neon);
2741 + else
2742 + /* register only the first entry */
2743 + return crypto_register_shash(&arm_poly1305_algs[0]);
2744 +
2745 + return crypto_register_shashes(arm_poly1305_algs,
2746 + ARRAY_SIZE(arm_poly1305_algs));
2747 +}
2748 +
2749 +static void __exit arm_poly1305_mod_exit(void)
2750 +{
2751 + if (!static_branch_likely(&have_neon)) {
2752 + crypto_unregister_shash(&arm_poly1305_algs[0]);
2753 + return;
2754 + }
2755 + crypto_unregister_shashes(arm_poly1305_algs,
2756 + ARRAY_SIZE(arm_poly1305_algs));
2757 +}
2758 +
2759 +module_init(arm_poly1305_mod_init);
2760 +module_exit(arm_poly1305_mod_exit);
2761 +
2762 +MODULE_LICENSE("GPL v2");
2763 +MODULE_ALIAS_CRYPTO("poly1305");
2764 +MODULE_ALIAS_CRYPTO("poly1305-arm");
2765 +MODULE_ALIAS_CRYPTO("poly1305-neon");
2766 --- a/lib/crypto/Kconfig
2767 +++ b/lib/crypto/Kconfig
2768 @@ -40,7 +40,7 @@ config CRYPTO_LIB_DES
2769 config CRYPTO_LIB_POLY1305_RSIZE
2770 int
2771 default 4 if X86_64
2772 - default 9 if ARM64
2773 + default 9 if ARM || ARM64
2774 default 1
2775
2776 config CRYPTO_ARCH_HAVE_LIB_POLY1305