kernel: 5.4: import wireguard backport
[openwrt/openwrt.git] / target / linux / generic / backport-5.4 / 080-wireguard-0042-crypto-x86-poly1305-import-unmodified-cryptogams-imp.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Sun, 5 Jan 2020 22:40:47 -0500
4 Subject: [PATCH] crypto: x86/poly1305 - import unmodified cryptogams
5 implementation
6
7 commit 0896ca2a0cb6127e8a129f1f2a680d49b6b0f65c upstream.
8
9 These x86_64 vectorized implementations come from Andy Polyakov's
10 CRYPTOGAMS implementation, and are included here in raw form without
11 modification, so that subsequent commits that fix these up for the
12 kernel can see how it has changed.
13
14 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
15 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
16 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
17 ---
18 arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 4159 +++++++++++++++++
19 1 file changed, 4159 insertions(+)
20 create mode 100644 arch/x86/crypto/poly1305-x86_64-cryptogams.pl
21
22 --- /dev/null
23 +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
24 @@ -0,0 +1,4159 @@
25 +#! /usr/bin/env perl
26 +# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
27 +#
28 +# Licensed under the OpenSSL license (the "License"). You may not use
29 +# this file except in compliance with the License. You can obtain a copy
30 +# in the file LICENSE in the source distribution or at
31 +# https://www.openssl.org/source/license.html
32 +
33 +#
34 +# ====================================================================
35 +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
36 +# project. The module is, however, dual licensed under OpenSSL and
37 +# CRYPTOGAMS licenses depending on where you obtain it. For further
38 +# details see http://www.openssl.org/~appro/cryptogams/.
39 +# ====================================================================
40 +#
41 +# This module implements Poly1305 hash for x86_64.
42 +#
43 +# March 2015
44 +#
45 +# Initial release.
46 +#
47 +# December 2016
48 +#
49 +# Add AVX512F+VL+BW code path.
50 +#
51 +# November 2017
52 +#
53 +# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
54 +# executed even on Knights Landing. Trigger for modification was
55 +# observation that AVX512 code paths can negatively affect overall
56 +# Skylake-X system performance. Since we are likely to suppress
57 +# AVX512F capability flag [at least on Skylake-X], conversion serves
58 +# as kind of "investment protection". Note that next *lake processor,
59 +# Cannolake, has AVX512IFMA code path to execute...
60 +#
61 +# Numbers are cycles per processed byte with poly1305_blocks alone,
62 +# measured with rdtsc at fixed clock frequency.
63 +#
64 +# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
65 +# P4 4.46/+120% -
66 +# Core 2 2.41/+90% -
67 +# Westmere 1.88/+120% -
68 +# Sandy Bridge 1.39/+140% 1.10
69 +# Haswell 1.14/+175% 1.11 0.65
70 +# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
71 +# Silvermont 2.83/+95% -
72 +# Knights L 3.60/? 1.65 1.10 0.41(***)
73 +# Goldmont 1.70/+180% -
74 +# VIA Nano 1.82/+150% -
75 +# Sledgehammer 1.38/+160% -
76 +# Bulldozer 2.30/+130% 0.97
77 +# Ryzen 1.15/+200% 1.08 1.18
78 +#
79 +# (*) improvement coefficients relative to clang are more modest and
80 +# are ~50% on most processors, in both cases we are comparing to
81 +# __int128 code;
82 +# (**) SSE2 implementation was attempted, but among non-AVX processors
83 +# it was faster than integer-only code only on older Intel P4 and
84 +# Core processors, 50-30%, less newer processor is, but slower on
85 +# contemporary ones, for example almost 2x slower on Atom, and as
86 +# former are naturally disappearing, SSE2 is deemed unnecessary;
87 +# (***) strangely enough performance seems to vary from core to core,
88 +# listed result is best case;
89 +
90 +$flavour = shift;
91 +$output = shift;
92 +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
93 +
94 +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
95 +
96 +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
97 +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
98 +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
99 +die "can't locate x86_64-xlate.pl";
100 +
101 +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102 + =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
103 + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
104 +}
105 +
106 +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
107 + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
108 + $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
109 + $avx += 2 if ($1==2.11 && $2>=8);
110 +}
111 +
112 +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
113 + `ml64 2>&1` =~ /Version ([0-9]+)\./) {
114 + $avx = ($1>=10) + ($1>=12);
115 +}
116 +
117 +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
118 + $avx = ($2>=3.0) + ($2>3.0);
119 +}
120 +
121 +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
122 +*STDOUT=*OUT;
123 +
124 +my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
125 +my ($mac,$nonce)=($inp,$len); # *_emit arguments
126 +my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
127 +my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
128 +
129 +sub poly1305_iteration {
130 +# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
131 +# output: $h0-$h2 *= $r0-$r1
132 +$code.=<<___;
133 + mulq $h0 # h0*r1
134 + mov %rax,$d2
135 + mov $r0,%rax
136 + mov %rdx,$d3
137 +
138 + mulq $h0 # h0*r0
139 + mov %rax,$h0 # future $h0
140 + mov $r0,%rax
141 + mov %rdx,$d1
142 +
143 + mulq $h1 # h1*r0
144 + add %rax,$d2
145 + mov $s1,%rax
146 + adc %rdx,$d3
147 +
148 + mulq $h1 # h1*s1
149 + mov $h2,$h1 # borrow $h1
150 + add %rax,$h0
151 + adc %rdx,$d1
152 +
153 + imulq $s1,$h1 # h2*s1
154 + add $h1,$d2
155 + mov $d1,$h1
156 + adc \$0,$d3
157 +
158 + imulq $r0,$h2 # h2*r0
159 + add $d2,$h1
160 + mov \$-4,%rax # mask value
161 + adc $h2,$d3
162 +
163 + and $d3,%rax # last reduction step
164 + mov $d3,$h2
165 + shr \$2,$d3
166 + and \$3,$h2
167 + add $d3,%rax
168 + add %rax,$h0
169 + adc \$0,$h1
170 + adc \$0,$h2
171 +___
172 +}
173 +
174 +########################################################################
175 +# Layout of opaque area is following.
176 +#
177 +# unsigned __int64 h[3]; # current hash value base 2^64
178 +# unsigned __int64 r[2]; # key value base 2^64
179 +
180 +$code.=<<___;
181 +.text
182 +
183 +.extern OPENSSL_ia32cap_P
184 +
185 +.globl poly1305_init
186 +.hidden poly1305_init
187 +.globl poly1305_blocks
188 +.hidden poly1305_blocks
189 +.globl poly1305_emit
190 +.hidden poly1305_emit
191 +
192 +.type poly1305_init,\@function,3
193 +.align 32
194 +poly1305_init:
195 + xor %rax,%rax
196 + mov %rax,0($ctx) # initialize hash value
197 + mov %rax,8($ctx)
198 + mov %rax,16($ctx)
199 +
200 + cmp \$0,$inp
201 + je .Lno_key
202 +
203 + lea poly1305_blocks(%rip),%r10
204 + lea poly1305_emit(%rip),%r11
205 +___
206 +$code.=<<___ if ($avx);
207 + mov OPENSSL_ia32cap_P+4(%rip),%r9
208 + lea poly1305_blocks_avx(%rip),%rax
209 + lea poly1305_emit_avx(%rip),%rcx
210 + bt \$`60-32`,%r9 # AVX?
211 + cmovc %rax,%r10
212 + cmovc %rcx,%r11
213 +___
214 +$code.=<<___ if ($avx>1);
215 + lea poly1305_blocks_avx2(%rip),%rax
216 + bt \$`5+32`,%r9 # AVX2?
217 + cmovc %rax,%r10
218 +___
219 +$code.=<<___ if ($avx>3);
220 + mov \$`(1<<31|1<<21|1<<16)`,%rax
221 + shr \$32,%r9
222 + and %rax,%r9
223 + cmp %rax,%r9
224 + je .Linit_base2_44
225 +___
226 +$code.=<<___;
227 + mov \$0x0ffffffc0fffffff,%rax
228 + mov \$0x0ffffffc0ffffffc,%rcx
229 + and 0($inp),%rax
230 + and 8($inp),%rcx
231 + mov %rax,24($ctx)
232 + mov %rcx,32($ctx)
233 +___
234 +$code.=<<___ if ($flavour !~ /elf32/);
235 + mov %r10,0(%rdx)
236 + mov %r11,8(%rdx)
237 +___
238 +$code.=<<___ if ($flavour =~ /elf32/);
239 + mov %r10d,0(%rdx)
240 + mov %r11d,4(%rdx)
241 +___
242 +$code.=<<___;
243 + mov \$1,%eax
244 +.Lno_key:
245 + ret
246 +.size poly1305_init,.-poly1305_init
247 +
248 +.type poly1305_blocks,\@function,4
249 +.align 32
250 +poly1305_blocks:
251 +.cfi_startproc
252 +.Lblocks:
253 + shr \$4,$len
254 + jz .Lno_data # too short
255 +
256 + push %rbx
257 +.cfi_push %rbx
258 + push %rbp
259 +.cfi_push %rbp
260 + push %r12
261 +.cfi_push %r12
262 + push %r13
263 +.cfi_push %r13
264 + push %r14
265 +.cfi_push %r14
266 + push %r15
267 +.cfi_push %r15
268 +.Lblocks_body:
269 +
270 + mov $len,%r15 # reassign $len
271 +
272 + mov 24($ctx),$r0 # load r
273 + mov 32($ctx),$s1
274 +
275 + mov 0($ctx),$h0 # load hash value
276 + mov 8($ctx),$h1
277 + mov 16($ctx),$h2
278 +
279 + mov $s1,$r1
280 + shr \$2,$s1
281 + mov $r1,%rax
282 + add $r1,$s1 # s1 = r1 + (r1 >> 2)
283 + jmp .Loop
284 +
285 +.align 32
286 +.Loop:
287 + add 0($inp),$h0 # accumulate input
288 + adc 8($inp),$h1
289 + lea 16($inp),$inp
290 + adc $padbit,$h2
291 +___
292 + &poly1305_iteration();
293 +$code.=<<___;
294 + mov $r1,%rax
295 + dec %r15 # len-=16
296 + jnz .Loop
297 +
298 + mov $h0,0($ctx) # store hash value
299 + mov $h1,8($ctx)
300 + mov $h2,16($ctx)
301 +
302 + mov 0(%rsp),%r15
303 +.cfi_restore %r15
304 + mov 8(%rsp),%r14
305 +.cfi_restore %r14
306 + mov 16(%rsp),%r13
307 +.cfi_restore %r13
308 + mov 24(%rsp),%r12
309 +.cfi_restore %r12
310 + mov 32(%rsp),%rbp
311 +.cfi_restore %rbp
312 + mov 40(%rsp),%rbx
313 +.cfi_restore %rbx
314 + lea 48(%rsp),%rsp
315 +.cfi_adjust_cfa_offset -48
316 +.Lno_data:
317 +.Lblocks_epilogue:
318 + ret
319 +.cfi_endproc
320 +.size poly1305_blocks,.-poly1305_blocks
321 +
322 +.type poly1305_emit,\@function,3
323 +.align 32
324 +poly1305_emit:
325 +.Lemit:
326 + mov 0($ctx),%r8 # load hash value
327 + mov 8($ctx),%r9
328 + mov 16($ctx),%r10
329 +
330 + mov %r8,%rax
331 + add \$5,%r8 # compare to modulus
332 + mov %r9,%rcx
333 + adc \$0,%r9
334 + adc \$0,%r10
335 + shr \$2,%r10 # did 130-bit value overflow?
336 + cmovnz %r8,%rax
337 + cmovnz %r9,%rcx
338 +
339 + add 0($nonce),%rax # accumulate nonce
340 + adc 8($nonce),%rcx
341 + mov %rax,0($mac) # write result
342 + mov %rcx,8($mac)
343 +
344 + ret
345 +.size poly1305_emit,.-poly1305_emit
346 +___
347 +if ($avx) {
348 +
349 +########################################################################
350 +# Layout of opaque area is following.
351 +#
352 +# unsigned __int32 h[5]; # current hash value base 2^26
353 +# unsigned __int32 is_base2_26;
354 +# unsigned __int64 r[2]; # key value base 2^64
355 +# unsigned __int64 pad;
356 +# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
357 +#
358 +# where r^n are base 2^26 digits of degrees of multiplier key. There are
359 +# 5 digits, but last four are interleaved with multiples of 5, totalling
360 +# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
361 +
362 +my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
363 + map("%xmm$_",(0..15));
364 +
365 +$code.=<<___;
366 +.type __poly1305_block,\@abi-omnipotent
367 +.align 32
368 +__poly1305_block:
369 +___
370 + &poly1305_iteration();
371 +$code.=<<___;
372 + ret
373 +.size __poly1305_block,.-__poly1305_block
374 +
375 +.type __poly1305_init_avx,\@abi-omnipotent
376 +.align 32
377 +__poly1305_init_avx:
378 + mov $r0,$h0
379 + mov $r1,$h1
380 + xor $h2,$h2
381 +
382 + lea 48+64($ctx),$ctx # size optimization
383 +
384 + mov $r1,%rax
385 + call __poly1305_block # r^2
386 +
387 + mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
388 + mov \$0x3ffffff,%edx
389 + mov $h0,$d1
390 + and $h0#d,%eax
391 + mov $r0,$d2
392 + and $r0#d,%edx
393 + mov %eax,`16*0+0-64`($ctx)
394 + shr \$26,$d1
395 + mov %edx,`16*0+4-64`($ctx)
396 + shr \$26,$d2
397 +
398 + mov \$0x3ffffff,%eax
399 + mov \$0x3ffffff,%edx
400 + and $d1#d,%eax
401 + and $d2#d,%edx
402 + mov %eax,`16*1+0-64`($ctx)
403 + lea (%rax,%rax,4),%eax # *5
404 + mov %edx,`16*1+4-64`($ctx)
405 + lea (%rdx,%rdx,4),%edx # *5
406 + mov %eax,`16*2+0-64`($ctx)
407 + shr \$26,$d1
408 + mov %edx,`16*2+4-64`($ctx)
409 + shr \$26,$d2
410 +
411 + mov $h1,%rax
412 + mov $r1,%rdx
413 + shl \$12,%rax
414 + shl \$12,%rdx
415 + or $d1,%rax
416 + or $d2,%rdx
417 + and \$0x3ffffff,%eax
418 + and \$0x3ffffff,%edx
419 + mov %eax,`16*3+0-64`($ctx)
420 + lea (%rax,%rax,4),%eax # *5
421 + mov %edx,`16*3+4-64`($ctx)
422 + lea (%rdx,%rdx,4),%edx # *5
423 + mov %eax,`16*4+0-64`($ctx)
424 + mov $h1,$d1
425 + mov %edx,`16*4+4-64`($ctx)
426 + mov $r1,$d2
427 +
428 + mov \$0x3ffffff,%eax
429 + mov \$0x3ffffff,%edx
430 + shr \$14,$d1
431 + shr \$14,$d2
432 + and $d1#d,%eax
433 + and $d2#d,%edx
434 + mov %eax,`16*5+0-64`($ctx)
435 + lea (%rax,%rax,4),%eax # *5
436 + mov %edx,`16*5+4-64`($ctx)
437 + lea (%rdx,%rdx,4),%edx # *5
438 + mov %eax,`16*6+0-64`($ctx)
439 + shr \$26,$d1
440 + mov %edx,`16*6+4-64`($ctx)
441 + shr \$26,$d2
442 +
443 + mov $h2,%rax
444 + shl \$24,%rax
445 + or %rax,$d1
446 + mov $d1#d,`16*7+0-64`($ctx)
447 + lea ($d1,$d1,4),$d1 # *5
448 + mov $d2#d,`16*7+4-64`($ctx)
449 + lea ($d2,$d2,4),$d2 # *5
450 + mov $d1#d,`16*8+0-64`($ctx)
451 + mov $d2#d,`16*8+4-64`($ctx)
452 +
453 + mov $r1,%rax
454 + call __poly1305_block # r^3
455 +
456 + mov \$0x3ffffff,%eax # save r^3 base 2^26
457 + mov $h0,$d1
458 + and $h0#d,%eax
459 + shr \$26,$d1
460 + mov %eax,`16*0+12-64`($ctx)
461 +
462 + mov \$0x3ffffff,%edx
463 + and $d1#d,%edx
464 + mov %edx,`16*1+12-64`($ctx)
465 + lea (%rdx,%rdx,4),%edx # *5
466 + shr \$26,$d1
467 + mov %edx,`16*2+12-64`($ctx)
468 +
469 + mov $h1,%rax
470 + shl \$12,%rax
471 + or $d1,%rax
472 + and \$0x3ffffff,%eax
473 + mov %eax,`16*3+12-64`($ctx)
474 + lea (%rax,%rax,4),%eax # *5
475 + mov $h1,$d1
476 + mov %eax,`16*4+12-64`($ctx)
477 +
478 + mov \$0x3ffffff,%edx
479 + shr \$14,$d1
480 + and $d1#d,%edx
481 + mov %edx,`16*5+12-64`($ctx)
482 + lea (%rdx,%rdx,4),%edx # *5
483 + shr \$26,$d1
484 + mov %edx,`16*6+12-64`($ctx)
485 +
486 + mov $h2,%rax
487 + shl \$24,%rax
488 + or %rax,$d1
489 + mov $d1#d,`16*7+12-64`($ctx)
490 + lea ($d1,$d1,4),$d1 # *5
491 + mov $d1#d,`16*8+12-64`($ctx)
492 +
493 + mov $r1,%rax
494 + call __poly1305_block # r^4
495 +
496 + mov \$0x3ffffff,%eax # save r^4 base 2^26
497 + mov $h0,$d1
498 + and $h0#d,%eax
499 + shr \$26,$d1
500 + mov %eax,`16*0+8-64`($ctx)
501 +
502 + mov \$0x3ffffff,%edx
503 + and $d1#d,%edx
504 + mov %edx,`16*1+8-64`($ctx)
505 + lea (%rdx,%rdx,4),%edx # *5
506 + shr \$26,$d1
507 + mov %edx,`16*2+8-64`($ctx)
508 +
509 + mov $h1,%rax
510 + shl \$12,%rax
511 + or $d1,%rax
512 + and \$0x3ffffff,%eax
513 + mov %eax,`16*3+8-64`($ctx)
514 + lea (%rax,%rax,4),%eax # *5
515 + mov $h1,$d1
516 + mov %eax,`16*4+8-64`($ctx)
517 +
518 + mov \$0x3ffffff,%edx
519 + shr \$14,$d1
520 + and $d1#d,%edx
521 + mov %edx,`16*5+8-64`($ctx)
522 + lea (%rdx,%rdx,4),%edx # *5
523 + shr \$26,$d1
524 + mov %edx,`16*6+8-64`($ctx)
525 +
526 + mov $h2,%rax
527 + shl \$24,%rax
528 + or %rax,$d1
529 + mov $d1#d,`16*7+8-64`($ctx)
530 + lea ($d1,$d1,4),$d1 # *5
531 + mov $d1#d,`16*8+8-64`($ctx)
532 +
533 + lea -48-64($ctx),$ctx # size [de-]optimization
534 + ret
535 +.size __poly1305_init_avx,.-__poly1305_init_avx
536 +
537 +.type poly1305_blocks_avx,\@function,4
538 +.align 32
539 +poly1305_blocks_avx:
540 +.cfi_startproc
541 + mov 20($ctx),%r8d # is_base2_26
542 + cmp \$128,$len
543 + jae .Lblocks_avx
544 + test %r8d,%r8d
545 + jz .Lblocks
546 +
547 +.Lblocks_avx:
548 + and \$-16,$len
549 + jz .Lno_data_avx
550 +
551 + vzeroupper
552 +
553 + test %r8d,%r8d
554 + jz .Lbase2_64_avx
555 +
556 + test \$31,$len
557 + jz .Leven_avx
558 +
559 + push %rbx
560 +.cfi_push %rbx
561 + push %rbp
562 +.cfi_push %rbp
563 + push %r12
564 +.cfi_push %r12
565 + push %r13
566 +.cfi_push %r13
567 + push %r14
568 +.cfi_push %r14
569 + push %r15
570 +.cfi_push %r15
571 +.Lblocks_avx_body:
572 +
573 + mov $len,%r15 # reassign $len
574 +
575 + mov 0($ctx),$d1 # load hash value
576 + mov 8($ctx),$d2
577 + mov 16($ctx),$h2#d
578 +
579 + mov 24($ctx),$r0 # load r
580 + mov 32($ctx),$s1
581 +
582 + ################################# base 2^26 -> base 2^64
583 + mov $d1#d,$h0#d
584 + and \$`-1*(1<<31)`,$d1
585 + mov $d2,$r1 # borrow $r1
586 + mov $d2#d,$h1#d
587 + and \$`-1*(1<<31)`,$d2
588 +
589 + shr \$6,$d1
590 + shl \$52,$r1
591 + add $d1,$h0
592 + shr \$12,$h1
593 + shr \$18,$d2
594 + add $r1,$h0
595 + adc $d2,$h1
596 +
597 + mov $h2,$d1
598 + shl \$40,$d1
599 + shr \$24,$h2
600 + add $d1,$h1
601 + adc \$0,$h2 # can be partially reduced...
602 +
603 + mov \$-4,$d2 # ... so reduce
604 + mov $h2,$d1
605 + and $h2,$d2
606 + shr \$2,$d1
607 + and \$3,$h2
608 + add $d2,$d1 # =*5
609 + add $d1,$h0
610 + adc \$0,$h1
611 + adc \$0,$h2
612 +
613 + mov $s1,$r1
614 + mov $s1,%rax
615 + shr \$2,$s1
616 + add $r1,$s1 # s1 = r1 + (r1 >> 2)
617 +
618 + add 0($inp),$h0 # accumulate input
619 + adc 8($inp),$h1
620 + lea 16($inp),$inp
621 + adc $padbit,$h2
622 +
623 + call __poly1305_block
624 +
625 + test $padbit,$padbit # if $padbit is zero,
626 + jz .Lstore_base2_64_avx # store hash in base 2^64 format
627 +
628 + ################################# base 2^64 -> base 2^26
629 + mov $h0,%rax
630 + mov $h0,%rdx
631 + shr \$52,$h0
632 + mov $h1,$r0
633 + mov $h1,$r1
634 + shr \$26,%rdx
635 + and \$0x3ffffff,%rax # h[0]
636 + shl \$12,$r0
637 + and \$0x3ffffff,%rdx # h[1]
638 + shr \$14,$h1
639 + or $r0,$h0
640 + shl \$24,$h2
641 + and \$0x3ffffff,$h0 # h[2]
642 + shr \$40,$r1
643 + and \$0x3ffffff,$h1 # h[3]
644 + or $r1,$h2 # h[4]
645 +
646 + sub \$16,%r15
647 + jz .Lstore_base2_26_avx
648 +
649 + vmovd %rax#d,$H0
650 + vmovd %rdx#d,$H1
651 + vmovd $h0#d,$H2
652 + vmovd $h1#d,$H3
653 + vmovd $h2#d,$H4
654 + jmp .Lproceed_avx
655 +
656 +.align 32
657 +.Lstore_base2_64_avx:
658 + mov $h0,0($ctx)
659 + mov $h1,8($ctx)
660 + mov $h2,16($ctx) # note that is_base2_26 is zeroed
661 + jmp .Ldone_avx
662 +
663 +.align 16
664 +.Lstore_base2_26_avx:
665 + mov %rax#d,0($ctx) # store hash value base 2^26
666 + mov %rdx#d,4($ctx)
667 + mov $h0#d,8($ctx)
668 + mov $h1#d,12($ctx)
669 + mov $h2#d,16($ctx)
670 +.align 16
671 +.Ldone_avx:
672 + mov 0(%rsp),%r15
673 +.cfi_restore %r15
674 + mov 8(%rsp),%r14
675 +.cfi_restore %r14
676 + mov 16(%rsp),%r13
677 +.cfi_restore %r13
678 + mov 24(%rsp),%r12
679 +.cfi_restore %r12
680 + mov 32(%rsp),%rbp
681 +.cfi_restore %rbp
682 + mov 40(%rsp),%rbx
683 +.cfi_restore %rbx
684 + lea 48(%rsp),%rsp
685 +.cfi_adjust_cfa_offset -48
686 +.Lno_data_avx:
687 +.Lblocks_avx_epilogue:
688 + ret
689 +.cfi_endproc
690 +
691 +.align 32
692 +.Lbase2_64_avx:
693 +.cfi_startproc
694 + push %rbx
695 +.cfi_push %rbx
696 + push %rbp
697 +.cfi_push %rbp
698 + push %r12
699 +.cfi_push %r12
700 + push %r13
701 +.cfi_push %r13
702 + push %r14
703 +.cfi_push %r14
704 + push %r15
705 +.cfi_push %r15
706 +.Lbase2_64_avx_body:
707 +
708 + mov $len,%r15 # reassign $len
709 +
710 + mov 24($ctx),$r0 # load r
711 + mov 32($ctx),$s1
712 +
713 + mov 0($ctx),$h0 # load hash value
714 + mov 8($ctx),$h1
715 + mov 16($ctx),$h2#d
716 +
717 + mov $s1,$r1
718 + mov $s1,%rax
719 + shr \$2,$s1
720 + add $r1,$s1 # s1 = r1 + (r1 >> 2)
721 +
722 + test \$31,$len
723 + jz .Linit_avx
724 +
725 + add 0($inp),$h0 # accumulate input
726 + adc 8($inp),$h1
727 + lea 16($inp),$inp
728 + adc $padbit,$h2
729 + sub \$16,%r15
730 +
731 + call __poly1305_block
732 +
733 +.Linit_avx:
734 + ################################# base 2^64 -> base 2^26
735 + mov $h0,%rax
736 + mov $h0,%rdx
737 + shr \$52,$h0
738 + mov $h1,$d1
739 + mov $h1,$d2
740 + shr \$26,%rdx
741 + and \$0x3ffffff,%rax # h[0]
742 + shl \$12,$d1
743 + and \$0x3ffffff,%rdx # h[1]
744 + shr \$14,$h1
745 + or $d1,$h0
746 + shl \$24,$h2
747 + and \$0x3ffffff,$h0 # h[2]
748 + shr \$40,$d2
749 + and \$0x3ffffff,$h1 # h[3]
750 + or $d2,$h2 # h[4]
751 +
752 + vmovd %rax#d,$H0
753 + vmovd %rdx#d,$H1
754 + vmovd $h0#d,$H2
755 + vmovd $h1#d,$H3
756 + vmovd $h2#d,$H4
757 + movl \$1,20($ctx) # set is_base2_26
758 +
759 + call __poly1305_init_avx
760 +
761 +.Lproceed_avx:
762 + mov %r15,$len
763 +
764 + mov 0(%rsp),%r15
765 +.cfi_restore %r15
766 + mov 8(%rsp),%r14
767 +.cfi_restore %r14
768 + mov 16(%rsp),%r13
769 +.cfi_restore %r13
770 + mov 24(%rsp),%r12
771 +.cfi_restore %r12
772 + mov 32(%rsp),%rbp
773 +.cfi_restore %rbp
774 + mov 40(%rsp),%rbx
775 +.cfi_restore %rbx
776 + lea 48(%rsp),%rax
777 + lea 48(%rsp),%rsp
778 +.cfi_adjust_cfa_offset -48
779 +.Lbase2_64_avx_epilogue:
780 + jmp .Ldo_avx
781 +.cfi_endproc
782 +
783 +.align 32
784 +.Leven_avx:
785 +.cfi_startproc
786 + vmovd 4*0($ctx),$H0 # load hash value
787 + vmovd 4*1($ctx),$H1
788 + vmovd 4*2($ctx),$H2
789 + vmovd 4*3($ctx),$H3
790 + vmovd 4*4($ctx),$H4
791 +
792 +.Ldo_avx:
793 +___
794 +$code.=<<___ if (!$win64);
795 + lea -0x58(%rsp),%r11
796 +.cfi_def_cfa %r11,0x60
797 + sub \$0x178,%rsp
798 +___
799 +$code.=<<___ if ($win64);
800 + lea -0xf8(%rsp),%r11
801 + sub \$0x218,%rsp
802 + vmovdqa %xmm6,0x50(%r11)
803 + vmovdqa %xmm7,0x60(%r11)
804 + vmovdqa %xmm8,0x70(%r11)
805 + vmovdqa %xmm9,0x80(%r11)
806 + vmovdqa %xmm10,0x90(%r11)
807 + vmovdqa %xmm11,0xa0(%r11)
808 + vmovdqa %xmm12,0xb0(%r11)
809 + vmovdqa %xmm13,0xc0(%r11)
810 + vmovdqa %xmm14,0xd0(%r11)
811 + vmovdqa %xmm15,0xe0(%r11)
812 +.Ldo_avx_body:
813 +___
814 +$code.=<<___;
815 + sub \$64,$len
816 + lea -32($inp),%rax
817 + cmovc %rax,$inp
818 +
819 + vmovdqu `16*3`($ctx),$D4 # preload r0^2
820 + lea `16*3+64`($ctx),$ctx # size optimization
821 + lea .Lconst(%rip),%rcx
822 +
823 + ################################################################
824 + # load input
825 + vmovdqu 16*2($inp),$T0
826 + vmovdqu 16*3($inp),$T1
827 + vmovdqa 64(%rcx),$MASK # .Lmask26
828 +
829 + vpsrldq \$6,$T0,$T2 # splat input
830 + vpsrldq \$6,$T1,$T3
831 + vpunpckhqdq $T1,$T0,$T4 # 4
832 + vpunpcklqdq $T1,$T0,$T0 # 0:1
833 + vpunpcklqdq $T3,$T2,$T3 # 2:3
834 +
835 + vpsrlq \$40,$T4,$T4 # 4
836 + vpsrlq \$26,$T0,$T1
837 + vpand $MASK,$T0,$T0 # 0
838 + vpsrlq \$4,$T3,$T2
839 + vpand $MASK,$T1,$T1 # 1
840 + vpsrlq \$30,$T3,$T3
841 + vpand $MASK,$T2,$T2 # 2
842 + vpand $MASK,$T3,$T3 # 3
843 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always
844 +
845 + jbe .Lskip_loop_avx
846 +
847 + # expand and copy pre-calculated table to stack
848 + vmovdqu `16*1-64`($ctx),$D1
849 + vmovdqu `16*2-64`($ctx),$D2
850 + vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
851 + vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
852 + vmovdqa $D3,-0x90(%r11)
853 + vmovdqa $D0,0x00(%rsp)
854 + vpshufd \$0xEE,$D1,$D4
855 + vmovdqu `16*3-64`($ctx),$D0
856 + vpshufd \$0x44,$D1,$D1
857 + vmovdqa $D4,-0x80(%r11)
858 + vmovdqa $D1,0x10(%rsp)
859 + vpshufd \$0xEE,$D2,$D3
860 + vmovdqu `16*4-64`($ctx),$D1
861 + vpshufd \$0x44,$D2,$D2
862 + vmovdqa $D3,-0x70(%r11)
863 + vmovdqa $D2,0x20(%rsp)
864 + vpshufd \$0xEE,$D0,$D4
865 + vmovdqu `16*5-64`($ctx),$D2
866 + vpshufd \$0x44,$D0,$D0
867 + vmovdqa $D4,-0x60(%r11)
868 + vmovdqa $D0,0x30(%rsp)
869 + vpshufd \$0xEE,$D1,$D3
870 + vmovdqu `16*6-64`($ctx),$D0
871 + vpshufd \$0x44,$D1,$D1
872 + vmovdqa $D3,-0x50(%r11)
873 + vmovdqa $D1,0x40(%rsp)
874 + vpshufd \$0xEE,$D2,$D4
875 + vmovdqu `16*7-64`($ctx),$D1
876 + vpshufd \$0x44,$D2,$D2
877 + vmovdqa $D4,-0x40(%r11)
878 + vmovdqa $D2,0x50(%rsp)
879 + vpshufd \$0xEE,$D0,$D3
880 + vmovdqu `16*8-64`($ctx),$D2
881 + vpshufd \$0x44,$D0,$D0
882 + vmovdqa $D3,-0x30(%r11)
883 + vmovdqa $D0,0x60(%rsp)
884 + vpshufd \$0xEE,$D1,$D4
885 + vpshufd \$0x44,$D1,$D1
886 + vmovdqa $D4,-0x20(%r11)
887 + vmovdqa $D1,0x70(%rsp)
888 + vpshufd \$0xEE,$D2,$D3
889 + vmovdqa 0x00(%rsp),$D4 # preload r0^2
890 + vpshufd \$0x44,$D2,$D2
891 + vmovdqa $D3,-0x10(%r11)
892 + vmovdqa $D2,0x80(%rsp)
893 +
894 + jmp .Loop_avx
895 +
896 +.align 32
897 +.Loop_avx:
898 + ################################################################
899 + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
900 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
901 + # \___________________/
902 + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
903 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
904 + # \___________________/ \____________________/
905 + #
906 + # Note that we start with inp[2:3]*r^2. This is because it
907 + # doesn't depend on reduction in previous iteration.
908 + ################################################################
909 + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
910 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
911 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
912 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
913 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
914 + #
915 + # though note that $Tx and $Hx are "reversed" in this section,
916 + # and $D4 is preloaded with r0^2...
917 +
918 + vpmuludq $T0,$D4,$D0 # d0 = h0*r0
919 + vpmuludq $T1,$D4,$D1 # d1 = h1*r0
920 + vmovdqa $H2,0x20(%r11) # offload hash
921 + vpmuludq $T2,$D4,$D2 # d3 = h2*r0
922 + vmovdqa 0x10(%rsp),$H2 # r1^2
923 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0
924 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0
925 +
926 + vmovdqa $H0,0x00(%r11) #
927 + vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
928 + vmovdqa $H1,0x10(%r11) #
929 + vpmuludq $T3,$H2,$H1 # h3*r1
930 + vpaddq $H0,$D0,$D0 # d0 += h4*s1
931 + vpaddq $H1,$D4,$D4 # d4 += h3*r1
932 + vmovdqa $H3,0x30(%r11) #
933 + vpmuludq $T2,$H2,$H0 # h2*r1
934 + vpmuludq $T1,$H2,$H1 # h1*r1
935 + vpaddq $H0,$D3,$D3 # d3 += h2*r1
936 + vmovdqa 0x30(%rsp),$H3 # r2^2
937 + vpaddq $H1,$D2,$D2 # d2 += h1*r1
938 + vmovdqa $H4,0x40(%r11) #
939 + vpmuludq $T0,$H2,$H2 # h0*r1
940 + vpmuludq $T2,$H3,$H0 # h2*r2
941 + vpaddq $H2,$D1,$D1 # d1 += h0*r1
942 +
943 + vmovdqa 0x40(%rsp),$H4 # s2^2
944 + vpaddq $H0,$D4,$D4 # d4 += h2*r2
945 + vpmuludq $T1,$H3,$H1 # h1*r2
946 + vpmuludq $T0,$H3,$H3 # h0*r2
947 + vpaddq $H1,$D3,$D3 # d3 += h1*r2
948 + vmovdqa 0x50(%rsp),$H2 # r3^2
949 + vpaddq $H3,$D2,$D2 # d2 += h0*r2
950 + vpmuludq $T4,$H4,$H0 # h4*s2
951 + vpmuludq $T3,$H4,$H4 # h3*s2
952 + vpaddq $H0,$D1,$D1 # d1 += h4*s2
953 + vmovdqa 0x60(%rsp),$H3 # s3^2
954 + vpaddq $H4,$D0,$D0 # d0 += h3*s2
955 +
956 + vmovdqa 0x80(%rsp),$H4 # s4^2
957 + vpmuludq $T1,$H2,$H1 # h1*r3
958 + vpmuludq $T0,$H2,$H2 # h0*r3
959 + vpaddq $H1,$D4,$D4 # d4 += h1*r3
960 + vpaddq $H2,$D3,$D3 # d3 += h0*r3
961 + vpmuludq $T4,$H3,$H0 # h4*s3
962 + vpmuludq $T3,$H3,$H1 # h3*s3
963 + vpaddq $H0,$D2,$D2 # d2 += h4*s3
964 + vmovdqu 16*0($inp),$H0 # load input
965 + vpaddq $H1,$D1,$D1 # d1 += h3*s3
966 + vpmuludq $T2,$H3,$H3 # h2*s3
967 + vpmuludq $T2,$H4,$T2 # h2*s4
968 + vpaddq $H3,$D0,$D0 # d0 += h2*s3
969 +
970 + vmovdqu 16*1($inp),$H1 #
971 + vpaddq $T2,$D1,$D1 # d1 += h2*s4
972 + vpmuludq $T3,$H4,$T3 # h3*s4
973 + vpmuludq $T4,$H4,$T4 # h4*s4
974 + vpsrldq \$6,$H0,$H2 # splat input
975 + vpaddq $T3,$D2,$D2 # d2 += h3*s4
976 + vpaddq $T4,$D3,$D3 # d3 += h4*s4
977 + vpsrldq \$6,$H1,$H3 #
978 + vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
979 + vpmuludq $T1,$H4,$T0 # h1*s4
980 + vpunpckhqdq $H1,$H0,$H4 # 4
981 + vpaddq $T4,$D4,$D4 # d4 += h0*r4
982 + vmovdqa -0x90(%r11),$T4 # r0^4
983 + vpaddq $T0,$D0,$D0 # d0 += h1*s4
984 +
985 + vpunpcklqdq $H1,$H0,$H0 # 0:1
986 + vpunpcklqdq $H3,$H2,$H3 # 2:3
987 +
988 + #vpsrlq \$40,$H4,$H4 # 4
989 + vpsrldq \$`40/8`,$H4,$H4 # 4
990 + vpsrlq \$26,$H0,$H1
991 + vpand $MASK,$H0,$H0 # 0
992 + vpsrlq \$4,$H3,$H2
993 + vpand $MASK,$H1,$H1 # 1
994 + vpand 0(%rcx),$H4,$H4 # .Lmask24
995 + vpsrlq \$30,$H3,$H3
996 + vpand $MASK,$H2,$H2 # 2
997 + vpand $MASK,$H3,$H3 # 3
998 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always
999 +
1000 + vpaddq 0x00(%r11),$H0,$H0 # add hash value
1001 + vpaddq 0x10(%r11),$H1,$H1
1002 + vpaddq 0x20(%r11),$H2,$H2
1003 + vpaddq 0x30(%r11),$H3,$H3
1004 + vpaddq 0x40(%r11),$H4,$H4
1005 +
1006 + lea 16*2($inp),%rax
1007 + lea 16*4($inp),$inp
1008 + sub \$64,$len
1009 + cmovc %rax,$inp
1010 +
1011 + ################################################################
1012 + # Now we accumulate (inp[0:1]+hash)*r^4
1013 + ################################################################
1014 + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1015 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1016 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1017 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1018 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1019 +
1020 + vpmuludq $H0,$T4,$T0 # h0*r0
1021 + vpmuludq $H1,$T4,$T1 # h1*r0
1022 + vpaddq $T0,$D0,$D0
1023 + vpaddq $T1,$D1,$D1
1024 + vmovdqa -0x80(%r11),$T2 # r1^4
1025 + vpmuludq $H2,$T4,$T0 # h2*r0
1026 + vpmuludq $H3,$T4,$T1 # h3*r0
1027 + vpaddq $T0,$D2,$D2
1028 + vpaddq $T1,$D3,$D3
1029 + vpmuludq $H4,$T4,$T4 # h4*r0
1030 + vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
1031 + vpaddq $T4,$D4,$D4
1032 +
1033 + vpaddq $T0,$D0,$D0 # d0 += h4*s1
1034 + vpmuludq $H2,$T2,$T1 # h2*r1
1035 + vpmuludq $H3,$T2,$T0 # h3*r1
1036 + vpaddq $T1,$D3,$D3 # d3 += h2*r1
1037 + vmovdqa -0x60(%r11),$T3 # r2^4
1038 + vpaddq $T0,$D4,$D4 # d4 += h3*r1
1039 + vpmuludq $H1,$T2,$T1 # h1*r1
1040 + vpmuludq $H0,$T2,$T2 # h0*r1
1041 + vpaddq $T1,$D2,$D2 # d2 += h1*r1
1042 + vpaddq $T2,$D1,$D1 # d1 += h0*r1
1043 +
1044 + vmovdqa -0x50(%r11),$T4 # s2^4
1045 + vpmuludq $H2,$T3,$T0 # h2*r2
1046 + vpmuludq $H1,$T3,$T1 # h1*r2
1047 + vpaddq $T0,$D4,$D4 # d4 += h2*r2
1048 + vpaddq $T1,$D3,$D3 # d3 += h1*r2
1049 + vmovdqa -0x40(%r11),$T2 # r3^4
1050 + vpmuludq $H0,$T3,$T3 # h0*r2
1051 + vpmuludq $H4,$T4,$T0 # h4*s2
1052 + vpaddq $T3,$D2,$D2 # d2 += h0*r2
1053 + vpaddq $T0,$D1,$D1 # d1 += h4*s2
1054 + vmovdqa -0x30(%r11),$T3 # s3^4
1055 + vpmuludq $H3,$T4,$T4 # h3*s2
1056 + vpmuludq $H1,$T2,$T1 # h1*r3
1057 + vpaddq $T4,$D0,$D0 # d0 += h3*s2
1058 +
1059 + vmovdqa -0x10(%r11),$T4 # s4^4
1060 + vpaddq $T1,$D4,$D4 # d4 += h1*r3
1061 + vpmuludq $H0,$T2,$T2 # h0*r3
1062 + vpmuludq $H4,$T3,$T0 # h4*s3
1063 + vpaddq $T2,$D3,$D3 # d3 += h0*r3
1064 + vpaddq $T0,$D2,$D2 # d2 += h4*s3
1065 + vmovdqu 16*2($inp),$T0 # load input
1066 + vpmuludq $H3,$T3,$T2 # h3*s3
1067 + vpmuludq $H2,$T3,$T3 # h2*s3
1068 + vpaddq $T2,$D1,$D1 # d1 += h3*s3
1069 + vmovdqu 16*3($inp),$T1 #
1070 + vpaddq $T3,$D0,$D0 # d0 += h2*s3
1071 +
1072 + vpmuludq $H2,$T4,$H2 # h2*s4
1073 + vpmuludq $H3,$T4,$H3 # h3*s4
1074 + vpsrldq \$6,$T0,$T2 # splat input
1075 + vpaddq $H2,$D1,$D1 # d1 += h2*s4
1076 + vpmuludq $H4,$T4,$H4 # h4*s4
1077 + vpsrldq \$6,$T1,$T3 #
1078 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
1079 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
1080 + vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
1081 + vpmuludq $H1,$T4,$H0
1082 + vpunpckhqdq $T1,$T0,$T4 # 4
1083 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1084 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1085 +
1086 + vpunpcklqdq $T1,$T0,$T0 # 0:1
1087 + vpunpcklqdq $T3,$T2,$T3 # 2:3
1088 +
1089 + #vpsrlq \$40,$T4,$T4 # 4
1090 + vpsrldq \$`40/8`,$T4,$T4 # 4
1091 + vpsrlq \$26,$T0,$T1
1092 + vmovdqa 0x00(%rsp),$D4 # preload r0^2
1093 + vpand $MASK,$T0,$T0 # 0
1094 + vpsrlq \$4,$T3,$T2
1095 + vpand $MASK,$T1,$T1 # 1
1096 + vpand 0(%rcx),$T4,$T4 # .Lmask24
1097 + vpsrlq \$30,$T3,$T3
1098 + vpand $MASK,$T2,$T2 # 2
1099 + vpand $MASK,$T3,$T3 # 3
1100 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1101 +
1102 + ################################################################
1103 + # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1104 + # and P. Schwabe
1105 +
1106 + vpsrlq \$26,$H3,$D3
1107 + vpand $MASK,$H3,$H3
1108 + vpaddq $D3,$H4,$H4 # h3 -> h4
1109 +
1110 + vpsrlq \$26,$H0,$D0
1111 + vpand $MASK,$H0,$H0
1112 + vpaddq $D0,$D1,$H1 # h0 -> h1
1113 +
1114 + vpsrlq \$26,$H4,$D0
1115 + vpand $MASK,$H4,$H4
1116 +
1117 + vpsrlq \$26,$H1,$D1
1118 + vpand $MASK,$H1,$H1
1119 + vpaddq $D1,$H2,$H2 # h1 -> h2
1120 +
1121 + vpaddq $D0,$H0,$H0
1122 + vpsllq \$2,$D0,$D0
1123 + vpaddq $D0,$H0,$H0 # h4 -> h0
1124 +
1125 + vpsrlq \$26,$H2,$D2
1126 + vpand $MASK,$H2,$H2
1127 + vpaddq $D2,$H3,$H3 # h2 -> h3
1128 +
1129 + vpsrlq \$26,$H0,$D0
1130 + vpand $MASK,$H0,$H0
1131 + vpaddq $D0,$H1,$H1 # h0 -> h1
1132 +
1133 + vpsrlq \$26,$H3,$D3
1134 + vpand $MASK,$H3,$H3
1135 + vpaddq $D3,$H4,$H4 # h3 -> h4
1136 +
1137 + ja .Loop_avx
1138 +
1139 +.Lskip_loop_avx:
1140 + ################################################################
1141 + # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1142 +
1143 + vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1144 + add \$32,$len
1145 + jnz .Long_tail_avx
1146 +
1147 + vpaddq $H2,$T2,$T2
1148 + vpaddq $H0,$T0,$T0
1149 + vpaddq $H1,$T1,$T1
1150 + vpaddq $H3,$T3,$T3
1151 + vpaddq $H4,$T4,$T4
1152 +
1153 +.Long_tail_avx:
1154 + vmovdqa $H2,0x20(%r11)
1155 + vmovdqa $H0,0x00(%r11)
1156 + vmovdqa $H1,0x10(%r11)
1157 + vmovdqa $H3,0x30(%r11)
1158 + vmovdqa $H4,0x40(%r11)
1159 +
1160 + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1161 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1162 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1163 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1164 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1165 +
1166 + vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1167 + vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1168 + vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1169 + vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1170 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1171 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1172 +
1173 + vpmuludq $T3,$H2,$H0 # h3*r1
1174 + vpaddq $H0,$D4,$D4 # d4 += h3*r1
1175 + vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1176 + vpmuludq $T2,$H2,$H1 # h2*r1
1177 + vpaddq $H1,$D3,$D3 # d3 += h2*r1
1178 + vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1179 + vpmuludq $T1,$H2,$H0 # h1*r1
1180 + vpaddq $H0,$D2,$D2 # d2 += h1*r1
1181 + vpmuludq $T0,$H2,$H2 # h0*r1
1182 + vpaddq $H2,$D1,$D1 # d1 += h0*r1
1183 + vpmuludq $T4,$H3,$H3 # h4*s1
1184 + vpaddq $H3,$D0,$D0 # d0 += h4*s1
1185 +
1186 + vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1187 + vpmuludq $T2,$H4,$H1 # h2*r2
1188 + vpaddq $H1,$D4,$D4 # d4 += h2*r2
1189 + vpmuludq $T1,$H4,$H0 # h1*r2
1190 + vpaddq $H0,$D3,$D3 # d3 += h1*r2
1191 + vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1192 + vpmuludq $T0,$H4,$H4 # h0*r2
1193 + vpaddq $H4,$D2,$D2 # d2 += h0*r2
1194 + vpmuludq $T4,$H2,$H1 # h4*s2
1195 + vpaddq $H1,$D1,$D1 # d1 += h4*s2
1196 + vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1197 + vpmuludq $T3,$H2,$H2 # h3*s2
1198 + vpaddq $H2,$D0,$D0 # d0 += h3*s2
1199 +
1200 + vpmuludq $T1,$H3,$H0 # h1*r3
1201 + vpaddq $H0,$D4,$D4 # d4 += h1*r3
1202 + vpmuludq $T0,$H3,$H3 # h0*r3
1203 + vpaddq $H3,$D3,$D3 # d3 += h0*r3
1204 + vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1205 + vpmuludq $T4,$H4,$H1 # h4*s3
1206 + vpaddq $H1,$D2,$D2 # d2 += h4*s3
1207 + vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1208 + vpmuludq $T3,$H4,$H0 # h3*s3
1209 + vpaddq $H0,$D1,$D1 # d1 += h3*s3
1210 + vpmuludq $T2,$H4,$H4 # h2*s3
1211 + vpaddq $H4,$D0,$D0 # d0 += h2*s3
1212 +
1213 + vpmuludq $T0,$H2,$H2 # h0*r4
1214 + vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1215 + vpmuludq $T4,$H3,$H1 # h4*s4
1216 + vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1217 + vpmuludq $T3,$H3,$H0 # h3*s4
1218 + vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1219 + vpmuludq $T2,$H3,$H1 # h2*s4
1220 + vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1221 + vpmuludq $T1,$H3,$H3 # h1*s4
1222 + vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1223 +
1224 + jz .Lshort_tail_avx
1225 +
1226 + vmovdqu 16*0($inp),$H0 # load input
1227 + vmovdqu 16*1($inp),$H1
1228 +
1229 + vpsrldq \$6,$H0,$H2 # splat input
1230 + vpsrldq \$6,$H1,$H3
1231 + vpunpckhqdq $H1,$H0,$H4 # 4
1232 + vpunpcklqdq $H1,$H0,$H0 # 0:1
1233 + vpunpcklqdq $H3,$H2,$H3 # 2:3
1234 +
1235 + vpsrlq \$40,$H4,$H4 # 4
1236 + vpsrlq \$26,$H0,$H1
1237 + vpand $MASK,$H0,$H0 # 0
1238 + vpsrlq \$4,$H3,$H2
1239 + vpand $MASK,$H1,$H1 # 1
1240 + vpsrlq \$30,$H3,$H3
1241 + vpand $MASK,$H2,$H2 # 2
1242 + vpand $MASK,$H3,$H3 # 3
1243 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1244 +
1245 + vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1246 + vpaddq 0x00(%r11),$H0,$H0
1247 + vpaddq 0x10(%r11),$H1,$H1
1248 + vpaddq 0x20(%r11),$H2,$H2
1249 + vpaddq 0x30(%r11),$H3,$H3
1250 + vpaddq 0x40(%r11),$H4,$H4
1251 +
1252 + ################################################################
1253 + # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1254 +
1255 + vpmuludq $H0,$T4,$T0 # h0*r0
1256 + vpaddq $T0,$D0,$D0 # d0 += h0*r0
1257 + vpmuludq $H1,$T4,$T1 # h1*r0
1258 + vpaddq $T1,$D1,$D1 # d1 += h1*r0
1259 + vpmuludq $H2,$T4,$T0 # h2*r0
1260 + vpaddq $T0,$D2,$D2 # d2 += h2*r0
1261 + vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1262 + vpmuludq $H3,$T4,$T1 # h3*r0
1263 + vpaddq $T1,$D3,$D3 # d3 += h3*r0
1264 + vpmuludq $H4,$T4,$T4 # h4*r0
1265 + vpaddq $T4,$D4,$D4 # d4 += h4*r0
1266 +
1267 + vpmuludq $H3,$T2,$T0 # h3*r1
1268 + vpaddq $T0,$D4,$D4 # d4 += h3*r1
1269 + vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1270 + vpmuludq $H2,$T2,$T1 # h2*r1
1271 + vpaddq $T1,$D3,$D3 # d3 += h2*r1
1272 + vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1273 + vpmuludq $H1,$T2,$T0 # h1*r1
1274 + vpaddq $T0,$D2,$D2 # d2 += h1*r1
1275 + vpmuludq $H0,$T2,$T2 # h0*r1
1276 + vpaddq $T2,$D1,$D1 # d1 += h0*r1
1277 + vpmuludq $H4,$T3,$T3 # h4*s1
1278 + vpaddq $T3,$D0,$D0 # d0 += h4*s1
1279 +
1280 + vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1281 + vpmuludq $H2,$T4,$T1 # h2*r2
1282 + vpaddq $T1,$D4,$D4 # d4 += h2*r2
1283 + vpmuludq $H1,$T4,$T0 # h1*r2
1284 + vpaddq $T0,$D3,$D3 # d3 += h1*r2
1285 + vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1286 + vpmuludq $H0,$T4,$T4 # h0*r2
1287 + vpaddq $T4,$D2,$D2 # d2 += h0*r2
1288 + vpmuludq $H4,$T2,$T1 # h4*s2
1289 + vpaddq $T1,$D1,$D1 # d1 += h4*s2
1290 + vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1291 + vpmuludq $H3,$T2,$T2 # h3*s2
1292 + vpaddq $T2,$D0,$D0 # d0 += h3*s2
1293 +
1294 + vpmuludq $H1,$T3,$T0 # h1*r3
1295 + vpaddq $T0,$D4,$D4 # d4 += h1*r3
1296 + vpmuludq $H0,$T3,$T3 # h0*r3
1297 + vpaddq $T3,$D3,$D3 # d3 += h0*r3
1298 + vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1299 + vpmuludq $H4,$T4,$T1 # h4*s3
1300 + vpaddq $T1,$D2,$D2 # d2 += h4*s3
1301 + vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1302 + vpmuludq $H3,$T4,$T0 # h3*s3
1303 + vpaddq $T0,$D1,$D1 # d1 += h3*s3
1304 + vpmuludq $H2,$T4,$T4 # h2*s3
1305 + vpaddq $T4,$D0,$D0 # d0 += h2*s3
1306 +
1307 + vpmuludq $H0,$T2,$T2 # h0*r4
1308 + vpaddq $T2,$D4,$D4 # d4 += h0*r4
1309 + vpmuludq $H4,$T3,$T1 # h4*s4
1310 + vpaddq $T1,$D3,$D3 # d3 += h4*s4
1311 + vpmuludq $H3,$T3,$T0 # h3*s4
1312 + vpaddq $T0,$D2,$D2 # d2 += h3*s4
1313 + vpmuludq $H2,$T3,$T1 # h2*s4
1314 + vpaddq $T1,$D1,$D1 # d1 += h2*s4
1315 + vpmuludq $H1,$T3,$T3 # h1*s4
1316 + vpaddq $T3,$D0,$D0 # d0 += h1*s4
1317 +
1318 +.Lshort_tail_avx:
1319 + ################################################################
1320 + # horizontal addition
1321 +
1322 + vpsrldq \$8,$D4,$T4
1323 + vpsrldq \$8,$D3,$T3
1324 + vpsrldq \$8,$D1,$T1
1325 + vpsrldq \$8,$D0,$T0
1326 + vpsrldq \$8,$D2,$T2
1327 + vpaddq $T3,$D3,$D3
1328 + vpaddq $T4,$D4,$D4
1329 + vpaddq $T0,$D0,$D0
1330 + vpaddq $T1,$D1,$D1
1331 + vpaddq $T2,$D2,$D2
1332 +
1333 + ################################################################
1334 + # lazy reduction
1335 +
1336 + vpsrlq \$26,$D3,$H3
1337 + vpand $MASK,$D3,$D3
1338 + vpaddq $H3,$D4,$D4 # h3 -> h4
1339 +
1340 + vpsrlq \$26,$D0,$H0
1341 + vpand $MASK,$D0,$D0
1342 + vpaddq $H0,$D1,$D1 # h0 -> h1
1343 +
1344 + vpsrlq \$26,$D4,$H4
1345 + vpand $MASK,$D4,$D4
1346 +
1347 + vpsrlq \$26,$D1,$H1
1348 + vpand $MASK,$D1,$D1
1349 + vpaddq $H1,$D2,$D2 # h1 -> h2
1350 +
1351 + vpaddq $H4,$D0,$D0
1352 + vpsllq \$2,$H4,$H4
1353 + vpaddq $H4,$D0,$D0 # h4 -> h0
1354 +
1355 + vpsrlq \$26,$D2,$H2
1356 + vpand $MASK,$D2,$D2
1357 + vpaddq $H2,$D3,$D3 # h2 -> h3
1358 +
1359 + vpsrlq \$26,$D0,$H0
1360 + vpand $MASK,$D0,$D0
1361 + vpaddq $H0,$D1,$D1 # h0 -> h1
1362 +
1363 + vpsrlq \$26,$D3,$H3
1364 + vpand $MASK,$D3,$D3
1365 + vpaddq $H3,$D4,$D4 # h3 -> h4
1366 +
1367 + vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1368 + vmovd $D1,`4*1-48-64`($ctx)
1369 + vmovd $D2,`4*2-48-64`($ctx)
1370 + vmovd $D3,`4*3-48-64`($ctx)
1371 + vmovd $D4,`4*4-48-64`($ctx)
1372 +___
1373 +$code.=<<___ if ($win64);
1374 + vmovdqa 0x50(%r11),%xmm6
1375 + vmovdqa 0x60(%r11),%xmm7
1376 + vmovdqa 0x70(%r11),%xmm8
1377 + vmovdqa 0x80(%r11),%xmm9
1378 + vmovdqa 0x90(%r11),%xmm10
1379 + vmovdqa 0xa0(%r11),%xmm11
1380 + vmovdqa 0xb0(%r11),%xmm12
1381 + vmovdqa 0xc0(%r11),%xmm13
1382 + vmovdqa 0xd0(%r11),%xmm14
1383 + vmovdqa 0xe0(%r11),%xmm15
1384 + lea 0xf8(%r11),%rsp
1385 +.Ldo_avx_epilogue:
1386 +___
1387 +$code.=<<___ if (!$win64);
1388 + lea 0x58(%r11),%rsp
1389 +.cfi_def_cfa %rsp,8
1390 +___
1391 +$code.=<<___;
1392 + vzeroupper
1393 + ret
1394 +.cfi_endproc
1395 +.size poly1305_blocks_avx,.-poly1305_blocks_avx
1396 +
1397 +.type poly1305_emit_avx,\@function,3
1398 +.align 32
1399 +poly1305_emit_avx:
1400 + cmpl \$0,20($ctx) # is_base2_26?
1401 + je .Lemit
1402 +
1403 + mov 0($ctx),%eax # load hash value base 2^26
1404 + mov 4($ctx),%ecx
1405 + mov 8($ctx),%r8d
1406 + mov 12($ctx),%r11d
1407 + mov 16($ctx),%r10d
1408 +
1409 + shl \$26,%rcx # base 2^26 -> base 2^64
1410 + mov %r8,%r9
1411 + shl \$52,%r8
1412 + add %rcx,%rax
1413 + shr \$12,%r9
1414 + add %rax,%r8 # h0
1415 + adc \$0,%r9
1416 +
1417 + shl \$14,%r11
1418 + mov %r10,%rax
1419 + shr \$24,%r10
1420 + add %r11,%r9
1421 + shl \$40,%rax
1422 + add %rax,%r9 # h1
1423 + adc \$0,%r10 # h2
1424 +
1425 + mov %r10,%rax # could be partially reduced, so reduce
1426 + mov %r10,%rcx
1427 + and \$3,%r10
1428 + shr \$2,%rax
1429 + and \$-4,%rcx
1430 + add %rcx,%rax
1431 + add %rax,%r8
1432 + adc \$0,%r9
1433 + adc \$0,%r10
1434 +
1435 + mov %r8,%rax
1436 + add \$5,%r8 # compare to modulus
1437 + mov %r9,%rcx
1438 + adc \$0,%r9
1439 + adc \$0,%r10
1440 + shr \$2,%r10 # did 130-bit value overflow?
1441 + cmovnz %r8,%rax
1442 + cmovnz %r9,%rcx
1443 +
1444 + add 0($nonce),%rax # accumulate nonce
1445 + adc 8($nonce),%rcx
1446 + mov %rax,0($mac) # write result
1447 + mov %rcx,8($mac)
1448 +
1449 + ret
1450 +.size poly1305_emit_avx,.-poly1305_emit_avx
1451 +___
1452 +
1453 +if ($avx>1) {
1454 +my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1455 + map("%ymm$_",(0..15));
1456 +my $S4=$MASK;
1457 +
1458 +$code.=<<___;
1459 +.type poly1305_blocks_avx2,\@function,4
1460 +.align 32
1461 +poly1305_blocks_avx2:
1462 +.cfi_startproc
1463 + mov 20($ctx),%r8d # is_base2_26
1464 + cmp \$128,$len
1465 + jae .Lblocks_avx2
1466 + test %r8d,%r8d
1467 + jz .Lblocks
1468 +
1469 +.Lblocks_avx2:
1470 + and \$-16,$len
1471 + jz .Lno_data_avx2
1472 +
1473 + vzeroupper
1474 +
1475 + test %r8d,%r8d
1476 + jz .Lbase2_64_avx2
1477 +
1478 + test \$63,$len
1479 + jz .Leven_avx2
1480 +
1481 + push %rbx
1482 +.cfi_push %rbx
1483 + push %rbp
1484 +.cfi_push %rbp
1485 + push %r12
1486 +.cfi_push %r12
1487 + push %r13
1488 +.cfi_push %r13
1489 + push %r14
1490 +.cfi_push %r14
1491 + push %r15
1492 +.cfi_push %r15
1493 +.Lblocks_avx2_body:
1494 +
1495 + mov $len,%r15 # reassign $len
1496 +
1497 + mov 0($ctx),$d1 # load hash value
1498 + mov 8($ctx),$d2
1499 + mov 16($ctx),$h2#d
1500 +
1501 + mov 24($ctx),$r0 # load r
1502 + mov 32($ctx),$s1
1503 +
1504 + ################################# base 2^26 -> base 2^64
1505 + mov $d1#d,$h0#d
1506 + and \$`-1*(1<<31)`,$d1
1507 + mov $d2,$r1 # borrow $r1
1508 + mov $d2#d,$h1#d
1509 + and \$`-1*(1<<31)`,$d2
1510 +
1511 + shr \$6,$d1
1512 + shl \$52,$r1
1513 + add $d1,$h0
1514 + shr \$12,$h1
1515 + shr \$18,$d2
1516 + add $r1,$h0
1517 + adc $d2,$h1
1518 +
1519 + mov $h2,$d1
1520 + shl \$40,$d1
1521 + shr \$24,$h2
1522 + add $d1,$h1
1523 + adc \$0,$h2 # can be partially reduced...
1524 +
1525 + mov \$-4,$d2 # ... so reduce
1526 + mov $h2,$d1
1527 + and $h2,$d2
1528 + shr \$2,$d1
1529 + and \$3,$h2
1530 + add $d2,$d1 # =*5
1531 + add $d1,$h0
1532 + adc \$0,$h1
1533 + adc \$0,$h2
1534 +
1535 + mov $s1,$r1
1536 + mov $s1,%rax
1537 + shr \$2,$s1
1538 + add $r1,$s1 # s1 = r1 + (r1 >> 2)
1539 +
1540 +.Lbase2_26_pre_avx2:
1541 + add 0($inp),$h0 # accumulate input
1542 + adc 8($inp),$h1
1543 + lea 16($inp),$inp
1544 + adc $padbit,$h2
1545 + sub \$16,%r15
1546 +
1547 + call __poly1305_block
1548 + mov $r1,%rax
1549 +
1550 + test \$63,%r15
1551 + jnz .Lbase2_26_pre_avx2
1552 +
1553 + test $padbit,$padbit # if $padbit is zero,
1554 + jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
1555 +
1556 + ################################# base 2^64 -> base 2^26
1557 + mov $h0,%rax
1558 + mov $h0,%rdx
1559 + shr \$52,$h0
1560 + mov $h1,$r0
1561 + mov $h1,$r1
1562 + shr \$26,%rdx
1563 + and \$0x3ffffff,%rax # h[0]
1564 + shl \$12,$r0
1565 + and \$0x3ffffff,%rdx # h[1]
1566 + shr \$14,$h1
1567 + or $r0,$h0
1568 + shl \$24,$h2
1569 + and \$0x3ffffff,$h0 # h[2]
1570 + shr \$40,$r1
1571 + and \$0x3ffffff,$h1 # h[3]
1572 + or $r1,$h2 # h[4]
1573 +
1574 + test %r15,%r15
1575 + jz .Lstore_base2_26_avx2
1576 +
1577 + vmovd %rax#d,%x#$H0
1578 + vmovd %rdx#d,%x#$H1
1579 + vmovd $h0#d,%x#$H2
1580 + vmovd $h1#d,%x#$H3
1581 + vmovd $h2#d,%x#$H4
1582 + jmp .Lproceed_avx2
1583 +
1584 +.align 32
1585 +.Lstore_base2_64_avx2:
1586 + mov $h0,0($ctx)
1587 + mov $h1,8($ctx)
1588 + mov $h2,16($ctx) # note that is_base2_26 is zeroed
1589 + jmp .Ldone_avx2
1590 +
1591 +.align 16
1592 +.Lstore_base2_26_avx2:
1593 + mov %rax#d,0($ctx) # store hash value base 2^26
1594 + mov %rdx#d,4($ctx)
1595 + mov $h0#d,8($ctx)
1596 + mov $h1#d,12($ctx)
1597 + mov $h2#d,16($ctx)
1598 +.align 16
1599 +.Ldone_avx2:
1600 + mov 0(%rsp),%r15
1601 +.cfi_restore %r15
1602 + mov 8(%rsp),%r14
1603 +.cfi_restore %r14
1604 + mov 16(%rsp),%r13
1605 +.cfi_restore %r13
1606 + mov 24(%rsp),%r12
1607 +.cfi_restore %r12
1608 + mov 32(%rsp),%rbp
1609 +.cfi_restore %rbp
1610 + mov 40(%rsp),%rbx
1611 +.cfi_restore %rbx
1612 + lea 48(%rsp),%rsp
1613 +.cfi_adjust_cfa_offset -48
1614 +.Lno_data_avx2:
1615 +.Lblocks_avx2_epilogue:
1616 + ret
1617 +.cfi_endproc
1618 +
1619 +.align 32
1620 +.Lbase2_64_avx2:
1621 +.cfi_startproc
1622 + push %rbx
1623 +.cfi_push %rbx
1624 + push %rbp
1625 +.cfi_push %rbp
1626 + push %r12
1627 +.cfi_push %r12
1628 + push %r13
1629 +.cfi_push %r13
1630 + push %r14
1631 +.cfi_push %r14
1632 + push %r15
1633 +.cfi_push %r15
1634 +.Lbase2_64_avx2_body:
1635 +
1636 + mov $len,%r15 # reassign $len
1637 +
1638 + mov 24($ctx),$r0 # load r
1639 + mov 32($ctx),$s1
1640 +
1641 + mov 0($ctx),$h0 # load hash value
1642 + mov 8($ctx),$h1
1643 + mov 16($ctx),$h2#d
1644 +
1645 + mov $s1,$r1
1646 + mov $s1,%rax
1647 + shr \$2,$s1
1648 + add $r1,$s1 # s1 = r1 + (r1 >> 2)
1649 +
1650 + test \$63,$len
1651 + jz .Linit_avx2
1652 +
1653 +.Lbase2_64_pre_avx2:
1654 + add 0($inp),$h0 # accumulate input
1655 + adc 8($inp),$h1
1656 + lea 16($inp),$inp
1657 + adc $padbit,$h2
1658 + sub \$16,%r15
1659 +
1660 + call __poly1305_block
1661 + mov $r1,%rax
1662 +
1663 + test \$63,%r15
1664 + jnz .Lbase2_64_pre_avx2
1665 +
1666 +.Linit_avx2:
1667 + ################################# base 2^64 -> base 2^26
1668 + mov $h0,%rax
1669 + mov $h0,%rdx
1670 + shr \$52,$h0
1671 + mov $h1,$d1
1672 + mov $h1,$d2
1673 + shr \$26,%rdx
1674 + and \$0x3ffffff,%rax # h[0]
1675 + shl \$12,$d1
1676 + and \$0x3ffffff,%rdx # h[1]
1677 + shr \$14,$h1
1678 + or $d1,$h0
1679 + shl \$24,$h2
1680 + and \$0x3ffffff,$h0 # h[2]
1681 + shr \$40,$d2
1682 + and \$0x3ffffff,$h1 # h[3]
1683 + or $d2,$h2 # h[4]
1684 +
1685 + vmovd %rax#d,%x#$H0
1686 + vmovd %rdx#d,%x#$H1
1687 + vmovd $h0#d,%x#$H2
1688 + vmovd $h1#d,%x#$H3
1689 + vmovd $h2#d,%x#$H4
1690 + movl \$1,20($ctx) # set is_base2_26
1691 +
1692 + call __poly1305_init_avx
1693 +
1694 +.Lproceed_avx2:
1695 + mov %r15,$len # restore $len
1696 + mov OPENSSL_ia32cap_P+8(%rip),%r10d
1697 + mov \$`(1<<31|1<<30|1<<16)`,%r11d
1698 +
1699 + mov 0(%rsp),%r15
1700 +.cfi_restore %r15
1701 + mov 8(%rsp),%r14
1702 +.cfi_restore %r14
1703 + mov 16(%rsp),%r13
1704 +.cfi_restore %r13
1705 + mov 24(%rsp),%r12
1706 +.cfi_restore %r12
1707 + mov 32(%rsp),%rbp
1708 +.cfi_restore %rbp
1709 + mov 40(%rsp),%rbx
1710 +.cfi_restore %rbx
1711 + lea 48(%rsp),%rax
1712 + lea 48(%rsp),%rsp
1713 +.cfi_adjust_cfa_offset -48
1714 +.Lbase2_64_avx2_epilogue:
1715 + jmp .Ldo_avx2
1716 +.cfi_endproc
1717 +
1718 +.align 32
1719 +.Leven_avx2:
1720 +.cfi_startproc
1721 + mov OPENSSL_ia32cap_P+8(%rip),%r10d
1722 + vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1723 + vmovd 4*1($ctx),%x#$H1
1724 + vmovd 4*2($ctx),%x#$H2
1725 + vmovd 4*3($ctx),%x#$H3
1726 + vmovd 4*4($ctx),%x#$H4
1727 +
1728 +.Ldo_avx2:
1729 +___
1730 +$code.=<<___ if ($avx>2);
1731 + cmp \$512,$len
1732 + jb .Lskip_avx512
1733 + and %r11d,%r10d
1734 + test \$`1<<16`,%r10d # check for AVX512F
1735 + jnz .Lblocks_avx512
1736 +.Lskip_avx512:
1737 +___
1738 +$code.=<<___ if (!$win64);
1739 + lea -8(%rsp),%r11
1740 +.cfi_def_cfa %r11,16
1741 + sub \$0x128,%rsp
1742 +___
1743 +$code.=<<___ if ($win64);
1744 + lea -0xf8(%rsp),%r11
1745 + sub \$0x1c8,%rsp
1746 + vmovdqa %xmm6,0x50(%r11)
1747 + vmovdqa %xmm7,0x60(%r11)
1748 + vmovdqa %xmm8,0x70(%r11)
1749 + vmovdqa %xmm9,0x80(%r11)
1750 + vmovdqa %xmm10,0x90(%r11)
1751 + vmovdqa %xmm11,0xa0(%r11)
1752 + vmovdqa %xmm12,0xb0(%r11)
1753 + vmovdqa %xmm13,0xc0(%r11)
1754 + vmovdqa %xmm14,0xd0(%r11)
1755 + vmovdqa %xmm15,0xe0(%r11)
1756 +.Ldo_avx2_body:
1757 +___
1758 +$code.=<<___;
1759 + lea .Lconst(%rip),%rcx
1760 + lea 48+64($ctx),$ctx # size optimization
1761 + vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
1762 +
1763 + # expand and copy pre-calculated table to stack
1764 + vmovdqu `16*0-64`($ctx),%x#$T2
1765 + and \$-512,%rsp
1766 + vmovdqu `16*1-64`($ctx),%x#$T3
1767 + vmovdqu `16*2-64`($ctx),%x#$T4
1768 + vmovdqu `16*3-64`($ctx),%x#$D0
1769 + vmovdqu `16*4-64`($ctx),%x#$D1
1770 + vmovdqu `16*5-64`($ctx),%x#$D2
1771 + lea 0x90(%rsp),%rax # size optimization
1772 + vmovdqu `16*6-64`($ctx),%x#$D3
1773 + vpermd $T2,$T0,$T2 # 00003412 -> 14243444
1774 + vmovdqu `16*7-64`($ctx),%x#$D4
1775 + vpermd $T3,$T0,$T3
1776 + vmovdqu `16*8-64`($ctx),%x#$MASK
1777 + vpermd $T4,$T0,$T4
1778 + vmovdqa $T2,0x00(%rsp)
1779 + vpermd $D0,$T0,$D0
1780 + vmovdqa $T3,0x20-0x90(%rax)
1781 + vpermd $D1,$T0,$D1
1782 + vmovdqa $T4,0x40-0x90(%rax)
1783 + vpermd $D2,$T0,$D2
1784 + vmovdqa $D0,0x60-0x90(%rax)
1785 + vpermd $D3,$T0,$D3
1786 + vmovdqa $D1,0x80-0x90(%rax)
1787 + vpermd $D4,$T0,$D4
1788 + vmovdqa $D2,0xa0-0x90(%rax)
1789 + vpermd $MASK,$T0,$MASK
1790 + vmovdqa $D3,0xc0-0x90(%rax)
1791 + vmovdqa $D4,0xe0-0x90(%rax)
1792 + vmovdqa $MASK,0x100-0x90(%rax)
1793 + vmovdqa 64(%rcx),$MASK # .Lmask26
1794 +
1795 + ################################################################
1796 + # load input
1797 + vmovdqu 16*0($inp),%x#$T0
1798 + vmovdqu 16*1($inp),%x#$T1
1799 + vinserti128 \$1,16*2($inp),$T0,$T0
1800 + vinserti128 \$1,16*3($inp),$T1,$T1
1801 + lea 16*4($inp),$inp
1802 +
1803 + vpsrldq \$6,$T0,$T2 # splat input
1804 + vpsrldq \$6,$T1,$T3
1805 + vpunpckhqdq $T1,$T0,$T4 # 4
1806 + vpunpcklqdq $T3,$T2,$T2 # 2:3
1807 + vpunpcklqdq $T1,$T0,$T0 # 0:1
1808 +
1809 + vpsrlq \$30,$T2,$T3
1810 + vpsrlq \$4,$T2,$T2
1811 + vpsrlq \$26,$T0,$T1
1812 + vpsrlq \$40,$T4,$T4 # 4
1813 + vpand $MASK,$T2,$T2 # 2
1814 + vpand $MASK,$T0,$T0 # 0
1815 + vpand $MASK,$T1,$T1 # 1
1816 + vpand $MASK,$T3,$T3 # 3
1817 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1818 +
1819 + vpaddq $H2,$T2,$H2 # accumulate input
1820 + sub \$64,$len
1821 + jz .Ltail_avx2
1822 + jmp .Loop_avx2
1823 +
1824 +.align 32
1825 +.Loop_avx2:
1826 + ################################################################
1827 + # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1828 + # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1829 + # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1830 + # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1831 + # \________/\__________/
1832 + ################################################################
1833 + #vpaddq $H2,$T2,$H2 # accumulate input
1834 + vpaddq $H0,$T0,$H0
1835 + vmovdqa `32*0`(%rsp),$T0 # r0^4
1836 + vpaddq $H1,$T1,$H1
1837 + vmovdqa `32*1`(%rsp),$T1 # r1^4
1838 + vpaddq $H3,$T3,$H3
1839 + vmovdqa `32*3`(%rsp),$T2 # r2^4
1840 + vpaddq $H4,$T4,$H4
1841 + vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1842 + vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1843 +
1844 + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1845 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1846 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1847 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1848 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1849 + #
1850 + # however, as h2 is "chronologically" first one available pull
1851 + # corresponding operations up, so it's
1852 + #
1853 + # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1854 + # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1855 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1856 + # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1857 + # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1858 +
1859 + vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1860 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1861 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1862 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1863 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1864 +
1865 + vpmuludq $H0,$T1,$T4 # h0*r1
1866 + vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1867 + vpaddq $T4,$D1,$D1 # d1 += h0*r1
1868 + vpaddq $H2,$D2,$D2 # d2 += h1*r1
1869 + vpmuludq $H3,$T1,$T4 # h3*r1
1870 + vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1871 + vpaddq $T4,$D4,$D4 # d4 += h3*r1
1872 + vpaddq $H2,$D0,$D0 # d0 += h4*s1
1873 + vmovdqa `32*4-0x90`(%rax),$T1 # s2
1874 +
1875 + vpmuludq $H0,$T0,$T4 # h0*r0
1876 + vpmuludq $H1,$T0,$H2 # h1*r0
1877 + vpaddq $T4,$D0,$D0 # d0 += h0*r0
1878 + vpaddq $H2,$D1,$D1 # d1 += h1*r0
1879 + vpmuludq $H3,$T0,$T4 # h3*r0
1880 + vpmuludq $H4,$T0,$H2 # h4*r0
1881 + vmovdqu 16*0($inp),%x#$T0 # load input
1882 + vpaddq $T4,$D3,$D3 # d3 += h3*r0
1883 + vpaddq $H2,$D4,$D4 # d4 += h4*r0
1884 + vinserti128 \$1,16*2($inp),$T0,$T0
1885 +
1886 + vpmuludq $H3,$T1,$T4 # h3*s2
1887 + vpmuludq $H4,$T1,$H2 # h4*s2
1888 + vmovdqu 16*1($inp),%x#$T1
1889 + vpaddq $T4,$D0,$D0 # d0 += h3*s2
1890 + vpaddq $H2,$D1,$D1 # d1 += h4*s2
1891 + vmovdqa `32*5-0x90`(%rax),$H2 # r3
1892 + vpmuludq $H1,$T2,$T4 # h1*r2
1893 + vpmuludq $H0,$T2,$T2 # h0*r2
1894 + vpaddq $T4,$D3,$D3 # d3 += h1*r2
1895 + vpaddq $T2,$D2,$D2 # d2 += h0*r2
1896 + vinserti128 \$1,16*3($inp),$T1,$T1
1897 + lea 16*4($inp),$inp
1898 +
1899 + vpmuludq $H1,$H2,$T4 # h1*r3
1900 + vpmuludq $H0,$H2,$H2 # h0*r3
1901 + vpsrldq \$6,$T0,$T2 # splat input
1902 + vpaddq $T4,$D4,$D4 # d4 += h1*r3
1903 + vpaddq $H2,$D3,$D3 # d3 += h0*r3
1904 + vpmuludq $H3,$T3,$T4 # h3*s3
1905 + vpmuludq $H4,$T3,$H2 # h4*s3
1906 + vpsrldq \$6,$T1,$T3
1907 + vpaddq $T4,$D1,$D1 # d1 += h3*s3
1908 + vpaddq $H2,$D2,$D2 # d2 += h4*s3
1909 + vpunpckhqdq $T1,$T0,$T4 # 4
1910 +
1911 + vpmuludq $H3,$S4,$H3 # h3*s4
1912 + vpmuludq $H4,$S4,$H4 # h4*s4
1913 + vpunpcklqdq $T1,$T0,$T0 # 0:1
1914 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1915 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1916 + vpunpcklqdq $T3,$T2,$T3 # 2:3
1917 + vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
1918 + vpmuludq $H1,$S4,$H0 # h1*s4
1919 + vmovdqa 64(%rcx),$MASK # .Lmask26
1920 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1921 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1922 +
1923 + ################################################################
1924 + # lazy reduction (interleaved with tail of input splat)
1925 +
1926 + vpsrlq \$26,$H3,$D3
1927 + vpand $MASK,$H3,$H3
1928 + vpaddq $D3,$H4,$H4 # h3 -> h4
1929 +
1930 + vpsrlq \$26,$H0,$D0
1931 + vpand $MASK,$H0,$H0
1932 + vpaddq $D0,$D1,$H1 # h0 -> h1
1933 +
1934 + vpsrlq \$26,$H4,$D4
1935 + vpand $MASK,$H4,$H4
1936 +
1937 + vpsrlq \$4,$T3,$T2
1938 +
1939 + vpsrlq \$26,$H1,$D1
1940 + vpand $MASK,$H1,$H1
1941 + vpaddq $D1,$H2,$H2 # h1 -> h2
1942 +
1943 + vpaddq $D4,$H0,$H0
1944 + vpsllq \$2,$D4,$D4
1945 + vpaddq $D4,$H0,$H0 # h4 -> h0
1946 +
1947 + vpand $MASK,$T2,$T2 # 2
1948 + vpsrlq \$26,$T0,$T1
1949 +
1950 + vpsrlq \$26,$H2,$D2
1951 + vpand $MASK,$H2,$H2
1952 + vpaddq $D2,$H3,$H3 # h2 -> h3
1953 +
1954 + vpaddq $T2,$H2,$H2 # modulo-scheduled
1955 + vpsrlq \$30,$T3,$T3
1956 +
1957 + vpsrlq \$26,$H0,$D0
1958 + vpand $MASK,$H0,$H0
1959 + vpaddq $D0,$H1,$H1 # h0 -> h1
1960 +
1961 + vpsrlq \$40,$T4,$T4 # 4
1962 +
1963 + vpsrlq \$26,$H3,$D3
1964 + vpand $MASK,$H3,$H3
1965 + vpaddq $D3,$H4,$H4 # h3 -> h4
1966 +
1967 + vpand $MASK,$T0,$T0 # 0
1968 + vpand $MASK,$T1,$T1 # 1
1969 + vpand $MASK,$T3,$T3 # 3
1970 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1971 +
1972 + sub \$64,$len
1973 + jnz .Loop_avx2
1974 +
1975 + .byte 0x66,0x90
1976 +.Ltail_avx2:
1977 + ################################################################
1978 + # while above multiplications were by r^4 in all lanes, in last
1979 + # iteration we multiply least significant lane by r^4 and most
1980 + # significant one by r, so copy of above except that references
1981 + # to the precomputed table are displaced by 4...
1982 +
1983 + #vpaddq $H2,$T2,$H2 # accumulate input
1984 + vpaddq $H0,$T0,$H0
1985 + vmovdqu `32*0+4`(%rsp),$T0 # r0^4
1986 + vpaddq $H1,$T1,$H1
1987 + vmovdqu `32*1+4`(%rsp),$T1 # r1^4
1988 + vpaddq $H3,$T3,$H3
1989 + vmovdqu `32*3+4`(%rsp),$T2 # r2^4
1990 + vpaddq $H4,$T4,$H4
1991 + vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
1992 + vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
1993 +
1994 + vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1995 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1996 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1997 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1998 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1999 +
2000 + vpmuludq $H0,$T1,$T4 # h0*r1
2001 + vpmuludq $H1,$T1,$H2 # h1*r1
2002 + vpaddq $T4,$D1,$D1 # d1 += h0*r1
2003 + vpaddq $H2,$D2,$D2 # d2 += h1*r1
2004 + vpmuludq $H3,$T1,$T4 # h3*r1
2005 + vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
2006 + vpaddq $T4,$D4,$D4 # d4 += h3*r1
2007 + vpaddq $H2,$D0,$D0 # d0 += h4*s1
2008 +
2009 + vpmuludq $H0,$T0,$T4 # h0*r0
2010 + vpmuludq $H1,$T0,$H2 # h1*r0
2011 + vpaddq $T4,$D0,$D0 # d0 += h0*r0
2012 + vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
2013 + vpaddq $H2,$D1,$D1 # d1 += h1*r0
2014 + vpmuludq $H3,$T0,$T4 # h3*r0
2015 + vpmuludq $H4,$T0,$H2 # h4*r0
2016 + vpaddq $T4,$D3,$D3 # d3 += h3*r0
2017 + vpaddq $H2,$D4,$D4 # d4 += h4*r0
2018 +
2019 + vpmuludq $H3,$T1,$T4 # h3*s2
2020 + vpmuludq $H4,$T1,$H2 # h4*s2
2021 + vpaddq $T4,$D0,$D0 # d0 += h3*s2
2022 + vpaddq $H2,$D1,$D1 # d1 += h4*s2
2023 + vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
2024 + vpmuludq $H1,$T2,$T4 # h1*r2
2025 + vpmuludq $H0,$T2,$T2 # h0*r2
2026 + vpaddq $T4,$D3,$D3 # d3 += h1*r2
2027 + vpaddq $T2,$D2,$D2 # d2 += h0*r2
2028 +
2029 + vpmuludq $H1,$H2,$T4 # h1*r3
2030 + vpmuludq $H0,$H2,$H2 # h0*r3
2031 + vpaddq $T4,$D4,$D4 # d4 += h1*r3
2032 + vpaddq $H2,$D3,$D3 # d3 += h0*r3
2033 + vpmuludq $H3,$T3,$T4 # h3*s3
2034 + vpmuludq $H4,$T3,$H2 # h4*s3
2035 + vpaddq $T4,$D1,$D1 # d1 += h3*s3
2036 + vpaddq $H2,$D2,$D2 # d2 += h4*s3
2037 +
2038 + vpmuludq $H3,$S4,$H3 # h3*s4
2039 + vpmuludq $H4,$S4,$H4 # h4*s4
2040 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
2041 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
2042 + vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
2043 + vpmuludq $H1,$S4,$H0 # h1*s4
2044 + vmovdqa 64(%rcx),$MASK # .Lmask26
2045 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
2046 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
2047 +
2048 + ################################################################
2049 + # horizontal addition
2050 +
2051 + vpsrldq \$8,$D1,$T1
2052 + vpsrldq \$8,$H2,$T2
2053 + vpsrldq \$8,$H3,$T3
2054 + vpsrldq \$8,$H4,$T4
2055 + vpsrldq \$8,$H0,$T0
2056 + vpaddq $T1,$D1,$D1
2057 + vpaddq $T2,$H2,$H2
2058 + vpaddq $T3,$H3,$H3
2059 + vpaddq $T4,$H4,$H4
2060 + vpaddq $T0,$H0,$H0
2061 +
2062 + vpermq \$0x2,$H3,$T3
2063 + vpermq \$0x2,$H4,$T4
2064 + vpermq \$0x2,$H0,$T0
2065 + vpermq \$0x2,$D1,$T1
2066 + vpermq \$0x2,$H2,$T2
2067 + vpaddq $T3,$H3,$H3
2068 + vpaddq $T4,$H4,$H4
2069 + vpaddq $T0,$H0,$H0
2070 + vpaddq $T1,$D1,$D1
2071 + vpaddq $T2,$H2,$H2
2072 +
2073 + ################################################################
2074 + # lazy reduction
2075 +
2076 + vpsrlq \$26,$H3,$D3
2077 + vpand $MASK,$H3,$H3
2078 + vpaddq $D3,$H4,$H4 # h3 -> h4
2079 +
2080 + vpsrlq \$26,$H0,$D0
2081 + vpand $MASK,$H0,$H0
2082 + vpaddq $D0,$D1,$H1 # h0 -> h1
2083 +
2084 + vpsrlq \$26,$H4,$D4
2085 + vpand $MASK,$H4,$H4
2086 +
2087 + vpsrlq \$26,$H1,$D1
2088 + vpand $MASK,$H1,$H1
2089 + vpaddq $D1,$H2,$H2 # h1 -> h2
2090 +
2091 + vpaddq $D4,$H0,$H0
2092 + vpsllq \$2,$D4,$D4
2093 + vpaddq $D4,$H0,$H0 # h4 -> h0
2094 +
2095 + vpsrlq \$26,$H2,$D2
2096 + vpand $MASK,$H2,$H2
2097 + vpaddq $D2,$H3,$H3 # h2 -> h3
2098 +
2099 + vpsrlq \$26,$H0,$D0
2100 + vpand $MASK,$H0,$H0
2101 + vpaddq $D0,$H1,$H1 # h0 -> h1
2102 +
2103 + vpsrlq \$26,$H3,$D3
2104 + vpand $MASK,$H3,$H3
2105 + vpaddq $D3,$H4,$H4 # h3 -> h4
2106 +
2107 + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2108 + vmovd %x#$H1,`4*1-48-64`($ctx)
2109 + vmovd %x#$H2,`4*2-48-64`($ctx)
2110 + vmovd %x#$H3,`4*3-48-64`($ctx)
2111 + vmovd %x#$H4,`4*4-48-64`($ctx)
2112 +___
2113 +$code.=<<___ if ($win64);
2114 + vmovdqa 0x50(%r11),%xmm6
2115 + vmovdqa 0x60(%r11),%xmm7
2116 + vmovdqa 0x70(%r11),%xmm8
2117 + vmovdqa 0x80(%r11),%xmm9
2118 + vmovdqa 0x90(%r11),%xmm10
2119 + vmovdqa 0xa0(%r11),%xmm11
2120 + vmovdqa 0xb0(%r11),%xmm12
2121 + vmovdqa 0xc0(%r11),%xmm13
2122 + vmovdqa 0xd0(%r11),%xmm14
2123 + vmovdqa 0xe0(%r11),%xmm15
2124 + lea 0xf8(%r11),%rsp
2125 +.Ldo_avx2_epilogue:
2126 +___
2127 +$code.=<<___ if (!$win64);
2128 + lea 8(%r11),%rsp
2129 +.cfi_def_cfa %rsp,8
2130 +___
2131 +$code.=<<___;
2132 + vzeroupper
2133 + ret
2134 +.cfi_endproc
2135 +.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
2136 +___
2137 +#######################################################################
2138 +if ($avx>2) {
2139 +# On entry we have input length divisible by 64. But since inner loop
2140 +# processes 128 bytes per iteration, cases when length is not divisible
2141 +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2142 +# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2143 +# for this tail, we wouldn't have to even allocate stack frame...
2144 +
2145 +my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2146 +my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2147 +my $PADBIT="%zmm30";
2148 +
2149 +map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
2150 +map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2151 +map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2152 +map(s/%y/%z/,($MASK));
2153 +
2154 +$code.=<<___;
2155 +.type poly1305_blocks_avx512,\@function,4
2156 +.align 32
2157 +poly1305_blocks_avx512:
2158 +.cfi_startproc
2159 +.Lblocks_avx512:
2160 + mov \$15,%eax
2161 + kmovw %eax,%k2
2162 +___
2163 +$code.=<<___ if (!$win64);
2164 + lea -8(%rsp),%r11
2165 +.cfi_def_cfa %r11,16
2166 + sub \$0x128,%rsp
2167 +___
2168 +$code.=<<___ if ($win64);
2169 + lea -0xf8(%rsp),%r11
2170 + sub \$0x1c8,%rsp
2171 + vmovdqa %xmm6,0x50(%r11)
2172 + vmovdqa %xmm7,0x60(%r11)
2173 + vmovdqa %xmm8,0x70(%r11)
2174 + vmovdqa %xmm9,0x80(%r11)
2175 + vmovdqa %xmm10,0x90(%r11)
2176 + vmovdqa %xmm11,0xa0(%r11)
2177 + vmovdqa %xmm12,0xb0(%r11)
2178 + vmovdqa %xmm13,0xc0(%r11)
2179 + vmovdqa %xmm14,0xd0(%r11)
2180 + vmovdqa %xmm15,0xe0(%r11)
2181 +.Ldo_avx512_body:
2182 +___
2183 +$code.=<<___;
2184 + lea .Lconst(%rip),%rcx
2185 + lea 48+64($ctx),$ctx # size optimization
2186 + vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
2187 +
2188 + # expand pre-calculated table
2189 + vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
2190 + and \$-512,%rsp
2191 + vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
2192 + mov \$0x20,%rax
2193 + vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
2194 + vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
2195 + vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
2196 + vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
2197 + vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
2198 + vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
2199 + vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
2200 + vpermd $D0,$T2,$R0 # 00003412 -> 14243444
2201 + vpbroadcastq 64(%rcx),$MASK # .Lmask26
2202 + vpermd $D1,$T2,$R1
2203 + vpermd $T0,$T2,$S1
2204 + vpermd $D2,$T2,$R2
2205 + vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
2206 + vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
2207 + vpermd $T1,$T2,$S2
2208 + vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
2209 + vpsrlq \$32,$R1,$T1
2210 + vpermd $D3,$T2,$R3
2211 + vmovdqa64 $S1,0x40(%rsp){%k2}
2212 + vpermd $T3,$T2,$S3
2213 + vpermd $D4,$T2,$R4
2214 + vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
2215 + vpermd $T4,$T2,$S4
2216 + vmovdqa64 $S2,0x80(%rsp){%k2}
2217 + vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
2218 + vmovdqa64 $S3,0xc0(%rsp){%k2}
2219 + vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
2220 + vmovdqa64 $S4,0x100(%rsp){%k2}
2221 +
2222 + ################################################################
2223 + # calculate 5th through 8th powers of the key
2224 + #
2225 + # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2226 + # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2227 + # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
2228 + # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
2229 + # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
2230 +
2231 + vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
2232 + vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
2233 + vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
2234 + vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
2235 + vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
2236 + vpsrlq \$32,$R2,$T2
2237 +
2238 + vpmuludq $T1,$S4,$M0
2239 + vpmuludq $T1,$R0,$M1
2240 + vpmuludq $T1,$R1,$M2
2241 + vpmuludq $T1,$R2,$M3
2242 + vpmuludq $T1,$R3,$M4
2243 + vpsrlq \$32,$R3,$T3
2244 + vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
2245 + vpaddq $M1,$D1,$D1 # d1 += r1'*r0
2246 + vpaddq $M2,$D2,$D2 # d2 += r1'*r1
2247 + vpaddq $M3,$D3,$D3 # d3 += r1'*r2
2248 + vpaddq $M4,$D4,$D4 # d4 += r1'*r3
2249 +
2250 + vpmuludq $T2,$S3,$M0
2251 + vpmuludq $T2,$S4,$M1
2252 + vpmuludq $T2,$R1,$M3
2253 + vpmuludq $T2,$R2,$M4
2254 + vpmuludq $T2,$R0,$M2
2255 + vpsrlq \$32,$R4,$T4
2256 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
2257 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
2258 + vpaddq $M3,$D3,$D3 # d3 += r2'*r1
2259 + vpaddq $M4,$D4,$D4 # d4 += r2'*r2
2260 + vpaddq $M2,$D2,$D2 # d2 += r2'*r0
2261 +
2262 + vpmuludq $T3,$S2,$M0
2263 + vpmuludq $T3,$R0,$M3
2264 + vpmuludq $T3,$R1,$M4
2265 + vpmuludq $T3,$S3,$M1
2266 + vpmuludq $T3,$S4,$M2
2267 + vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
2268 + vpaddq $M3,$D3,$D3 # d3 += r3'*r0
2269 + vpaddq $M4,$D4,$D4 # d4 += r3'*r1
2270 + vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
2271 + vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
2272 +
2273 + vpmuludq $T4,$S4,$M3
2274 + vpmuludq $T4,$R0,$M4
2275 + vpmuludq $T4,$S1,$M0
2276 + vpmuludq $T4,$S2,$M1
2277 + vpmuludq $T4,$S3,$M2
2278 + vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
2279 + vpaddq $M4,$D4,$D4 # d4 += r2'*r0
2280 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
2281 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
2282 + vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
2283 +
2284 + ################################################################
2285 + # load input
2286 + vmovdqu64 16*0($inp),%z#$T3
2287 + vmovdqu64 16*4($inp),%z#$T4
2288 + lea 16*8($inp),$inp
2289 +
2290 + ################################################################
2291 + # lazy reduction
2292 +
2293 + vpsrlq \$26,$D3,$M3
2294 + vpandq $MASK,$D3,$D3
2295 + vpaddq $M3,$D4,$D4 # d3 -> d4
2296 +
2297 + vpsrlq \$26,$D0,$M0
2298 + vpandq $MASK,$D0,$D0
2299 + vpaddq $M0,$D1,$D1 # d0 -> d1
2300 +
2301 + vpsrlq \$26,$D4,$M4
2302 + vpandq $MASK,$D4,$D4
2303 +
2304 + vpsrlq \$26,$D1,$M1
2305 + vpandq $MASK,$D1,$D1
2306 + vpaddq $M1,$D2,$D2 # d1 -> d2
2307 +
2308 + vpaddq $M4,$D0,$D0
2309 + vpsllq \$2,$M4,$M4
2310 + vpaddq $M4,$D0,$D0 # d4 -> d0
2311 +
2312 + vpsrlq \$26,$D2,$M2
2313 + vpandq $MASK,$D2,$D2
2314 + vpaddq $M2,$D3,$D3 # d2 -> d3
2315 +
2316 + vpsrlq \$26,$D0,$M0
2317 + vpandq $MASK,$D0,$D0
2318 + vpaddq $M0,$D1,$D1 # d0 -> d1
2319 +
2320 + vpsrlq \$26,$D3,$M3
2321 + vpandq $MASK,$D3,$D3
2322 + vpaddq $M3,$D4,$D4 # d3 -> d4
2323 +
2324 + ################################################################
2325 + # at this point we have 14243444 in $R0-$S4 and 05060708 in
2326 + # $D0-$D4, ...
2327 +
2328 + vpunpcklqdq $T4,$T3,$T0 # transpose input
2329 + vpunpckhqdq $T4,$T3,$T4
2330 +
2331 + # ... since input 64-bit lanes are ordered as 73625140, we could
2332 + # "vperm" it to 76543210 (here and in each loop iteration), *or*
2333 + # we could just flow along, hence the goal for $R0-$S4 is
2334 + # 1858286838784888 ...
2335 +
2336 + vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
2337 + mov \$0x7777,%eax
2338 + kmovw %eax,%k1
2339 +
2340 + vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
2341 + vpermd $R1,$M0,$R1
2342 + vpermd $R2,$M0,$R2
2343 + vpermd $R3,$M0,$R3
2344 + vpermd $R4,$M0,$R4
2345 +
2346 + vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
2347 + vpermd $D1,$M0,${R1}{%k1}
2348 + vpermd $D2,$M0,${R2}{%k1}
2349 + vpermd $D3,$M0,${R3}{%k1}
2350 + vpermd $D4,$M0,${R4}{%k1}
2351 +
2352 + vpslld \$2,$R1,$S1 # *5
2353 + vpslld \$2,$R2,$S2
2354 + vpslld \$2,$R3,$S3
2355 + vpslld \$2,$R4,$S4
2356 + vpaddd $R1,$S1,$S1
2357 + vpaddd $R2,$S2,$S2
2358 + vpaddd $R3,$S3,$S3
2359 + vpaddd $R4,$S4,$S4
2360 +
2361 + vpbroadcastq 32(%rcx),$PADBIT # .L129
2362 +
2363 + vpsrlq \$52,$T0,$T2 # splat input
2364 + vpsllq \$12,$T4,$T3
2365 + vporq $T3,$T2,$T2
2366 + vpsrlq \$26,$T0,$T1
2367 + vpsrlq \$14,$T4,$T3
2368 + vpsrlq \$40,$T4,$T4 # 4
2369 + vpandq $MASK,$T2,$T2 # 2
2370 + vpandq $MASK,$T0,$T0 # 0
2371 + #vpandq $MASK,$T1,$T1 # 1
2372 + #vpandq $MASK,$T3,$T3 # 3
2373 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2374 +
2375 + vpaddq $H2,$T2,$H2 # accumulate input
2376 + sub \$192,$len
2377 + jbe .Ltail_avx512
2378 + jmp .Loop_avx512
2379 +
2380 +.align 32
2381 +.Loop_avx512:
2382 + ################################################################
2383 + # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2384 + # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2385 + # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2386 + # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2387 + # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2388 + # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2389 + # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2390 + # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2391 + # \________/\___________/
2392 + ################################################################
2393 + #vpaddq $H2,$T2,$H2 # accumulate input
2394 +
2395 + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
2396 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
2397 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
2398 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2399 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2400 + #
2401 + # however, as h2 is "chronologically" first one available pull
2402 + # corresponding operations up, so it's
2403 + #
2404 + # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
2405 + # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
2406 + # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
2407 + # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
2408 + # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
2409 +
2410 + vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2411 + vpaddq $H0,$T0,$H0
2412 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2413 + vpandq $MASK,$T1,$T1 # 1
2414 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2415 + vpandq $MASK,$T3,$T3 # 3
2416 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2417 + vporq $PADBIT,$T4,$T4 # padbit, yes, always
2418 + vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2419 + vpaddq $H1,$T1,$H1 # accumulate input
2420 + vpaddq $H3,$T3,$H3
2421 + vpaddq $H4,$T4,$H4
2422 +
2423 + vmovdqu64 16*0($inp),$T3 # load input
2424 + vmovdqu64 16*4($inp),$T4
2425 + lea 16*8($inp),$inp
2426 + vpmuludq $H0,$R3,$M3
2427 + vpmuludq $H0,$R4,$M4
2428 + vpmuludq $H0,$R0,$M0
2429 + vpmuludq $H0,$R1,$M1
2430 + vpaddq $M3,$D3,$D3 # d3 += h0*r3
2431 + vpaddq $M4,$D4,$D4 # d4 += h0*r4
2432 + vpaddq $M0,$D0,$D0 # d0 += h0*r0
2433 + vpaddq $M1,$D1,$D1 # d1 += h0*r1
2434 +
2435 + vpmuludq $H1,$R2,$M3
2436 + vpmuludq $H1,$R3,$M4
2437 + vpmuludq $H1,$S4,$M0
2438 + vpmuludq $H0,$R2,$M2
2439 + vpaddq $M3,$D3,$D3 # d3 += h1*r2
2440 + vpaddq $M4,$D4,$D4 # d4 += h1*r3
2441 + vpaddq $M0,$D0,$D0 # d0 += h1*s4
2442 + vpaddq $M2,$D2,$D2 # d2 += h0*r2
2443 +
2444 + vpunpcklqdq $T4,$T3,$T0 # transpose input
2445 + vpunpckhqdq $T4,$T3,$T4
2446 +
2447 + vpmuludq $H3,$R0,$M3
2448 + vpmuludq $H3,$R1,$M4
2449 + vpmuludq $H1,$R0,$M1
2450 + vpmuludq $H1,$R1,$M2
2451 + vpaddq $M3,$D3,$D3 # d3 += h3*r0
2452 + vpaddq $M4,$D4,$D4 # d4 += h3*r1
2453 + vpaddq $M1,$D1,$D1 # d1 += h1*r0
2454 + vpaddq $M2,$D2,$D2 # d2 += h1*r1
2455 +
2456 + vpmuludq $H4,$S4,$M3
2457 + vpmuludq $H4,$R0,$M4
2458 + vpmuludq $H3,$S2,$M0
2459 + vpmuludq $H3,$S3,$M1
2460 + vpaddq $M3,$D3,$D3 # d3 += h4*s4
2461 + vpmuludq $H3,$S4,$M2
2462 + vpaddq $M4,$D4,$D4 # d4 += h4*r0
2463 + vpaddq $M0,$D0,$D0 # d0 += h3*s2
2464 + vpaddq $M1,$D1,$D1 # d1 += h3*s3
2465 + vpaddq $M2,$D2,$D2 # d2 += h3*s4
2466 +
2467 + vpmuludq $H4,$S1,$M0
2468 + vpmuludq $H4,$S2,$M1
2469 + vpmuludq $H4,$S3,$M2
2470 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2471 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2472 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2473 +
2474 + ################################################################
2475 + # lazy reduction (interleaved with input splat)
2476 +
2477 + vpsrlq \$52,$T0,$T2 # splat input
2478 + vpsllq \$12,$T4,$T3
2479 +
2480 + vpsrlq \$26,$D3,$H3
2481 + vpandq $MASK,$D3,$D3
2482 + vpaddq $H3,$D4,$H4 # h3 -> h4
2483 +
2484 + vporq $T3,$T2,$T2
2485 +
2486 + vpsrlq \$26,$H0,$D0
2487 + vpandq $MASK,$H0,$H0
2488 + vpaddq $D0,$H1,$H1 # h0 -> h1
2489 +
2490 + vpandq $MASK,$T2,$T2 # 2
2491 +
2492 + vpsrlq \$26,$H4,$D4
2493 + vpandq $MASK,$H4,$H4
2494 +
2495 + vpsrlq \$26,$H1,$D1
2496 + vpandq $MASK,$H1,$H1
2497 + vpaddq $D1,$H2,$H2 # h1 -> h2
2498 +
2499 + vpaddq $D4,$H0,$H0
2500 + vpsllq \$2,$D4,$D4
2501 + vpaddq $D4,$H0,$H0 # h4 -> h0
2502 +
2503 + vpaddq $T2,$H2,$H2 # modulo-scheduled
2504 + vpsrlq \$26,$T0,$T1
2505 +
2506 + vpsrlq \$26,$H2,$D2
2507 + vpandq $MASK,$H2,$H2
2508 + vpaddq $D2,$D3,$H3 # h2 -> h3
2509 +
2510 + vpsrlq \$14,$T4,$T3
2511 +
2512 + vpsrlq \$26,$H0,$D0
2513 + vpandq $MASK,$H0,$H0
2514 + vpaddq $D0,$H1,$H1 # h0 -> h1
2515 +
2516 + vpsrlq \$40,$T4,$T4 # 4
2517 +
2518 + vpsrlq \$26,$H3,$D3
2519 + vpandq $MASK,$H3,$H3
2520 + vpaddq $D3,$H4,$H4 # h3 -> h4
2521 +
2522 + vpandq $MASK,$T0,$T0 # 0
2523 + #vpandq $MASK,$T1,$T1 # 1
2524 + #vpandq $MASK,$T3,$T3 # 3
2525 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2526 +
2527 + sub \$128,$len
2528 + ja .Loop_avx512
2529 +
2530 +.Ltail_avx512:
2531 + ################################################################
2532 + # while above multiplications were by r^8 in all lanes, in last
2533 + # iteration we multiply least significant lane by r^8 and most
2534 + # significant one by r, that's why table gets shifted...
2535 +
2536 + vpsrlq \$32,$R0,$R0 # 0105020603070408
2537 + vpsrlq \$32,$R1,$R1
2538 + vpsrlq \$32,$R2,$R2
2539 + vpsrlq \$32,$S3,$S3
2540 + vpsrlq \$32,$S4,$S4
2541 + vpsrlq \$32,$R3,$R3
2542 + vpsrlq \$32,$R4,$R4
2543 + vpsrlq \$32,$S1,$S1
2544 + vpsrlq \$32,$S2,$S2
2545 +
2546 + ################################################################
2547 + # load either next or last 64 byte of input
2548 + lea ($inp,$len),$inp
2549 +
2550 + #vpaddq $H2,$T2,$H2 # accumulate input
2551 + vpaddq $H0,$T0,$H0
2552 +
2553 + vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2554 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2555 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2556 + vpandq $MASK,$T1,$T1 # 1
2557 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2558 + vpandq $MASK,$T3,$T3 # 3
2559 + vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2560 + vporq $PADBIT,$T4,$T4 # padbit, yes, always
2561 + vpaddq $H1,$T1,$H1 # accumulate input
2562 + vpaddq $H3,$T3,$H3
2563 + vpaddq $H4,$T4,$H4
2564 +
2565 + vmovdqu 16*0($inp),%x#$T0
2566 + vpmuludq $H0,$R3,$M3
2567 + vpmuludq $H0,$R4,$M4
2568 + vpmuludq $H0,$R0,$M0
2569 + vpmuludq $H0,$R1,$M1
2570 + vpaddq $M3,$D3,$D3 # d3 += h0*r3
2571 + vpaddq $M4,$D4,$D4 # d4 += h0*r4
2572 + vpaddq $M0,$D0,$D0 # d0 += h0*r0
2573 + vpaddq $M1,$D1,$D1 # d1 += h0*r1
2574 +
2575 + vmovdqu 16*1($inp),%x#$T1
2576 + vpmuludq $H1,$R2,$M3
2577 + vpmuludq $H1,$R3,$M4
2578 + vpmuludq $H1,$S4,$M0
2579 + vpmuludq $H0,$R2,$M2
2580 + vpaddq $M3,$D3,$D3 # d3 += h1*r2
2581 + vpaddq $M4,$D4,$D4 # d4 += h1*r3
2582 + vpaddq $M0,$D0,$D0 # d0 += h1*s4
2583 + vpaddq $M2,$D2,$D2 # d2 += h0*r2
2584 +
2585 + vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
2586 + vpmuludq $H3,$R0,$M3
2587 + vpmuludq $H3,$R1,$M4
2588 + vpmuludq $H1,$R0,$M1
2589 + vpmuludq $H1,$R1,$M2
2590 + vpaddq $M3,$D3,$D3 # d3 += h3*r0
2591 + vpaddq $M4,$D4,$D4 # d4 += h3*r1
2592 + vpaddq $M1,$D1,$D1 # d1 += h1*r0
2593 + vpaddq $M2,$D2,$D2 # d2 += h1*r1
2594 +
2595 + vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
2596 + vpmuludq $H4,$S4,$M3
2597 + vpmuludq $H4,$R0,$M4
2598 + vpmuludq $H3,$S2,$M0
2599 + vpmuludq $H3,$S3,$M1
2600 + vpmuludq $H3,$S4,$M2
2601 + vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
2602 + vpaddq $M4,$D4,$D4 # d4 += h4*r0
2603 + vpaddq $M0,$D0,$D0 # d0 += h3*s2
2604 + vpaddq $M1,$D1,$D1 # d1 += h3*s3
2605 + vpaddq $M2,$D2,$D2 # d2 += h3*s4
2606 +
2607 + vpmuludq $H4,$S1,$M0
2608 + vpmuludq $H4,$S2,$M1
2609 + vpmuludq $H4,$S3,$M2
2610 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2611 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2612 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2613 +
2614 + ################################################################
2615 + # horizontal addition
2616 +
2617 + mov \$1,%eax
2618 + vpermq \$0xb1,$H3,$D3
2619 + vpermq \$0xb1,$D4,$H4
2620 + vpermq \$0xb1,$H0,$D0
2621 + vpermq \$0xb1,$H1,$D1
2622 + vpermq \$0xb1,$H2,$D2
2623 + vpaddq $D3,$H3,$H3
2624 + vpaddq $D4,$H4,$H4
2625 + vpaddq $D0,$H0,$H0
2626 + vpaddq $D1,$H1,$H1
2627 + vpaddq $D2,$H2,$H2
2628 +
2629 + kmovw %eax,%k3
2630 + vpermq \$0x2,$H3,$D3
2631 + vpermq \$0x2,$H4,$D4
2632 + vpermq \$0x2,$H0,$D0
2633 + vpermq \$0x2,$H1,$D1
2634 + vpermq \$0x2,$H2,$D2
2635 + vpaddq $D3,$H3,$H3
2636 + vpaddq $D4,$H4,$H4
2637 + vpaddq $D0,$H0,$H0
2638 + vpaddq $D1,$H1,$H1
2639 + vpaddq $D2,$H2,$H2
2640 +
2641 + vextracti64x4 \$0x1,$H3,%y#$D3
2642 + vextracti64x4 \$0x1,$H4,%y#$D4
2643 + vextracti64x4 \$0x1,$H0,%y#$D0
2644 + vextracti64x4 \$0x1,$H1,%y#$D1
2645 + vextracti64x4 \$0x1,$H2,%y#$D2
2646 + vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
2647 + vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
2648 + vpaddq $D0,$H0,${H0}{%k3}{z}
2649 + vpaddq $D1,$H1,${H1}{%k3}{z}
2650 + vpaddq $D2,$H2,${H2}{%k3}{z}
2651 +___
2652 +map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2653 +map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2654 +$code.=<<___;
2655 + ################################################################
2656 + # lazy reduction (interleaved with input splat)
2657 +
2658 + vpsrlq \$26,$H3,$D3
2659 + vpand $MASK,$H3,$H3
2660 + vpsrldq \$6,$T0,$T2 # splat input
2661 + vpsrldq \$6,$T1,$T3
2662 + vpunpckhqdq $T1,$T0,$T4 # 4
2663 + vpaddq $D3,$H4,$H4 # h3 -> h4
2664 +
2665 + vpsrlq \$26,$H0,$D0
2666 + vpand $MASK,$H0,$H0
2667 + vpunpcklqdq $T3,$T2,$T2 # 2:3
2668 + vpunpcklqdq $T1,$T0,$T0 # 0:1
2669 + vpaddq $D0,$H1,$H1 # h0 -> h1
2670 +
2671 + vpsrlq \$26,$H4,$D4
2672 + vpand $MASK,$H4,$H4
2673 +
2674 + vpsrlq \$26,$H1,$D1
2675 + vpand $MASK,$H1,$H1
2676 + vpsrlq \$30,$T2,$T3
2677 + vpsrlq \$4,$T2,$T2
2678 + vpaddq $D1,$H2,$H2 # h1 -> h2
2679 +
2680 + vpaddq $D4,$H0,$H0
2681 + vpsllq \$2,$D4,$D4
2682 + vpsrlq \$26,$T0,$T1
2683 + vpsrlq \$40,$T4,$T4 # 4
2684 + vpaddq $D4,$H0,$H0 # h4 -> h0
2685 +
2686 + vpsrlq \$26,$H2,$D2
2687 + vpand $MASK,$H2,$H2
2688 + vpand $MASK,$T2,$T2 # 2
2689 + vpand $MASK,$T0,$T0 # 0
2690 + vpaddq $D2,$H3,$H3 # h2 -> h3
2691 +
2692 + vpsrlq \$26,$H0,$D0
2693 + vpand $MASK,$H0,$H0
2694 + vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
2695 + vpand $MASK,$T1,$T1 # 1
2696 + vpaddq $D0,$H1,$H1 # h0 -> h1
2697 +
2698 + vpsrlq \$26,$H3,$D3
2699 + vpand $MASK,$H3,$H3
2700 + vpand $MASK,$T3,$T3 # 3
2701 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always
2702 + vpaddq $D3,$H4,$H4 # h3 -> h4
2703 +
2704 + lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
2705 + add \$64,$len
2706 + jnz .Ltail_avx2
2707 +
2708 + vpsubq $T2,$H2,$H2 # undo input accumulation
2709 + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2710 + vmovd %x#$H1,`4*1-48-64`($ctx)
2711 + vmovd %x#$H2,`4*2-48-64`($ctx)
2712 + vmovd %x#$H3,`4*3-48-64`($ctx)
2713 + vmovd %x#$H4,`4*4-48-64`($ctx)
2714 + vzeroall
2715 +___
2716 +$code.=<<___ if ($win64);
2717 + movdqa 0x50(%r11),%xmm6
2718 + movdqa 0x60(%r11),%xmm7
2719 + movdqa 0x70(%r11),%xmm8
2720 + movdqa 0x80(%r11),%xmm9
2721 + movdqa 0x90(%r11),%xmm10
2722 + movdqa 0xa0(%r11),%xmm11
2723 + movdqa 0xb0(%r11),%xmm12
2724 + movdqa 0xc0(%r11),%xmm13
2725 + movdqa 0xd0(%r11),%xmm14
2726 + movdqa 0xe0(%r11),%xmm15
2727 + lea 0xf8(%r11),%rsp
2728 +.Ldo_avx512_epilogue:
2729 +___
2730 +$code.=<<___ if (!$win64);
2731 + lea 8(%r11),%rsp
2732 +.cfi_def_cfa %rsp,8
2733 +___
2734 +$code.=<<___;
2735 + ret
2736 +.cfi_endproc
2737 +.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
2738 +___
2739 +if ($avx>3) {
2740 +########################################################################
2741 +# VPMADD52 version using 2^44 radix.
2742 +#
2743 +# One can argue that base 2^52 would be more natural. Well, even though
2744 +# some operations would be more natural, one has to recognize couple of
2745 +# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2746 +# at amount of multiply-n-accumulate operations. Secondly, it makes it
2747 +# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2748 +# reference implementations], which means that more such operations
2749 +# would have to be performed in inner loop, which in turn makes critical
2750 +# path longer. In other words, even though base 2^44 reduction might
2751 +# look less elegant, overall critical path is actually shorter...
2752 +
2753 +########################################################################
2754 +# Layout of opaque area is following.
2755 +#
2756 +# unsigned __int64 h[3]; # current hash value base 2^44
2757 +# unsigned __int64 s[2]; # key value*20 base 2^44
2758 +# unsigned __int64 r[3]; # key value base 2^44
2759 +# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2760 +# # r^n positions reflect
2761 +# # placement in register, not
2762 +# # memory, R[3] is R[1]*20
2763 +
2764 +$code.=<<___;
2765 +.type poly1305_init_base2_44,\@function,3
2766 +.align 32
2767 +poly1305_init_base2_44:
2768 + xor %rax,%rax
2769 + mov %rax,0($ctx) # initialize hash value
2770 + mov %rax,8($ctx)
2771 + mov %rax,16($ctx)
2772 +
2773 +.Linit_base2_44:
2774 + lea poly1305_blocks_vpmadd52(%rip),%r10
2775 + lea poly1305_emit_base2_44(%rip),%r11
2776 +
2777 + mov \$0x0ffffffc0fffffff,%rax
2778 + mov \$0x0ffffffc0ffffffc,%rcx
2779 + and 0($inp),%rax
2780 + mov \$0x00000fffffffffff,%r8
2781 + and 8($inp),%rcx
2782 + mov \$0x00000fffffffffff,%r9
2783 + and %rax,%r8
2784 + shrd \$44,%rcx,%rax
2785 + mov %r8,40($ctx) # r0
2786 + and %r9,%rax
2787 + shr \$24,%rcx
2788 + mov %rax,48($ctx) # r1
2789 + lea (%rax,%rax,4),%rax # *5
2790 + mov %rcx,56($ctx) # r2
2791 + shl \$2,%rax # magic <<2
2792 + lea (%rcx,%rcx,4),%rcx # *5
2793 + shl \$2,%rcx # magic <<2
2794 + mov %rax,24($ctx) # s1
2795 + mov %rcx,32($ctx) # s2
2796 + movq \$-1,64($ctx) # write impossible value
2797 +___
2798 +$code.=<<___ if ($flavour !~ /elf32/);
2799 + mov %r10,0(%rdx)
2800 + mov %r11,8(%rdx)
2801 +___
2802 +$code.=<<___ if ($flavour =~ /elf32/);
2803 + mov %r10d,0(%rdx)
2804 + mov %r11d,4(%rdx)
2805 +___
2806 +$code.=<<___;
2807 + mov \$1,%eax
2808 + ret
2809 +.size poly1305_init_base2_44,.-poly1305_init_base2_44
2810 +___
2811 +{
2812 +my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2813 +my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2814 +my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2815 +
2816 +$code.=<<___;
2817 +.type poly1305_blocks_vpmadd52,\@function,4
2818 +.align 32
2819 +poly1305_blocks_vpmadd52:
2820 + shr \$4,$len
2821 + jz .Lno_data_vpmadd52 # too short
2822 +
2823 + shl \$40,$padbit
2824 + mov 64($ctx),%r8 # peek on power of the key
2825 +
2826 + # if powers of the key are not calculated yet, process up to 3
2827 + # blocks with this single-block subroutine, otherwise ensure that
2828 + # length is divisible by 2 blocks and pass the rest down to next
2829 + # subroutine...
2830 +
2831 + mov \$3,%rax
2832 + mov \$1,%r10
2833 + cmp \$4,$len # is input long
2834 + cmovae %r10,%rax
2835 + test %r8,%r8 # is power value impossible?
2836 + cmovns %r10,%rax
2837 +
2838 + and $len,%rax # is input of favourable length?
2839 + jz .Lblocks_vpmadd52_4x
2840 +
2841 + sub %rax,$len
2842 + mov \$7,%r10d
2843 + mov \$1,%r11d
2844 + kmovw %r10d,%k7
2845 + lea .L2_44_inp_permd(%rip),%r10
2846 + kmovw %r11d,%k1
2847 +
2848 + vmovq $padbit,%x#$PAD
2849 + vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
2850 + vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
2851 + vpermq \$0xcf,$PAD,$PAD
2852 + vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
2853 +
2854 + vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
2855 + vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
2856 + vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
2857 + vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
2858 +
2859 + vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
2860 + vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
2861 +
2862 + jmp .Loop_vpmadd52
2863 +
2864 +.align 32
2865 +.Loop_vpmadd52:
2866 + vmovdqu32 0($inp),%x#$T0 # load input as ----3210
2867 + lea 16($inp),$inp
2868 +
2869 + vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
2870 + vpsrlvq $inp_shift,$T0,$T0
2871 + vpandq $reduc_mask,$T0,$T0
2872 + vporq $PAD,$T0,$T0
2873 +
2874 + vpaddq $T0,$Dlo,$Dlo # accumulate input
2875 +
2876 + vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
2877 + vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
2878 + vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
2879 +
2880 + vpxord $Dlo,$Dlo,$Dlo
2881 + vpxord $Dhi,$Dhi,$Dhi
2882 +
2883 + vpmadd52luq $r2r1r0,$H0,$Dlo
2884 + vpmadd52huq $r2r1r0,$H0,$Dhi
2885 +
2886 + vpmadd52luq $r1r0s2,$H1,$Dlo
2887 + vpmadd52huq $r1r0s2,$H1,$Dhi
2888 +
2889 + vpmadd52luq $r0s2s1,$H2,$Dlo
2890 + vpmadd52huq $r0s2s1,$H2,$Dhi
2891 +
2892 + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
2893 + vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
2894 + vpandq $reduc_mask,$Dlo,$Dlo
2895 +
2896 + vpaddq $T0,$Dhi,$Dhi
2897 +
2898 + vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
2899 +
2900 + vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
2901 +
2902 + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
2903 + vpandq $reduc_mask,$Dlo,$Dlo
2904 +
2905 + vpermq \$0b10010011,$T0,$T0
2906 +
2907 + vpaddq $T0,$Dlo,$Dlo
2908 +
2909 + vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
2910 +
2911 + vpaddq $T0,$Dlo,$Dlo
2912 + vpsllq \$2,$T0,$T0
2913 +
2914 + vpaddq $T0,$Dlo,$Dlo
2915 +
2916 + dec %rax # len-=16
2917 + jnz .Loop_vpmadd52
2918 +
2919 + vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
2920 +
2921 + test $len,$len
2922 + jnz .Lblocks_vpmadd52_4x
2923 +
2924 +.Lno_data_vpmadd52:
2925 + ret
2926 +.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
2927 +___
2928 +}
2929 +{
2930 +########################################################################
2931 +# As implied by its name 4x subroutine processes 4 blocks in parallel
2932 +# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
2933 +# and is handled in 256-bit %ymm registers.
2934 +
2935 +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
2936 +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
2937 +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
2938 +
2939 +$code.=<<___;
2940 +.type poly1305_blocks_vpmadd52_4x,\@function,4
2941 +.align 32
2942 +poly1305_blocks_vpmadd52_4x:
2943 + shr \$4,$len
2944 + jz .Lno_data_vpmadd52_4x # too short
2945 +
2946 + shl \$40,$padbit
2947 + mov 64($ctx),%r8 # peek on power of the key
2948 +
2949 +.Lblocks_vpmadd52_4x:
2950 + vpbroadcastq $padbit,$PAD
2951 +
2952 + vmovdqa64 .Lx_mask44(%rip),$mask44
2953 + mov \$5,%eax
2954 + vmovdqa64 .Lx_mask42(%rip),$mask42
2955 + kmovw %eax,%k1 # used in 2x path
2956 +
2957 + test %r8,%r8 # is power value impossible?
2958 + js .Linit_vpmadd52 # if it is, then init R[4]
2959 +
2960 + vmovq 0($ctx),%x#$H0 # load current hash value
2961 + vmovq 8($ctx),%x#$H1
2962 + vmovq 16($ctx),%x#$H2
2963 +
2964 + test \$3,$len # is length 4*n+2?
2965 + jnz .Lblocks_vpmadd52_2x_do
2966 +
2967 +.Lblocks_vpmadd52_4x_do:
2968 + vpbroadcastq 64($ctx),$R0 # load 4th power of the key
2969 + vpbroadcastq 96($ctx),$R1
2970 + vpbroadcastq 128($ctx),$R2
2971 + vpbroadcastq 160($ctx),$S1
2972 +
2973 +.Lblocks_vpmadd52_4x_key_loaded:
2974 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4
2975 + vpaddq $R2,$S2,$S2
2976 + vpsllq \$2,$S2,$S2
2977 +
2978 + test \$7,$len # is len 8*n?
2979 + jz .Lblocks_vpmadd52_8x
2980 +
2981 + vmovdqu64 16*0($inp),$T2 # load data
2982 + vmovdqu64 16*2($inp),$T3
2983 + lea 16*4($inp),$inp
2984 +
2985 + vpunpcklqdq $T3,$T2,$T1 # transpose data
2986 + vpunpckhqdq $T3,$T2,$T3
2987 +
2988 + # at this point 64-bit lanes are ordered as 3-1-2-0
2989 +
2990 + vpsrlq \$24,$T3,$T2 # splat the data
2991 + vporq $PAD,$T2,$T2
2992 + vpaddq $T2,$H2,$H2 # accumulate input
2993 + vpandq $mask44,$T1,$T0
2994 + vpsrlq \$44,$T1,$T1
2995 + vpsllq \$20,$T3,$T3
2996 + vporq $T3,$T1,$T1
2997 + vpandq $mask44,$T1,$T1
2998 +
2999 + sub \$4,$len
3000 + jz .Ltail_vpmadd52_4x
3001 + jmp .Loop_vpmadd52_4x
3002 + ud2
3003 +
3004 +.align 32
3005 +.Linit_vpmadd52:
3006 + vmovq 24($ctx),%x#$S1 # load key
3007 + vmovq 56($ctx),%x#$H2
3008 + vmovq 32($ctx),%x#$S2
3009 + vmovq 40($ctx),%x#$R0
3010 + vmovq 48($ctx),%x#$R1
3011 +
3012 + vmovdqa $R0,$H0
3013 + vmovdqa $R1,$H1
3014 + vmovdqa $H2,$R2
3015 +
3016 + mov \$2,%eax
3017 +
3018 +.Lmul_init_vpmadd52:
3019 + vpxorq $D0lo,$D0lo,$D0lo
3020 + vpmadd52luq $H2,$S1,$D0lo
3021 + vpxorq $D0hi,$D0hi,$D0hi
3022 + vpmadd52huq $H2,$S1,$D0hi
3023 + vpxorq $D1lo,$D1lo,$D1lo
3024 + vpmadd52luq $H2,$S2,$D1lo
3025 + vpxorq $D1hi,$D1hi,$D1hi
3026 + vpmadd52huq $H2,$S2,$D1hi
3027 + vpxorq $D2lo,$D2lo,$D2lo
3028 + vpmadd52luq $H2,$R0,$D2lo
3029 + vpxorq $D2hi,$D2hi,$D2hi
3030 + vpmadd52huq $H2,$R0,$D2hi
3031 +
3032 + vpmadd52luq $H0,$R0,$D0lo
3033 + vpmadd52huq $H0,$R0,$D0hi
3034 + vpmadd52luq $H0,$R1,$D1lo
3035 + vpmadd52huq $H0,$R1,$D1hi
3036 + vpmadd52luq $H0,$R2,$D2lo
3037 + vpmadd52huq $H0,$R2,$D2hi
3038 +
3039 + vpmadd52luq $H1,$S2,$D0lo
3040 + vpmadd52huq $H1,$S2,$D0hi
3041 + vpmadd52luq $H1,$R0,$D1lo
3042 + vpmadd52huq $H1,$R0,$D1hi
3043 + vpmadd52luq $H1,$R1,$D2lo
3044 + vpmadd52huq $H1,$R1,$D2hi
3045 +
3046 + ################################################################
3047 + # partial reduction
3048 + vpsrlq \$44,$D0lo,$tmp
3049 + vpsllq \$8,$D0hi,$D0hi
3050 + vpandq $mask44,$D0lo,$H0
3051 + vpaddq $tmp,$D0hi,$D0hi
3052 +
3053 + vpaddq $D0hi,$D1lo,$D1lo
3054 +
3055 + vpsrlq \$44,$D1lo,$tmp
3056 + vpsllq \$8,$D1hi,$D1hi
3057 + vpandq $mask44,$D1lo,$H1
3058 + vpaddq $tmp,$D1hi,$D1hi
3059 +
3060 + vpaddq $D1hi,$D2lo,$D2lo
3061 +
3062 + vpsrlq \$42,$D2lo,$tmp
3063 + vpsllq \$10,$D2hi,$D2hi
3064 + vpandq $mask42,$D2lo,$H2
3065 + vpaddq $tmp,$D2hi,$D2hi
3066 +
3067 + vpaddq $D2hi,$H0,$H0
3068 + vpsllq \$2,$D2hi,$D2hi
3069 +
3070 + vpaddq $D2hi,$H0,$H0
3071 +
3072 + vpsrlq \$44,$H0,$tmp # additional step
3073 + vpandq $mask44,$H0,$H0
3074 +
3075 + vpaddq $tmp,$H1,$H1
3076 +
3077 + dec %eax
3078 + jz .Ldone_init_vpmadd52
3079 +
3080 + vpunpcklqdq $R1,$H1,$R1 # 1,2
3081 + vpbroadcastq %x#$H1,%x#$H1 # 2,2
3082 + vpunpcklqdq $R2,$H2,$R2
3083 + vpbroadcastq %x#$H2,%x#$H2
3084 + vpunpcklqdq $R0,$H0,$R0
3085 + vpbroadcastq %x#$H0,%x#$H0
3086 +
3087 + vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3088 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3089 + vpaddq $R1,$S1,$S1
3090 + vpaddq $R2,$S2,$S2
3091 + vpsllq \$2,$S1,$S1
3092 + vpsllq \$2,$S2,$S2
3093 +
3094 + jmp .Lmul_init_vpmadd52
3095 + ud2
3096 +
3097 +.align 32
3098 +.Ldone_init_vpmadd52:
3099 + vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
3100 + vinserti128 \$1,%x#$R2,$H2,$R2
3101 + vinserti128 \$1,%x#$R0,$H0,$R0
3102 +
3103 + vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
3104 + vpermq \$0b11011000,$R2,$R2
3105 + vpermq \$0b11011000,$R0,$R0
3106 +
3107 + vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3108 + vpaddq $R1,$S1,$S1
3109 + vpsllq \$2,$S1,$S1
3110 +
3111 + vmovq 0($ctx),%x#$H0 # load current hash value
3112 + vmovq 8($ctx),%x#$H1
3113 + vmovq 16($ctx),%x#$H2
3114 +
3115 + test \$3,$len # is length 4*n+2?
3116 + jnz .Ldone_init_vpmadd52_2x
3117 +
3118 + vmovdqu64 $R0,64($ctx) # save key powers
3119 + vpbroadcastq %x#$R0,$R0 # broadcast 4th power
3120 + vmovdqu64 $R1,96($ctx)
3121 + vpbroadcastq %x#$R1,$R1
3122 + vmovdqu64 $R2,128($ctx)
3123 + vpbroadcastq %x#$R2,$R2
3124 + vmovdqu64 $S1,160($ctx)
3125 + vpbroadcastq %x#$S1,$S1
3126 +
3127 + jmp .Lblocks_vpmadd52_4x_key_loaded
3128 + ud2
3129 +
3130 +.align 32
3131 +.Ldone_init_vpmadd52_2x:
3132 + vmovdqu64 $R0,64($ctx) # save key powers
3133 + vpsrldq \$8,$R0,$R0 # 0-1-0-2
3134 + vmovdqu64 $R1,96($ctx)
3135 + vpsrldq \$8,$R1,$R1
3136 + vmovdqu64 $R2,128($ctx)
3137 + vpsrldq \$8,$R2,$R2
3138 + vmovdqu64 $S1,160($ctx)
3139 + vpsrldq \$8,$S1,$S1
3140 + jmp .Lblocks_vpmadd52_2x_key_loaded
3141 + ud2
3142 +
3143 +.align 32
3144 +.Lblocks_vpmadd52_2x_do:
3145 + vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
3146 + vmovdqu64 160+8($ctx),${S1}{%k1}{z}
3147 + vmovdqu64 64+8($ctx),${R0}{%k1}{z}
3148 + vmovdqu64 96+8($ctx),${R1}{%k1}{z}
3149 +
3150 +.Lblocks_vpmadd52_2x_key_loaded:
3151 + vmovdqu64 16*0($inp),$T2 # load data
3152 + vpxorq $T3,$T3,$T3
3153 + lea 16*2($inp),$inp
3154 +
3155 + vpunpcklqdq $T3,$T2,$T1 # transpose data
3156 + vpunpckhqdq $T3,$T2,$T3
3157 +
3158 + # at this point 64-bit lanes are ordered as x-1-x-0
3159 +
3160 + vpsrlq \$24,$T3,$T2 # splat the data
3161 + vporq $PAD,$T2,$T2
3162 + vpaddq $T2,$H2,$H2 # accumulate input
3163 + vpandq $mask44,$T1,$T0
3164 + vpsrlq \$44,$T1,$T1
3165 + vpsllq \$20,$T3,$T3
3166 + vporq $T3,$T1,$T1
3167 + vpandq $mask44,$T1,$T1
3168 +
3169 + jmp .Ltail_vpmadd52_2x
3170 + ud2
3171 +
3172 +.align 32
3173 +.Loop_vpmadd52_4x:
3174 + #vpaddq $T2,$H2,$H2 # accumulate input
3175 + vpaddq $T0,$H0,$H0
3176 + vpaddq $T1,$H1,$H1
3177 +
3178 + vpxorq $D0lo,$D0lo,$D0lo
3179 + vpmadd52luq $H2,$S1,$D0lo
3180 + vpxorq $D0hi,$D0hi,$D0hi
3181 + vpmadd52huq $H2,$S1,$D0hi
3182 + vpxorq $D1lo,$D1lo,$D1lo
3183 + vpmadd52luq $H2,$S2,$D1lo
3184 + vpxorq $D1hi,$D1hi,$D1hi
3185 + vpmadd52huq $H2,$S2,$D1hi
3186 + vpxorq $D2lo,$D2lo,$D2lo
3187 + vpmadd52luq $H2,$R0,$D2lo
3188 + vpxorq $D2hi,$D2hi,$D2hi
3189 + vpmadd52huq $H2,$R0,$D2hi
3190 +
3191 + vmovdqu64 16*0($inp),$T2 # load data
3192 + vmovdqu64 16*2($inp),$T3
3193 + lea 16*4($inp),$inp
3194 + vpmadd52luq $H0,$R0,$D0lo
3195 + vpmadd52huq $H0,$R0,$D0hi
3196 + vpmadd52luq $H0,$R1,$D1lo
3197 + vpmadd52huq $H0,$R1,$D1hi
3198 + vpmadd52luq $H0,$R2,$D2lo
3199 + vpmadd52huq $H0,$R2,$D2hi
3200 +
3201 + vpunpcklqdq $T3,$T2,$T1 # transpose data
3202 + vpunpckhqdq $T3,$T2,$T3
3203 + vpmadd52luq $H1,$S2,$D0lo
3204 + vpmadd52huq $H1,$S2,$D0hi
3205 + vpmadd52luq $H1,$R0,$D1lo
3206 + vpmadd52huq $H1,$R0,$D1hi
3207 + vpmadd52luq $H1,$R1,$D2lo
3208 + vpmadd52huq $H1,$R1,$D2hi
3209 +
3210 + ################################################################
3211 + # partial reduction (interleaved with data splat)
3212 + vpsrlq \$44,$D0lo,$tmp
3213 + vpsllq \$8,$D0hi,$D0hi
3214 + vpandq $mask44,$D0lo,$H0
3215 + vpaddq $tmp,$D0hi,$D0hi
3216 +
3217 + vpsrlq \$24,$T3,$T2
3218 + vporq $PAD,$T2,$T2
3219 + vpaddq $D0hi,$D1lo,$D1lo
3220 +
3221 + vpsrlq \$44,$D1lo,$tmp
3222 + vpsllq \$8,$D1hi,$D1hi
3223 + vpandq $mask44,$D1lo,$H1
3224 + vpaddq $tmp,$D1hi,$D1hi
3225 +
3226 + vpandq $mask44,$T1,$T0
3227 + vpsrlq \$44,$T1,$T1
3228 + vpsllq \$20,$T3,$T3
3229 + vpaddq $D1hi,$D2lo,$D2lo
3230 +
3231 + vpsrlq \$42,$D2lo,$tmp
3232 + vpsllq \$10,$D2hi,$D2hi
3233 + vpandq $mask42,$D2lo,$H2
3234 + vpaddq $tmp,$D2hi,$D2hi
3235 +
3236 + vpaddq $T2,$H2,$H2 # accumulate input
3237 + vpaddq $D2hi,$H0,$H0
3238 + vpsllq \$2,$D2hi,$D2hi
3239 +
3240 + vpaddq $D2hi,$H0,$H0
3241 + vporq $T3,$T1,$T1
3242 + vpandq $mask44,$T1,$T1
3243 +
3244 + vpsrlq \$44,$H0,$tmp # additional step
3245 + vpandq $mask44,$H0,$H0
3246 +
3247 + vpaddq $tmp,$H1,$H1
3248 +
3249 + sub \$4,$len # len-=64
3250 + jnz .Loop_vpmadd52_4x
3251 +
3252 +.Ltail_vpmadd52_4x:
3253 + vmovdqu64 128($ctx),$R2 # load all key powers
3254 + vmovdqu64 160($ctx),$S1
3255 + vmovdqu64 64($ctx),$R0
3256 + vmovdqu64 96($ctx),$R1
3257 +
3258 +.Ltail_vpmadd52_2x:
3259 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3260 + vpaddq $R2,$S2,$S2
3261 + vpsllq \$2,$S2,$S2
3262 +
3263 + #vpaddq $T2,$H2,$H2 # accumulate input
3264 + vpaddq $T0,$H0,$H0
3265 + vpaddq $T1,$H1,$H1
3266 +
3267 + vpxorq $D0lo,$D0lo,$D0lo
3268 + vpmadd52luq $H2,$S1,$D0lo
3269 + vpxorq $D0hi,$D0hi,$D0hi
3270 + vpmadd52huq $H2,$S1,$D0hi
3271 + vpxorq $D1lo,$D1lo,$D1lo
3272 + vpmadd52luq $H2,$S2,$D1lo
3273 + vpxorq $D1hi,$D1hi,$D1hi
3274 + vpmadd52huq $H2,$S2,$D1hi
3275 + vpxorq $D2lo,$D2lo,$D2lo
3276 + vpmadd52luq $H2,$R0,$D2lo
3277 + vpxorq $D2hi,$D2hi,$D2hi
3278 + vpmadd52huq $H2,$R0,$D2hi
3279 +
3280 + vpmadd52luq $H0,$R0,$D0lo
3281 + vpmadd52huq $H0,$R0,$D0hi
3282 + vpmadd52luq $H0,$R1,$D1lo
3283 + vpmadd52huq $H0,$R1,$D1hi
3284 + vpmadd52luq $H0,$R2,$D2lo
3285 + vpmadd52huq $H0,$R2,$D2hi
3286 +
3287 + vpmadd52luq $H1,$S2,$D0lo
3288 + vpmadd52huq $H1,$S2,$D0hi
3289 + vpmadd52luq $H1,$R0,$D1lo
3290 + vpmadd52huq $H1,$R0,$D1hi
3291 + vpmadd52luq $H1,$R1,$D2lo
3292 + vpmadd52huq $H1,$R1,$D2hi
3293 +
3294 + ################################################################
3295 + # horizontal addition
3296 +
3297 + mov \$1,%eax
3298 + kmovw %eax,%k1
3299 + vpsrldq \$8,$D0lo,$T0
3300 + vpsrldq \$8,$D0hi,$H0
3301 + vpsrldq \$8,$D1lo,$T1
3302 + vpsrldq \$8,$D1hi,$H1
3303 + vpaddq $T0,$D0lo,$D0lo
3304 + vpaddq $H0,$D0hi,$D0hi
3305 + vpsrldq \$8,$D2lo,$T2
3306 + vpsrldq \$8,$D2hi,$H2
3307 + vpaddq $T1,$D1lo,$D1lo
3308 + vpaddq $H1,$D1hi,$D1hi
3309 + vpermq \$0x2,$D0lo,$T0
3310 + vpermq \$0x2,$D0hi,$H0
3311 + vpaddq $T2,$D2lo,$D2lo
3312 + vpaddq $H2,$D2hi,$D2hi
3313 +
3314 + vpermq \$0x2,$D1lo,$T1
3315 + vpermq \$0x2,$D1hi,$H1
3316 + vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3317 + vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3318 + vpermq \$0x2,$D2lo,$T2
3319 + vpermq \$0x2,$D2hi,$H2
3320 + vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3321 + vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3322 + vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3323 + vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3324 +
3325 + ################################################################
3326 + # partial reduction
3327 + vpsrlq \$44,$D0lo,$tmp
3328 + vpsllq \$8,$D0hi,$D0hi
3329 + vpandq $mask44,$D0lo,$H0
3330 + vpaddq $tmp,$D0hi,$D0hi
3331 +
3332 + vpaddq $D0hi,$D1lo,$D1lo
3333 +
3334 + vpsrlq \$44,$D1lo,$tmp
3335 + vpsllq \$8,$D1hi,$D1hi
3336 + vpandq $mask44,$D1lo,$H1
3337 + vpaddq $tmp,$D1hi,$D1hi
3338 +
3339 + vpaddq $D1hi,$D2lo,$D2lo
3340 +
3341 + vpsrlq \$42,$D2lo,$tmp
3342 + vpsllq \$10,$D2hi,$D2hi
3343 + vpandq $mask42,$D2lo,$H2
3344 + vpaddq $tmp,$D2hi,$D2hi
3345 +
3346 + vpaddq $D2hi,$H0,$H0
3347 + vpsllq \$2,$D2hi,$D2hi
3348 +
3349 + vpaddq $D2hi,$H0,$H0
3350 +
3351 + vpsrlq \$44,$H0,$tmp # additional step
3352 + vpandq $mask44,$H0,$H0
3353 +
3354 + vpaddq $tmp,$H1,$H1
3355 + # at this point $len is
3356 + # either 4*n+2 or 0...
3357 + sub \$2,$len # len-=32
3358 + ja .Lblocks_vpmadd52_4x_do
3359 +
3360 + vmovq %x#$H0,0($ctx)
3361 + vmovq %x#$H1,8($ctx)
3362 + vmovq %x#$H2,16($ctx)
3363 + vzeroall
3364 +
3365 +.Lno_data_vpmadd52_4x:
3366 + ret
3367 +.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3368 +___
3369 +}
3370 +{
3371 +########################################################################
3372 +# As implied by its name 8x subroutine processes 8 blocks in parallel...
3373 +# This is intermediate version, as it's used only in cases when input
3374 +# length is either 8*n, 8*n+1 or 8*n+2...
3375 +
3376 +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3377 +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3378 +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3379 +my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3380 +
3381 +$code.=<<___;
3382 +.type poly1305_blocks_vpmadd52_8x,\@function,4
3383 +.align 32
3384 +poly1305_blocks_vpmadd52_8x:
3385 + shr \$4,$len
3386 + jz .Lno_data_vpmadd52_8x # too short
3387 +
3388 + shl \$40,$padbit
3389 + mov 64($ctx),%r8 # peek on power of the key
3390 +
3391 + vmovdqa64 .Lx_mask44(%rip),$mask44
3392 + vmovdqa64 .Lx_mask42(%rip),$mask42
3393 +
3394 + test %r8,%r8 # is power value impossible?
3395 + js .Linit_vpmadd52 # if it is, then init R[4]
3396 +
3397 + vmovq 0($ctx),%x#$H0 # load current hash value
3398 + vmovq 8($ctx),%x#$H1
3399 + vmovq 16($ctx),%x#$H2
3400 +
3401 +.Lblocks_vpmadd52_8x:
3402 + ################################################################
3403 + # fist we calculate more key powers
3404 +
3405 + vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
3406 + vmovdqu64 160($ctx),$S1
3407 + vmovdqu64 64($ctx),$R0
3408 + vmovdqu64 96($ctx),$R1
3409 +
3410 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3411 + vpaddq $R2,$S2,$S2
3412 + vpsllq \$2,$S2,$S2
3413 +
3414 + vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
3415 + vpbroadcastq %x#$R0,$RR0
3416 + vpbroadcastq %x#$R1,$RR1
3417 +
3418 + vpxorq $D0lo,$D0lo,$D0lo
3419 + vpmadd52luq $RR2,$S1,$D0lo
3420 + vpxorq $D0hi,$D0hi,$D0hi
3421 + vpmadd52huq $RR2,$S1,$D0hi
3422 + vpxorq $D1lo,$D1lo,$D1lo
3423 + vpmadd52luq $RR2,$S2,$D1lo
3424 + vpxorq $D1hi,$D1hi,$D1hi
3425 + vpmadd52huq $RR2,$S2,$D1hi
3426 + vpxorq $D2lo,$D2lo,$D2lo
3427 + vpmadd52luq $RR2,$R0,$D2lo
3428 + vpxorq $D2hi,$D2hi,$D2hi
3429 + vpmadd52huq $RR2,$R0,$D2hi
3430 +
3431 + vpmadd52luq $RR0,$R0,$D0lo
3432 + vpmadd52huq $RR0,$R0,$D0hi
3433 + vpmadd52luq $RR0,$R1,$D1lo
3434 + vpmadd52huq $RR0,$R1,$D1hi
3435 + vpmadd52luq $RR0,$R2,$D2lo
3436 + vpmadd52huq $RR0,$R2,$D2hi
3437 +
3438 + vpmadd52luq $RR1,$S2,$D0lo
3439 + vpmadd52huq $RR1,$S2,$D0hi
3440 + vpmadd52luq $RR1,$R0,$D1lo
3441 + vpmadd52huq $RR1,$R0,$D1hi
3442 + vpmadd52luq $RR1,$R1,$D2lo
3443 + vpmadd52huq $RR1,$R1,$D2hi
3444 +
3445 + ################################################################
3446 + # partial reduction
3447 + vpsrlq \$44,$D0lo,$tmp
3448 + vpsllq \$8,$D0hi,$D0hi
3449 + vpandq $mask44,$D0lo,$RR0
3450 + vpaddq $tmp,$D0hi,$D0hi
3451 +
3452 + vpaddq $D0hi,$D1lo,$D1lo
3453 +
3454 + vpsrlq \$44,$D1lo,$tmp
3455 + vpsllq \$8,$D1hi,$D1hi
3456 + vpandq $mask44,$D1lo,$RR1
3457 + vpaddq $tmp,$D1hi,$D1hi
3458 +
3459 + vpaddq $D1hi,$D2lo,$D2lo
3460 +
3461 + vpsrlq \$42,$D2lo,$tmp
3462 + vpsllq \$10,$D2hi,$D2hi
3463 + vpandq $mask42,$D2lo,$RR2
3464 + vpaddq $tmp,$D2hi,$D2hi
3465 +
3466 + vpaddq $D2hi,$RR0,$RR0
3467 + vpsllq \$2,$D2hi,$D2hi
3468 +
3469 + vpaddq $D2hi,$RR0,$RR0
3470 +
3471 + vpsrlq \$44,$RR0,$tmp # additional step
3472 + vpandq $mask44,$RR0,$RR0
3473 +
3474 + vpaddq $tmp,$RR1,$RR1
3475 +
3476 + ################################################################
3477 + # At this point Rx holds 1324 powers, RRx - 5768, and the goal
3478 + # is 15263748, which reflects how data is loaded...
3479 +
3480 + vpunpcklqdq $R2,$RR2,$T2 # 3748
3481 + vpunpckhqdq $R2,$RR2,$R2 # 1526
3482 + vpunpcklqdq $R0,$RR0,$T0
3483 + vpunpckhqdq $R0,$RR0,$R0
3484 + vpunpcklqdq $R1,$RR1,$T1
3485 + vpunpckhqdq $R1,$RR1,$R1
3486 +___
3487 +######## switch to %zmm
3488 +map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3489 +map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3490 +map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3491 +map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3492 +
3493 +$code.=<<___;
3494 + vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
3495 + vshufi64x2 \$0x44,$R0,$T0,$RR0
3496 + vshufi64x2 \$0x44,$R1,$T1,$RR1
3497 +
3498 + vmovdqu64 16*0($inp),$T2 # load data
3499 + vmovdqu64 16*4($inp),$T3
3500 + lea 16*8($inp),$inp
3501 +
3502 + vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
3503 + vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
3504 + vpaddq $RR2,$SS2,$SS2
3505 + vpaddq $RR1,$SS1,$SS1
3506 + vpsllq \$2,$SS2,$SS2
3507 + vpsllq \$2,$SS1,$SS1
3508 +
3509 + vpbroadcastq $padbit,$PAD
3510 + vpbroadcastq %x#$mask44,$mask44
3511 + vpbroadcastq %x#$mask42,$mask42
3512 +
3513 + vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
3514 + vpbroadcastq %x#$SS2,$S2
3515 + vpbroadcastq %x#$RR0,$R0
3516 + vpbroadcastq %x#$RR1,$R1
3517 + vpbroadcastq %x#$RR2,$R2
3518 +
3519 + vpunpcklqdq $T3,$T2,$T1 # transpose data
3520 + vpunpckhqdq $T3,$T2,$T3
3521 +
3522 + # at this point 64-bit lanes are ordered as 73625140
3523 +
3524 + vpsrlq \$24,$T3,$T2 # splat the data
3525 + vporq $PAD,$T2,$T2
3526 + vpaddq $T2,$H2,$H2 # accumulate input
3527 + vpandq $mask44,$T1,$T0
3528 + vpsrlq \$44,$T1,$T1
3529 + vpsllq \$20,$T3,$T3
3530 + vporq $T3,$T1,$T1
3531 + vpandq $mask44,$T1,$T1
3532 +
3533 + sub \$8,$len
3534 + jz .Ltail_vpmadd52_8x
3535 + jmp .Loop_vpmadd52_8x
3536 +
3537 +.align 32
3538 +.Loop_vpmadd52_8x:
3539 + #vpaddq $T2,$H2,$H2 # accumulate input
3540 + vpaddq $T0,$H0,$H0
3541 + vpaddq $T1,$H1,$H1
3542 +
3543 + vpxorq $D0lo,$D0lo,$D0lo
3544 + vpmadd52luq $H2,$S1,$D0lo
3545 + vpxorq $D0hi,$D0hi,$D0hi
3546 + vpmadd52huq $H2,$S1,$D0hi
3547 + vpxorq $D1lo,$D1lo,$D1lo
3548 + vpmadd52luq $H2,$S2,$D1lo
3549 + vpxorq $D1hi,$D1hi,$D1hi
3550 + vpmadd52huq $H2,$S2,$D1hi
3551 + vpxorq $D2lo,$D2lo,$D2lo
3552 + vpmadd52luq $H2,$R0,$D2lo
3553 + vpxorq $D2hi,$D2hi,$D2hi
3554 + vpmadd52huq $H2,$R0,$D2hi
3555 +
3556 + vmovdqu64 16*0($inp),$T2 # load data
3557 + vmovdqu64 16*4($inp),$T3
3558 + lea 16*8($inp),$inp
3559 + vpmadd52luq $H0,$R0,$D0lo
3560 + vpmadd52huq $H0,$R0,$D0hi
3561 + vpmadd52luq $H0,$R1,$D1lo
3562 + vpmadd52huq $H0,$R1,$D1hi
3563 + vpmadd52luq $H0,$R2,$D2lo
3564 + vpmadd52huq $H0,$R2,$D2hi
3565 +
3566 + vpunpcklqdq $T3,$T2,$T1 # transpose data
3567 + vpunpckhqdq $T3,$T2,$T3
3568 + vpmadd52luq $H1,$S2,$D0lo
3569 + vpmadd52huq $H1,$S2,$D0hi
3570 + vpmadd52luq $H1,$R0,$D1lo
3571 + vpmadd52huq $H1,$R0,$D1hi
3572 + vpmadd52luq $H1,$R1,$D2lo
3573 + vpmadd52huq $H1,$R1,$D2hi
3574 +
3575 + ################################################################
3576 + # partial reduction (interleaved with data splat)
3577 + vpsrlq \$44,$D0lo,$tmp
3578 + vpsllq \$8,$D0hi,$D0hi
3579 + vpandq $mask44,$D0lo,$H0
3580 + vpaddq $tmp,$D0hi,$D0hi
3581 +
3582 + vpsrlq \$24,$T3,$T2
3583 + vporq $PAD,$T2,$T2
3584 + vpaddq $D0hi,$D1lo,$D1lo
3585 +
3586 + vpsrlq \$44,$D1lo,$tmp
3587 + vpsllq \$8,$D1hi,$D1hi
3588 + vpandq $mask44,$D1lo,$H1
3589 + vpaddq $tmp,$D1hi,$D1hi
3590 +
3591 + vpandq $mask44,$T1,$T0
3592 + vpsrlq \$44,$T1,$T1
3593 + vpsllq \$20,$T3,$T3
3594 + vpaddq $D1hi,$D2lo,$D2lo
3595 +
3596 + vpsrlq \$42,$D2lo,$tmp
3597 + vpsllq \$10,$D2hi,$D2hi
3598 + vpandq $mask42,$D2lo,$H2
3599 + vpaddq $tmp,$D2hi,$D2hi
3600 +
3601 + vpaddq $T2,$H2,$H2 # accumulate input
3602 + vpaddq $D2hi,$H0,$H0
3603 + vpsllq \$2,$D2hi,$D2hi
3604 +
3605 + vpaddq $D2hi,$H0,$H0
3606 + vporq $T3,$T1,$T1
3607 + vpandq $mask44,$T1,$T1
3608 +
3609 + vpsrlq \$44,$H0,$tmp # additional step
3610 + vpandq $mask44,$H0,$H0
3611 +
3612 + vpaddq $tmp,$H1,$H1
3613 +
3614 + sub \$8,$len # len-=128
3615 + jnz .Loop_vpmadd52_8x
3616 +
3617 +.Ltail_vpmadd52_8x:
3618 + #vpaddq $T2,$H2,$H2 # accumulate input
3619 + vpaddq $T0,$H0,$H0
3620 + vpaddq $T1,$H1,$H1
3621 +
3622 + vpxorq $D0lo,$D0lo,$D0lo
3623 + vpmadd52luq $H2,$SS1,$D0lo
3624 + vpxorq $D0hi,$D0hi,$D0hi
3625 + vpmadd52huq $H2,$SS1,$D0hi
3626 + vpxorq $D1lo,$D1lo,$D1lo
3627 + vpmadd52luq $H2,$SS2,$D1lo
3628 + vpxorq $D1hi,$D1hi,$D1hi
3629 + vpmadd52huq $H2,$SS2,$D1hi
3630 + vpxorq $D2lo,$D2lo,$D2lo
3631 + vpmadd52luq $H2,$RR0,$D2lo
3632 + vpxorq $D2hi,$D2hi,$D2hi
3633 + vpmadd52huq $H2,$RR0,$D2hi
3634 +
3635 + vpmadd52luq $H0,$RR0,$D0lo
3636 + vpmadd52huq $H0,$RR0,$D0hi
3637 + vpmadd52luq $H0,$RR1,$D1lo
3638 + vpmadd52huq $H0,$RR1,$D1hi
3639 + vpmadd52luq $H0,$RR2,$D2lo
3640 + vpmadd52huq $H0,$RR2,$D2hi
3641 +
3642 + vpmadd52luq $H1,$SS2,$D0lo
3643 + vpmadd52huq $H1,$SS2,$D0hi
3644 + vpmadd52luq $H1,$RR0,$D1lo
3645 + vpmadd52huq $H1,$RR0,$D1hi
3646 + vpmadd52luq $H1,$RR1,$D2lo
3647 + vpmadd52huq $H1,$RR1,$D2hi
3648 +
3649 + ################################################################
3650 + # horizontal addition
3651 +
3652 + mov \$1,%eax
3653 + kmovw %eax,%k1
3654 + vpsrldq \$8,$D0lo,$T0
3655 + vpsrldq \$8,$D0hi,$H0
3656 + vpsrldq \$8,$D1lo,$T1
3657 + vpsrldq \$8,$D1hi,$H1
3658 + vpaddq $T0,$D0lo,$D0lo
3659 + vpaddq $H0,$D0hi,$D0hi
3660 + vpsrldq \$8,$D2lo,$T2
3661 + vpsrldq \$8,$D2hi,$H2
3662 + vpaddq $T1,$D1lo,$D1lo
3663 + vpaddq $H1,$D1hi,$D1hi
3664 + vpermq \$0x2,$D0lo,$T0
3665 + vpermq \$0x2,$D0hi,$H0
3666 + vpaddq $T2,$D2lo,$D2lo
3667 + vpaddq $H2,$D2hi,$D2hi
3668 +
3669 + vpermq \$0x2,$D1lo,$T1
3670 + vpermq \$0x2,$D1hi,$H1
3671 + vpaddq $T0,$D0lo,$D0lo
3672 + vpaddq $H0,$D0hi,$D0hi
3673 + vpermq \$0x2,$D2lo,$T2
3674 + vpermq \$0x2,$D2hi,$H2
3675 + vpaddq $T1,$D1lo,$D1lo
3676 + vpaddq $H1,$D1hi,$D1hi
3677 + vextracti64x4 \$1,$D0lo,%y#$T0
3678 + vextracti64x4 \$1,$D0hi,%y#$H0
3679 + vpaddq $T2,$D2lo,$D2lo
3680 + vpaddq $H2,$D2hi,$D2hi
3681 +
3682 + vextracti64x4 \$1,$D1lo,%y#$T1
3683 + vextracti64x4 \$1,$D1hi,%y#$H1
3684 + vextracti64x4 \$1,$D2lo,%y#$T2
3685 + vextracti64x4 \$1,$D2hi,%y#$H2
3686 +___
3687 +######## switch back to %ymm
3688 +map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3689 +map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3690 +map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3691 +
3692 +$code.=<<___;
3693 + vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3694 + vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3695 + vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3696 + vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3697 + vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3698 + vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3699 +
3700 + ################################################################
3701 + # partial reduction
3702 + vpsrlq \$44,$D0lo,$tmp
3703 + vpsllq \$8,$D0hi,$D0hi
3704 + vpandq $mask44,$D0lo,$H0
3705 + vpaddq $tmp,$D0hi,$D0hi
3706 +
3707 + vpaddq $D0hi,$D1lo,$D1lo
3708 +
3709 + vpsrlq \$44,$D1lo,$tmp
3710 + vpsllq \$8,$D1hi,$D1hi
3711 + vpandq $mask44,$D1lo,$H1
3712 + vpaddq $tmp,$D1hi,$D1hi
3713 +
3714 + vpaddq $D1hi,$D2lo,$D2lo
3715 +
3716 + vpsrlq \$42,$D2lo,$tmp
3717 + vpsllq \$10,$D2hi,$D2hi
3718 + vpandq $mask42,$D2lo,$H2
3719 + vpaddq $tmp,$D2hi,$D2hi
3720 +
3721 + vpaddq $D2hi,$H0,$H0
3722 + vpsllq \$2,$D2hi,$D2hi
3723 +
3724 + vpaddq $D2hi,$H0,$H0
3725 +
3726 + vpsrlq \$44,$H0,$tmp # additional step
3727 + vpandq $mask44,$H0,$H0
3728 +
3729 + vpaddq $tmp,$H1,$H1
3730 +
3731 + ################################################################
3732 +
3733 + vmovq %x#$H0,0($ctx)
3734 + vmovq %x#$H1,8($ctx)
3735 + vmovq %x#$H2,16($ctx)
3736 + vzeroall
3737 +
3738 +.Lno_data_vpmadd52_8x:
3739 + ret
3740 +.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3741 +___
3742 +}
3743 +$code.=<<___;
3744 +.type poly1305_emit_base2_44,\@function,3
3745 +.align 32
3746 +poly1305_emit_base2_44:
3747 + mov 0($ctx),%r8 # load hash value
3748 + mov 8($ctx),%r9
3749 + mov 16($ctx),%r10
3750 +
3751 + mov %r9,%rax
3752 + shr \$20,%r9
3753 + shl \$44,%rax
3754 + mov %r10,%rcx
3755 + shr \$40,%r10
3756 + shl \$24,%rcx
3757 +
3758 + add %rax,%r8
3759 + adc %rcx,%r9
3760 + adc \$0,%r10
3761 +
3762 + mov %r8,%rax
3763 + add \$5,%r8 # compare to modulus
3764 + mov %r9,%rcx
3765 + adc \$0,%r9
3766 + adc \$0,%r10
3767 + shr \$2,%r10 # did 130-bit value overflow?
3768 + cmovnz %r8,%rax
3769 + cmovnz %r9,%rcx
3770 +
3771 + add 0($nonce),%rax # accumulate nonce
3772 + adc 8($nonce),%rcx
3773 + mov %rax,0($mac) # write result
3774 + mov %rcx,8($mac)
3775 +
3776 + ret
3777 +.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
3778 +___
3779 +} } }
3780 +$code.=<<___;
3781 +.align 64
3782 +.Lconst:
3783 +.Lmask24:
3784 +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
3785 +.L129:
3786 +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
3787 +.Lmask26:
3788 +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
3789 +.Lpermd_avx2:
3790 +.long 2,2,2,3,2,0,2,1
3791 +.Lpermd_avx512:
3792 +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
3793 +
3794 +.L2_44_inp_permd:
3795 +.long 0,1,1,2,2,3,7,7
3796 +.L2_44_inp_shift:
3797 +.quad 0,12,24,64
3798 +.L2_44_mask:
3799 +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
3800 +.L2_44_shift_rgt:
3801 +.quad 44,44,42,64
3802 +.L2_44_shift_lft:
3803 +.quad 8,8,10,64
3804 +
3805 +.align 64
3806 +.Lx_mask44:
3807 +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3808 +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3809 +.Lx_mask42:
3810 +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3811 +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3812 +___
3813 +}
3814 +$code.=<<___;
3815 +.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3816 +.align 16
3817 +___
3818 +
3819 +{ # chacha20-poly1305 helpers
3820 +my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
3821 + ("%rdi","%rsi","%rdx","%rcx"); # Unix order
3822 +$code.=<<___;
3823 +.globl xor128_encrypt_n_pad
3824 +.type xor128_encrypt_n_pad,\@abi-omnipotent
3825 +.align 16
3826 +xor128_encrypt_n_pad:
3827 + sub $otp,$inp
3828 + sub $otp,$out
3829 + mov $len,%r10 # put len aside
3830 + shr \$4,$len # len / 16
3831 + jz .Ltail_enc
3832 + nop
3833 +.Loop_enc_xmm:
3834 + movdqu ($inp,$otp),%xmm0
3835 + pxor ($otp),%xmm0
3836 + movdqu %xmm0,($out,$otp)
3837 + movdqa %xmm0,($otp)
3838 + lea 16($otp),$otp
3839 + dec $len
3840 + jnz .Loop_enc_xmm
3841 +
3842 + and \$15,%r10 # len % 16
3843 + jz .Ldone_enc
3844 +
3845 +.Ltail_enc:
3846 + mov \$16,$len
3847 + sub %r10,$len
3848 + xor %eax,%eax
3849 +.Loop_enc_byte:
3850 + mov ($inp,$otp),%al
3851 + xor ($otp),%al
3852 + mov %al,($out,$otp)
3853 + mov %al,($otp)
3854 + lea 1($otp),$otp
3855 + dec %r10
3856 + jnz .Loop_enc_byte
3857 +
3858 + xor %eax,%eax
3859 +.Loop_enc_pad:
3860 + mov %al,($otp)
3861 + lea 1($otp),$otp
3862 + dec $len
3863 + jnz .Loop_enc_pad
3864 +
3865 +.Ldone_enc:
3866 + mov $otp,%rax
3867 + ret
3868 +.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3869 +
3870 +.globl xor128_decrypt_n_pad
3871 +.type xor128_decrypt_n_pad,\@abi-omnipotent
3872 +.align 16
3873 +xor128_decrypt_n_pad:
3874 + sub $otp,$inp
3875 + sub $otp,$out
3876 + mov $len,%r10 # put len aside
3877 + shr \$4,$len # len / 16
3878 + jz .Ltail_dec
3879 + nop
3880 +.Loop_dec_xmm:
3881 + movdqu ($inp,$otp),%xmm0
3882 + movdqa ($otp),%xmm1
3883 + pxor %xmm0,%xmm1
3884 + movdqu %xmm1,($out,$otp)
3885 + movdqa %xmm0,($otp)
3886 + lea 16($otp),$otp
3887 + dec $len
3888 + jnz .Loop_dec_xmm
3889 +
3890 + pxor %xmm1,%xmm1
3891 + and \$15,%r10 # len % 16
3892 + jz .Ldone_dec
3893 +
3894 +.Ltail_dec:
3895 + mov \$16,$len
3896 + sub %r10,$len
3897 + xor %eax,%eax
3898 + xor %r11,%r11
3899 +.Loop_dec_byte:
3900 + mov ($inp,$otp),%r11b
3901 + mov ($otp),%al
3902 + xor %r11b,%al
3903 + mov %al,($out,$otp)
3904 + mov %r11b,($otp)
3905 + lea 1($otp),$otp
3906 + dec %r10
3907 + jnz .Loop_dec_byte
3908 +
3909 + xor %eax,%eax
3910 +.Loop_dec_pad:
3911 + mov %al,($otp)
3912 + lea 1($otp),$otp
3913 + dec $len
3914 + jnz .Loop_dec_pad
3915 +
3916 +.Ldone_dec:
3917 + mov $otp,%rax
3918 + ret
3919 +.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3920 +___
3921 +}
3922 +
3923 +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3924 +# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3925 +if ($win64) {
3926 +$rec="%rcx";
3927 +$frame="%rdx";
3928 +$context="%r8";
3929 +$disp="%r9";
3930 +
3931 +$code.=<<___;
3932 +.extern __imp_RtlVirtualUnwind
3933 +.type se_handler,\@abi-omnipotent
3934 +.align 16
3935 +se_handler:
3936 + push %rsi
3937 + push %rdi
3938 + push %rbx
3939 + push %rbp
3940 + push %r12
3941 + push %r13
3942 + push %r14
3943 + push %r15
3944 + pushfq
3945 + sub \$64,%rsp
3946 +
3947 + mov 120($context),%rax # pull context->Rax
3948 + mov 248($context),%rbx # pull context->Rip
3949 +
3950 + mov 8($disp),%rsi # disp->ImageBase
3951 + mov 56($disp),%r11 # disp->HandlerData
3952 +
3953 + mov 0(%r11),%r10d # HandlerData[0]
3954 + lea (%rsi,%r10),%r10 # prologue label
3955 + cmp %r10,%rbx # context->Rip<.Lprologue
3956 + jb .Lcommon_seh_tail
3957 +
3958 + mov 152($context),%rax # pull context->Rsp
3959 +
3960 + mov 4(%r11),%r10d # HandlerData[1]
3961 + lea (%rsi,%r10),%r10 # epilogue label
3962 + cmp %r10,%rbx # context->Rip>=.Lepilogue
3963 + jae .Lcommon_seh_tail
3964 +
3965 + lea 48(%rax),%rax
3966 +
3967 + mov -8(%rax),%rbx
3968 + mov -16(%rax),%rbp
3969 + mov -24(%rax),%r12
3970 + mov -32(%rax),%r13
3971 + mov -40(%rax),%r14
3972 + mov -48(%rax),%r15
3973 + mov %rbx,144($context) # restore context->Rbx
3974 + mov %rbp,160($context) # restore context->Rbp
3975 + mov %r12,216($context) # restore context->R12
3976 + mov %r13,224($context) # restore context->R13
3977 + mov %r14,232($context) # restore context->R14
3978 + mov %r15,240($context) # restore context->R14
3979 +
3980 + jmp .Lcommon_seh_tail
3981 +.size se_handler,.-se_handler
3982 +
3983 +.type avx_handler,\@abi-omnipotent
3984 +.align 16
3985 +avx_handler:
3986 + push %rsi
3987 + push %rdi
3988 + push %rbx
3989 + push %rbp
3990 + push %r12
3991 + push %r13
3992 + push %r14
3993 + push %r15
3994 + pushfq
3995 + sub \$64,%rsp
3996 +
3997 + mov 120($context),%rax # pull context->Rax
3998 + mov 248($context),%rbx # pull context->Rip
3999 +
4000 + mov 8($disp),%rsi # disp->ImageBase
4001 + mov 56($disp),%r11 # disp->HandlerData
4002 +
4003 + mov 0(%r11),%r10d # HandlerData[0]
4004 + lea (%rsi,%r10),%r10 # prologue label
4005 + cmp %r10,%rbx # context->Rip<prologue label
4006 + jb .Lcommon_seh_tail
4007 +
4008 + mov 152($context),%rax # pull context->Rsp
4009 +
4010 + mov 4(%r11),%r10d # HandlerData[1]
4011 + lea (%rsi,%r10),%r10 # epilogue label
4012 + cmp %r10,%rbx # context->Rip>=epilogue label
4013 + jae .Lcommon_seh_tail
4014 +
4015 + mov 208($context),%rax # pull context->R11
4016 +
4017 + lea 0x50(%rax),%rsi
4018 + lea 0xf8(%rax),%rax
4019 + lea 512($context),%rdi # &context.Xmm6
4020 + mov \$20,%ecx
4021 + .long 0xa548f3fc # cld; rep movsq
4022 +
4023 +.Lcommon_seh_tail:
4024 + mov 8(%rax),%rdi
4025 + mov 16(%rax),%rsi
4026 + mov %rax,152($context) # restore context->Rsp
4027 + mov %rsi,168($context) # restore context->Rsi
4028 + mov %rdi,176($context) # restore context->Rdi
4029 +
4030 + mov 40($disp),%rdi # disp->ContextRecord
4031 + mov $context,%rsi # context
4032 + mov \$154,%ecx # sizeof(CONTEXT)
4033 + .long 0xa548f3fc # cld; rep movsq
4034 +
4035 + mov $disp,%rsi
4036 + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4037 + mov 8(%rsi),%rdx # arg2, disp->ImageBase
4038 + mov 0(%rsi),%r8 # arg3, disp->ControlPc
4039 + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4040 + mov 40(%rsi),%r10 # disp->ContextRecord
4041 + lea 56(%rsi),%r11 # &disp->HandlerData
4042 + lea 24(%rsi),%r12 # &disp->EstablisherFrame
4043 + mov %r10,32(%rsp) # arg5
4044 + mov %r11,40(%rsp) # arg6
4045 + mov %r12,48(%rsp) # arg7
4046 + mov %rcx,56(%rsp) # arg8, (NULL)
4047 + call *__imp_RtlVirtualUnwind(%rip)
4048 +
4049 + mov \$1,%eax # ExceptionContinueSearch
4050 + add \$64,%rsp
4051 + popfq
4052 + pop %r15
4053 + pop %r14
4054 + pop %r13
4055 + pop %r12
4056 + pop %rbp
4057 + pop %rbx
4058 + pop %rdi
4059 + pop %rsi
4060 + ret
4061 +.size avx_handler,.-avx_handler
4062 +
4063 +.section .pdata
4064 +.align 4
4065 + .rva .LSEH_begin_poly1305_init
4066 + .rva .LSEH_end_poly1305_init
4067 + .rva .LSEH_info_poly1305_init
4068 +
4069 + .rva .LSEH_begin_poly1305_blocks
4070 + .rva .LSEH_end_poly1305_blocks
4071 + .rva .LSEH_info_poly1305_blocks
4072 +
4073 + .rva .LSEH_begin_poly1305_emit
4074 + .rva .LSEH_end_poly1305_emit
4075 + .rva .LSEH_info_poly1305_emit
4076 +___
4077 +$code.=<<___ if ($avx);
4078 + .rva .LSEH_begin_poly1305_blocks_avx
4079 + .rva .Lbase2_64_avx
4080 + .rva .LSEH_info_poly1305_blocks_avx_1
4081 +
4082 + .rva .Lbase2_64_avx
4083 + .rva .Leven_avx
4084 + .rva .LSEH_info_poly1305_blocks_avx_2
4085 +
4086 + .rva .Leven_avx
4087 + .rva .LSEH_end_poly1305_blocks_avx
4088 + .rva .LSEH_info_poly1305_blocks_avx_3
4089 +
4090 + .rva .LSEH_begin_poly1305_emit_avx
4091 + .rva .LSEH_end_poly1305_emit_avx
4092 + .rva .LSEH_info_poly1305_emit_avx
4093 +___
4094 +$code.=<<___ if ($avx>1);
4095 + .rva .LSEH_begin_poly1305_blocks_avx2
4096 + .rva .Lbase2_64_avx2
4097 + .rva .LSEH_info_poly1305_blocks_avx2_1
4098 +
4099 + .rva .Lbase2_64_avx2
4100 + .rva .Leven_avx2
4101 + .rva .LSEH_info_poly1305_blocks_avx2_2
4102 +
4103 + .rva .Leven_avx2
4104 + .rva .LSEH_end_poly1305_blocks_avx2
4105 + .rva .LSEH_info_poly1305_blocks_avx2_3
4106 +___
4107 +$code.=<<___ if ($avx>2);
4108 + .rva .LSEH_begin_poly1305_blocks_avx512
4109 + .rva .LSEH_end_poly1305_blocks_avx512
4110 + .rva .LSEH_info_poly1305_blocks_avx512
4111 +___
4112 +$code.=<<___;
4113 +.section .xdata
4114 +.align 8
4115 +.LSEH_info_poly1305_init:
4116 + .byte 9,0,0,0
4117 + .rva se_handler
4118 + .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
4119 +
4120 +.LSEH_info_poly1305_blocks:
4121 + .byte 9,0,0,0
4122 + .rva se_handler
4123 + .rva .Lblocks_body,.Lblocks_epilogue
4124 +
4125 +.LSEH_info_poly1305_emit:
4126 + .byte 9,0,0,0
4127 + .rva se_handler
4128 + .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
4129 +___
4130 +$code.=<<___ if ($avx);
4131 +.LSEH_info_poly1305_blocks_avx_1:
4132 + .byte 9,0,0,0
4133 + .rva se_handler
4134 + .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
4135 +
4136 +.LSEH_info_poly1305_blocks_avx_2:
4137 + .byte 9,0,0,0
4138 + .rva se_handler
4139 + .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
4140 +
4141 +.LSEH_info_poly1305_blocks_avx_3:
4142 + .byte 9,0,0,0
4143 + .rva avx_handler
4144 + .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
4145 +
4146 +.LSEH_info_poly1305_emit_avx:
4147 + .byte 9,0,0,0
4148 + .rva se_handler
4149 + .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
4150 +___
4151 +$code.=<<___ if ($avx>1);
4152 +.LSEH_info_poly1305_blocks_avx2_1:
4153 + .byte 9,0,0,0
4154 + .rva se_handler
4155 + .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
4156 +
4157 +.LSEH_info_poly1305_blocks_avx2_2:
4158 + .byte 9,0,0,0
4159 + .rva se_handler
4160 + .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
4161 +
4162 +.LSEH_info_poly1305_blocks_avx2_3:
4163 + .byte 9,0,0,0
4164 + .rva avx_handler
4165 + .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
4166 +___
4167 +$code.=<<___ if ($avx>2);
4168 +.LSEH_info_poly1305_blocks_avx512:
4169 + .byte 9,0,0,0
4170 + .rva avx_handler
4171 + .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
4172 +___
4173 +}
4174 +
4175 +foreach (split('\n',$code)) {
4176 + s/\`([^\`]*)\`/eval($1)/ge;
4177 + s/%r([a-z]+)#d/%e$1/g;
4178 + s/%r([0-9]+)#d/%r$1d/g;
4179 + s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
4180 +
4181 + print $_,"\n";
4182 +}
4183 +close STDOUT;