kernel: 5.4: import wireguard backport
[openwrt/openwrt.git] / target / linux / generic / backport-5.4 / 080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Ard Biesheuvel <ardb@kernel.org>
3 Date: Fri, 8 Nov 2019 13:22:26 +0100
4 Subject: [PATCH] crypto: mips/poly1305 - incorporate OpenSSL/CRYPTOGAMS
5 optimized implementation
6 MIME-Version: 1.0
7 Content-Type: text/plain; charset=UTF-8
8 Content-Transfer-Encoding: 8bit
9
10 commit a11d055e7a64ac34a5e99b6fe731299449cbcd58 upstream.
11
12 This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation for
13 MIPS authored by Andy Polyakov, a prior 64-bit only version of which has been
14 contributed by him to the OpenSSL project. The file 'poly1305-mips.pl' is taken
15 straight from this upstream GitHub repository [0] at commit
16 d22ade312a7af958ec955620b0d241cf42c37feb, and already contains all the changes
17 required to build it as part of a Linux kernel module.
18
19 [0] https://github.com/dot-asm/cryptogams
20
21 Co-developed-by: Andy Polyakov <appro@cryptogams.org>
22 Signed-off-by: Andy Polyakov <appro@cryptogams.org>
23 Co-developed-by: René van Dorst <opensource@vdorst.com>
24 Signed-off-by: René van Dorst <opensource@vdorst.com>
25 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
26 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
27 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
28 ---
29 arch/mips/crypto/Makefile | 14 +
30 arch/mips/crypto/poly1305-glue.c | 203 +++++
31 arch/mips/crypto/poly1305-mips.pl | 1273 +++++++++++++++++++++++++++++
32 crypto/Kconfig | 5 +
33 lib/crypto/Kconfig | 1 +
34 5 files changed, 1496 insertions(+)
35 create mode 100644 arch/mips/crypto/poly1305-glue.c
36 create mode 100644 arch/mips/crypto/poly1305-mips.pl
37
38 --- a/arch/mips/crypto/Makefile
39 +++ b/arch/mips/crypto/Makefile
40 @@ -8,3 +8,17 @@ obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32
41 obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
42 chacha-mips-y := chacha-core.o chacha-glue.o
43 AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
44 +
45 +obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
46 +poly1305-mips-y := poly1305-core.o poly1305-glue.o
47 +
48 +perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
49 +perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
50 +
51 +quiet_cmd_perlasm = PERLASM $@
52 + cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
53 +
54 +$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
55 + $(call if_changed,perlasm)
56 +
57 +targets += poly1305-core.S
58 --- /dev/null
59 +++ b/arch/mips/crypto/poly1305-glue.c
60 @@ -0,0 +1,203 @@
61 +// SPDX-License-Identifier: GPL-2.0
62 +/*
63 + * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
64 + *
65 + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
66 + */
67 +
68 +#include <asm/unaligned.h>
69 +#include <crypto/algapi.h>
70 +#include <crypto/internal/hash.h>
71 +#include <crypto/internal/poly1305.h>
72 +#include <linux/cpufeature.h>
73 +#include <linux/crypto.h>
74 +#include <linux/module.h>
75 +
76 +asmlinkage void poly1305_init_mips(void *state, const u8 *key);
77 +asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
78 +asmlinkage void poly1305_emit_mips(void *state, __le32 *digest, const u32 *nonce);
79 +
80 +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
81 +{
82 + poly1305_init_mips(&dctx->h, key);
83 + dctx->s[0] = get_unaligned_le32(key + 16);
84 + dctx->s[1] = get_unaligned_le32(key + 20);
85 + dctx->s[2] = get_unaligned_le32(key + 24);
86 + dctx->s[3] = get_unaligned_le32(key + 28);
87 + dctx->buflen = 0;
88 +}
89 +EXPORT_SYMBOL(poly1305_init_arch);
90 +
91 +static int mips_poly1305_init(struct shash_desc *desc)
92 +{
93 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
94 +
95 + dctx->buflen = 0;
96 + dctx->rset = 0;
97 + dctx->sset = false;
98 +
99 + return 0;
100 +}
101 +
102 +static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
103 + u32 len, u32 hibit)
104 +{
105 + if (unlikely(!dctx->sset)) {
106 + if (!dctx->rset) {
107 + poly1305_init_mips(&dctx->h, src);
108 + src += POLY1305_BLOCK_SIZE;
109 + len -= POLY1305_BLOCK_SIZE;
110 + dctx->rset = 1;
111 + }
112 + if (len >= POLY1305_BLOCK_SIZE) {
113 + dctx->s[0] = get_unaligned_le32(src + 0);
114 + dctx->s[1] = get_unaligned_le32(src + 4);
115 + dctx->s[2] = get_unaligned_le32(src + 8);
116 + dctx->s[3] = get_unaligned_le32(src + 12);
117 + src += POLY1305_BLOCK_SIZE;
118 + len -= POLY1305_BLOCK_SIZE;
119 + dctx->sset = true;
120 + }
121 + if (len < POLY1305_BLOCK_SIZE)
122 + return;
123 + }
124 +
125 + len &= ~(POLY1305_BLOCK_SIZE - 1);
126 +
127 + poly1305_blocks_mips(&dctx->h, src, len, hibit);
128 +}
129 +
130 +static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
131 + unsigned int len)
132 +{
133 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
134 +
135 + if (unlikely(dctx->buflen)) {
136 + u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
137 +
138 + memcpy(dctx->buf + dctx->buflen, src, bytes);
139 + src += bytes;
140 + len -= bytes;
141 + dctx->buflen += bytes;
142 +
143 + if (dctx->buflen == POLY1305_BLOCK_SIZE) {
144 + mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
145 + dctx->buflen = 0;
146 + }
147 + }
148 +
149 + if (likely(len >= POLY1305_BLOCK_SIZE)) {
150 + mips_poly1305_blocks(dctx, src, len, 1);
151 + src += round_down(len, POLY1305_BLOCK_SIZE);
152 + len %= POLY1305_BLOCK_SIZE;
153 + }
154 +
155 + if (unlikely(len)) {
156 + dctx->buflen = len;
157 + memcpy(dctx->buf, src, len);
158 + }
159 + return 0;
160 +}
161 +
162 +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
163 + unsigned int nbytes)
164 +{
165 + if (unlikely(dctx->buflen)) {
166 + u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
167 +
168 + memcpy(dctx->buf + dctx->buflen, src, bytes);
169 + src += bytes;
170 + nbytes -= bytes;
171 + dctx->buflen += bytes;
172 +
173 + if (dctx->buflen == POLY1305_BLOCK_SIZE) {
174 + poly1305_blocks_mips(&dctx->h, dctx->buf,
175 + POLY1305_BLOCK_SIZE, 1);
176 + dctx->buflen = 0;
177 + }
178 + }
179 +
180 + if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
181 + unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
182 +
183 + poly1305_blocks_mips(&dctx->h, src, len, 1);
184 + src += len;
185 + nbytes %= POLY1305_BLOCK_SIZE;
186 + }
187 +
188 + if (unlikely(nbytes)) {
189 + dctx->buflen = nbytes;
190 + memcpy(dctx->buf, src, nbytes);
191 + }
192 +}
193 +EXPORT_SYMBOL(poly1305_update_arch);
194 +
195 +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
196 +{
197 + __le32 digest[4];
198 + u64 f = 0;
199 +
200 + if (unlikely(dctx->buflen)) {
201 + dctx->buf[dctx->buflen++] = 1;
202 + memset(dctx->buf + dctx->buflen, 0,
203 + POLY1305_BLOCK_SIZE - dctx->buflen);
204 + poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
205 + }
206 +
207 + poly1305_emit_mips(&dctx->h, digest, dctx->s);
208 +
209 + /* mac = (h + s) % (2^128) */
210 + f = (f >> 32) + le32_to_cpu(digest[0]);
211 + put_unaligned_le32(f, dst);
212 + f = (f >> 32) + le32_to_cpu(digest[1]);
213 + put_unaligned_le32(f, dst + 4);
214 + f = (f >> 32) + le32_to_cpu(digest[2]);
215 + put_unaligned_le32(f, dst + 8);
216 + f = (f >> 32) + le32_to_cpu(digest[3]);
217 + put_unaligned_le32(f, dst + 12);
218 +
219 + *dctx = (struct poly1305_desc_ctx){};
220 +}
221 +EXPORT_SYMBOL(poly1305_final_arch);
222 +
223 +static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
224 +{
225 + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
226 +
227 + if (unlikely(!dctx->sset))
228 + return -ENOKEY;
229 +
230 + poly1305_final_arch(dctx, dst);
231 + return 0;
232 +}
233 +
234 +static struct shash_alg mips_poly1305_alg = {
235 + .init = mips_poly1305_init,
236 + .update = mips_poly1305_update,
237 + .final = mips_poly1305_final,
238 + .digestsize = POLY1305_DIGEST_SIZE,
239 + .descsize = sizeof(struct poly1305_desc_ctx),
240 +
241 + .base.cra_name = "poly1305",
242 + .base.cra_driver_name = "poly1305-mips",
243 + .base.cra_priority = 200,
244 + .base.cra_blocksize = POLY1305_BLOCK_SIZE,
245 + .base.cra_module = THIS_MODULE,
246 +};
247 +
248 +static int __init mips_poly1305_mod_init(void)
249 +{
250 + return crypto_register_shash(&mips_poly1305_alg);
251 +}
252 +
253 +static void __exit mips_poly1305_mod_exit(void)
254 +{
255 + crypto_unregister_shash(&mips_poly1305_alg);
256 +}
257 +
258 +module_init(mips_poly1305_mod_init);
259 +module_exit(mips_poly1305_mod_exit);
260 +
261 +MODULE_LICENSE("GPL v2");
262 +MODULE_ALIAS_CRYPTO("poly1305");
263 +MODULE_ALIAS_CRYPTO("poly1305-mips");
264 --- /dev/null
265 +++ b/arch/mips/crypto/poly1305-mips.pl
266 @@ -0,0 +1,1273 @@
267 +#!/usr/bin/env perl
268 +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
269 +#
270 +# ====================================================================
271 +# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
272 +# project.
273 +# ====================================================================
274 +
275 +# Poly1305 hash for MIPS.
276 +#
277 +# May 2016
278 +#
279 +# Numbers are cycles per processed byte with poly1305_blocks alone.
280 +#
281 +# IALU/gcc
282 +# R1x000 ~5.5/+130% (big-endian)
283 +# Octeon II 2.50/+70% (little-endian)
284 +#
285 +# March 2019
286 +#
287 +# Add 32-bit code path.
288 +#
289 +# October 2019
290 +#
291 +# Modulo-scheduling reduction allows to omit dependency chain at the
292 +# end of inner loop and improve performance. Also optimize MIPS32R2
293 +# code path for MIPS 1004K core. Per René von Dorst's suggestions.
294 +#
295 +# IALU/gcc
296 +# R1x000 ~9.8/? (big-endian)
297 +# Octeon II 3.65/+140% (little-endian)
298 +# MT7621/1004K 4.75/? (little-endian)
299 +#
300 +######################################################################
301 +# There is a number of MIPS ABI in use, O32 and N32/64 are most
302 +# widely used. Then there is a new contender: NUBI. It appears that if
303 +# one picks the latter, it's possible to arrange code in ABI neutral
304 +# manner. Therefore let's stick to NUBI register layout:
305 +#
306 +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
307 +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
308 +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
309 +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
310 +#
311 +# The return value is placed in $a0. Following coding rules facilitate
312 +# interoperability:
313 +#
314 +# - never ever touch $tp, "thread pointer", former $gp [o32 can be
315 +# excluded from the rule, because it's specified volatile];
316 +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
317 +# old code];
318 +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
319 +#
320 +# For reference here is register layout for N32/64 MIPS ABIs:
321 +#
322 +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
323 +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
324 +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
325 +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
326 +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
327 +#
328 +# <appro@openssl.org>
329 +#
330 +######################################################################
331 +
332 +$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
333 +
334 +$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
335 +
336 +if ($flavour =~ /64|n32/i) {{{
337 +######################################################################
338 +# 64-bit code path
339 +#
340 +
341 +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
342 +my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
343 +
344 +$code.=<<___;
345 +#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
346 + defined(_MIPS_ARCH_MIPS64R6)) \\
347 + && !defined(_MIPS_ARCH_MIPS64R2)
348 +# define _MIPS_ARCH_MIPS64R2
349 +#endif
350 +
351 +#if defined(_MIPS_ARCH_MIPS64R6)
352 +# define dmultu(rs,rt)
353 +# define mflo(rd,rs,rt) dmulu rd,rs,rt
354 +# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
355 +#else
356 +# define dmultu(rs,rt) dmultu rs,rt
357 +# define mflo(rd,rs,rt) mflo rd
358 +# define mfhi(rd,rs,rt) mfhi rd
359 +#endif
360 +
361 +#ifdef __KERNEL__
362 +# define poly1305_init poly1305_init_mips
363 +# define poly1305_blocks poly1305_blocks_mips
364 +# define poly1305_emit poly1305_emit_mips
365 +#endif
366 +
367 +#if defined(__MIPSEB__) && !defined(MIPSEB)
368 +# define MIPSEB
369 +#endif
370 +
371 +#ifdef MIPSEB
372 +# define MSB 0
373 +# define LSB 7
374 +#else
375 +# define MSB 7
376 +# define LSB 0
377 +#endif
378 +
379 +.text
380 +.set noat
381 +.set noreorder
382 +
383 +.align 5
384 +.globl poly1305_init
385 +.ent poly1305_init
386 +poly1305_init:
387 + .frame $sp,0,$ra
388 + .set reorder
389 +
390 + sd $zero,0($ctx)
391 + sd $zero,8($ctx)
392 + sd $zero,16($ctx)
393 +
394 + beqz $inp,.Lno_key
395 +
396 +#if defined(_MIPS_ARCH_MIPS64R6)
397 + andi $tmp0,$inp,7 # $inp % 8
398 + dsubu $inp,$inp,$tmp0 # align $inp
399 + sll $tmp0,$tmp0,3 # byte to bit offset
400 + ld $in0,0($inp)
401 + ld $in1,8($inp)
402 + beqz $tmp0,.Laligned_key
403 + ld $tmp2,16($inp)
404 +
405 + subu $tmp1,$zero,$tmp0
406 +# ifdef MIPSEB
407 + dsllv $in0,$in0,$tmp0
408 + dsrlv $tmp3,$in1,$tmp1
409 + dsllv $in1,$in1,$tmp0
410 + dsrlv $tmp2,$tmp2,$tmp1
411 +# else
412 + dsrlv $in0,$in0,$tmp0
413 + dsllv $tmp3,$in1,$tmp1
414 + dsrlv $in1,$in1,$tmp0
415 + dsllv $tmp2,$tmp2,$tmp1
416 +# endif
417 + or $in0,$in0,$tmp3
418 + or $in1,$in1,$tmp2
419 +.Laligned_key:
420 +#else
421 + ldl $in0,0+MSB($inp)
422 + ldl $in1,8+MSB($inp)
423 + ldr $in0,0+LSB($inp)
424 + ldr $in1,8+LSB($inp)
425 +#endif
426 +#ifdef MIPSEB
427 +# if defined(_MIPS_ARCH_MIPS64R2)
428 + dsbh $in0,$in0 # byte swap
429 + dsbh $in1,$in1
430 + dshd $in0,$in0
431 + dshd $in1,$in1
432 +# else
433 + ori $tmp0,$zero,0xFF
434 + dsll $tmp2,$tmp0,32
435 + or $tmp0,$tmp2 # 0x000000FF000000FF
436 +
437 + and $tmp1,$in0,$tmp0 # byte swap
438 + and $tmp3,$in1,$tmp0
439 + dsrl $tmp2,$in0,24
440 + dsrl $tmp4,$in1,24
441 + dsll $tmp1,24
442 + dsll $tmp3,24
443 + and $tmp2,$tmp0
444 + and $tmp4,$tmp0
445 + dsll $tmp0,8 # 0x0000FF000000FF00
446 + or $tmp1,$tmp2
447 + or $tmp3,$tmp4
448 + and $tmp2,$in0,$tmp0
449 + and $tmp4,$in1,$tmp0
450 + dsrl $in0,8
451 + dsrl $in1,8
452 + dsll $tmp2,8
453 + dsll $tmp4,8
454 + and $in0,$tmp0
455 + and $in1,$tmp0
456 + or $tmp1,$tmp2
457 + or $tmp3,$tmp4
458 + or $in0,$tmp1
459 + or $in1,$tmp3
460 + dsrl $tmp1,$in0,32
461 + dsrl $tmp3,$in1,32
462 + dsll $in0,32
463 + dsll $in1,32
464 + or $in0,$tmp1
465 + or $in1,$tmp3
466 +# endif
467 +#endif
468 + li $tmp0,1
469 + dsll $tmp0,32 # 0x0000000100000000
470 + daddiu $tmp0,-63 # 0x00000000ffffffc1
471 + dsll $tmp0,28 # 0x0ffffffc10000000
472 + daddiu $tmp0,-1 # 0x0ffffffc0fffffff
473 +
474 + and $in0,$tmp0
475 + daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
476 + and $in1,$tmp0
477 +
478 + sd $in0,24($ctx)
479 + dsrl $tmp0,$in1,2
480 + sd $in1,32($ctx)
481 + daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
482 + sd $tmp0,40($ctx)
483 +
484 +.Lno_key:
485 + li $v0,0 # return 0
486 + jr $ra
487 +.end poly1305_init
488 +___
489 +{
490 +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
491 +
492 +my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
493 + ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
494 +my ($shr,$shl) = ($s6,$s7); # used on R6
495 +
496 +$code.=<<___;
497 +.align 5
498 +.globl poly1305_blocks
499 +.ent poly1305_blocks
500 +poly1305_blocks:
501 + .set noreorder
502 + dsrl $len,4 # number of complete blocks
503 + bnez $len,poly1305_blocks_internal
504 + nop
505 + jr $ra
506 + nop
507 +.end poly1305_blocks
508 +
509 +.align 5
510 +.ent poly1305_blocks_internal
511 +poly1305_blocks_internal:
512 + .set noreorder
513 +#if defined(_MIPS_ARCH_MIPS64R6)
514 + .frame $sp,8*8,$ra
515 + .mask $SAVED_REGS_MASK|0x000c0000,-8
516 + dsubu $sp,8*8
517 + sd $s7,56($sp)
518 + sd $s6,48($sp)
519 +#else
520 + .frame $sp,6*8,$ra
521 + .mask $SAVED_REGS_MASK,-8
522 + dsubu $sp,6*8
523 +#endif
524 + sd $s5,40($sp)
525 + sd $s4,32($sp)
526 +___
527 +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
528 + sd $s3,24($sp)
529 + sd $s2,16($sp)
530 + sd $s1,8($sp)
531 + sd $s0,0($sp)
532 +___
533 +$code.=<<___;
534 + .set reorder
535 +
536 +#if defined(_MIPS_ARCH_MIPS64R6)
537 + andi $shr,$inp,7
538 + dsubu $inp,$inp,$shr # align $inp
539 + sll $shr,$shr,3 # byte to bit offset
540 + subu $shl,$zero,$shr
541 +#endif
542 +
543 + ld $h0,0($ctx) # load hash value
544 + ld $h1,8($ctx)
545 + ld $h2,16($ctx)
546 +
547 + ld $r0,24($ctx) # load key
548 + ld $r1,32($ctx)
549 + ld $rs1,40($ctx)
550 +
551 + dsll $len,4
552 + daddu $len,$inp # end of buffer
553 + b .Loop
554 +
555 +.align 4
556 +.Loop:
557 +#if defined(_MIPS_ARCH_MIPS64R6)
558 + ld $in0,0($inp) # load input
559 + ld $in1,8($inp)
560 + beqz $shr,.Laligned_inp
561 +
562 + ld $tmp2,16($inp)
563 +# ifdef MIPSEB
564 + dsllv $in0,$in0,$shr
565 + dsrlv $tmp3,$in1,$shl
566 + dsllv $in1,$in1,$shr
567 + dsrlv $tmp2,$tmp2,$shl
568 +# else
569 + dsrlv $in0,$in0,$shr
570 + dsllv $tmp3,$in1,$shl
571 + dsrlv $in1,$in1,$shr
572 + dsllv $tmp2,$tmp2,$shl
573 +# endif
574 + or $in0,$in0,$tmp3
575 + or $in1,$in1,$tmp2
576 +.Laligned_inp:
577 +#else
578 + ldl $in0,0+MSB($inp) # load input
579 + ldl $in1,8+MSB($inp)
580 + ldr $in0,0+LSB($inp)
581 + ldr $in1,8+LSB($inp)
582 +#endif
583 + daddiu $inp,16
584 +#ifdef MIPSEB
585 +# if defined(_MIPS_ARCH_MIPS64R2)
586 + dsbh $in0,$in0 # byte swap
587 + dsbh $in1,$in1
588 + dshd $in0,$in0
589 + dshd $in1,$in1
590 +# else
591 + ori $tmp0,$zero,0xFF
592 + dsll $tmp2,$tmp0,32
593 + or $tmp0,$tmp2 # 0x000000FF000000FF
594 +
595 + and $tmp1,$in0,$tmp0 # byte swap
596 + and $tmp3,$in1,$tmp0
597 + dsrl $tmp2,$in0,24
598 + dsrl $tmp4,$in1,24
599 + dsll $tmp1,24
600 + dsll $tmp3,24
601 + and $tmp2,$tmp0
602 + and $tmp4,$tmp0
603 + dsll $tmp0,8 # 0x0000FF000000FF00
604 + or $tmp1,$tmp2
605 + or $tmp3,$tmp4
606 + and $tmp2,$in0,$tmp0
607 + and $tmp4,$in1,$tmp0
608 + dsrl $in0,8
609 + dsrl $in1,8
610 + dsll $tmp2,8
611 + dsll $tmp4,8
612 + and $in0,$tmp0
613 + and $in1,$tmp0
614 + or $tmp1,$tmp2
615 + or $tmp3,$tmp4
616 + or $in0,$tmp1
617 + or $in1,$tmp3
618 + dsrl $tmp1,$in0,32
619 + dsrl $tmp3,$in1,32
620 + dsll $in0,32
621 + dsll $in1,32
622 + or $in0,$tmp1
623 + or $in1,$tmp3
624 +# endif
625 +#endif
626 + dsrl $tmp1,$h2,2 # modulo-scheduled reduction
627 + andi $h2,$h2,3
628 + dsll $tmp0,$tmp1,2
629 +
630 + daddu $d0,$h0,$in0 # accumulate input
631 + daddu $tmp1,$tmp0
632 + sltu $tmp0,$d0,$h0
633 + daddu $d0,$d0,$tmp1 # ... and residue
634 + sltu $tmp1,$d0,$tmp1
635 + daddu $d1,$h1,$in1
636 + daddu $tmp0,$tmp1
637 + sltu $tmp1,$d1,$h1
638 + daddu $d1,$tmp0
639 +
640 + dmultu ($r0,$d0) # h0*r0
641 + daddu $d2,$h2,$padbit
642 + sltu $tmp0,$d1,$tmp0
643 + mflo ($h0,$r0,$d0)
644 + mfhi ($h1,$r0,$d0)
645 +
646 + dmultu ($rs1,$d1) # h1*5*r1
647 + daddu $d2,$tmp1
648 + daddu $d2,$tmp0
649 + mflo ($tmp0,$rs1,$d1)
650 + mfhi ($tmp1,$rs1,$d1)
651 +
652 + dmultu ($r1,$d0) # h0*r1
653 + mflo ($tmp2,$r1,$d0)
654 + mfhi ($h2,$r1,$d0)
655 + daddu $h0,$tmp0
656 + daddu $h1,$tmp1
657 + sltu $tmp0,$h0,$tmp0
658 +
659 + dmultu ($r0,$d1) # h1*r0
660 + daddu $h1,$tmp0
661 + daddu $h1,$tmp2
662 + mflo ($tmp0,$r0,$d1)
663 + mfhi ($tmp1,$r0,$d1)
664 +
665 + dmultu ($rs1,$d2) # h2*5*r1
666 + sltu $tmp2,$h1,$tmp2
667 + daddu $h2,$tmp2
668 + mflo ($tmp2,$rs1,$d2)
669 +
670 + dmultu ($r0,$d2) # h2*r0
671 + daddu $h1,$tmp0
672 + daddu $h2,$tmp1
673 + mflo ($tmp3,$r0,$d2)
674 + sltu $tmp0,$h1,$tmp0
675 + daddu $h2,$tmp0
676 +
677 + daddu $h1,$tmp2
678 + sltu $tmp2,$h1,$tmp2
679 + daddu $h2,$tmp2
680 + daddu $h2,$tmp3
681 +
682 + bne $inp,$len,.Loop
683 +
684 + sd $h0,0($ctx) # store hash value
685 + sd $h1,8($ctx)
686 + sd $h2,16($ctx)
687 +
688 + .set noreorder
689 +#if defined(_MIPS_ARCH_MIPS64R6)
690 + ld $s7,56($sp)
691 + ld $s6,48($sp)
692 +#endif
693 + ld $s5,40($sp) # epilogue
694 + ld $s4,32($sp)
695 +___
696 +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
697 + ld $s3,24($sp)
698 + ld $s2,16($sp)
699 + ld $s1,8($sp)
700 + ld $s0,0($sp)
701 +___
702 +$code.=<<___;
703 + jr $ra
704 +#if defined(_MIPS_ARCH_MIPS64R6)
705 + daddu $sp,8*8
706 +#else
707 + daddu $sp,6*8
708 +#endif
709 +.end poly1305_blocks_internal
710 +___
711 +}
712 +{
713 +my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
714 +
715 +$code.=<<___;
716 +.align 5
717 +.globl poly1305_emit
718 +.ent poly1305_emit
719 +poly1305_emit:
720 + .frame $sp,0,$ra
721 + .set reorder
722 +
723 + ld $tmp2,16($ctx)
724 + ld $tmp0,0($ctx)
725 + ld $tmp1,8($ctx)
726 +
727 + li $in0,-4 # final reduction
728 + dsrl $in1,$tmp2,2
729 + and $in0,$tmp2
730 + andi $tmp2,$tmp2,3
731 + daddu $in0,$in1
732 +
733 + daddu $tmp0,$tmp0,$in0
734 + sltu $in1,$tmp0,$in0
735 + daddiu $in0,$tmp0,5 # compare to modulus
736 + daddu $tmp1,$tmp1,$in1
737 + sltiu $tmp3,$in0,5
738 + sltu $tmp4,$tmp1,$in1
739 + daddu $in1,$tmp1,$tmp3
740 + daddu $tmp2,$tmp2,$tmp4
741 + sltu $tmp3,$in1,$tmp3
742 + daddu $tmp2,$tmp2,$tmp3
743 +
744 + dsrl $tmp2,2 # see if it carried/borrowed
745 + dsubu $tmp2,$zero,$tmp2
746 +
747 + xor $in0,$tmp0
748 + xor $in1,$tmp1
749 + and $in0,$tmp2
750 + and $in1,$tmp2
751 + xor $in0,$tmp0
752 + xor $in1,$tmp1
753 +
754 + lwu $tmp0,0($nonce) # load nonce
755 + lwu $tmp1,4($nonce)
756 + lwu $tmp2,8($nonce)
757 + lwu $tmp3,12($nonce)
758 + dsll $tmp1,32
759 + dsll $tmp3,32
760 + or $tmp0,$tmp1
761 + or $tmp2,$tmp3
762 +
763 + daddu $in0,$tmp0 # accumulate nonce
764 + daddu $in1,$tmp2
765 + sltu $tmp0,$in0,$tmp0
766 + daddu $in1,$tmp0
767 +
768 + dsrl $tmp0,$in0,8 # write mac value
769 + dsrl $tmp1,$in0,16
770 + dsrl $tmp2,$in0,24
771 + sb $in0,0($mac)
772 + dsrl $tmp3,$in0,32
773 + sb $tmp0,1($mac)
774 + dsrl $tmp0,$in0,40
775 + sb $tmp1,2($mac)
776 + dsrl $tmp1,$in0,48
777 + sb $tmp2,3($mac)
778 + dsrl $tmp2,$in0,56
779 + sb $tmp3,4($mac)
780 + dsrl $tmp3,$in1,8
781 + sb $tmp0,5($mac)
782 + dsrl $tmp0,$in1,16
783 + sb $tmp1,6($mac)
784 + dsrl $tmp1,$in1,24
785 + sb $tmp2,7($mac)
786 +
787 + sb $in1,8($mac)
788 + dsrl $tmp2,$in1,32
789 + sb $tmp3,9($mac)
790 + dsrl $tmp3,$in1,40
791 + sb $tmp0,10($mac)
792 + dsrl $tmp0,$in1,48
793 + sb $tmp1,11($mac)
794 + dsrl $tmp1,$in1,56
795 + sb $tmp2,12($mac)
796 + sb $tmp3,13($mac)
797 + sb $tmp0,14($mac)
798 + sb $tmp1,15($mac)
799 +
800 + jr $ra
801 +.end poly1305_emit
802 +.rdata
803 +.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
804 +.align 2
805 +___
806 +}
807 +}}} else {{{
808 +######################################################################
809 +# 32-bit code path
810 +#
811 +
812 +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
813 +my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
814 + ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
815 +
816 +$code.=<<___;
817 +#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
818 + defined(_MIPS_ARCH_MIPS32R6)) \\
819 + && !defined(_MIPS_ARCH_MIPS32R2)
820 +# define _MIPS_ARCH_MIPS32R2
821 +#endif
822 +
823 +#if defined(_MIPS_ARCH_MIPS32R6)
824 +# define multu(rs,rt)
825 +# define mflo(rd,rs,rt) mulu rd,rs,rt
826 +# define mfhi(rd,rs,rt) muhu rd,rs,rt
827 +#else
828 +# define multu(rs,rt) multu rs,rt
829 +# define mflo(rd,rs,rt) mflo rd
830 +# define mfhi(rd,rs,rt) mfhi rd
831 +#endif
832 +
833 +#ifdef __KERNEL__
834 +# define poly1305_init poly1305_init_mips
835 +# define poly1305_blocks poly1305_blocks_mips
836 +# define poly1305_emit poly1305_emit_mips
837 +#endif
838 +
839 +#if defined(__MIPSEB__) && !defined(MIPSEB)
840 +# define MIPSEB
841 +#endif
842 +
843 +#ifdef MIPSEB
844 +# define MSB 0
845 +# define LSB 3
846 +#else
847 +# define MSB 3
848 +# define LSB 0
849 +#endif
850 +
851 +.text
852 +.set noat
853 +.set noreorder
854 +
855 +.align 5
856 +.globl poly1305_init
857 +.ent poly1305_init
858 +poly1305_init:
859 + .frame $sp,0,$ra
860 + .set reorder
861 +
862 + sw $zero,0($ctx)
863 + sw $zero,4($ctx)
864 + sw $zero,8($ctx)
865 + sw $zero,12($ctx)
866 + sw $zero,16($ctx)
867 +
868 + beqz $inp,.Lno_key
869 +
870 +#if defined(_MIPS_ARCH_MIPS32R6)
871 + andi $tmp0,$inp,3 # $inp % 4
872 + subu $inp,$inp,$tmp0 # align $inp
873 + sll $tmp0,$tmp0,3 # byte to bit offset
874 + lw $in0,0($inp)
875 + lw $in1,4($inp)
876 + lw $in2,8($inp)
877 + lw $in3,12($inp)
878 + beqz $tmp0,.Laligned_key
879 +
880 + lw $tmp2,16($inp)
881 + subu $tmp1,$zero,$tmp0
882 +# ifdef MIPSEB
883 + sllv $in0,$in0,$tmp0
884 + srlv $tmp3,$in1,$tmp1
885 + sllv $in1,$in1,$tmp0
886 + or $in0,$in0,$tmp3
887 + srlv $tmp3,$in2,$tmp1
888 + sllv $in2,$in2,$tmp0
889 + or $in1,$in1,$tmp3
890 + srlv $tmp3,$in3,$tmp1
891 + sllv $in3,$in3,$tmp0
892 + or $in2,$in2,$tmp3
893 + srlv $tmp2,$tmp2,$tmp1
894 + or $in3,$in3,$tmp2
895 +# else
896 + srlv $in0,$in0,$tmp0
897 + sllv $tmp3,$in1,$tmp1
898 + srlv $in1,$in1,$tmp0
899 + or $in0,$in0,$tmp3
900 + sllv $tmp3,$in2,$tmp1
901 + srlv $in2,$in2,$tmp0
902 + or $in1,$in1,$tmp3
903 + sllv $tmp3,$in3,$tmp1
904 + srlv $in3,$in3,$tmp0
905 + or $in2,$in2,$tmp3
906 + sllv $tmp2,$tmp2,$tmp1
907 + or $in3,$in3,$tmp2
908 +# endif
909 +.Laligned_key:
910 +#else
911 + lwl $in0,0+MSB($inp)
912 + lwl $in1,4+MSB($inp)
913 + lwl $in2,8+MSB($inp)
914 + lwl $in3,12+MSB($inp)
915 + lwr $in0,0+LSB($inp)
916 + lwr $in1,4+LSB($inp)
917 + lwr $in2,8+LSB($inp)
918 + lwr $in3,12+LSB($inp)
919 +#endif
920 +#ifdef MIPSEB
921 +# if defined(_MIPS_ARCH_MIPS32R2)
922 + wsbh $in0,$in0 # byte swap
923 + wsbh $in1,$in1
924 + wsbh $in2,$in2
925 + wsbh $in3,$in3
926 + rotr $in0,$in0,16
927 + rotr $in1,$in1,16
928 + rotr $in2,$in2,16
929 + rotr $in3,$in3,16
930 +# else
931 + srl $tmp0,$in0,24 # byte swap
932 + srl $tmp1,$in0,8
933 + andi $tmp2,$in0,0xFF00
934 + sll $in0,$in0,24
935 + andi $tmp1,0xFF00
936 + sll $tmp2,$tmp2,8
937 + or $in0,$tmp0
938 + srl $tmp0,$in1,24
939 + or $tmp1,$tmp2
940 + srl $tmp2,$in1,8
941 + or $in0,$tmp1
942 + andi $tmp1,$in1,0xFF00
943 + sll $in1,$in1,24
944 + andi $tmp2,0xFF00
945 + sll $tmp1,$tmp1,8
946 + or $in1,$tmp0
947 + srl $tmp0,$in2,24
948 + or $tmp2,$tmp1
949 + srl $tmp1,$in2,8
950 + or $in1,$tmp2
951 + andi $tmp2,$in2,0xFF00
952 + sll $in2,$in2,24
953 + andi $tmp1,0xFF00
954 + sll $tmp2,$tmp2,8
955 + or $in2,$tmp0
956 + srl $tmp0,$in3,24
957 + or $tmp1,$tmp2
958 + srl $tmp2,$in3,8
959 + or $in2,$tmp1
960 + andi $tmp1,$in3,0xFF00
961 + sll $in3,$in3,24
962 + andi $tmp2,0xFF00
963 + sll $tmp1,$tmp1,8
964 + or $in3,$tmp0
965 + or $tmp2,$tmp1
966 + or $in3,$tmp2
967 +# endif
968 +#endif
969 + lui $tmp0,0x0fff
970 + ori $tmp0,0xffff # 0x0fffffff
971 + and $in0,$in0,$tmp0
972 + subu $tmp0,3 # 0x0ffffffc
973 + and $in1,$in1,$tmp0
974 + and $in2,$in2,$tmp0
975 + and $in3,$in3,$tmp0
976 +
977 + sw $in0,20($ctx)
978 + sw $in1,24($ctx)
979 + sw $in2,28($ctx)
980 + sw $in3,32($ctx)
981 +
982 + srl $tmp1,$in1,2
983 + srl $tmp2,$in2,2
984 + srl $tmp3,$in3,2
985 + addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
986 + addu $in2,$in2,$tmp2
987 + addu $in3,$in3,$tmp3
988 + sw $in1,36($ctx)
989 + sw $in2,40($ctx)
990 + sw $in3,44($ctx)
991 +.Lno_key:
992 + li $v0,0
993 + jr $ra
994 +.end poly1305_init
995 +___
996 +{
997 +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
998 +
999 +my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
1000 + ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
1001 +my ($d0,$d1,$d2,$d3) =
1002 + ($a4,$a5,$a6,$a7);
1003 +my $shr = $t2; # used on R6
1004 +my $one = $t2; # used on R2
1005 +
1006 +$code.=<<___;
1007 +.globl poly1305_blocks
1008 +.align 5
1009 +.ent poly1305_blocks
1010 +poly1305_blocks:
1011 + .frame $sp,16*4,$ra
1012 + .mask $SAVED_REGS_MASK,-4
1013 + .set noreorder
1014 + subu $sp, $sp,4*12
1015 + sw $s11,4*11($sp)
1016 + sw $s10,4*10($sp)
1017 + sw $s9, 4*9($sp)
1018 + sw $s8, 4*8($sp)
1019 + sw $s7, 4*7($sp)
1020 + sw $s6, 4*6($sp)
1021 + sw $s5, 4*5($sp)
1022 + sw $s4, 4*4($sp)
1023 +___
1024 +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1025 + sw $s3, 4*3($sp)
1026 + sw $s2, 4*2($sp)
1027 + sw $s1, 4*1($sp)
1028 + sw $s0, 4*0($sp)
1029 +___
1030 +$code.=<<___;
1031 + .set reorder
1032 +
1033 + srl $len,4 # number of complete blocks
1034 + li $one,1
1035 + beqz $len,.Labort
1036 +
1037 +#if defined(_MIPS_ARCH_MIPS32R6)
1038 + andi $shr,$inp,3
1039 + subu $inp,$inp,$shr # align $inp
1040 + sll $shr,$shr,3 # byte to bit offset
1041 +#endif
1042 +
1043 + lw $h0,0($ctx) # load hash value
1044 + lw $h1,4($ctx)
1045 + lw $h2,8($ctx)
1046 + lw $h3,12($ctx)
1047 + lw $h4,16($ctx)
1048 +
1049 + lw $r0,20($ctx) # load key
1050 + lw $r1,24($ctx)
1051 + lw $r2,28($ctx)
1052 + lw $r3,32($ctx)
1053 + lw $rs1,36($ctx)
1054 + lw $rs2,40($ctx)
1055 + lw $rs3,44($ctx)
1056 +
1057 + sll $len,4
1058 + addu $len,$len,$inp # end of buffer
1059 + b .Loop
1060 +
1061 +.align 4
1062 +.Loop:
1063 +#if defined(_MIPS_ARCH_MIPS32R6)
1064 + lw $d0,0($inp) # load input
1065 + lw $d1,4($inp)
1066 + lw $d2,8($inp)
1067 + lw $d3,12($inp)
1068 + beqz $shr,.Laligned_inp
1069 +
1070 + lw $t0,16($inp)
1071 + subu $t1,$zero,$shr
1072 +# ifdef MIPSEB
1073 + sllv $d0,$d0,$shr
1074 + srlv $at,$d1,$t1
1075 + sllv $d1,$d1,$shr
1076 + or $d0,$d0,$at
1077 + srlv $at,$d2,$t1
1078 + sllv $d2,$d2,$shr
1079 + or $d1,$d1,$at
1080 + srlv $at,$d3,$t1
1081 + sllv $d3,$d3,$shr
1082 + or $d2,$d2,$at
1083 + srlv $t0,$t0,$t1
1084 + or $d3,$d3,$t0
1085 +# else
1086 + srlv $d0,$d0,$shr
1087 + sllv $at,$d1,$t1
1088 + srlv $d1,$d1,$shr
1089 + or $d0,$d0,$at
1090 + sllv $at,$d2,$t1
1091 + srlv $d2,$d2,$shr
1092 + or $d1,$d1,$at
1093 + sllv $at,$d3,$t1
1094 + srlv $d3,$d3,$shr
1095 + or $d2,$d2,$at
1096 + sllv $t0,$t0,$t1
1097 + or $d3,$d3,$t0
1098 +# endif
1099 +.Laligned_inp:
1100 +#else
1101 + lwl $d0,0+MSB($inp) # load input
1102 + lwl $d1,4+MSB($inp)
1103 + lwl $d2,8+MSB($inp)
1104 + lwl $d3,12+MSB($inp)
1105 + lwr $d0,0+LSB($inp)
1106 + lwr $d1,4+LSB($inp)
1107 + lwr $d2,8+LSB($inp)
1108 + lwr $d3,12+LSB($inp)
1109 +#endif
1110 +#ifdef MIPSEB
1111 +# if defined(_MIPS_ARCH_MIPS32R2)
1112 + wsbh $d0,$d0 # byte swap
1113 + wsbh $d1,$d1
1114 + wsbh $d2,$d2
1115 + wsbh $d3,$d3
1116 + rotr $d0,$d0,16
1117 + rotr $d1,$d1,16
1118 + rotr $d2,$d2,16
1119 + rotr $d3,$d3,16
1120 +# else
1121 + srl $at,$d0,24 # byte swap
1122 + srl $t0,$d0,8
1123 + andi $t1,$d0,0xFF00
1124 + sll $d0,$d0,24
1125 + andi $t0,0xFF00
1126 + sll $t1,$t1,8
1127 + or $d0,$at
1128 + srl $at,$d1,24
1129 + or $t0,$t1
1130 + srl $t1,$d1,8
1131 + or $d0,$t0
1132 + andi $t0,$d1,0xFF00
1133 + sll $d1,$d1,24
1134 + andi $t1,0xFF00
1135 + sll $t0,$t0,8
1136 + or $d1,$at
1137 + srl $at,$d2,24
1138 + or $t1,$t0
1139 + srl $t0,$d2,8
1140 + or $d1,$t1
1141 + andi $t1,$d2,0xFF00
1142 + sll $d2,$d2,24
1143 + andi $t0,0xFF00
1144 + sll $t1,$t1,8
1145 + or $d2,$at
1146 + srl $at,$d3,24
1147 + or $t0,$t1
1148 + srl $t1,$d3,8
1149 + or $d2,$t0
1150 + andi $t0,$d3,0xFF00
1151 + sll $d3,$d3,24
1152 + andi $t1,0xFF00
1153 + sll $t0,$t0,8
1154 + or $d3,$at
1155 + or $t1,$t0
1156 + or $d3,$t1
1157 +# endif
1158 +#endif
1159 + srl $t0,$h4,2 # modulo-scheduled reduction
1160 + andi $h4,$h4,3
1161 + sll $at,$t0,2
1162 +
1163 + addu $d0,$d0,$h0 # accumulate input
1164 + addu $t0,$t0,$at
1165 + sltu $h0,$d0,$h0
1166 + addu $d0,$d0,$t0 # ... and residue
1167 + sltu $at,$d0,$t0
1168 +
1169 + addu $d1,$d1,$h1
1170 + addu $h0,$h0,$at # carry
1171 + sltu $h1,$d1,$h1
1172 + addu $d1,$d1,$h0
1173 + sltu $h0,$d1,$h0
1174 +
1175 + addu $d2,$d2,$h2
1176 + addu $h1,$h1,$h0 # carry
1177 + sltu $h2,$d2,$h2
1178 + addu $d2,$d2,$h1
1179 + sltu $h1,$d2,$h1
1180 +
1181 + addu $d3,$d3,$h3
1182 + addu $h2,$h2,$h1 # carry
1183 + sltu $h3,$d3,$h3
1184 + addu $d3,$d3,$h2
1185 +
1186 +#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
1187 + multu $r0,$d0 # d0*r0
1188 + sltu $h2,$d3,$h2
1189 + maddu $rs3,$d1 # d1*s3
1190 + addu $h3,$h3,$h2 # carry
1191 + maddu $rs2,$d2 # d2*s2
1192 + addu $h4,$h4,$padbit
1193 + maddu $rs1,$d3 # d3*s1
1194 + addu $h4,$h4,$h3
1195 + mfhi $at
1196 + mflo $h0
1197 +
1198 + multu $r1,$d0 # d0*r1
1199 + maddu $r0,$d1 # d1*r0
1200 + maddu $rs3,$d2 # d2*s3
1201 + maddu $rs2,$d3 # d3*s2
1202 + maddu $rs1,$h4 # h4*s1
1203 + maddu $at,$one # hi*1
1204 + mfhi $at
1205 + mflo $h1
1206 +
1207 + multu $r2,$d0 # d0*r2
1208 + maddu $r1,$d1 # d1*r1
1209 + maddu $r0,$d2 # d2*r0
1210 + maddu $rs3,$d3 # d3*s3
1211 + maddu $rs2,$h4 # h4*s2
1212 + maddu $at,$one # hi*1
1213 + mfhi $at
1214 + mflo $h2
1215 +
1216 + mul $t0,$r0,$h4 # h4*r0
1217 +
1218 + multu $r3,$d0 # d0*r3
1219 + maddu $r2,$d1 # d1*r2
1220 + maddu $r1,$d2 # d2*r1
1221 + maddu $r0,$d3 # d3*r0
1222 + maddu $rs3,$h4 # h4*s3
1223 + maddu $at,$one # hi*1
1224 + mfhi $at
1225 + mflo $h3
1226 +
1227 + addiu $inp,$inp,16
1228 +
1229 + addu $h4,$t0,$at
1230 +#else
1231 + multu ($r0,$d0) # d0*r0
1232 + mflo ($h0,$r0,$d0)
1233 + mfhi ($h1,$r0,$d0)
1234 +
1235 + sltu $h2,$d3,$h2
1236 + addu $h3,$h3,$h2 # carry
1237 +
1238 + multu ($rs3,$d1) # d1*s3
1239 + mflo ($at,$rs3,$d1)
1240 + mfhi ($t0,$rs3,$d1)
1241 +
1242 + addu $h4,$h4,$padbit
1243 + addiu $inp,$inp,16
1244 + addu $h4,$h4,$h3
1245 +
1246 + multu ($rs2,$d2) # d2*s2
1247 + mflo ($a3,$rs2,$d2)
1248 + mfhi ($t1,$rs2,$d2)
1249 + addu $h0,$h0,$at
1250 + addu $h1,$h1,$t0
1251 + multu ($rs1,$d3) # d3*s1
1252 + sltu $at,$h0,$at
1253 + addu $h1,$h1,$at
1254 +
1255 + mflo ($at,$rs1,$d3)
1256 + mfhi ($t0,$rs1,$d3)
1257 + addu $h0,$h0,$a3
1258 + addu $h1,$h1,$t1
1259 + multu ($r1,$d0) # d0*r1
1260 + sltu $a3,$h0,$a3
1261 + addu $h1,$h1,$a3
1262 +
1263 +
1264 + mflo ($a3,$r1,$d0)
1265 + mfhi ($h2,$r1,$d0)
1266 + addu $h0,$h0,$at
1267 + addu $h1,$h1,$t0
1268 + multu ($r0,$d1) # d1*r0
1269 + sltu $at,$h0,$at
1270 + addu $h1,$h1,$at
1271 +
1272 + mflo ($at,$r0,$d1)
1273 + mfhi ($t0,$r0,$d1)
1274 + addu $h1,$h1,$a3
1275 + sltu $a3,$h1,$a3
1276 + multu ($rs3,$d2) # d2*s3
1277 + addu $h2,$h2,$a3
1278 +
1279 + mflo ($a3,$rs3,$d2)
1280 + mfhi ($t1,$rs3,$d2)
1281 + addu $h1,$h1,$at
1282 + addu $h2,$h2,$t0
1283 + multu ($rs2,$d3) # d3*s2
1284 + sltu $at,$h1,$at
1285 + addu $h2,$h2,$at
1286 +
1287 + mflo ($at,$rs2,$d3)
1288 + mfhi ($t0,$rs2,$d3)
1289 + addu $h1,$h1,$a3
1290 + addu $h2,$h2,$t1
1291 + multu ($rs1,$h4) # h4*s1
1292 + sltu $a3,$h1,$a3
1293 + addu $h2,$h2,$a3
1294 +
1295 + mflo ($a3,$rs1,$h4)
1296 + addu $h1,$h1,$at
1297 + addu $h2,$h2,$t0
1298 + multu ($r2,$d0) # d0*r2
1299 + sltu $at,$h1,$at
1300 + addu $h2,$h2,$at
1301 +
1302 +
1303 + mflo ($at,$r2,$d0)
1304 + mfhi ($h3,$r2,$d0)
1305 + addu $h1,$h1,$a3
1306 + sltu $a3,$h1,$a3
1307 + multu ($r1,$d1) # d1*r1
1308 + addu $h2,$h2,$a3
1309 +
1310 + mflo ($a3,$r1,$d1)
1311 + mfhi ($t1,$r1,$d1)
1312 + addu $h2,$h2,$at
1313 + sltu $at,$h2,$at
1314 + multu ($r0,$d2) # d2*r0
1315 + addu $h3,$h3,$at
1316 +
1317 + mflo ($at,$r0,$d2)
1318 + mfhi ($t0,$r0,$d2)
1319 + addu $h2,$h2,$a3
1320 + addu $h3,$h3,$t1
1321 + multu ($rs3,$d3) # d3*s3
1322 + sltu $a3,$h2,$a3
1323 + addu $h3,$h3,$a3
1324 +
1325 + mflo ($a3,$rs3,$d3)
1326 + mfhi ($t1,$rs3,$d3)
1327 + addu $h2,$h2,$at
1328 + addu $h3,$h3,$t0
1329 + multu ($rs2,$h4) # h4*s2
1330 + sltu $at,$h2,$at
1331 + addu $h3,$h3,$at
1332 +
1333 + mflo ($at,$rs2,$h4)
1334 + addu $h2,$h2,$a3
1335 + addu $h3,$h3,$t1
1336 + multu ($r3,$d0) # d0*r3
1337 + sltu $a3,$h2,$a3
1338 + addu $h3,$h3,$a3
1339 +
1340 +
1341 + mflo ($a3,$r3,$d0)
1342 + mfhi ($t1,$r3,$d0)
1343 + addu $h2,$h2,$at
1344 + sltu $at,$h2,$at
1345 + multu ($r2,$d1) # d1*r2
1346 + addu $h3,$h3,$at
1347 +
1348 + mflo ($at,$r2,$d1)
1349 + mfhi ($t0,$r2,$d1)
1350 + addu $h3,$h3,$a3
1351 + sltu $a3,$h3,$a3
1352 + multu ($r0,$d3) # d3*r0
1353 + addu $t1,$t1,$a3
1354 +
1355 + mflo ($a3,$r0,$d3)
1356 + mfhi ($d3,$r0,$d3)
1357 + addu $h3,$h3,$at
1358 + addu $t1,$t1,$t0
1359 + multu ($r1,$d2) # d2*r1
1360 + sltu $at,$h3,$at
1361 + addu $t1,$t1,$at
1362 +
1363 + mflo ($at,$r1,$d2)
1364 + mfhi ($t0,$r1,$d2)
1365 + addu $h3,$h3,$a3
1366 + addu $t1,$t1,$d3
1367 + multu ($rs3,$h4) # h4*s3
1368 + sltu $a3,$h3,$a3
1369 + addu $t1,$t1,$a3
1370 +
1371 + mflo ($a3,$rs3,$h4)
1372 + addu $h3,$h3,$at
1373 + addu $t1,$t1,$t0
1374 + multu ($r0,$h4) # h4*r0
1375 + sltu $at,$h3,$at
1376 + addu $t1,$t1,$at
1377 +
1378 +
1379 + mflo ($h4,$r0,$h4)
1380 + addu $h3,$h3,$a3
1381 + sltu $a3,$h3,$a3
1382 + addu $t1,$t1,$a3
1383 + addu $h4,$h4,$t1
1384 +
1385 + li $padbit,1 # if we loop, padbit is 1
1386 +#endif
1387 + bne $inp,$len,.Loop
1388 +
1389 + sw $h0,0($ctx) # store hash value
1390 + sw $h1,4($ctx)
1391 + sw $h2,8($ctx)
1392 + sw $h3,12($ctx)
1393 + sw $h4,16($ctx)
1394 +
1395 + .set noreorder
1396 +.Labort:
1397 + lw $s11,4*11($sp)
1398 + lw $s10,4*10($sp)
1399 + lw $s9, 4*9($sp)
1400 + lw $s8, 4*8($sp)
1401 + lw $s7, 4*7($sp)
1402 + lw $s6, 4*6($sp)
1403 + lw $s5, 4*5($sp)
1404 + lw $s4, 4*4($sp)
1405 +___
1406 +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1407 + lw $s3, 4*3($sp)
1408 + lw $s2, 4*2($sp)
1409 + lw $s1, 4*1($sp)
1410 + lw $s0, 4*0($sp)
1411 +___
1412 +$code.=<<___;
1413 + jr $ra
1414 + addu $sp,$sp,4*12
1415 +.end poly1305_blocks
1416 +___
1417 +}
1418 +{
1419 +my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1420 +
1421 +$code.=<<___;
1422 +.align 5
1423 +.globl poly1305_emit
1424 +.ent poly1305_emit
1425 +poly1305_emit:
1426 + .frame $sp,0,$ra
1427 + .set reorder
1428 +
1429 + lw $tmp4,16($ctx)
1430 + lw $tmp0,0($ctx)
1431 + lw $tmp1,4($ctx)
1432 + lw $tmp2,8($ctx)
1433 + lw $tmp3,12($ctx)
1434 +
1435 + li $in0,-4 # final reduction
1436 + srl $ctx,$tmp4,2
1437 + and $in0,$in0,$tmp4
1438 + andi $tmp4,$tmp4,3
1439 + addu $ctx,$ctx,$in0
1440 +
1441 + addu $tmp0,$tmp0,$ctx
1442 + sltu $ctx,$tmp0,$ctx
1443 + addiu $in0,$tmp0,5 # compare to modulus
1444 + addu $tmp1,$tmp1,$ctx
1445 + sltiu $in1,$in0,5
1446 + sltu $ctx,$tmp1,$ctx
1447 + addu $in1,$in1,$tmp1
1448 + addu $tmp2,$tmp2,$ctx
1449 + sltu $in2,$in1,$tmp1
1450 + sltu $ctx,$tmp2,$ctx
1451 + addu $in2,$in2,$tmp2
1452 + addu $tmp3,$tmp3,$ctx
1453 + sltu $in3,$in2,$tmp2
1454 + sltu $ctx,$tmp3,$ctx
1455 + addu $in3,$in3,$tmp3
1456 + addu $tmp4,$tmp4,$ctx
1457 + sltu $ctx,$in3,$tmp3
1458 + addu $ctx,$tmp4
1459 +
1460 + srl $ctx,2 # see if it carried/borrowed
1461 + subu $ctx,$zero,$ctx
1462 +
1463 + xor $in0,$tmp0
1464 + xor $in1,$tmp1
1465 + xor $in2,$tmp2
1466 + xor $in3,$tmp3
1467 + and $in0,$ctx
1468 + and $in1,$ctx
1469 + and $in2,$ctx
1470 + and $in3,$ctx
1471 + xor $in0,$tmp0
1472 + xor $in1,$tmp1
1473 + xor $in2,$tmp2
1474 + xor $in3,$tmp3
1475 +
1476 + lw $tmp0,0($nonce) # load nonce
1477 + lw $tmp1,4($nonce)
1478 + lw $tmp2,8($nonce)
1479 + lw $tmp3,12($nonce)
1480 +
1481 + addu $in0,$tmp0 # accumulate nonce
1482 + sltu $ctx,$in0,$tmp0
1483 +
1484 + addu $in1,$tmp1
1485 + sltu $tmp1,$in1,$tmp1
1486 + addu $in1,$ctx
1487 + sltu $ctx,$in1,$ctx
1488 + addu $ctx,$tmp1
1489 +
1490 + addu $in2,$tmp2
1491 + sltu $tmp2,$in2,$tmp2
1492 + addu $in2,$ctx
1493 + sltu $ctx,$in2,$ctx
1494 + addu $ctx,$tmp2
1495 +
1496 + addu $in3,$tmp3
1497 + addu $in3,$ctx
1498 +
1499 + srl $tmp0,$in0,8 # write mac value
1500 + srl $tmp1,$in0,16
1501 + srl $tmp2,$in0,24
1502 + sb $in0, 0($mac)
1503 + sb $tmp0,1($mac)
1504 + srl $tmp0,$in1,8
1505 + sb $tmp1,2($mac)
1506 + srl $tmp1,$in1,16
1507 + sb $tmp2,3($mac)
1508 + srl $tmp2,$in1,24
1509 + sb $in1, 4($mac)
1510 + sb $tmp0,5($mac)
1511 + srl $tmp0,$in2,8
1512 + sb $tmp1,6($mac)
1513 + srl $tmp1,$in2,16
1514 + sb $tmp2,7($mac)
1515 + srl $tmp2,$in2,24
1516 + sb $in2, 8($mac)
1517 + sb $tmp0,9($mac)
1518 + srl $tmp0,$in3,8
1519 + sb $tmp1,10($mac)
1520 + srl $tmp1,$in3,16
1521 + sb $tmp2,11($mac)
1522 + srl $tmp2,$in3,24
1523 + sb $in3, 12($mac)
1524 + sb $tmp0,13($mac)
1525 + sb $tmp1,14($mac)
1526 + sb $tmp2,15($mac)
1527 +
1528 + jr $ra
1529 +.end poly1305_emit
1530 +.rdata
1531 +.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1532 +.align 2
1533 +___
1534 +}
1535 +}}}
1536 +
1537 +$output=pop and open STDOUT,">$output";
1538 +print $code;
1539 +close STDOUT;
1540 --- a/crypto/Kconfig
1541 +++ b/crypto/Kconfig
1542 @@ -707,6 +707,11 @@ config CRYPTO_POLY1305_X86_64
1543 in IETF protocols. This is the x86_64 assembler implementation using SIMD
1544 instructions.
1545
1546 +config CRYPTO_POLY1305_MIPS
1547 + tristate "Poly1305 authenticator algorithm (MIPS optimized)"
1548 + depends on CPU_MIPS32 || (CPU_MIPS64 && 64BIT)
1549 + select CRYPTO_ARCH_HAVE_LIB_POLY1305
1550 +
1551 config CRYPTO_MD4
1552 tristate "MD4 digest algorithm"
1553 select CRYPTO_HASH
1554 --- a/lib/crypto/Kconfig
1555 +++ b/lib/crypto/Kconfig
1556 @@ -39,6 +39,7 @@ config CRYPTO_LIB_DES
1557
1558 config CRYPTO_LIB_POLY1305_RSIZE
1559 int
1560 + default 2 if MIPS
1561 default 4 if X86_64
1562 default 9 if ARM || ARM64
1563 default 1