target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch

   1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
   2 From: Ard Biesheuvel <ardb@kernel.org>
   3 Date: Fri, 8 Nov 2019 13:22:26 +0100
   4 Subject: [PATCH] crypto: mips/poly1305 - incorporate OpenSSL/CRYPTOGAMS
   5  optimized implementation
   6 MIME-Version: 1.0
   7 Content-Type: text/plain; charset=UTF-8
   8 Content-Transfer-Encoding: 8bit
   9
  10 commit a11d055e7a64ac34a5e99b6fe731299449cbcd58 upstream.
  11
  12 This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation for
  13 MIPS authored by Andy Polyakov, a prior 64-bit only version of which has been
  14 contributed by him to the OpenSSL project. The file 'poly1305-mips.pl' is taken
  15 straight from this upstream GitHub repository [0] at commit
  16 d22ade312a7af958ec955620b0d241cf42c37feb, and already contains all the changes
  17 required to build it as part of a Linux kernel module.
  18
  19 [0] https://github.com/dot-asm/cryptogams
  20
  21 Co-developed-by: Andy Polyakov <appro@cryptogams.org>
  22 Signed-off-by: Andy Polyakov <appro@cryptogams.org>
  23 Co-developed-by: René van Dorst <opensource@vdorst.com>
  24 Signed-off-by: René van Dorst <opensource@vdorst.com>
  25 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
  26 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
  27 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
  28 ---
  29  arch/mips/crypto/Makefile         |   14 +
  30  arch/mips/crypto/poly1305-glue.c  |  203 +++++
  31  arch/mips/crypto/poly1305-mips.pl | 1273 +++++++++++++++++++++++++++++
  32  crypto/Kconfig                    |    5 +
  33  lib/crypto/Kconfig                |    1 +
  34  5 files changed, 1496 insertions(+)
  35  create mode 100644 arch/mips/crypto/poly1305-glue.c
  36  create mode 100644 arch/mips/crypto/poly1305-mips.pl
  37
  38 --- a/arch/mips/crypto/Makefile
  39 +++ b/arch/mips/crypto/Makefile
  40 @@ -8,3 +8,17 @@ obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32
  41  obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
  42  chacha-mips-y := chacha-core.o chacha-glue.o
  43  AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
  44 +
  45 +obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
  46 +poly1305-mips-y := poly1305-core.o poly1305-glue.o
  47 +
  48 +perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
  49 +perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
  50 +
  51 +quiet_cmd_perlasm = PERLASM $@
  52 +      cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
  53 +
  54 +$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
  55 +       $(call if_changed,perlasm)
  56 +
  57 +targets += poly1305-core.S
  58 --- /dev/null
  59 +++ b/arch/mips/crypto/poly1305-glue.c
  60 @@ -0,0 +1,203 @@
  61 +// SPDX-License-Identifier: GPL-2.0
  62 +/*
  63 + * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
  64 + *
  65 + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
  66 + */
  67 +
  68 +#include <asm/unaligned.h>
  69 +#include <crypto/algapi.h>
  70 +#include <crypto/internal/hash.h>
  71 +#include <crypto/internal/poly1305.h>
  72 +#include <linux/cpufeature.h>
  73 +#include <linux/crypto.h>
  74 +#include <linux/module.h>
  75 +
  76 +asmlinkage void poly1305_init_mips(void *state, const u8 *key);
  77 +asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
  78 +asmlinkage void poly1305_emit_mips(void *state, __le32 *digest, const u32 *nonce);
  79 +
  80 +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
  81 +{
  82 +       poly1305_init_mips(&dctx->h, key);
  83 +       dctx->s[0] = get_unaligned_le32(key + 16);
  84 +       dctx->s[1] = get_unaligned_le32(key + 20);
  85 +       dctx->s[2] = get_unaligned_le32(key + 24);
  86 +       dctx->s[3] = get_unaligned_le32(key + 28);
  87 +       dctx->buflen = 0;
  88 +}
  89 +EXPORT_SYMBOL(poly1305_init_arch);
  90 +
  91 +static int mips_poly1305_init(struct shash_desc *desc)
  92 +{
  93 +       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
  94 +
  95 +       dctx->buflen = 0;
  96 +       dctx->rset = 0;
  97 +       dctx->sset = false;
  98 +
  99 +       return 0;
 100 +}
 101 +
 102 +static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
 103 +                                u32 len, u32 hibit)
 104 +{
 105 +       if (unlikely(!dctx->sset)) {
 106 +               if (!dctx->rset) {
 107 +                       poly1305_init_mips(&dctx->h, src);
 108 +                       src += POLY1305_BLOCK_SIZE;
 109 +                       len -= POLY1305_BLOCK_SIZE;
 110 +                       dctx->rset = 1;
 111 +               }
 112 +               if (len >= POLY1305_BLOCK_SIZE) {
 113 +                       dctx->s[0] = get_unaligned_le32(src +  0);
 114 +                       dctx->s[1] = get_unaligned_le32(src +  4);
 115 +                       dctx->s[2] = get_unaligned_le32(src +  8);
 116 +                       dctx->s[3] = get_unaligned_le32(src + 12);
 117 +                       src += POLY1305_BLOCK_SIZE;
 118 +                       len -= POLY1305_BLOCK_SIZE;
 119 +                       dctx->sset = true;
 120 +               }
 121 +               if (len < POLY1305_BLOCK_SIZE)
 122 +                       return;
 123 +       }
 124 +
 125 +       len &= ~(POLY1305_BLOCK_SIZE - 1);
 126 +
 127 +       poly1305_blocks_mips(&dctx->h, src, len, hibit);
 128 +}
 129 +
 130 +static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
 131 +                               unsigned int len)
 132 +{
 133 +       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 134 +
 135 +       if (unlikely(dctx->buflen)) {
 136 +               u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
 137 +
 138 +               memcpy(dctx->buf + dctx->buflen, src, bytes);
 139 +               src += bytes;
 140 +               len -= bytes;
 141 +               dctx->buflen += bytes;
 142 +
 143 +               if (dctx->buflen == POLY1305_BLOCK_SIZE) {
 144 +                       mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
 145 +                       dctx->buflen = 0;
 146 +               }
 147 +       }
 148 +
 149 +       if (likely(len >= POLY1305_BLOCK_SIZE)) {
 150 +               mips_poly1305_blocks(dctx, src, len, 1);
 151 +               src += round_down(len, POLY1305_BLOCK_SIZE);
 152 +               len %= POLY1305_BLOCK_SIZE;
 153 +       }
 154 +
 155 +       if (unlikely(len)) {
 156 +               dctx->buflen = len;
 157 +               memcpy(dctx->buf, src, len);
 158 +       }
 159 +       return 0;
 160 +}
 161 +
 162 +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
 163 +                         unsigned int nbytes)
 164 +{
 165 +       if (unlikely(dctx->buflen)) {
 166 +               u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
 167 +
 168 +               memcpy(dctx->buf + dctx->buflen, src, bytes);
 169 +               src += bytes;
 170 +               nbytes -= bytes;
 171 +               dctx->buflen += bytes;
 172 +
 173 +               if (dctx->buflen == POLY1305_BLOCK_SIZE) {
 174 +                       poly1305_blocks_mips(&dctx->h, dctx->buf,
 175 +                                            POLY1305_BLOCK_SIZE, 1);
 176 +                       dctx->buflen = 0;
 177 +               }
 178 +       }
 179 +
 180 +       if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
 181 +               unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
 182 +
 183 +               poly1305_blocks_mips(&dctx->h, src, len, 1);
 184 +               src += len;
 185 +               nbytes %= POLY1305_BLOCK_SIZE;
 186 +       }
 187 +
 188 +       if (unlikely(nbytes)) {
 189 +               dctx->buflen = nbytes;
 190 +               memcpy(dctx->buf, src, nbytes);
 191 +       }
 192 +}
 193 +EXPORT_SYMBOL(poly1305_update_arch);
 194 +
 195 +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
 196 +{
 197 +       __le32 digest[4];
 198 +       u64 f = 0;
 199 +
 200 +       if (unlikely(dctx->buflen)) {
 201 +               dctx->buf[dctx->buflen++] = 1;
 202 +               memset(dctx->buf + dctx->buflen, 0,
 203 +                      POLY1305_BLOCK_SIZE - dctx->buflen);
 204 +               poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
 205 +       }
 206 +
 207 +       poly1305_emit_mips(&dctx->h, digest, dctx->s);
 208 +
 209 +       /* mac = (h + s) % (2^128) */
 210 +       f = (f >> 32) + le32_to_cpu(digest[0]);
 211 +       put_unaligned_le32(f, dst);
 212 +       f = (f >> 32) + le32_to_cpu(digest[1]);
 213 +       put_unaligned_le32(f, dst + 4);
 214 +       f = (f >> 32) + le32_to_cpu(digest[2]);
 215 +       put_unaligned_le32(f, dst + 8);
 216 +       f = (f >> 32) + le32_to_cpu(digest[3]);
 217 +       put_unaligned_le32(f, dst + 12);
 218 +
 219 +       *dctx = (struct poly1305_desc_ctx){};
 220 +}
 221 +EXPORT_SYMBOL(poly1305_final_arch);
 222 +
 223 +static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
 224 +{
 225 +       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 226 +
 227 +       if (unlikely(!dctx->sset))
 228 +               return -ENOKEY;
 229 +
 230 +       poly1305_final_arch(dctx, dst);
 231 +       return 0;
 232 +}
 233 +
 234 +static struct shash_alg mips_poly1305_alg = {
 235 +       .init                   = mips_poly1305_init,
 236 +       .update                 = mips_poly1305_update,
 237 +       .final                  = mips_poly1305_final,
 238 +       .digestsize             = POLY1305_DIGEST_SIZE,
 239 +       .descsize               = sizeof(struct poly1305_desc_ctx),
 240 +
 241 +       .base.cra_name          = "poly1305",
 242 +       .base.cra_driver_name   = "poly1305-mips",
 243 +       .base.cra_priority      = 200,
 244 +       .base.cra_blocksize     = POLY1305_BLOCK_SIZE,
 245 +       .base.cra_module        = THIS_MODULE,
 246 +};
 247 +
 248 +static int __init mips_poly1305_mod_init(void)
 249 +{
 250 +       return crypto_register_shash(&mips_poly1305_alg);
 251 +}
 252 +
 253 +static void __exit mips_poly1305_mod_exit(void)
 254 +{
 255 +       crypto_unregister_shash(&mips_poly1305_alg);
 256 +}
 257 +
 258 +module_init(mips_poly1305_mod_init);
 259 +module_exit(mips_poly1305_mod_exit);
 260 +
 261 +MODULE_LICENSE("GPL v2");
 262 +MODULE_ALIAS_CRYPTO("poly1305");
 263 +MODULE_ALIAS_CRYPTO("poly1305-mips");
 264 --- /dev/null
 265 +++ b/arch/mips/crypto/poly1305-mips.pl
 266 @@ -0,0 +1,1273 @@
 267 +#!/usr/bin/env perl
 268 +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
 269 +#
 270 +# ====================================================================
 271 +# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
 272 +# project.
 273 +# ====================================================================
 274 +
 275 +# Poly1305 hash for MIPS.
 276 +#
 277 +# May 2016
 278 +#
 279 +# Numbers are cycles per processed byte with poly1305_blocks alone.
 280 +#
 281 +#              IALU/gcc
 282 +# R1x000       ~5.5/+130%      (big-endian)
 283 +# Octeon II    2.50/+70%       (little-endian)
 284 +#
 285 +# March 2019
 286 +#
 287 +# Add 32-bit code path.
 288 +#
 289 +# October 2019
 290 +#
 291 +# Modulo-scheduling reduction allows to omit dependency chain at the
 292 +# end of inner loop and improve performance. Also optimize MIPS32R2
 293 +# code path for MIPS 1004K core. Per René von Dorst's suggestions.
 294 +#
 295 +#              IALU/gcc
 296 +# R1x000       ~9.8/?          (big-endian)
 297 +# Octeon II    3.65/+140%      (little-endian)
 298 +# MT7621/1004K 4.75/?          (little-endian)
 299 +#
 300 +######################################################################
 301 +# There is a number of MIPS ABI in use, O32 and N32/64 are most
 302 +# widely used. Then there is a new contender: NUBI. It appears that if
 303 +# one picks the latter, it's possible to arrange code in ABI neutral
 304 +# manner. Therefore let's stick to NUBI register layout:
 305 +#
 306 +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
 307 +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
 308 +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
 309 +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
 310 +#
 311 +# The return value is placed in $a0. Following coding rules facilitate
 312 +# interoperability:
 313 +#
 314 +# - never ever touch $tp, "thread pointer", former $gp [o32 can be
 315 +#   excluded from the rule, because it's specified volatile];
 316 +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
 317 +#   old code];
 318 +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
 319 +#
 320 +# For reference here is register layout for N32/64 MIPS ABIs:
 321 +#
 322 +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
 323 +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
 324 +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
 325 +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
 326 +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
 327 +#
 328 +# <appro@openssl.org>
 329 +#
 330 +######################################################################
 331 +
 332 +$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
 333 +
 334 +$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
 335 +
 336 +if ($flavour =~ /64|n32/i) {{{
 337 +######################################################################
 338 +# 64-bit code path
 339 +#
 340 +
 341 +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
 342 +my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
 343 +
 344 +$code.=<<___;
 345 +#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
 346 +     defined(_MIPS_ARCH_MIPS64R6)) \\
 347 +     && !defined(_MIPS_ARCH_MIPS64R2)
 348 +# define _MIPS_ARCH_MIPS64R2
 349 +#endif
 350 +
 351 +#if defined(_MIPS_ARCH_MIPS64R6)
 352 +# define dmultu(rs,rt)
 353 +# define mflo(rd,rs,rt)        dmulu   rd,rs,rt
 354 +# define mfhi(rd,rs,rt)        dmuhu   rd,rs,rt
 355 +#else
 356 +# define dmultu(rs,rt)         dmultu  rs,rt
 357 +# define mflo(rd,rs,rt)        mflo    rd
 358 +# define mfhi(rd,rs,rt)        mfhi    rd
 359 +#endif
 360 +
 361 +#ifdef __KERNEL__
 362 +# define poly1305_init   poly1305_init_mips
 363 +# define poly1305_blocks poly1305_blocks_mips
 364 +# define poly1305_emit   poly1305_emit_mips
 365 +#endif
 366 +
 367 +#if defined(__MIPSEB__) && !defined(MIPSEB)
 368 +# define MIPSEB
 369 +#endif
 370 +
 371 +#ifdef MIPSEB
 372 +# define MSB 0
 373 +# define LSB 7
 374 +#else
 375 +# define MSB 7
 376 +# define LSB 0
 377 +#endif
 378 +
 379 +.text
 380 +.set   noat
 381 +.set   noreorder
 382 +
 383 +.align 5
 384 +.globl poly1305_init
 385 +.ent   poly1305_init
 386 +poly1305_init:
 387 +       .frame  $sp,0,$ra
 388 +       .set    reorder
 389 +
 390 +       sd      $zero,0($ctx)
 391 +       sd      $zero,8($ctx)
 392 +       sd      $zero,16($ctx)
 393 +
 394 +       beqz    $inp,.Lno_key
 395 +
 396 +#if defined(_MIPS_ARCH_MIPS64R6)
 397 +       andi    $tmp0,$inp,7            # $inp % 8
 398 +       dsubu   $inp,$inp,$tmp0         # align $inp
 399 +       sll     $tmp0,$tmp0,3           # byte to bit offset
 400 +       ld      $in0,0($inp)
 401 +       ld      $in1,8($inp)
 402 +       beqz    $tmp0,.Laligned_key
 403 +       ld      $tmp2,16($inp)
 404 +
 405 +       subu    $tmp1,$zero,$tmp0
 406 +# ifdef        MIPSEB
 407 +       dsllv   $in0,$in0,$tmp0
 408 +       dsrlv   $tmp3,$in1,$tmp1
 409 +       dsllv   $in1,$in1,$tmp0
 410 +       dsrlv   $tmp2,$tmp2,$tmp1
 411 +# else
 412 +       dsrlv   $in0,$in0,$tmp0
 413 +       dsllv   $tmp3,$in1,$tmp1
 414 +       dsrlv   $in1,$in1,$tmp0
 415 +       dsllv   $tmp2,$tmp2,$tmp1
 416 +# endif
 417 +       or      $in0,$in0,$tmp3
 418 +       or      $in1,$in1,$tmp2
 419 +.Laligned_key:
 420 +#else
 421 +       ldl     $in0,0+MSB($inp)
 422 +       ldl     $in1,8+MSB($inp)
 423 +       ldr     $in0,0+LSB($inp)
 424 +       ldr     $in1,8+LSB($inp)
 425 +#endif
 426 +#ifdef MIPSEB
 427 +# if defined(_MIPS_ARCH_MIPS64R2)
 428 +       dsbh    $in0,$in0               # byte swap
 429 +        dsbh   $in1,$in1
 430 +       dshd    $in0,$in0
 431 +        dshd   $in1,$in1
 432 +# else
 433 +       ori     $tmp0,$zero,0xFF
 434 +       dsll    $tmp2,$tmp0,32
 435 +       or      $tmp0,$tmp2             # 0x000000FF000000FF
 436 +
 437 +       and     $tmp1,$in0,$tmp0        # byte swap
 438 +        and    $tmp3,$in1,$tmp0
 439 +       dsrl    $tmp2,$in0,24
 440 +        dsrl   $tmp4,$in1,24
 441 +       dsll    $tmp1,24
 442 +        dsll   $tmp3,24
 443 +       and     $tmp2,$tmp0
 444 +        and    $tmp4,$tmp0
 445 +       dsll    $tmp0,8                 # 0x0000FF000000FF00
 446 +       or      $tmp1,$tmp2
 447 +        or     $tmp3,$tmp4
 448 +       and     $tmp2,$in0,$tmp0
 449 +        and    $tmp4,$in1,$tmp0
 450 +       dsrl    $in0,8
 451 +        dsrl   $in1,8
 452 +       dsll    $tmp2,8
 453 +        dsll   $tmp4,8
 454 +       and     $in0,$tmp0
 455 +        and    $in1,$tmp0
 456 +       or      $tmp1,$tmp2
 457 +        or     $tmp3,$tmp4
 458 +       or      $in0,$tmp1
 459 +        or     $in1,$tmp3
 460 +       dsrl    $tmp1,$in0,32
 461 +        dsrl   $tmp3,$in1,32
 462 +       dsll    $in0,32
 463 +        dsll   $in1,32
 464 +       or      $in0,$tmp1
 465 +        or     $in1,$tmp3
 466 +# endif
 467 +#endif
 468 +       li      $tmp0,1
 469 +       dsll    $tmp0,32                # 0x0000000100000000
 470 +       daddiu  $tmp0,-63               # 0x00000000ffffffc1
 471 +       dsll    $tmp0,28                # 0x0ffffffc10000000
 472 +       daddiu  $tmp0,-1                # 0x0ffffffc0fffffff
 473 +
 474 +       and     $in0,$tmp0
 475 +       daddiu  $tmp0,-3                # 0x0ffffffc0ffffffc
 476 +       and     $in1,$tmp0
 477 +
 478 +       sd      $in0,24($ctx)
 479 +       dsrl    $tmp0,$in1,2
 480 +       sd      $in1,32($ctx)
 481 +       daddu   $tmp0,$in1              # s1 = r1 + (r1 >> 2)
 482 +       sd      $tmp0,40($ctx)
 483 +
 484 +.Lno_key:
 485 +       li      $v0,0                   # return 0
 486 +       jr      $ra
 487 +.end   poly1305_init
 488 +___
 489 +{
 490 +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
 491 +
 492 +my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
 493 +   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
 494 +my ($shr,$shl) = ($s6,$s7);            # used on R6
 495 +
 496 +$code.=<<___;
 497 +.align 5
 498 +.globl poly1305_blocks
 499 +.ent   poly1305_blocks
 500 +poly1305_blocks:
 501 +       .set    noreorder
 502 +       dsrl    $len,4                  # number of complete blocks
 503 +       bnez    $len,poly1305_blocks_internal
 504 +       nop
 505 +       jr      $ra
 506 +       nop
 507 +.end   poly1305_blocks
 508 +
 509 +.align 5
 510 +.ent   poly1305_blocks_internal
 511 +poly1305_blocks_internal:
 512 +       .set    noreorder
 513 +#if defined(_MIPS_ARCH_MIPS64R6)
 514 +       .frame  $sp,8*8,$ra
 515 +       .mask   $SAVED_REGS_MASK|0x000c0000,-8
 516 +       dsubu   $sp,8*8
 517 +       sd      $s7,56($sp)
 518 +       sd      $s6,48($sp)
 519 +#else
 520 +       .frame  $sp,6*8,$ra
 521 +       .mask   $SAVED_REGS_MASK,-8
 522 +       dsubu   $sp,6*8
 523 +#endif
 524 +       sd      $s5,40($sp)
 525 +       sd      $s4,32($sp)
 526 +___
 527 +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
 528 +       sd      $s3,24($sp)
 529 +       sd      $s2,16($sp)
 530 +       sd      $s1,8($sp)
 531 +       sd      $s0,0($sp)
 532 +___
 533 +$code.=<<___;
 534 +       .set    reorder
 535 +
 536 +#if defined(_MIPS_ARCH_MIPS64R6)
 537 +       andi    $shr,$inp,7
 538 +       dsubu   $inp,$inp,$shr          # align $inp
 539 +       sll     $shr,$shr,3             # byte to bit offset
 540 +       subu    $shl,$zero,$shr
 541 +#endif
 542 +
 543 +       ld      $h0,0($ctx)             # load hash value
 544 +       ld      $h1,8($ctx)
 545 +       ld      $h2,16($ctx)
 546 +
 547 +       ld      $r0,24($ctx)            # load key
 548 +       ld      $r1,32($ctx)
 549 +       ld      $rs1,40($ctx)
 550 +
 551 +       dsll    $len,4
 552 +       daddu   $len,$inp               # end of buffer
 553 +       b       .Loop
 554 +
 555 +.align 4
 556 +.Loop:
 557 +#if defined(_MIPS_ARCH_MIPS64R6)
 558 +       ld      $in0,0($inp)            # load input
 559 +       ld      $in1,8($inp)
 560 +       beqz    $shr,.Laligned_inp
 561 +
 562 +       ld      $tmp2,16($inp)
 563 +# ifdef        MIPSEB
 564 +       dsllv   $in0,$in0,$shr
 565 +       dsrlv   $tmp3,$in1,$shl
 566 +       dsllv   $in1,$in1,$shr
 567 +       dsrlv   $tmp2,$tmp2,$shl
 568 +# else
 569 +       dsrlv   $in0,$in0,$shr
 570 +       dsllv   $tmp3,$in1,$shl
 571 +       dsrlv   $in1,$in1,$shr
 572 +       dsllv   $tmp2,$tmp2,$shl
 573 +# endif
 574 +       or      $in0,$in0,$tmp3
 575 +       or      $in1,$in1,$tmp2
 576 +.Laligned_inp:
 577 +#else
 578 +       ldl     $in0,0+MSB($inp)        # load input
 579 +       ldl     $in1,8+MSB($inp)
 580 +       ldr     $in0,0+LSB($inp)
 581 +       ldr     $in1,8+LSB($inp)
 582 +#endif
 583 +       daddiu  $inp,16
 584 +#ifdef MIPSEB
 585 +# if defined(_MIPS_ARCH_MIPS64R2)
 586 +       dsbh    $in0,$in0               # byte swap
 587 +        dsbh   $in1,$in1
 588 +       dshd    $in0,$in0
 589 +        dshd   $in1,$in1
 590 +# else
 591 +       ori     $tmp0,$zero,0xFF
 592 +       dsll    $tmp2,$tmp0,32
 593 +       or      $tmp0,$tmp2             # 0x000000FF000000FF
 594 +
 595 +       and     $tmp1,$in0,$tmp0        # byte swap
 596 +        and    $tmp3,$in1,$tmp0
 597 +       dsrl    $tmp2,$in0,24
 598 +        dsrl   $tmp4,$in1,24
 599 +       dsll    $tmp1,24
 600 +        dsll   $tmp3,24
 601 +       and     $tmp2,$tmp0
 602 +        and    $tmp4,$tmp0
 603 +       dsll    $tmp0,8                 # 0x0000FF000000FF00
 604 +       or      $tmp1,$tmp2
 605 +        or     $tmp3,$tmp4
 606 +       and     $tmp2,$in0,$tmp0
 607 +        and    $tmp4,$in1,$tmp0
 608 +       dsrl    $in0,8
 609 +        dsrl   $in1,8
 610 +       dsll    $tmp2,8
 611 +        dsll   $tmp4,8
 612 +       and     $in0,$tmp0
 613 +        and    $in1,$tmp0
 614 +       or      $tmp1,$tmp2
 615 +        or     $tmp3,$tmp4
 616 +       or      $in0,$tmp1
 617 +        or     $in1,$tmp3
 618 +       dsrl    $tmp1,$in0,32
 619 +        dsrl   $tmp3,$in1,32
 620 +       dsll    $in0,32
 621 +        dsll   $in1,32
 622 +       or      $in0,$tmp1
 623 +        or     $in1,$tmp3
 624 +# endif
 625 +#endif
 626 +       dsrl    $tmp1,$h2,2             # modulo-scheduled reduction
 627 +       andi    $h2,$h2,3
 628 +       dsll    $tmp0,$tmp1,2
 629 +
 630 +       daddu   $d0,$h0,$in0            # accumulate input
 631 +        daddu  $tmp1,$tmp0
 632 +       sltu    $tmp0,$d0,$h0
 633 +       daddu   $d0,$d0,$tmp1           # ... and residue
 634 +       sltu    $tmp1,$d0,$tmp1
 635 +       daddu   $d1,$h1,$in1
 636 +       daddu   $tmp0,$tmp1
 637 +       sltu    $tmp1,$d1,$h1
 638 +       daddu   $d1,$tmp0
 639 +
 640 +       dmultu  ($r0,$d0)               # h0*r0
 641 +        daddu  $d2,$h2,$padbit
 642 +        sltu   $tmp0,$d1,$tmp0
 643 +       mflo    ($h0,$r0,$d0)
 644 +       mfhi    ($h1,$r0,$d0)
 645 +
 646 +       dmultu  ($rs1,$d1)              # h1*5*r1
 647 +        daddu  $d2,$tmp1
 648 +        daddu  $d2,$tmp0
 649 +       mflo    ($tmp0,$rs1,$d1)
 650 +       mfhi    ($tmp1,$rs1,$d1)
 651 +
 652 +       dmultu  ($r1,$d0)               # h0*r1
 653 +       mflo    ($tmp2,$r1,$d0)
 654 +       mfhi    ($h2,$r1,$d0)
 655 +        daddu  $h0,$tmp0
 656 +        daddu  $h1,$tmp1
 657 +        sltu   $tmp0,$h0,$tmp0
 658 +
 659 +       dmultu  ($r0,$d1)               # h1*r0
 660 +        daddu  $h1,$tmp0
 661 +        daddu  $h1,$tmp2
 662 +       mflo    ($tmp0,$r0,$d1)
 663 +       mfhi    ($tmp1,$r0,$d1)
 664 +
 665 +       dmultu  ($rs1,$d2)              # h2*5*r1
 666 +        sltu   $tmp2,$h1,$tmp2
 667 +        daddu  $h2,$tmp2
 668 +       mflo    ($tmp2,$rs1,$d2)
 669 +
 670 +       dmultu  ($r0,$d2)               # h2*r0
 671 +        daddu  $h1,$tmp0
 672 +        daddu  $h2,$tmp1
 673 +       mflo    ($tmp3,$r0,$d2)
 674 +        sltu   $tmp0,$h1,$tmp0
 675 +        daddu  $h2,$tmp0
 676 +
 677 +       daddu   $h1,$tmp2
 678 +       sltu    $tmp2,$h1,$tmp2
 679 +       daddu   $h2,$tmp2
 680 +       daddu   $h2,$tmp3
 681 +
 682 +       bne     $inp,$len,.Loop
 683 +
 684 +       sd      $h0,0($ctx)             # store hash value
 685 +       sd      $h1,8($ctx)
 686 +       sd      $h2,16($ctx)
 687 +
 688 +       .set    noreorder
 689 +#if defined(_MIPS_ARCH_MIPS64R6)
 690 +       ld      $s7,56($sp)
 691 +       ld      $s6,48($sp)
 692 +#endif
 693 +       ld      $s5,40($sp)             # epilogue
 694 +       ld      $s4,32($sp)
 695 +___
 696 +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
 697 +       ld      $s3,24($sp)
 698 +       ld      $s2,16($sp)
 699 +       ld      $s1,8($sp)
 700 +       ld      $s0,0($sp)
 701 +___
 702 +$code.=<<___;
 703 +       jr      $ra
 704 +#if defined(_MIPS_ARCH_MIPS64R6)
 705 +       daddu   $sp,8*8
 706 +#else
 707 +       daddu   $sp,6*8
 708 +#endif
 709 +.end   poly1305_blocks_internal
 710 +___
 711 +}
 712 +{
 713 +my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
 714 +
 715 +$code.=<<___;
 716 +.align 5
 717 +.globl poly1305_emit
 718 +.ent   poly1305_emit
 719 +poly1305_emit:
 720 +       .frame  $sp,0,$ra
 721 +       .set    reorder
 722 +
 723 +       ld      $tmp2,16($ctx)
 724 +       ld      $tmp0,0($ctx)
 725 +       ld      $tmp1,8($ctx)
 726 +
 727 +       li      $in0,-4                 # final reduction
 728 +       dsrl    $in1,$tmp2,2
 729 +       and     $in0,$tmp2
 730 +       andi    $tmp2,$tmp2,3
 731 +       daddu   $in0,$in1
 732 +
 733 +       daddu   $tmp0,$tmp0,$in0
 734 +       sltu    $in1,$tmp0,$in0
 735 +        daddiu $in0,$tmp0,5            # compare to modulus
 736 +       daddu   $tmp1,$tmp1,$in1
 737 +        sltiu  $tmp3,$in0,5
 738 +       sltu    $tmp4,$tmp1,$in1
 739 +        daddu  $in1,$tmp1,$tmp3
 740 +       daddu   $tmp2,$tmp2,$tmp4
 741 +        sltu   $tmp3,$in1,$tmp3
 742 +        daddu  $tmp2,$tmp2,$tmp3
 743 +
 744 +       dsrl    $tmp2,2                 # see if it carried/borrowed
 745 +       dsubu   $tmp2,$zero,$tmp2
 746 +
 747 +       xor     $in0,$tmp0
 748 +       xor     $in1,$tmp1
 749 +       and     $in0,$tmp2
 750 +       and     $in1,$tmp2
 751 +       xor     $in0,$tmp0
 752 +       xor     $in1,$tmp1
 753 +
 754 +       lwu     $tmp0,0($nonce)         # load nonce
 755 +       lwu     $tmp1,4($nonce)
 756 +       lwu     $tmp2,8($nonce)
 757 +       lwu     $tmp3,12($nonce)
 758 +       dsll    $tmp1,32
 759 +       dsll    $tmp3,32
 760 +       or      $tmp0,$tmp1
 761 +       or      $tmp2,$tmp3
 762 +
 763 +       daddu   $in0,$tmp0              # accumulate nonce
 764 +       daddu   $in1,$tmp2
 765 +       sltu    $tmp0,$in0,$tmp0
 766 +       daddu   $in1,$tmp0
 767 +
 768 +       dsrl    $tmp0,$in0,8            # write mac value
 769 +       dsrl    $tmp1,$in0,16
 770 +       dsrl    $tmp2,$in0,24
 771 +       sb      $in0,0($mac)
 772 +       dsrl    $tmp3,$in0,32
 773 +       sb      $tmp0,1($mac)
 774 +       dsrl    $tmp0,$in0,40
 775 +       sb      $tmp1,2($mac)
 776 +       dsrl    $tmp1,$in0,48
 777 +       sb      $tmp2,3($mac)
 778 +       dsrl    $tmp2,$in0,56
 779 +       sb      $tmp3,4($mac)
 780 +       dsrl    $tmp3,$in1,8
 781 +       sb      $tmp0,5($mac)
 782 +       dsrl    $tmp0,$in1,16
 783 +       sb      $tmp1,6($mac)
 784 +       dsrl    $tmp1,$in1,24
 785 +       sb      $tmp2,7($mac)
 786 +
 787 +       sb      $in1,8($mac)
 788 +       dsrl    $tmp2,$in1,32
 789 +       sb      $tmp3,9($mac)
 790 +       dsrl    $tmp3,$in1,40
 791 +       sb      $tmp0,10($mac)
 792 +       dsrl    $tmp0,$in1,48
 793 +       sb      $tmp1,11($mac)
 794 +       dsrl    $tmp1,$in1,56
 795 +       sb      $tmp2,12($mac)
 796 +       sb      $tmp3,13($mac)
 797 +       sb      $tmp0,14($mac)
 798 +       sb      $tmp1,15($mac)
 799 +
 800 +       jr      $ra
 801 +.end   poly1305_emit
 802 +.rdata
 803 +.asciiz        "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
 804 +.align 2
 805 +___
 806 +}
 807 +}}} else {{{
 808 +######################################################################
 809 +# 32-bit code path
 810 +#
 811 +
 812 +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
 813 +my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
 814 +   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
 815 +
 816 +$code.=<<___;
 817 +#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
 818 +     defined(_MIPS_ARCH_MIPS32R6)) \\
 819 +     && !defined(_MIPS_ARCH_MIPS32R2)
 820 +# define _MIPS_ARCH_MIPS32R2
 821 +#endif
 822 +
 823 +#if defined(_MIPS_ARCH_MIPS32R6)
 824 +# define multu(rs,rt)
 825 +# define mflo(rd,rs,rt)        mulu    rd,rs,rt
 826 +# define mfhi(rd,rs,rt)        muhu    rd,rs,rt
 827 +#else
 828 +# define multu(rs,rt)  multu   rs,rt
 829 +# define mflo(rd,rs,rt)        mflo    rd
 830 +# define mfhi(rd,rs,rt)        mfhi    rd
 831 +#endif
 832 +
 833 +#ifdef __KERNEL__
 834 +# define poly1305_init   poly1305_init_mips
 835 +# define poly1305_blocks poly1305_blocks_mips
 836 +# define poly1305_emit   poly1305_emit_mips
 837 +#endif
 838 +
 839 +#if defined(__MIPSEB__) && !defined(MIPSEB)
 840 +# define MIPSEB
 841 +#endif
 842 +
 843 +#ifdef MIPSEB
 844 +# define MSB 0
 845 +# define LSB 3
 846 +#else
 847 +# define MSB 3
 848 +# define LSB 0
 849 +#endif
 850 +
 851 +.text
 852 +.set   noat
 853 +.set   noreorder
 854 +
 855 +.align 5
 856 +.globl poly1305_init
 857 +.ent   poly1305_init
 858 +poly1305_init:
 859 +       .frame  $sp,0,$ra
 860 +       .set    reorder
 861 +
 862 +       sw      $zero,0($ctx)
 863 +       sw      $zero,4($ctx)
 864 +       sw      $zero,8($ctx)
 865 +       sw      $zero,12($ctx)
 866 +       sw      $zero,16($ctx)
 867 +
 868 +       beqz    $inp,.Lno_key
 869 +
 870 +#if defined(_MIPS_ARCH_MIPS32R6)
 871 +       andi    $tmp0,$inp,3            # $inp % 4
 872 +       subu    $inp,$inp,$tmp0         # align $inp
 873 +       sll     $tmp0,$tmp0,3           # byte to bit offset
 874 +       lw      $in0,0($inp)
 875 +       lw      $in1,4($inp)
 876 +       lw      $in2,8($inp)
 877 +       lw      $in3,12($inp)
 878 +       beqz    $tmp0,.Laligned_key
 879 +
 880 +       lw      $tmp2,16($inp)
 881 +       subu    $tmp1,$zero,$tmp0
 882 +# ifdef        MIPSEB
 883 +       sllv    $in0,$in0,$tmp0
 884 +       srlv    $tmp3,$in1,$tmp1
 885 +       sllv    $in1,$in1,$tmp0
 886 +       or      $in0,$in0,$tmp3
 887 +       srlv    $tmp3,$in2,$tmp1
 888 +       sllv    $in2,$in2,$tmp0
 889 +       or      $in1,$in1,$tmp3
 890 +       srlv    $tmp3,$in3,$tmp1
 891 +       sllv    $in3,$in3,$tmp0
 892 +       or      $in2,$in2,$tmp3
 893 +       srlv    $tmp2,$tmp2,$tmp1
 894 +       or      $in3,$in3,$tmp2
 895 +# else
 896 +       srlv    $in0,$in0,$tmp0
 897 +       sllv    $tmp3,$in1,$tmp1
 898 +       srlv    $in1,$in1,$tmp0
 899 +       or      $in0,$in0,$tmp3
 900 +       sllv    $tmp3,$in2,$tmp1
 901 +       srlv    $in2,$in2,$tmp0
 902 +       or      $in1,$in1,$tmp3
 903 +       sllv    $tmp3,$in3,$tmp1
 904 +       srlv    $in3,$in3,$tmp0
 905 +       or      $in2,$in2,$tmp3
 906 +       sllv    $tmp2,$tmp2,$tmp1
 907 +       or      $in3,$in3,$tmp2
 908 +# endif
 909 +.Laligned_key:
 910 +#else
 911 +       lwl     $in0,0+MSB($inp)
 912 +       lwl     $in1,4+MSB($inp)
 913 +       lwl     $in2,8+MSB($inp)
 914 +       lwl     $in3,12+MSB($inp)
 915 +       lwr     $in0,0+LSB($inp)
 916 +       lwr     $in1,4+LSB($inp)
 917 +       lwr     $in2,8+LSB($inp)
 918 +       lwr     $in3,12+LSB($inp)
 919 +#endif
 920 +#ifdef MIPSEB
 921 +# if defined(_MIPS_ARCH_MIPS32R2)
 922 +       wsbh    $in0,$in0               # byte swap
 923 +       wsbh    $in1,$in1
 924 +       wsbh    $in2,$in2
 925 +       wsbh    $in3,$in3
 926 +       rotr    $in0,$in0,16
 927 +       rotr    $in1,$in1,16
 928 +       rotr    $in2,$in2,16
 929 +       rotr    $in3,$in3,16
 930 +# else
 931 +       srl     $tmp0,$in0,24           # byte swap
 932 +       srl     $tmp1,$in0,8
 933 +       andi    $tmp2,$in0,0xFF00
 934 +       sll     $in0,$in0,24
 935 +       andi    $tmp1,0xFF00
 936 +       sll     $tmp2,$tmp2,8
 937 +       or      $in0,$tmp0
 938 +        srl    $tmp0,$in1,24
 939 +       or      $tmp1,$tmp2
 940 +        srl    $tmp2,$in1,8
 941 +       or      $in0,$tmp1
 942 +        andi   $tmp1,$in1,0xFF00
 943 +        sll    $in1,$in1,24
 944 +        andi   $tmp2,0xFF00
 945 +        sll    $tmp1,$tmp1,8
 946 +        or     $in1,$tmp0
 947 +       srl     $tmp0,$in2,24
 948 +        or     $tmp2,$tmp1
 949 +       srl     $tmp1,$in2,8
 950 +        or     $in1,$tmp2
 951 +       andi    $tmp2,$in2,0xFF00
 952 +       sll     $in2,$in2,24
 953 +       andi    $tmp1,0xFF00
 954 +       sll     $tmp2,$tmp2,8
 955 +       or      $in2,$tmp0
 956 +        srl    $tmp0,$in3,24
 957 +       or      $tmp1,$tmp2
 958 +        srl    $tmp2,$in3,8
 959 +       or      $in2,$tmp1
 960 +        andi   $tmp1,$in3,0xFF00
 961 +        sll    $in3,$in3,24
 962 +        andi   $tmp2,0xFF00
 963 +        sll    $tmp1,$tmp1,8
 964 +        or     $in3,$tmp0
 965 +        or     $tmp2,$tmp1
 966 +        or     $in3,$tmp2
 967 +# endif
 968 +#endif
 969 +       lui     $tmp0,0x0fff
 970 +       ori     $tmp0,0xffff            # 0x0fffffff
 971 +       and     $in0,$in0,$tmp0
 972 +       subu    $tmp0,3                 # 0x0ffffffc
 973 +       and     $in1,$in1,$tmp0
 974 +       and     $in2,$in2,$tmp0
 975 +       and     $in3,$in3,$tmp0
 976 +
 977 +       sw      $in0,20($ctx)
 978 +       sw      $in1,24($ctx)
 979 +       sw      $in2,28($ctx)
 980 +       sw      $in3,32($ctx)
 981 +
 982 +       srl     $tmp1,$in1,2
 983 +       srl     $tmp2,$in2,2
 984 +       srl     $tmp3,$in3,2
 985 +       addu    $in1,$in1,$tmp1         # s1 = r1 + (r1 >> 2)
 986 +       addu    $in2,$in2,$tmp2
 987 +       addu    $in3,$in3,$tmp3
 988 +       sw      $in1,36($ctx)
 989 +       sw      $in2,40($ctx)
 990 +       sw      $in3,44($ctx)
 991 +.Lno_key:
 992 +       li      $v0,0
 993 +       jr      $ra
 994 +.end   poly1305_init
 995 +___
 996 +{
 997 +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
 998 +
 999 +my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
1000 +   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
1001 +my ($d0,$d1,$d2,$d3) =
1002 +   ($a4,$a5,$a6,$a7);
1003 +my $shr = $t2;         # used on R6
1004 +my $one = $t2;         # used on R2
1005 +
1006 +$code.=<<___;
1007 +.globl poly1305_blocks
1008 +.align 5
1009 +.ent   poly1305_blocks
1010 +poly1305_blocks:
1011 +       .frame  $sp,16*4,$ra
1012 +       .mask   $SAVED_REGS_MASK,-4
1013 +       .set    noreorder
1014 +       subu    $sp, $sp,4*12
1015 +       sw      $s11,4*11($sp)
1016 +       sw      $s10,4*10($sp)
1017 +       sw      $s9, 4*9($sp)
1018 +       sw      $s8, 4*8($sp)
1019 +       sw      $s7, 4*7($sp)
1020 +       sw      $s6, 4*6($sp)
1021 +       sw      $s5, 4*5($sp)
1022 +       sw      $s4, 4*4($sp)
1023 +___
1024 +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1025 +       sw      $s3, 4*3($sp)
1026 +       sw      $s2, 4*2($sp)
1027 +       sw      $s1, 4*1($sp)
1028 +       sw      $s0, 4*0($sp)
1029 +___
1030 +$code.=<<___;
1031 +       .set    reorder
1032 +
1033 +       srl     $len,4                  # number of complete blocks
1034 +       li      $one,1
1035 +       beqz    $len,.Labort
1036 +
1037 +#if defined(_MIPS_ARCH_MIPS32R6)
1038 +       andi    $shr,$inp,3
1039 +       subu    $inp,$inp,$shr          # align $inp
1040 +       sll     $shr,$shr,3             # byte to bit offset
1041 +#endif
1042 +
1043 +       lw      $h0,0($ctx)             # load hash value
1044 +       lw      $h1,4($ctx)
1045 +       lw      $h2,8($ctx)
1046 +       lw      $h3,12($ctx)
1047 +       lw      $h4,16($ctx)
1048 +
1049 +       lw      $r0,20($ctx)            # load key
1050 +       lw      $r1,24($ctx)
1051 +       lw      $r2,28($ctx)
1052 +       lw      $r3,32($ctx)
1053 +       lw      $rs1,36($ctx)
1054 +       lw      $rs2,40($ctx)
1055 +       lw      $rs3,44($ctx)
1056 +
1057 +       sll     $len,4
1058 +       addu    $len,$len,$inp          # end of buffer
1059 +       b       .Loop
1060 +
1061 +.align 4
1062 +.Loop:
1063 +#if defined(_MIPS_ARCH_MIPS32R6)
1064 +       lw      $d0,0($inp)             # load input
1065 +       lw      $d1,4($inp)
1066 +       lw      $d2,8($inp)
1067 +       lw      $d3,12($inp)
1068 +       beqz    $shr,.Laligned_inp
1069 +
1070 +       lw      $t0,16($inp)
1071 +       subu    $t1,$zero,$shr
1072 +# ifdef        MIPSEB
1073 +       sllv    $d0,$d0,$shr
1074 +       srlv    $at,$d1,$t1
1075 +       sllv    $d1,$d1,$shr
1076 +       or      $d0,$d0,$at
1077 +       srlv    $at,$d2,$t1
1078 +       sllv    $d2,$d2,$shr
1079 +       or      $d1,$d1,$at
1080 +       srlv    $at,$d3,$t1
1081 +       sllv    $d3,$d3,$shr
1082 +       or      $d2,$d2,$at
1083 +       srlv    $t0,$t0,$t1
1084 +       or      $d3,$d3,$t0
1085 +# else
1086 +       srlv    $d0,$d0,$shr
1087 +       sllv    $at,$d1,$t1
1088 +       srlv    $d1,$d1,$shr
1089 +       or      $d0,$d0,$at
1090 +       sllv    $at,$d2,$t1
1091 +       srlv    $d2,$d2,$shr
1092 +       or      $d1,$d1,$at
1093 +       sllv    $at,$d3,$t1
1094 +       srlv    $d3,$d3,$shr
1095 +       or      $d2,$d2,$at
1096 +       sllv    $t0,$t0,$t1
1097 +       or      $d3,$d3,$t0
1098 +# endif
1099 +.Laligned_inp:
1100 +#else
1101 +       lwl     $d0,0+MSB($inp)         # load input
1102 +       lwl     $d1,4+MSB($inp)
1103 +       lwl     $d2,8+MSB($inp)
1104 +       lwl     $d3,12+MSB($inp)
1105 +       lwr     $d0,0+LSB($inp)
1106 +       lwr     $d1,4+LSB($inp)
1107 +       lwr     $d2,8+LSB($inp)
1108 +       lwr     $d3,12+LSB($inp)
1109 +#endif
1110 +#ifdef MIPSEB
1111 +# if defined(_MIPS_ARCH_MIPS32R2)
1112 +       wsbh    $d0,$d0                 # byte swap
1113 +       wsbh    $d1,$d1
1114 +       wsbh    $d2,$d2
1115 +       wsbh    $d3,$d3
1116 +       rotr    $d0,$d0,16
1117 +       rotr    $d1,$d1,16
1118 +       rotr    $d2,$d2,16
1119 +       rotr    $d3,$d3,16
1120 +# else
1121 +       srl     $at,$d0,24              # byte swap
1122 +       srl     $t0,$d0,8
1123 +       andi    $t1,$d0,0xFF00
1124 +       sll     $d0,$d0,24
1125 +       andi    $t0,0xFF00
1126 +       sll     $t1,$t1,8
1127 +       or      $d0,$at
1128 +        srl    $at,$d1,24
1129 +       or      $t0,$t1
1130 +        srl    $t1,$d1,8
1131 +       or      $d0,$t0
1132 +        andi   $t0,$d1,0xFF00
1133 +        sll    $d1,$d1,24
1134 +        andi   $t1,0xFF00
1135 +        sll    $t0,$t0,8
1136 +        or     $d1,$at
1137 +       srl     $at,$d2,24
1138 +        or     $t1,$t0
1139 +       srl     $t0,$d2,8
1140 +        or     $d1,$t1
1141 +       andi    $t1,$d2,0xFF00
1142 +       sll     $d2,$d2,24
1143 +       andi    $t0,0xFF00
1144 +       sll     $t1,$t1,8
1145 +       or      $d2,$at
1146 +        srl    $at,$d3,24
1147 +       or      $t0,$t1
1148 +        srl    $t1,$d3,8
1149 +       or      $d2,$t0
1150 +        andi   $t0,$d3,0xFF00
1151 +        sll    $d3,$d3,24
1152 +        andi   $t1,0xFF00
1153 +        sll    $t0,$t0,8
1154 +        or     $d3,$at
1155 +        or     $t1,$t0
1156 +        or     $d3,$t1
1157 +# endif
1158 +#endif
1159 +       srl     $t0,$h4,2               # modulo-scheduled reduction
1160 +       andi    $h4,$h4,3
1161 +       sll     $at,$t0,2
1162 +
1163 +       addu    $d0,$d0,$h0             # accumulate input
1164 +        addu   $t0,$t0,$at
1165 +       sltu    $h0,$d0,$h0
1166 +       addu    $d0,$d0,$t0             # ... and residue
1167 +       sltu    $at,$d0,$t0
1168 +
1169 +       addu    $d1,$d1,$h1
1170 +        addu   $h0,$h0,$at             # carry
1171 +       sltu    $h1,$d1,$h1
1172 +       addu    $d1,$d1,$h0
1173 +       sltu    $h0,$d1,$h0
1174 +
1175 +       addu    $d2,$d2,$h2
1176 +        addu   $h1,$h1,$h0             # carry
1177 +       sltu    $h2,$d2,$h2
1178 +       addu    $d2,$d2,$h1
1179 +       sltu    $h1,$d2,$h1
1180 +
1181 +       addu    $d3,$d3,$h3
1182 +        addu   $h2,$h2,$h1             # carry
1183 +       sltu    $h3,$d3,$h3
1184 +       addu    $d3,$d3,$h2
1185 +
1186 +#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
1187 +       multu   $r0,$d0                 # d0*r0
1188 +        sltu   $h2,$d3,$h2
1189 +       maddu   $rs3,$d1                # d1*s3
1190 +        addu   $h3,$h3,$h2             # carry
1191 +       maddu   $rs2,$d2                # d2*s2
1192 +        addu   $h4,$h4,$padbit
1193 +       maddu   $rs1,$d3                # d3*s1
1194 +        addu   $h4,$h4,$h3
1195 +       mfhi    $at
1196 +       mflo    $h0
1197 +
1198 +       multu   $r1,$d0                 # d0*r1
1199 +       maddu   $r0,$d1                 # d1*r0
1200 +       maddu   $rs3,$d2                # d2*s3
1201 +       maddu   $rs2,$d3                # d3*s2
1202 +       maddu   $rs1,$h4                # h4*s1
1203 +       maddu   $at,$one                # hi*1
1204 +       mfhi    $at
1205 +       mflo    $h1
1206 +
1207 +       multu   $r2,$d0                 # d0*r2
1208 +       maddu   $r1,$d1                 # d1*r1
1209 +       maddu   $r0,$d2                 # d2*r0
1210 +       maddu   $rs3,$d3                # d3*s3
1211 +       maddu   $rs2,$h4                # h4*s2
1212 +       maddu   $at,$one                # hi*1
1213 +       mfhi    $at
1214 +       mflo    $h2
1215 +
1216 +       mul     $t0,$r0,$h4             # h4*r0
1217 +
1218 +       multu   $r3,$d0                 # d0*r3
1219 +       maddu   $r2,$d1                 # d1*r2
1220 +       maddu   $r1,$d2                 # d2*r1
1221 +       maddu   $r0,$d3                 # d3*r0
1222 +       maddu   $rs3,$h4                # h4*s3
1223 +       maddu   $at,$one                # hi*1
1224 +       mfhi    $at
1225 +       mflo    $h3
1226 +
1227 +        addiu  $inp,$inp,16
1228 +
1229 +       addu    $h4,$t0,$at
1230 +#else
1231 +       multu   ($r0,$d0)               # d0*r0
1232 +       mflo    ($h0,$r0,$d0)
1233 +       mfhi    ($h1,$r0,$d0)
1234 +
1235 +        sltu   $h2,$d3,$h2
1236 +        addu   $h3,$h3,$h2             # carry
1237 +
1238 +       multu   ($rs3,$d1)              # d1*s3
1239 +       mflo    ($at,$rs3,$d1)
1240 +       mfhi    ($t0,$rs3,$d1)
1241 +
1242 +        addu   $h4,$h4,$padbit
1243 +        addiu  $inp,$inp,16
1244 +        addu   $h4,$h4,$h3
1245 +
1246 +       multu   ($rs2,$d2)              # d2*s2
1247 +       mflo    ($a3,$rs2,$d2)
1248 +       mfhi    ($t1,$rs2,$d2)
1249 +        addu   $h0,$h0,$at
1250 +        addu   $h1,$h1,$t0
1251 +       multu   ($rs1,$d3)              # d3*s1
1252 +        sltu   $at,$h0,$at
1253 +        addu   $h1,$h1,$at
1254 +
1255 +       mflo    ($at,$rs1,$d3)
1256 +       mfhi    ($t0,$rs1,$d3)
1257 +        addu   $h0,$h0,$a3
1258 +        addu   $h1,$h1,$t1
1259 +       multu   ($r1,$d0)               # d0*r1
1260 +        sltu   $a3,$h0,$a3
1261 +        addu   $h1,$h1,$a3
1262 +
1263 +
1264 +       mflo    ($a3,$r1,$d0)
1265 +       mfhi    ($h2,$r1,$d0)
1266 +        addu   $h0,$h0,$at
1267 +        addu   $h1,$h1,$t0
1268 +       multu   ($r0,$d1)               # d1*r0
1269 +        sltu   $at,$h0,$at
1270 +        addu   $h1,$h1,$at
1271 +
1272 +       mflo    ($at,$r0,$d1)
1273 +       mfhi    ($t0,$r0,$d1)
1274 +        addu   $h1,$h1,$a3
1275 +        sltu   $a3,$h1,$a3
1276 +       multu   ($rs3,$d2)              # d2*s3
1277 +        addu   $h2,$h2,$a3
1278 +
1279 +       mflo    ($a3,$rs3,$d2)
1280 +       mfhi    ($t1,$rs3,$d2)
1281 +        addu   $h1,$h1,$at
1282 +        addu   $h2,$h2,$t0
1283 +       multu   ($rs2,$d3)              # d3*s2
1284 +        sltu   $at,$h1,$at
1285 +        addu   $h2,$h2,$at
1286 +
1287 +       mflo    ($at,$rs2,$d3)
1288 +       mfhi    ($t0,$rs2,$d3)
1289 +        addu   $h1,$h1,$a3
1290 +        addu   $h2,$h2,$t1
1291 +       multu   ($rs1,$h4)              # h4*s1
1292 +        sltu   $a3,$h1,$a3
1293 +        addu   $h2,$h2,$a3
1294 +
1295 +       mflo    ($a3,$rs1,$h4)
1296 +        addu   $h1,$h1,$at
1297 +        addu   $h2,$h2,$t0
1298 +       multu   ($r2,$d0)               # d0*r2
1299 +        sltu   $at,$h1,$at
1300 +        addu   $h2,$h2,$at
1301 +
1302 +
1303 +       mflo    ($at,$r2,$d0)
1304 +       mfhi    ($h3,$r2,$d0)
1305 +        addu   $h1,$h1,$a3
1306 +        sltu   $a3,$h1,$a3
1307 +       multu   ($r1,$d1)               # d1*r1
1308 +        addu   $h2,$h2,$a3
1309 +
1310 +       mflo    ($a3,$r1,$d1)
1311 +       mfhi    ($t1,$r1,$d1)
1312 +        addu   $h2,$h2,$at
1313 +        sltu   $at,$h2,$at
1314 +       multu   ($r0,$d2)               # d2*r0
1315 +        addu   $h3,$h3,$at
1316 +
1317 +       mflo    ($at,$r0,$d2)
1318 +       mfhi    ($t0,$r0,$d2)
1319 +        addu   $h2,$h2,$a3
1320 +        addu   $h3,$h3,$t1
1321 +       multu   ($rs3,$d3)              # d3*s3
1322 +        sltu   $a3,$h2,$a3
1323 +        addu   $h3,$h3,$a3
1324 +
1325 +       mflo    ($a3,$rs3,$d3)
1326 +       mfhi    ($t1,$rs3,$d3)
1327 +        addu   $h2,$h2,$at
1328 +        addu   $h3,$h3,$t0
1329 +       multu   ($rs2,$h4)              # h4*s2
1330 +        sltu   $at,$h2,$at
1331 +        addu   $h3,$h3,$at
1332 +
1333 +       mflo    ($at,$rs2,$h4)
1334 +        addu   $h2,$h2,$a3
1335 +        addu   $h3,$h3,$t1
1336 +       multu   ($r3,$d0)               # d0*r3
1337 +        sltu   $a3,$h2,$a3
1338 +        addu   $h3,$h3,$a3
1339 +
1340 +
1341 +       mflo    ($a3,$r3,$d0)
1342 +       mfhi    ($t1,$r3,$d0)
1343 +        addu   $h2,$h2,$at
1344 +        sltu   $at,$h2,$at
1345 +       multu   ($r2,$d1)               # d1*r2
1346 +        addu   $h3,$h3,$at
1347 +
1348 +       mflo    ($at,$r2,$d1)
1349 +       mfhi    ($t0,$r2,$d1)
1350 +        addu   $h3,$h3,$a3
1351 +        sltu   $a3,$h3,$a3
1352 +       multu   ($r0,$d3)               # d3*r0
1353 +        addu   $t1,$t1,$a3
1354 +
1355 +       mflo    ($a3,$r0,$d3)
1356 +       mfhi    ($d3,$r0,$d3)
1357 +        addu   $h3,$h3,$at
1358 +        addu   $t1,$t1,$t0
1359 +       multu   ($r1,$d2)               # d2*r1
1360 +        sltu   $at,$h3,$at
1361 +        addu   $t1,$t1,$at
1362 +
1363 +       mflo    ($at,$r1,$d2)
1364 +       mfhi    ($t0,$r1,$d2)
1365 +        addu   $h3,$h3,$a3
1366 +        addu   $t1,$t1,$d3
1367 +       multu   ($rs3,$h4)              # h4*s3
1368 +        sltu   $a3,$h3,$a3
1369 +        addu   $t1,$t1,$a3
1370 +
1371 +       mflo    ($a3,$rs3,$h4)
1372 +        addu   $h3,$h3,$at
1373 +        addu   $t1,$t1,$t0
1374 +       multu   ($r0,$h4)               # h4*r0
1375 +        sltu   $at,$h3,$at
1376 +        addu   $t1,$t1,$at
1377 +
1378 +
1379 +       mflo    ($h4,$r0,$h4)
1380 +        addu   $h3,$h3,$a3
1381 +        sltu   $a3,$h3,$a3
1382 +        addu   $t1,$t1,$a3
1383 +       addu    $h4,$h4,$t1
1384 +
1385 +       li      $padbit,1               # if we loop, padbit is 1
1386 +#endif
1387 +       bne     $inp,$len,.Loop
1388 +
1389 +       sw      $h0,0($ctx)             # store hash value
1390 +       sw      $h1,4($ctx)
1391 +       sw      $h2,8($ctx)
1392 +       sw      $h3,12($ctx)
1393 +       sw      $h4,16($ctx)
1394 +
1395 +       .set    noreorder
1396 +.Labort:
1397 +       lw      $s11,4*11($sp)
1398 +       lw      $s10,4*10($sp)
1399 +       lw      $s9, 4*9($sp)
1400 +       lw      $s8, 4*8($sp)
1401 +       lw      $s7, 4*7($sp)
1402 +       lw      $s6, 4*6($sp)
1403 +       lw      $s5, 4*5($sp)
1404 +       lw      $s4, 4*4($sp)
1405 +___
1406 +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1407 +       lw      $s3, 4*3($sp)
1408 +       lw      $s2, 4*2($sp)
1409 +       lw      $s1, 4*1($sp)
1410 +       lw      $s0, 4*0($sp)
1411 +___
1412 +$code.=<<___;
1413 +       jr      $ra
1414 +       addu    $sp,$sp,4*12
1415 +.end   poly1305_blocks
1416 +___
1417 +}
1418 +{
1419 +my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1420 +
1421 +$code.=<<___;
1422 +.align 5
1423 +.globl poly1305_emit
1424 +.ent   poly1305_emit
1425 +poly1305_emit:
1426 +       .frame  $sp,0,$ra
1427 +       .set    reorder
1428 +
1429 +       lw      $tmp4,16($ctx)
1430 +       lw      $tmp0,0($ctx)
1431 +       lw      $tmp1,4($ctx)
1432 +       lw      $tmp2,8($ctx)
1433 +       lw      $tmp3,12($ctx)
1434 +
1435 +       li      $in0,-4                 # final reduction
1436 +       srl     $ctx,$tmp4,2
1437 +       and     $in0,$in0,$tmp4
1438 +       andi    $tmp4,$tmp4,3
1439 +       addu    $ctx,$ctx,$in0
1440 +
1441 +       addu    $tmp0,$tmp0,$ctx
1442 +       sltu    $ctx,$tmp0,$ctx
1443 +        addiu  $in0,$tmp0,5            # compare to modulus
1444 +       addu    $tmp1,$tmp1,$ctx
1445 +        sltiu  $in1,$in0,5
1446 +       sltu    $ctx,$tmp1,$ctx
1447 +        addu   $in1,$in1,$tmp1
1448 +       addu    $tmp2,$tmp2,$ctx
1449 +        sltu   $in2,$in1,$tmp1
1450 +       sltu    $ctx,$tmp2,$ctx
1451 +        addu   $in2,$in2,$tmp2
1452 +       addu    $tmp3,$tmp3,$ctx
1453 +        sltu   $in3,$in2,$tmp2
1454 +       sltu    $ctx,$tmp3,$ctx
1455 +        addu   $in3,$in3,$tmp3
1456 +       addu    $tmp4,$tmp4,$ctx
1457 +        sltu   $ctx,$in3,$tmp3
1458 +        addu   $ctx,$tmp4
1459 +
1460 +       srl     $ctx,2                  # see if it carried/borrowed
1461 +       subu    $ctx,$zero,$ctx
1462 +
1463 +       xor     $in0,$tmp0
1464 +       xor     $in1,$tmp1
1465 +       xor     $in2,$tmp2
1466 +       xor     $in3,$tmp3
1467 +       and     $in0,$ctx
1468 +       and     $in1,$ctx
1469 +       and     $in2,$ctx
1470 +       and     $in3,$ctx
1471 +       xor     $in0,$tmp0
1472 +       xor     $in1,$tmp1
1473 +       xor     $in2,$tmp2
1474 +       xor     $in3,$tmp3
1475 +
1476 +       lw      $tmp0,0($nonce)         # load nonce
1477 +       lw      $tmp1,4($nonce)
1478 +       lw      $tmp2,8($nonce)
1479 +       lw      $tmp3,12($nonce)
1480 +
1481 +       addu    $in0,$tmp0              # accumulate nonce
1482 +       sltu    $ctx,$in0,$tmp0
1483 +
1484 +       addu    $in1,$tmp1
1485 +       sltu    $tmp1,$in1,$tmp1
1486 +       addu    $in1,$ctx
1487 +       sltu    $ctx,$in1,$ctx
1488 +       addu    $ctx,$tmp1
1489 +
1490 +       addu    $in2,$tmp2
1491 +       sltu    $tmp2,$in2,$tmp2
1492 +       addu    $in2,$ctx
1493 +       sltu    $ctx,$in2,$ctx
1494 +       addu    $ctx,$tmp2
1495 +
1496 +       addu    $in3,$tmp3
1497 +       addu    $in3,$ctx
1498 +
1499 +       srl     $tmp0,$in0,8            # write mac value
1500 +       srl     $tmp1,$in0,16
1501 +       srl     $tmp2,$in0,24
1502 +       sb      $in0, 0($mac)
1503 +       sb      $tmp0,1($mac)
1504 +       srl     $tmp0,$in1,8
1505 +       sb      $tmp1,2($mac)
1506 +       srl     $tmp1,$in1,16
1507 +       sb      $tmp2,3($mac)
1508 +       srl     $tmp2,$in1,24
1509 +       sb      $in1, 4($mac)
1510 +       sb      $tmp0,5($mac)
1511 +       srl     $tmp0,$in2,8
1512 +       sb      $tmp1,6($mac)
1513 +       srl     $tmp1,$in2,16
1514 +       sb      $tmp2,7($mac)
1515 +       srl     $tmp2,$in2,24
1516 +       sb      $in2, 8($mac)
1517 +       sb      $tmp0,9($mac)
1518 +       srl     $tmp0,$in3,8
1519 +       sb      $tmp1,10($mac)
1520 +       srl     $tmp1,$in3,16
1521 +       sb      $tmp2,11($mac)
1522 +       srl     $tmp2,$in3,24
1523 +       sb      $in3, 12($mac)
1524 +       sb      $tmp0,13($mac)
1525 +       sb      $tmp1,14($mac)
1526 +       sb      $tmp2,15($mac)
1527 +
1528 +       jr      $ra
1529 +.end   poly1305_emit
1530 +.rdata
1531 +.asciiz        "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1532 +.align 2
1533 +___
1534 +}
1535 +}}}
1536 +
1537 +$output=pop and open STDOUT,">$output";
1538 +print $code;
1539 +close STDOUT;
1540 --- a/crypto/Kconfig
1541 +++ b/crypto/Kconfig
1542 @@ -707,6 +707,11 @@ config CRYPTO_POLY1305_X86_64
1543           in IETF protocols. This is the x86_64 assembler implementation using SIMD
1544           instructions.
1545
1546 +config CRYPTO_POLY1305_MIPS
1547 +       tristate "Poly1305 authenticator algorithm (MIPS optimized)"
1548 +       depends on CPU_MIPS32 || (CPU_MIPS64 && 64BIT)
1549 +       select CRYPTO_ARCH_HAVE_LIB_POLY1305
1550 +
1551  config CRYPTO_MD4
1552         tristate "MD4 digest algorithm"
1553         select CRYPTO_HASH
1554 --- a/lib/crypto/Kconfig
1555 +++ b/lib/crypto/Kconfig
1556 @@ -39,6 +39,7 @@ config CRYPTO_LIB_DES
1557
1558  config CRYPTO_LIB_POLY1305_RSIZE
1559         int
1560 +       default 2 if MIPS
1561         default 4 if X86_64
1562         default 9 if ARM || ARM64
1563         default 1