target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch

   1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
   2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
   3 Date: Sun, 5 Jan 2020 22:40:48 -0500
   4 Subject: [PATCH] crypto: x86/poly1305 - wire up faster implementations for
   5  kernel
   6
   7 commit d7d7b853566254648df59f7ea27ea05952a6cfa8 upstream.
   8
   9 These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F.
  10 The AVX-512F implementation is disabled on Skylake, due to throttling,
  11 but it is quite fast on >= Cannonlake.
  12
  13 On the left is cycle counts on a Core i7 6700HQ using the AVX-2
  14 codepath, comparing this implementation ("new") to the implementation in
  15 the current crypto api ("old"). On the right are benchmarks on a Xeon
  16 Gold 5120 using the AVX-512 codepath. The new implementation is faster
  17 on all benchmarks.
  18
  19         AVX-2                  AVX-512
  20       ---------              -----------
  21
  22     size    old     new      size   old     new
  23     ----    ----    ----     ----   ----    ----
  24     0       70      68       0      74      70
  25     16      92      90       16     96      92
  26     32      134     104      32     136     106
  27     48      172     120      48     184     124
  28     64      218     136      64     218     138
  29     80      254     158      80     260     160
  30     96      298     174      96     300     176
  31     112     342     192      112    342     194
  32     128     388     212      128    384     212
  33     144     428     228      144    420     226
  34     160     466     246      160    464     248
  35     176     510     264      176    504     264
  36     192     550     282      192    544     282
  37     208     594     302      208    582     300
  38     224     628     316      224    624     318
  39     240     676     334      240    662     338
  40     256     716     354      256    708     358
  41     272     764     374      272    748     372
  42     288     802     352      288    788     358
  43     304     420     366      304    422     370
  44     320     428     360      320    432     364
  45     336     484     378      336    486     380
  46     352     426     384      352    434     390
  47     368     478     400      368    480     408
  48     384     488     394      384    490     398
  49     400     542     408      400    542     412
  50     416     486     416      416    492     426
  51     432     534     430      432    538     436
  52     448     544     422      448    546     432
  53     464     600     438      464    600     448
  54     480     540     448      480    548     456
  55     496     594     464      496    594     476
  56     512     602     456      512    606     470
  57     528     656     476      528    656     480
  58     544     600     480      544    606     498
  59     560     650     494      560    652     512
  60     576     664     490      576    662     508
  61     592     714     508      592    716     522
  62     608     656     514      608    664     538
  63     624     708     532      624    710     552
  64     640     716     524      640    720     516
  65     656     770     536      656    772     526
  66     672     716     548      672    722     544
  67     688     770     562      688    768     556
  68     704     774     552      704    778     556
  69     720     826     568      720    832     568
  70     736     768     574      736    780     584
  71     752     822     592      752    826     600
  72     768     830     584      768    836     560
  73     784     884     602      784    888     572
  74     800     828     610      800    838     588
  75     816     884     628      816    884     604
  76     832     888     618      832    894     598
  77     848     942     632      848    946     612
  78     864     884     644      864    896     628
  79     880     936     660      880    942     644
  80     896     948     652      896    952     608
  81     912     1000    664      912    1004    616
  82     928     942     676      928    954     634
  83     944     994     690      944    1000    646
  84     960     1002    680      960    1008    646
  85     976     1054    694      976    1062    658
  86     992     1002    706      992    1012    674
  87     1008    1052    720      1008   1058    690
  88
  89 This commit wires in the prior implementation from Andy, and makes the
  90 following changes to be suitable for kernel land.
  91
  92   - Some cosmetic and structural changes, like renaming labels to
  93     .Lname, constants, and other Linux conventions, as well as making
  94     the code easy for us to maintain moving forward.
  95
  96   - CPU feature checking is done in C by the glue code.
  97
  98   - We avoid jumping into the middle of functions, to appease objtool,
  99     and instead parameterize shared code.
 100
 101   - We maintain frame pointers so that stack traces make sense.
 102
 103   - We remove the dependency on the perl xlate code, which transforms
 104     the output into things that assemblers we don't care about use.
 105
 106 Importantly, none of our changes affect the arithmetic or core code, but
 107 just involve the differing environment of kernel space.
 108
 109 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
 110 Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
 111 Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
 112 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
 113 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
 114 ---
 115  arch/x86/crypto/.gitignore                    |   1 +
 116  arch/x86/crypto/Makefile                      |  11 +-
 117  arch/x86/crypto/poly1305-avx2-x86_64.S        | 390 ----------
 118  arch/x86/crypto/poly1305-sse2-x86_64.S        | 590 ---------------
 119  arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 682 ++++++++++--------
 120  arch/x86/crypto/poly1305_glue.c               | 473 +++++-------
 121  lib/crypto/Kconfig                            |   2 +-
 122  7 files changed, 572 insertions(+), 1577 deletions(-)
 123  create mode 100644 arch/x86/crypto/.gitignore
 124  delete mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S
 125  delete mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S
 126
 127 --- /dev/null
 128 +++ b/arch/x86/crypto/.gitignore
 129 @@ -0,0 +1 @@
 130 +poly1305-x86_64.S
 131 --- a/arch/x86/crypto/Makefile
 132 +++ b/arch/x86/crypto/Makefile
 133 @@ -73,6 +73,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o
 134
 135  nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
 136  blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
 137 +poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
 138 +ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
 139 +targets += poly1305-x86_64-cryptogams.S
 140 +endif
 141
 142  ifeq ($(avx_supported),yes)
 143         camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
 144 @@ -101,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni
 145  aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 146  ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 147  sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 148 -poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
 149  ifeq ($(avx2_supported),yes)
 150  sha1-ssse3-y += sha1_avx2_x86_64_asm.o
 151 -poly1305-x86_64-y += poly1305-avx2-x86_64.o
 152  endif
 153  ifeq ($(sha1_ni_supported),yes)
 154  sha1-ssse3-y += sha1_ni_asm.o
 155 @@ -118,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o
 156  endif
 157  sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
 158  crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
 159 +
 160 +quiet_cmd_perlasm = PERLASM $@
 161 +      cmd_perlasm = $(PERL) $< > $@
 162 +$(obj)/%.S: $(src)/%.pl FORCE
 163 +       $(call if_changed,perlasm)
 164 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S
 165 +++ /dev/null
 166 @@ -1,390 +0,0 @@
 167 -/* SPDX-License-Identifier: GPL-2.0-or-later */
 168 -/*
 169 - * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
 170 - *
 171 - * Copyright (C) 2015 Martin Willi
 172 - */
 173 -
 174 -#include <linux/linkage.h>
 175 -
 176 -.section       .rodata.cst32.ANMASK, "aM", @progbits, 32
 177 -.align 32
 178 -ANMASK:        .octa 0x0000000003ffffff0000000003ffffff
 179 -       .octa 0x0000000003ffffff0000000003ffffff
 180 -
 181 -.section       .rodata.cst32.ORMASK, "aM", @progbits, 32
 182 -.align 32
 183 -ORMASK:        .octa 0x00000000010000000000000001000000
 184 -       .octa 0x00000000010000000000000001000000
 185 -
 186 -.text
 187 -
 188 -#define h0 0x00(%rdi)
 189 -#define h1 0x04(%rdi)
 190 -#define h2 0x08(%rdi)
 191 -#define h3 0x0c(%rdi)
 192 -#define h4 0x10(%rdi)
 193 -#define r0 0x00(%rdx)
 194 -#define r1 0x04(%rdx)
 195 -#define r2 0x08(%rdx)
 196 -#define r3 0x0c(%rdx)
 197 -#define r4 0x10(%rdx)
 198 -#define u0 0x00(%r8)
 199 -#define u1 0x04(%r8)
 200 -#define u2 0x08(%r8)
 201 -#define u3 0x0c(%r8)
 202 -#define u4 0x10(%r8)
 203 -#define w0 0x18(%r8)
 204 -#define w1 0x1c(%r8)
 205 -#define w2 0x20(%r8)
 206 -#define w3 0x24(%r8)
 207 -#define w4 0x28(%r8)
 208 -#define y0 0x30(%r8)
 209 -#define y1 0x34(%r8)
 210 -#define y2 0x38(%r8)
 211 -#define y3 0x3c(%r8)
 212 -#define y4 0x40(%r8)
 213 -#define m %rsi
 214 -#define hc0 %ymm0
 215 -#define hc1 %ymm1
 216 -#define hc2 %ymm2
 217 -#define hc3 %ymm3
 218 -#define hc4 %ymm4
 219 -#define hc0x %xmm0
 220 -#define hc1x %xmm1
 221 -#define hc2x %xmm2
 222 -#define hc3x %xmm3
 223 -#define hc4x %xmm4
 224 -#define t1 %ymm5
 225 -#define t2 %ymm6
 226 -#define t1x %xmm5
 227 -#define t2x %xmm6
 228 -#define ruwy0 %ymm7
 229 -#define ruwy1 %ymm8
 230 -#define ruwy2 %ymm9
 231 -#define ruwy3 %ymm10
 232 -#define ruwy4 %ymm11
 233 -#define ruwy0x %xmm7
 234 -#define ruwy1x %xmm8
 235 -#define ruwy2x %xmm9
 236 -#define ruwy3x %xmm10
 237 -#define ruwy4x %xmm11
 238 -#define svxz1 %ymm12
 239 -#define svxz2 %ymm13
 240 -#define svxz3 %ymm14
 241 -#define svxz4 %ymm15
 242 -#define d0 %r9
 243 -#define d1 %r10
 244 -#define d2 %r11
 245 -#define d3 %r12
 246 -#define d4 %r13
 247 -
 248 -ENTRY(poly1305_4block_avx2)
 249 -       # %rdi: Accumulator h[5]
 250 -       # %rsi: 64 byte input block m
 251 -       # %rdx: Poly1305 key r[5]
 252 -       # %rcx: Quadblock count
 253 -       # %r8:  Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
 254 -
 255 -       # This four-block variant uses loop unrolled block processing. It
 256 -       # requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
 257 -       # h = (h + m) * r  =>  h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
 258 -
 259 -       vzeroupper
 260 -       push            %rbx
 261 -       push            %r12
 262 -       push            %r13
 263 -
 264 -       # combine r0,u0,w0,y0
 265 -       vmovd           y0,ruwy0x
 266 -       vmovd           w0,t1x
 267 -       vpunpcklqdq     t1,ruwy0,ruwy0
 268 -       vmovd           u0,t1x
 269 -       vmovd           r0,t2x
 270 -       vpunpcklqdq     t2,t1,t1
 271 -       vperm2i128      $0x20,t1,ruwy0,ruwy0
 272 -
 273 -       # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
 274 -       vmovd           y1,ruwy1x
 275 -       vmovd           w1,t1x
 276 -       vpunpcklqdq     t1,ruwy1,ruwy1
 277 -       vmovd           u1,t1x
 278 -       vmovd           r1,t2x
 279 -       vpunpcklqdq     t2,t1,t1
 280 -       vperm2i128      $0x20,t1,ruwy1,ruwy1
 281 -       vpslld          $2,ruwy1,svxz1
 282 -       vpaddd          ruwy1,svxz1,svxz1
 283 -
 284 -       # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
 285 -       vmovd           y2,ruwy2x
 286 -       vmovd           w2,t1x
 287 -       vpunpcklqdq     t1,ruwy2,ruwy2
 288 -       vmovd           u2,t1x
 289 -       vmovd           r2,t2x
 290 -       vpunpcklqdq     t2,t1,t1
 291 -       vperm2i128      $0x20,t1,ruwy2,ruwy2
 292 -       vpslld          $2,ruwy2,svxz2
 293 -       vpaddd          ruwy2,svxz2,svxz2
 294 -
 295 -       # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
 296 -       vmovd           y3,ruwy3x
 297 -       vmovd           w3,t1x
 298 -       vpunpcklqdq     t1,ruwy3,ruwy3
 299 -       vmovd           u3,t1x
 300 -       vmovd           r3,t2x
 301 -       vpunpcklqdq     t2,t1,t1
 302 -       vperm2i128      $0x20,t1,ruwy3,ruwy3
 303 -       vpslld          $2,ruwy3,svxz3
 304 -       vpaddd          ruwy3,svxz3,svxz3
 305 -
 306 -       # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
 307 -       vmovd           y4,ruwy4x
 308 -       vmovd           w4,t1x
 309 -       vpunpcklqdq     t1,ruwy4,ruwy4
 310 -       vmovd           u4,t1x
 311 -       vmovd           r4,t2x
 312 -       vpunpcklqdq     t2,t1,t1
 313 -       vperm2i128      $0x20,t1,ruwy4,ruwy4
 314 -       vpslld          $2,ruwy4,svxz4
 315 -       vpaddd          ruwy4,svxz4,svxz4
 316 -
 317 -.Ldoblock4:
 318 -       # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
 319 -       #        m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
 320 -       vmovd           0x00(m),hc0x
 321 -       vmovd           0x10(m),t1x
 322 -       vpunpcklqdq     t1,hc0,hc0
 323 -       vmovd           0x20(m),t1x
 324 -       vmovd           0x30(m),t2x
 325 -       vpunpcklqdq     t2,t1,t1
 326 -       vperm2i128      $0x20,t1,hc0,hc0
 327 -       vpand           ANMASK(%rip),hc0,hc0
 328 -       vmovd           h0,t1x
 329 -       vpaddd          t1,hc0,hc0
 330 -       # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
 331 -       #        (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
 332 -       vmovd           0x03(m),hc1x
 333 -       vmovd           0x13(m),t1x
 334 -       vpunpcklqdq     t1,hc1,hc1
 335 -       vmovd           0x23(m),t1x
 336 -       vmovd           0x33(m),t2x
 337 -       vpunpcklqdq     t2,t1,t1
 338 -       vperm2i128      $0x20,t1,hc1,hc1
 339 -       vpsrld          $2,hc1,hc1
 340 -       vpand           ANMASK(%rip),hc1,hc1
 341 -       vmovd           h1,t1x
 342 -       vpaddd          t1,hc1,hc1
 343 -       # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
 344 -       #        (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
 345 -       vmovd           0x06(m),hc2x
 346 -       vmovd           0x16(m),t1x
 347 -       vpunpcklqdq     t1,hc2,hc2
 348 -       vmovd           0x26(m),t1x
 349 -       vmovd           0x36(m),t2x
 350 -       vpunpcklqdq     t2,t1,t1
 351 -       vperm2i128      $0x20,t1,hc2,hc2
 352 -       vpsrld          $4,hc2,hc2
 353 -       vpand           ANMASK(%rip),hc2,hc2
 354 -       vmovd           h2,t1x
 355 -       vpaddd          t1,hc2,hc2
 356 -       # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
 357 -       #        (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
 358 -       vmovd           0x09(m),hc3x
 359 -       vmovd           0x19(m),t1x
 360 -       vpunpcklqdq     t1,hc3,hc3
 361 -       vmovd           0x29(m),t1x
 362 -       vmovd           0x39(m),t2x
 363 -       vpunpcklqdq     t2,t1,t1
 364 -       vperm2i128      $0x20,t1,hc3,hc3
 365 -       vpsrld          $6,hc3,hc3
 366 -       vpand           ANMASK(%rip),hc3,hc3
 367 -       vmovd           h3,t1x
 368 -       vpaddd          t1,hc3,hc3
 369 -       # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
 370 -       #        (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
 371 -       vmovd           0x0c(m),hc4x
 372 -       vmovd           0x1c(m),t1x
 373 -       vpunpcklqdq     t1,hc4,hc4
 374 -       vmovd           0x2c(m),t1x
 375 -       vmovd           0x3c(m),t2x
 376 -       vpunpcklqdq     t2,t1,t1
 377 -       vperm2i128      $0x20,t1,hc4,hc4
 378 -       vpsrld          $8,hc4,hc4
 379 -       vpor            ORMASK(%rip),hc4,hc4
 380 -       vmovd           h4,t1x
 381 -       vpaddd          t1,hc4,hc4
 382 -
 383 -       # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
 384 -       vpmuludq        hc0,ruwy0,t1
 385 -       # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
 386 -       vpmuludq        hc1,svxz4,t2
 387 -       vpaddq          t2,t1,t1
 388 -       # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
 389 -       vpmuludq        hc2,svxz3,t2
 390 -       vpaddq          t2,t1,t1
 391 -       # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
 392 -       vpmuludq        hc3,svxz2,t2
 393 -       vpaddq          t2,t1,t1
 394 -       # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
 395 -       vpmuludq        hc4,svxz1,t2
 396 -       vpaddq          t2,t1,t1
 397 -       # d0 = t1[0] + t1[1] + t[2] + t[3]
 398 -       vpermq          $0xee,t1,t2
 399 -       vpaddq          t2,t1,t1
 400 -       vpsrldq         $8,t1,t2
 401 -       vpaddq          t2,t1,t1
 402 -       vmovq           t1x,d0
 403 -
 404 -       # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
 405 -       vpmuludq        hc0,ruwy1,t1
 406 -       # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
 407 -       vpmuludq        hc1,ruwy0,t2
 408 -       vpaddq          t2,t1,t1
 409 -       # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
 410 -       vpmuludq        hc2,svxz4,t2
 411 -       vpaddq          t2,t1,t1
 412 -       # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
 413 -       vpmuludq        hc3,svxz3,t2
 414 -       vpaddq          t2,t1,t1
 415 -       # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
 416 -       vpmuludq        hc4,svxz2,t2
 417 -       vpaddq          t2,t1,t1
 418 -       # d1 = t1[0] + t1[1] + t1[3] + t1[4]
 419 -       vpermq          $0xee,t1,t2
 420 -       vpaddq          t2,t1,t1
 421 -       vpsrldq         $8,t1,t2
 422 -       vpaddq          t2,t1,t1
 423 -       vmovq           t1x,d1
 424 -
 425 -       # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
 426 -       vpmuludq        hc0,ruwy2,t1
 427 -       # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
 428 -       vpmuludq        hc1,ruwy1,t2
 429 -       vpaddq          t2,t1,t1
 430 -       # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
 431 -       vpmuludq        hc2,ruwy0,t2
 432 -       vpaddq          t2,t1,t1
 433 -       # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
 434 -       vpmuludq        hc3,svxz4,t2
 435 -       vpaddq          t2,t1,t1
 436 -       # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
 437 -       vpmuludq        hc4,svxz3,t2
 438 -       vpaddq          t2,t1,t1
 439 -       # d2 = t1[0] + t1[1] + t1[2] + t1[3]
 440 -       vpermq          $0xee,t1,t2
 441 -       vpaddq          t2,t1,t1
 442 -       vpsrldq         $8,t1,t2
 443 -       vpaddq          t2,t1,t1
 444 -       vmovq           t1x,d2
 445 -
 446 -       # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
 447 -       vpmuludq        hc0,ruwy3,t1
 448 -       # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
 449 -       vpmuludq        hc1,ruwy2,t2
 450 -       vpaddq          t2,t1,t1
 451 -       # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
 452 -       vpmuludq        hc2,ruwy1,t2
 453 -       vpaddq          t2,t1,t1
 454 -       # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
 455 -       vpmuludq        hc3,ruwy0,t2
 456 -       vpaddq          t2,t1,t1
 457 -       # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
 458 -       vpmuludq        hc4,svxz4,t2
 459 -       vpaddq          t2,t1,t1
 460 -       # d3 = t1[0] + t1[1] + t1[2] + t1[3]
 461 -       vpermq          $0xee,t1,t2
 462 -       vpaddq          t2,t1,t1
 463 -       vpsrldq         $8,t1,t2
 464 -       vpaddq          t2,t1,t1
 465 -       vmovq           t1x,d3
 466 -
 467 -       # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
 468 -       vpmuludq        hc0,ruwy4,t1
 469 -       # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
 470 -       vpmuludq        hc1,ruwy3,t2
 471 -       vpaddq          t2,t1,t1
 472 -       # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
 473 -       vpmuludq        hc2,ruwy2,t2
 474 -       vpaddq          t2,t1,t1
 475 -       # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
 476 -       vpmuludq        hc3,ruwy1,t2
 477 -       vpaddq          t2,t1,t1
 478 -       # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
 479 -       vpmuludq        hc4,ruwy0,t2
 480 -       vpaddq          t2,t1,t1
 481 -       # d4 = t1[0] + t1[1] + t1[2] + t1[3]
 482 -       vpermq          $0xee,t1,t2
 483 -       vpaddq          t2,t1,t1
 484 -       vpsrldq         $8,t1,t2
 485 -       vpaddq          t2,t1,t1
 486 -       vmovq           t1x,d4
 487 -
 488 -       # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
 489 -       # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
 490 -       # amount.  Careful: we must not assume the carry bits 'd0 >> 26',
 491 -       # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
 492 -       # integers.  It's true in a single-block implementation, but not here.
 493 -
 494 -       # d1 += d0 >> 26
 495 -       mov             d0,%rax
 496 -       shr             $26,%rax
 497 -       add             %rax,d1
 498 -       # h0 = d0 & 0x3ffffff
 499 -       mov             d0,%rbx
 500 -       and             $0x3ffffff,%ebx
 501 -
 502 -       # d2 += d1 >> 26
 503 -       mov             d1,%rax
 504 -       shr             $26,%rax
 505 -       add             %rax,d2
 506 -       # h1 = d1 & 0x3ffffff
 507 -       mov             d1,%rax
 508 -       and             $0x3ffffff,%eax
 509 -       mov             %eax,h1
 510 -
 511 -       # d3 += d2 >> 26
 512 -       mov             d2,%rax
 513 -       shr             $26,%rax
 514 -       add             %rax,d3
 515 -       # h2 = d2 & 0x3ffffff
 516 -       mov             d2,%rax
 517 -       and             $0x3ffffff,%eax
 518 -       mov             %eax,h2
 519 -
 520 -       # d4 += d3 >> 26
 521 -       mov             d3,%rax
 522 -       shr             $26,%rax
 523 -       add             %rax,d4
 524 -       # h3 = d3 & 0x3ffffff
 525 -       mov             d3,%rax
 526 -       and             $0x3ffffff,%eax
 527 -       mov             %eax,h3
 528 -
 529 -       # h0 += (d4 >> 26) * 5
 530 -       mov             d4,%rax
 531 -       shr             $26,%rax
 532 -       lea             (%rax,%rax,4),%rax
 533 -       add             %rax,%rbx
 534 -       # h4 = d4 & 0x3ffffff
 535 -       mov             d4,%rax
 536 -       and             $0x3ffffff,%eax
 537 -       mov             %eax,h4
 538 -
 539 -       # h1 += h0 >> 26
 540 -       mov             %rbx,%rax
 541 -       shr             $26,%rax
 542 -       add             %eax,h1
 543 -       # h0 = h0 & 0x3ffffff
 544 -       andl            $0x3ffffff,%ebx
 545 -       mov             %ebx,h0
 546 -
 547 -       add             $0x40,m
 548 -       dec             %rcx
 549 -       jnz             .Ldoblock4
 550 -
 551 -       vzeroupper
 552 -       pop             %r13
 553 -       pop             %r12
 554 -       pop             %rbx
 555 -       ret
 556 -ENDPROC(poly1305_4block_avx2)
 557 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S
 558 +++ /dev/null
 559 @@ -1,590 +0,0 @@
 560 -/* SPDX-License-Identifier: GPL-2.0-or-later */
 561 -/*
 562 - * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
 563 - *
 564 - * Copyright (C) 2015 Martin Willi
 565 - */
 566 -
 567 -#include <linux/linkage.h>
 568 -
 569 -.section       .rodata.cst16.ANMASK, "aM", @progbits, 16
 570 -.align 16
 571 -ANMASK:        .octa 0x0000000003ffffff0000000003ffffff
 572 -
 573 -.section       .rodata.cst16.ORMASK, "aM", @progbits, 16
 574 -.align 16
 575 -ORMASK:        .octa 0x00000000010000000000000001000000
 576 -
 577 -.text
 578 -
 579 -#define h0 0x00(%rdi)
 580 -#define h1 0x04(%rdi)
 581 -#define h2 0x08(%rdi)
 582 -#define h3 0x0c(%rdi)
 583 -#define h4 0x10(%rdi)
 584 -#define r0 0x00(%rdx)
 585 -#define r1 0x04(%rdx)
 586 -#define r2 0x08(%rdx)
 587 -#define r3 0x0c(%rdx)
 588 -#define r4 0x10(%rdx)
 589 -#define s1 0x00(%rsp)
 590 -#define s2 0x04(%rsp)
 591 -#define s3 0x08(%rsp)
 592 -#define s4 0x0c(%rsp)
 593 -#define m %rsi
 594 -#define h01 %xmm0
 595 -#define h23 %xmm1
 596 -#define h44 %xmm2
 597 -#define t1 %xmm3
 598 -#define t2 %xmm4
 599 -#define t3 %xmm5
 600 -#define t4 %xmm6
 601 -#define mask %xmm7
 602 -#define d0 %r8
 603 -#define d1 %r9
 604 -#define d2 %r10
 605 -#define d3 %r11
 606 -#define d4 %r12
 607 -
 608 -ENTRY(poly1305_block_sse2)
 609 -       # %rdi: Accumulator h[5]
 610 -       # %rsi: 16 byte input block m
 611 -       # %rdx: Poly1305 key r[5]
 612 -       # %rcx: Block count
 613 -
 614 -       # This single block variant tries to improve performance by doing two
 615 -       # multiplications in parallel using SSE instructions. There is quite
 616 -       # some quardword packing involved, hence the speedup is marginal.
 617 -
 618 -       push            %rbx
 619 -       push            %r12
 620 -       sub             $0x10,%rsp
 621 -
 622 -       # s1..s4 = r1..r4 * 5
 623 -       mov             r1,%eax
 624 -       lea             (%eax,%eax,4),%eax
 625 -       mov             %eax,s1
 626 -       mov             r2,%eax
 627 -       lea             (%eax,%eax,4),%eax
 628 -       mov             %eax,s2
 629 -       mov             r3,%eax
 630 -       lea             (%eax,%eax,4),%eax
 631 -       mov             %eax,s3
 632 -       mov             r4,%eax
 633 -       lea             (%eax,%eax,4),%eax
 634 -       mov             %eax,s4
 635 -
 636 -       movdqa          ANMASK(%rip),mask
 637 -
 638 -.Ldoblock:
 639 -       # h01 = [0, h1, 0, h0]
 640 -       # h23 = [0, h3, 0, h2]
 641 -       # h44 = [0, h4, 0, h4]
 642 -       movd            h0,h01
 643 -       movd            h1,t1
 644 -       movd            h2,h23
 645 -       movd            h3,t2
 646 -       movd            h4,h44
 647 -       punpcklqdq      t1,h01
 648 -       punpcklqdq      t2,h23
 649 -       punpcklqdq      h44,h44
 650 -
 651 -       # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
 652 -       movd            0x00(m),t1
 653 -       movd            0x03(m),t2
 654 -       psrld           $2,t2
 655 -       punpcklqdq      t2,t1
 656 -       pand            mask,t1
 657 -       paddd           t1,h01
 658 -       # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
 659 -       movd            0x06(m),t1
 660 -       movd            0x09(m),t2
 661 -       psrld           $4,t1
 662 -       psrld           $6,t2
 663 -       punpcklqdq      t2,t1
 664 -       pand            mask,t1
 665 -       paddd           t1,h23
 666 -       # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
 667 -       mov             0x0c(m),%eax
 668 -       shr             $8,%eax
 669 -       or              $0x01000000,%eax
 670 -       movd            %eax,t1
 671 -       pshufd          $0xc4,t1,t1
 672 -       paddd           t1,h44
 673 -
 674 -       # t1[0] = h0 * r0 + h2 * s3
 675 -       # t1[1] = h1 * s4 + h3 * s2
 676 -       movd            r0,t1
 677 -       movd            s4,t2
 678 -       punpcklqdq      t2,t1
 679 -       pmuludq         h01,t1
 680 -       movd            s3,t2
 681 -       movd            s2,t3
 682 -       punpcklqdq      t3,t2
 683 -       pmuludq         h23,t2
 684 -       paddq           t2,t1
 685 -       # t2[0] = h0 * r1 + h2 * s4
 686 -       # t2[1] = h1 * r0 + h3 * s3
 687 -       movd            r1,t2
 688 -       movd            r0,t3
 689 -       punpcklqdq      t3,t2
 690 -       pmuludq         h01,t2
 691 -       movd            s4,t3
 692 -       movd            s3,t4
 693 -       punpcklqdq      t4,t3
 694 -       pmuludq         h23,t3
 695 -       paddq           t3,t2
 696 -       # t3[0] = h4 * s1
 697 -       # t3[1] = h4 * s2
 698 -       movd            s1,t3
 699 -       movd            s2,t4
 700 -       punpcklqdq      t4,t3
 701 -       pmuludq         h44,t3
 702 -       # d0 = t1[0] + t1[1] + t3[0]
 703 -       # d1 = t2[0] + t2[1] + t3[1]
 704 -       movdqa          t1,t4
 705 -       punpcklqdq      t2,t4
 706 -       punpckhqdq      t2,t1
 707 -       paddq           t4,t1
 708 -       paddq           t3,t1
 709 -       movq            t1,d0
 710 -       psrldq          $8,t1
 711 -       movq            t1,d1
 712 -
 713 -       # t1[0] = h0 * r2 + h2 * r0
 714 -       # t1[1] = h1 * r1 + h3 * s4
 715 -       movd            r2,t1
 716 -       movd            r1,t2
 717 -       punpcklqdq      t2,t1
 718 -       pmuludq         h01,t1
 719 -       movd            r0,t2
 720 -       movd            s4,t3
 721 -       punpcklqdq      t3,t2
 722 -       pmuludq         h23,t2
 723 -       paddq           t2,t1
 724 -       # t2[0] = h0 * r3 + h2 * r1
 725 -       # t2[1] = h1 * r2 + h3 * r0
 726 -       movd            r3,t2
 727 -       movd            r2,t3
 728 -       punpcklqdq      t3,t2
 729 -       pmuludq         h01,t2
 730 -       movd            r1,t3
 731 -       movd            r0,t4
 732 -       punpcklqdq      t4,t3
 733 -       pmuludq         h23,t3
 734 -       paddq           t3,t2
 735 -       # t3[0] = h4 * s3
 736 -       # t3[1] = h4 * s4
 737 -       movd            s3,t3
 738 -       movd            s4,t4
 739 -       punpcklqdq      t4,t3
 740 -       pmuludq         h44,t3
 741 -       # d2 = t1[0] + t1[1] + t3[0]
 742 -       # d3 = t2[0] + t2[1] + t3[1]
 743 -       movdqa          t1,t4
 744 -       punpcklqdq      t2,t4
 745 -       punpckhqdq      t2,t1
 746 -       paddq           t4,t1
 747 -       paddq           t3,t1
 748 -       movq            t1,d2
 749 -       psrldq          $8,t1
 750 -       movq            t1,d3
 751 -
 752 -       # t1[0] = h0 * r4 + h2 * r2
 753 -       # t1[1] = h1 * r3 + h3 * r1
 754 -       movd            r4,t1
 755 -       movd            r3,t2
 756 -       punpcklqdq      t2,t1
 757 -       pmuludq         h01,t1
 758 -       movd            r2,t2
 759 -       movd            r1,t3
 760 -       punpcklqdq      t3,t2
 761 -       pmuludq         h23,t2
 762 -       paddq           t2,t1
 763 -       # t3[0] = h4 * r0
 764 -       movd            r0,t3
 765 -       pmuludq         h44,t3
 766 -       # d4 = t1[0] + t1[1] + t3[0]
 767 -       movdqa          t1,t4
 768 -       psrldq          $8,t4
 769 -       paddq           t4,t1
 770 -       paddq           t3,t1
 771 -       movq            t1,d4
 772 -
 773 -       # d1 += d0 >> 26
 774 -       mov             d0,%rax
 775 -       shr             $26,%rax
 776 -       add             %rax,d1
 777 -       # h0 = d0 & 0x3ffffff
 778 -       mov             d0,%rbx
 779 -       and             $0x3ffffff,%ebx
 780 -
 781 -       # d2 += d1 >> 26
 782 -       mov             d1,%rax
 783 -       shr             $26,%rax
 784 -       add             %rax,d2
 785 -       # h1 = d1 & 0x3ffffff
 786 -       mov             d1,%rax
 787 -       and             $0x3ffffff,%eax
 788 -       mov             %eax,h1
 789 -
 790 -       # d3 += d2 >> 26
 791 -       mov             d2,%rax
 792 -       shr             $26,%rax
 793 -       add             %rax,d3
 794 -       # h2 = d2 & 0x3ffffff
 795 -       mov             d2,%rax
 796 -       and             $0x3ffffff,%eax
 797 -       mov             %eax,h2
 798 -
 799 -       # d4 += d3 >> 26
 800 -       mov             d3,%rax
 801 -       shr             $26,%rax
 802 -       add             %rax,d4
 803 -       # h3 = d3 & 0x3ffffff
 804 -       mov             d3,%rax
 805 -       and             $0x3ffffff,%eax
 806 -       mov             %eax,h3
 807 -
 808 -       # h0 += (d4 >> 26) * 5
 809 -       mov             d4,%rax
 810 -       shr             $26,%rax
 811 -       lea             (%rax,%rax,4),%rax
 812 -       add             %rax,%rbx
 813 -       # h4 = d4 & 0x3ffffff
 814 -       mov             d4,%rax
 815 -       and             $0x3ffffff,%eax
 816 -       mov             %eax,h4
 817 -
 818 -       # h1 += h0 >> 26
 819 -       mov             %rbx,%rax
 820 -       shr             $26,%rax
 821 -       add             %eax,h1
 822 -       # h0 = h0 & 0x3ffffff
 823 -       andl            $0x3ffffff,%ebx
 824 -       mov             %ebx,h0
 825 -
 826 -       add             $0x10,m
 827 -       dec             %rcx
 828 -       jnz             .Ldoblock
 829 -
 830 -       # Zeroing of key material
 831 -       mov             %rcx,0x00(%rsp)
 832 -       mov             %rcx,0x08(%rsp)
 833 -
 834 -       add             $0x10,%rsp
 835 -       pop             %r12
 836 -       pop             %rbx
 837 -       ret
 838 -ENDPROC(poly1305_block_sse2)
 839 -
 840 -
 841 -#define u0 0x00(%r8)
 842 -#define u1 0x04(%r8)
 843 -#define u2 0x08(%r8)
 844 -#define u3 0x0c(%r8)
 845 -#define u4 0x10(%r8)
 846 -#define hc0 %xmm0
 847 -#define hc1 %xmm1
 848 -#define hc2 %xmm2
 849 -#define hc3 %xmm5
 850 -#define hc4 %xmm6
 851 -#define ru0 %xmm7
 852 -#define ru1 %xmm8
 853 -#define ru2 %xmm9
 854 -#define ru3 %xmm10
 855 -#define ru4 %xmm11
 856 -#define sv1 %xmm12
 857 -#define sv2 %xmm13
 858 -#define sv3 %xmm14
 859 -#define sv4 %xmm15
 860 -#undef d0
 861 -#define d0 %r13
 862 -
 863 -ENTRY(poly1305_2block_sse2)
 864 -       # %rdi: Accumulator h[5]
 865 -       # %rsi: 16 byte input block m
 866 -       # %rdx: Poly1305 key r[5]
 867 -       # %rcx: Doubleblock count
 868 -       # %r8:  Poly1305 derived key r^2 u[5]
 869 -
 870 -       # This two-block variant further improves performance by using loop
 871 -       # unrolled block processing. This is more straight forward and does
 872 -       # less byte shuffling, but requires a second Poly1305 key r^2:
 873 -       # h = (h + m) * r    =>    h = (h + m1) * r^2 + m2 * r
 874 -
 875 -       push            %rbx
 876 -       push            %r12
 877 -       push            %r13
 878 -
 879 -       # combine r0,u0
 880 -       movd            u0,ru0
 881 -       movd            r0,t1
 882 -       punpcklqdq      t1,ru0
 883 -
 884 -       # combine r1,u1 and s1=r1*5,v1=u1*5
 885 -       movd            u1,ru1
 886 -       movd            r1,t1
 887 -       punpcklqdq      t1,ru1
 888 -       movdqa          ru1,sv1
 889 -       pslld           $2,sv1
 890 -       paddd           ru1,sv1
 891 -
 892 -       # combine r2,u2 and s2=r2*5,v2=u2*5
 893 -       movd            u2,ru2
 894 -       movd            r2,t1
 895 -       punpcklqdq      t1,ru2
 896 -       movdqa          ru2,sv2
 897 -       pslld           $2,sv2
 898 -       paddd           ru2,sv2
 899 -
 900 -       # combine r3,u3 and s3=r3*5,v3=u3*5
 901 -       movd            u3,ru3
 902 -       movd            r3,t1
 903 -       punpcklqdq      t1,ru3
 904 -       movdqa          ru3,sv3
 905 -       pslld           $2,sv3
 906 -       paddd           ru3,sv3
 907 -
 908 -       # combine r4,u4 and s4=r4*5,v4=u4*5
 909 -       movd            u4,ru4
 910 -       movd            r4,t1
 911 -       punpcklqdq      t1,ru4
 912 -       movdqa          ru4,sv4
 913 -       pslld           $2,sv4
 914 -       paddd           ru4,sv4
 915 -
 916 -.Ldoblock2:
 917 -       # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
 918 -       movd            0x00(m),hc0
 919 -       movd            0x10(m),t1
 920 -       punpcklqdq      t1,hc0
 921 -       pand            ANMASK(%rip),hc0
 922 -       movd            h0,t1
 923 -       paddd           t1,hc0
 924 -       # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
 925 -       movd            0x03(m),hc1
 926 -       movd            0x13(m),t1
 927 -       punpcklqdq      t1,hc1
 928 -       psrld           $2,hc1
 929 -       pand            ANMASK(%rip),hc1
 930 -       movd            h1,t1
 931 -       paddd           t1,hc1
 932 -       # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
 933 -       movd            0x06(m),hc2
 934 -       movd            0x16(m),t1
 935 -       punpcklqdq      t1,hc2
 936 -       psrld           $4,hc2
 937 -       pand            ANMASK(%rip),hc2
 938 -       movd            h2,t1
 939 -       paddd           t1,hc2
 940 -       # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
 941 -       movd            0x09(m),hc3
 942 -       movd            0x19(m),t1
 943 -       punpcklqdq      t1,hc3
 944 -       psrld           $6,hc3
 945 -       pand            ANMASK(%rip),hc3
 946 -       movd            h3,t1
 947 -       paddd           t1,hc3
 948 -       # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
 949 -       movd            0x0c(m),hc4
 950 -       movd            0x1c(m),t1
 951 -       punpcklqdq      t1,hc4
 952 -       psrld           $8,hc4
 953 -       por             ORMASK(%rip),hc4
 954 -       movd            h4,t1
 955 -       paddd           t1,hc4
 956 -
 957 -       # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
 958 -       movdqa          ru0,t1
 959 -       pmuludq         hc0,t1
 960 -       # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
 961 -       movdqa          sv4,t2
 962 -       pmuludq         hc1,t2
 963 -       paddq           t2,t1
 964 -       # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
 965 -       movdqa          sv3,t2
 966 -       pmuludq         hc2,t2
 967 -       paddq           t2,t1
 968 -       # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
 969 -       movdqa          sv2,t2
 970 -       pmuludq         hc3,t2
 971 -       paddq           t2,t1
 972 -       # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
 973 -       movdqa          sv1,t2
 974 -       pmuludq         hc4,t2
 975 -       paddq           t2,t1
 976 -       # d0 = t1[0] + t1[1]
 977 -       movdqa          t1,t2
 978 -       psrldq          $8,t2
 979 -       paddq           t2,t1
 980 -       movq            t1,d0
 981 -
 982 -       # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
 983 -       movdqa          ru1,t1
 984 -       pmuludq         hc0,t1
 985 -       # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
 986 -       movdqa          ru0,t2
 987 -       pmuludq         hc1,t2
 988 -       paddq           t2,t1
 989 -       # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
 990 -       movdqa          sv4,t2
 991 -       pmuludq         hc2,t2
 992 -       paddq           t2,t1
 993 -       # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
 994 -       movdqa          sv3,t2
 995 -       pmuludq         hc3,t2
 996 -       paddq           t2,t1
 997 -       # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
 998 -       movdqa          sv2,t2
 999 -       pmuludq         hc4,t2
1000 -       paddq           t2,t1
1001 -       # d1 = t1[0] + t1[1]
1002 -       movdqa          t1,t2
1003 -       psrldq          $8,t2
1004 -       paddq           t2,t1
1005 -       movq            t1,d1
1006 -
1007 -       # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
1008 -       movdqa          ru2,t1
1009 -       pmuludq         hc0,t1
1010 -       # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
1011 -       movdqa          ru1,t2
1012 -       pmuludq         hc1,t2
1013 -       paddq           t2,t1
1014 -       # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
1015 -       movdqa          ru0,t2
1016 -       pmuludq         hc2,t2
1017 -       paddq           t2,t1
1018 -       # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
1019 -       movdqa          sv4,t2
1020 -       pmuludq         hc3,t2
1021 -       paddq           t2,t1
1022 -       # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
1023 -       movdqa          sv3,t2
1024 -       pmuludq         hc4,t2
1025 -       paddq           t2,t1
1026 -       # d2 = t1[0] + t1[1]
1027 -       movdqa          t1,t2
1028 -       psrldq          $8,t2
1029 -       paddq           t2,t1
1030 -       movq            t1,d2
1031 -
1032 -       # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
1033 -       movdqa          ru3,t1
1034 -       pmuludq         hc0,t1
1035 -       # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
1036 -       movdqa          ru2,t2
1037 -       pmuludq         hc1,t2
1038 -       paddq           t2,t1
1039 -       # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
1040 -       movdqa          ru1,t2
1041 -       pmuludq         hc2,t2
1042 -       paddq           t2,t1
1043 -       # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
1044 -       movdqa          ru0,t2
1045 -       pmuludq         hc3,t2
1046 -       paddq           t2,t1
1047 -       # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
1048 -       movdqa          sv4,t2
1049 -       pmuludq         hc4,t2
1050 -       paddq           t2,t1
1051 -       # d3 = t1[0] + t1[1]
1052 -       movdqa          t1,t2
1053 -       psrldq          $8,t2
1054 -       paddq           t2,t1
1055 -       movq            t1,d3
1056 -
1057 -       # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
1058 -       movdqa          ru4,t1
1059 -       pmuludq         hc0,t1
1060 -       # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
1061 -       movdqa          ru3,t2
1062 -       pmuludq         hc1,t2
1063 -       paddq           t2,t1
1064 -       # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
1065 -       movdqa          ru2,t2
1066 -       pmuludq         hc2,t2
1067 -       paddq           t2,t1
1068 -       # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
1069 -       movdqa          ru1,t2
1070 -       pmuludq         hc3,t2
1071 -       paddq           t2,t1
1072 -       # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
1073 -       movdqa          ru0,t2
1074 -       pmuludq         hc4,t2
1075 -       paddq           t2,t1
1076 -       # d4 = t1[0] + t1[1]
1077 -       movdqa          t1,t2
1078 -       psrldq          $8,t2
1079 -       paddq           t2,t1
1080 -       movq            t1,d4
1081 -
1082 -       # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
1083 -       # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
1084 -       # amount.  Careful: we must not assume the carry bits 'd0 >> 26',
1085 -       # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
1086 -       # integers.  It's true in a single-block implementation, but not here.
1087 -
1088 -       # d1 += d0 >> 26
1089 -       mov             d0,%rax
1090 -       shr             $26,%rax
1091 -       add             %rax,d1
1092 -       # h0 = d0 & 0x3ffffff
1093 -       mov             d0,%rbx
1094 -       and             $0x3ffffff,%ebx
1095 -
1096 -       # d2 += d1 >> 26
1097 -       mov             d1,%rax
1098 -       shr             $26,%rax
1099 -       add             %rax,d2
1100 -       # h1 = d1 & 0x3ffffff
1101 -       mov             d1,%rax
1102 -       and             $0x3ffffff,%eax
1103 -       mov             %eax,h1
1104 -
1105 -       # d3 += d2 >> 26
1106 -       mov             d2,%rax
1107 -       shr             $26,%rax
1108 -       add             %rax,d3
1109 -       # h2 = d2 & 0x3ffffff
1110 -       mov             d2,%rax
1111 -       and             $0x3ffffff,%eax
1112 -       mov             %eax,h2
1113 -
1114 -       # d4 += d3 >> 26
1115 -       mov             d3,%rax
1116 -       shr             $26,%rax
1117 -       add             %rax,d4
1118 -       # h3 = d3 & 0x3ffffff
1119 -       mov             d3,%rax
1120 -       and             $0x3ffffff,%eax
1121 -       mov             %eax,h3
1122 -
1123 -       # h0 += (d4 >> 26) * 5
1124 -       mov             d4,%rax
1125 -       shr             $26,%rax
1126 -       lea             (%rax,%rax,4),%rax
1127 -       add             %rax,%rbx
1128 -       # h4 = d4 & 0x3ffffff
1129 -       mov             d4,%rax
1130 -       and             $0x3ffffff,%eax
1131 -       mov             %eax,h4
1132 -
1133 -       # h1 += h0 >> 26
1134 -       mov             %rbx,%rax
1135 -       shr             $26,%rax
1136 -       add             %eax,h1
1137 -       # h0 = h0 & 0x3ffffff
1138 -       andl            $0x3ffffff,%ebx
1139 -       mov             %ebx,h0
1140 -
1141 -       add             $0x20,m
1142 -       dec             %rcx
1143 -       jnz             .Ldoblock2
1144 -
1145 -       pop             %r13
1146 -       pop             %r12
1147 -       pop             %rbx
1148 -       ret
1149 -ENDPROC(poly1305_2block_sse2)
1150 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
1151 +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
1152 @@ -1,11 +1,14 @@
1153 -#! /usr/bin/env perl
1154 -# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
1155 +#!/usr/bin/env perl
1156 +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
1157  #
1158 -# Licensed under the OpenSSL license (the "License").  You may not use
1159 -# this file except in compliance with the License.  You can obtain a copy
1160 -# in the file LICENSE in the source distribution or at
1161 -# https://www.openssl.org/source/license.html
1162 -
1163 +# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
1164 +# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
1165 +# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
1166 +#
1167 +# This code is taken from the OpenSSL project but the author, Andy Polyakov,
1168 +# has relicensed it under the licenses specified in the SPDX header above.
1169 +# The original headers, including the original license headers, are
1170 +# included below for completeness.
1171  #
1172  # ====================================================================
1173  # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
1174 @@ -32,7 +35,7 @@
1175  # Skylake-X system performance. Since we are likely to suppress
1176  # AVX512F capability flag [at least on Skylake-X], conversion serves
1177  # as kind of "investment protection". Note that next *lake processor,
1178 -# Cannolake, has AVX512IFMA code path to execute...
1179 +# Cannonlake, has AVX512IFMA code path to execute...
1180  #
1181  # Numbers are cycles per processed byte with poly1305_blocks alone,
1182  # measured with rdtsc at fixed clock frequency.
1183 @@ -68,39 +71,114 @@ $output  = shift;
1184  if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
1185
1186  $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
1187 +$kernel=0; $kernel=1 if (!$flavour && !$output);
1188
1189 -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
1190 -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
1191 -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
1192 -die "can't locate x86_64-xlate.pl";
1193 -
1194 -if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
1195 -               =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
1196 -       $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
1197 +if (!$kernel) {
1198 +       $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
1199 +       ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
1200 +       ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
1201 +       die "can't locate x86_64-xlate.pl";
1202 +
1203 +       open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
1204 +       *STDOUT=*OUT;
1205 +
1206 +       if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
1207 +           =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
1208 +               $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
1209 +       }
1210 +
1211 +       if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
1212 +           `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
1213 +               $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
1214 +               $avx += 1 if ($1==2.11 && $2>=8);
1215 +       }
1216 +
1217 +       if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
1218 +           `ml64 2>&1` =~ /Version ([0-9]+)\./) {
1219 +               $avx = ($1>=10) + ($1>=11);
1220 +       }
1221 +
1222 +       if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
1223 +               $avx = ($2>=3.0) + ($2>3.0);
1224 +       }
1225 +} else {
1226 +       $avx = 4; # The kernel uses ifdefs for this.
1227  }
1228
1229 -if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
1230 -          `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
1231 -       $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
1232 -       $avx += 2 if ($1==2.11 && $2>=8);
1233 +sub declare_function() {
1234 +       my ($name, $align, $nargs) = @_;
1235 +       if($kernel) {
1236 +               $code .= ".align $align\n";
1237 +               $code .= "ENTRY($name)\n";
1238 +               $code .= ".L$name:\n";
1239 +       } else {
1240 +               $code .= ".globl        $name\n";
1241 +               $code .= ".type $name,\@function,$nargs\n";
1242 +               $code .= ".align        $align\n";
1243 +               $code .= "$name:\n";
1244 +       }
1245  }
1246
1247 -if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
1248 -          `ml64 2>&1` =~ /Version ([0-9]+)\./) {
1249 -       $avx = ($1>=10) + ($1>=12);
1250 +sub end_function() {
1251 +       my ($name) = @_;
1252 +       if($kernel) {
1253 +               $code .= "ENDPROC($name)\n";
1254 +       } else {
1255 +               $code .= ".size   $name,.-$name\n";
1256 +       }
1257  }
1258
1259 -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
1260 -       $avx = ($2>=3.0) + ($2>3.0);
1261 -}
1262 +$code.=<<___ if $kernel;
1263 +#include <linux/linkage.h>
1264 +___
1265
1266 -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
1267 -*STDOUT=*OUT;
1268 +if ($avx) {
1269 +$code.=<<___ if $kernel;
1270 +.section .rodata
1271 +___
1272 +$code.=<<___;
1273 +.align 64
1274 +.Lconst:
1275 +.Lmask24:
1276 +.long  0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1277 +.L129:
1278 +.long  `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
1279 +.Lmask26:
1280 +.long  0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1281 +.Lpermd_avx2:
1282 +.long  2,2,2,3,2,0,2,1
1283 +.Lpermd_avx512:
1284 +.long  0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
1285 +
1286 +.L2_44_inp_permd:
1287 +.long  0,1,1,2,2,3,7,7
1288 +.L2_44_inp_shift:
1289 +.quad  0,12,24,64
1290 +.L2_44_mask:
1291 +.quad  0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
1292 +.L2_44_shift_rgt:
1293 +.quad  44,44,42,64
1294 +.L2_44_shift_lft:
1295 +.quad  8,8,10,64
1296 +
1297 +.align 64
1298 +.Lx_mask44:
1299 +.quad  0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1300 +.quad  0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1301 +.Lx_mask42:
1302 +.quad  0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1303 +.quad  0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1304 +___
1305 +}
1306 +$code.=<<___ if (!$kernel);
1307 +.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1308 +.align 16
1309 +___
1310
1311  my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
1312  my ($mac,$nonce)=($inp,$len);  # *_emit arguments
1313 -my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
1314 -my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
1315 +my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
1316 +my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
1317
1318  sub poly1305_iteration {
1319  # input:       copy of $r1 in %rax, $h0-$h2, $r0-$r1
1320 @@ -155,19 +233,19 @@ ___
1321
1322  $code.=<<___;
1323  .text
1324 -
1325 +___
1326 +$code.=<<___ if (!$kernel);
1327  .extern        OPENSSL_ia32cap_P
1328
1329 -.globl poly1305_init
1330 -.hidden        poly1305_init
1331 -.globl poly1305_blocks
1332 -.hidden        poly1305_blocks
1333 -.globl poly1305_emit
1334 -.hidden        poly1305_emit
1335 -
1336 -.type  poly1305_init,\@function,3
1337 -.align 32
1338 -poly1305_init:
1339 +.globl poly1305_init_x86_64
1340 +.hidden        poly1305_init_x86_64
1341 +.globl poly1305_blocks_x86_64
1342 +.hidden        poly1305_blocks_x86_64
1343 +.globl poly1305_emit_x86_64
1344 +.hidden        poly1305_emit_x86_64
1345 +___
1346 +&declare_function("poly1305_init_x86_64", 32, 3);
1347 +$code.=<<___;
1348         xor     %rax,%rax
1349         mov     %rax,0($ctx)            # initialize hash value
1350         mov     %rax,8($ctx)
1351 @@ -175,11 +253,12 @@ poly1305_init:
1352
1353         cmp     \$0,$inp
1354         je      .Lno_key
1355 -
1356 -       lea     poly1305_blocks(%rip),%r10
1357 -       lea     poly1305_emit(%rip),%r11
1358  ___
1359 -$code.=<<___   if ($avx);
1360 +$code.=<<___ if (!$kernel);
1361 +       lea     poly1305_blocks_x86_64(%rip),%r10
1362 +       lea     poly1305_emit_x86_64(%rip),%r11
1363 +___
1364 +$code.=<<___   if (!$kernel && $avx);
1365         mov     OPENSSL_ia32cap_P+4(%rip),%r9
1366         lea     poly1305_blocks_avx(%rip),%rax
1367         lea     poly1305_emit_avx(%rip),%rcx
1368 @@ -187,12 +266,12 @@ $code.=<<___      if ($avx);
1369         cmovc   %rax,%r10
1370         cmovc   %rcx,%r11
1371  ___
1372 -$code.=<<___   if ($avx>1);
1373 +$code.=<<___   if (!$kernel && $avx>1);
1374         lea     poly1305_blocks_avx2(%rip),%rax
1375         bt      \$`5+32`,%r9            # AVX2?
1376         cmovc   %rax,%r10
1377  ___
1378 -$code.=<<___   if ($avx>3);
1379 +$code.=<<___   if (!$kernel && $avx>3);
1380         mov     \$`(1<<31|1<<21|1<<16)`,%rax
1381         shr     \$32,%r9
1382         and     %rax,%r9
1383 @@ -207,11 +286,11 @@ $code.=<<___;
1384         mov     %rax,24($ctx)
1385         mov     %rcx,32($ctx)
1386  ___
1387 -$code.=<<___   if ($flavour !~ /elf32/);
1388 +$code.=<<___   if (!$kernel && $flavour !~ /elf32/);
1389         mov     %r10,0(%rdx)
1390         mov     %r11,8(%rdx)
1391  ___
1392 -$code.=<<___   if ($flavour =~ /elf32/);
1393 +$code.=<<___   if (!$kernel && $flavour =~ /elf32/);
1394         mov     %r10d,0(%rdx)
1395         mov     %r11d,4(%rdx)
1396  ___
1397 @@ -219,11 +298,11 @@ $code.=<<___;
1398         mov     \$1,%eax
1399  .Lno_key:
1400         ret
1401 -.size  poly1305_init,.-poly1305_init
1402 +___
1403 +&end_function("poly1305_init_x86_64");
1404
1405 -.type  poly1305_blocks,\@function,4
1406 -.align 32
1407 -poly1305_blocks:
1408 +&declare_function("poly1305_blocks_x86_64", 32, 4);
1409 +$code.=<<___;
1410  .cfi_startproc
1411  .Lblocks:
1412         shr     \$4,$len
1413 @@ -231,8 +310,6 @@ poly1305_blocks:
1414
1415         push    %rbx
1416  .cfi_push      %rbx
1417 -       push    %rbp
1418 -.cfi_push      %rbp
1419         push    %r12
1420  .cfi_push      %r12
1421         push    %r13
1422 @@ -241,6 +318,8 @@ poly1305_blocks:
1423  .cfi_push      %r14
1424         push    %r15
1425  .cfi_push      %r15
1426 +       push    $ctx
1427 +.cfi_push      $ctx
1428  .Lblocks_body:
1429
1430         mov     $len,%r15               # reassign $len
1431 @@ -265,26 +344,29 @@ poly1305_blocks:
1432         lea     16($inp),$inp
1433         adc     $padbit,$h2
1434  ___
1435 +
1436         &poly1305_iteration();
1437 +
1438  $code.=<<___;
1439         mov     $r1,%rax
1440         dec     %r15                    # len-=16
1441         jnz     .Loop
1442
1443 +       mov     0(%rsp),$ctx
1444 +.cfi_restore   $ctx
1445 +
1446         mov     $h0,0($ctx)             # store hash value
1447         mov     $h1,8($ctx)
1448         mov     $h2,16($ctx)
1449
1450 -       mov     0(%rsp),%r15
1451 +       mov     8(%rsp),%r15
1452  .cfi_restore   %r15
1453 -       mov     8(%rsp),%r14
1454 +       mov     16(%rsp),%r14
1455  .cfi_restore   %r14
1456 -       mov     16(%rsp),%r13
1457 +       mov     24(%rsp),%r13
1458  .cfi_restore   %r13
1459 -       mov     24(%rsp),%r12
1460 +       mov     32(%rsp),%r12
1461  .cfi_restore   %r12
1462 -       mov     32(%rsp),%rbp
1463 -.cfi_restore   %rbp
1464         mov     40(%rsp),%rbx
1465  .cfi_restore   %rbx
1466         lea     48(%rsp),%rsp
1467 @@ -293,11 +375,11 @@ $code.=<<___;
1468  .Lblocks_epilogue:
1469         ret
1470  .cfi_endproc
1471 -.size  poly1305_blocks,.-poly1305_blocks
1472 +___
1473 +&end_function("poly1305_blocks_x86_64");
1474
1475 -.type  poly1305_emit,\@function,3
1476 -.align 32
1477 -poly1305_emit:
1478 +&declare_function("poly1305_emit_x86_64", 32, 3);
1479 +$code.=<<___;
1480  .Lemit:
1481         mov     0($ctx),%r8     # load hash value
1482         mov     8($ctx),%r9
1483 @@ -318,10 +400,14 @@ poly1305_emit:
1484         mov     %rcx,8($mac)
1485
1486         ret
1487 -.size  poly1305_emit,.-poly1305_emit
1488  ___
1489 +&end_function("poly1305_emit_x86_64");
1490  if ($avx) {
1491
1492 +if($kernel) {
1493 +       $code .= "#ifdef CONFIG_AS_AVX\n";
1494 +}
1495 +
1496  ########################################################################
1497  # Layout of opaque area is following.
1498  #
1499 @@ -342,15 +428,19 @@ $code.=<<___;
1500  .type  __poly1305_block,\@abi-omnipotent
1501  .align 32
1502  __poly1305_block:
1503 +       push $ctx
1504  ___
1505         &poly1305_iteration();
1506  $code.=<<___;
1507 +       pop $ctx
1508         ret
1509  .size  __poly1305_block,.-__poly1305_block
1510
1511  .type  __poly1305_init_avx,\@abi-omnipotent
1512  .align 32
1513  __poly1305_init_avx:
1514 +       push %rbp
1515 +       mov %rsp,%rbp
1516         mov     $r0,$h0
1517         mov     $r1,$h1
1518         xor     $h2,$h2
1519 @@ -507,12 +597,13 @@ __poly1305_init_avx:
1520         mov     $d1#d,`16*8+8-64`($ctx)
1521
1522         lea     -48-64($ctx),$ctx       # size [de-]optimization
1523 +       pop %rbp
1524         ret
1525  .size  __poly1305_init_avx,.-__poly1305_init_avx
1526 +___
1527
1528 -.type  poly1305_blocks_avx,\@function,4
1529 -.align 32
1530 -poly1305_blocks_avx:
1531 +&declare_function("poly1305_blocks_avx", 32, 4);
1532 +$code.=<<___;
1533  .cfi_startproc
1534         mov     20($ctx),%r8d           # is_base2_26
1535         cmp     \$128,$len
1536 @@ -532,10 +623,11 @@ poly1305_blocks_avx:
1537         test    \$31,$len
1538         jz      .Leven_avx
1539
1540 -       push    %rbx
1541 -.cfi_push      %rbx
1542         push    %rbp
1543  .cfi_push      %rbp
1544 +       mov     %rsp,%rbp
1545 +       push    %rbx
1546 +.cfi_push      %rbx
1547         push    %r12
1548  .cfi_push      %r12
1549         push    %r13
1550 @@ -645,20 +737,18 @@ poly1305_blocks_avx:
1551         mov     $h2#d,16($ctx)
1552  .align 16
1553  .Ldone_avx:
1554 -       mov     0(%rsp),%r15
1555 +       pop             %r15
1556  .cfi_restore   %r15
1557 -       mov     8(%rsp),%r14
1558 +       pop             %r14
1559  .cfi_restore   %r14
1560 -       mov     16(%rsp),%r13
1561 +       pop             %r13
1562  .cfi_restore   %r13
1563 -       mov     24(%rsp),%r12
1564 +       pop             %r12
1565  .cfi_restore   %r12
1566 -       mov     32(%rsp),%rbp
1567 -.cfi_restore   %rbp
1568 -       mov     40(%rsp),%rbx
1569 +       pop             %rbx
1570  .cfi_restore   %rbx
1571 -       lea     48(%rsp),%rsp
1572 -.cfi_adjust_cfa_offset -48
1573 +       pop             %rbp
1574 +.cfi_restore   %rbp
1575  .Lno_data_avx:
1576  .Lblocks_avx_epilogue:
1577         ret
1578 @@ -667,10 +757,11 @@ poly1305_blocks_avx:
1579  .align 32
1580  .Lbase2_64_avx:
1581  .cfi_startproc
1582 -       push    %rbx
1583 -.cfi_push      %rbx
1584         push    %rbp
1585  .cfi_push      %rbp
1586 +       mov     %rsp,%rbp
1587 +       push    %rbx
1588 +.cfi_push      %rbx
1589         push    %r12
1590  .cfi_push      %r12
1591         push    %r13
1592 @@ -736,22 +827,18 @@ poly1305_blocks_avx:
1593
1594  .Lproceed_avx:
1595         mov     %r15,$len
1596 -
1597 -       mov     0(%rsp),%r15
1598 +       pop             %r15
1599  .cfi_restore   %r15
1600 -       mov     8(%rsp),%r14
1601 +       pop             %r14
1602  .cfi_restore   %r14
1603 -       mov     16(%rsp),%r13
1604 +       pop             %r13
1605  .cfi_restore   %r13
1606 -       mov     24(%rsp),%r12
1607 +       pop             %r12
1608  .cfi_restore   %r12
1609 -       mov     32(%rsp),%rbp
1610 -.cfi_restore   %rbp
1611 -       mov     40(%rsp),%rbx
1612 +       pop             %rbx
1613  .cfi_restore   %rbx
1614 -       lea     48(%rsp),%rax
1615 -       lea     48(%rsp),%rsp
1616 -.cfi_adjust_cfa_offset -48
1617 +       pop             %rbp
1618 +.cfi_restore   %rbp
1619  .Lbase2_64_avx_epilogue:
1620         jmp     .Ldo_avx
1621  .cfi_endproc
1622 @@ -768,8 +855,11 @@ poly1305_blocks_avx:
1623  .Ldo_avx:
1624  ___
1625  $code.=<<___   if (!$win64);
1626 +       lea             8(%rsp),%r10
1627 +.cfi_def_cfa_register  %r10
1628 +       and             \$-32,%rsp
1629 +       sub             \$-8,%rsp
1630         lea             -0x58(%rsp),%r11
1631 -.cfi_def_cfa           %r11,0x60
1632         sub             \$0x178,%rsp
1633  ___
1634  $code.=<<___   if ($win64);
1635 @@ -1361,18 +1451,18 @@ $code.=<<___    if ($win64);
1636  .Ldo_avx_epilogue:
1637  ___
1638  $code.=<<___   if (!$win64);
1639 -       lea             0x58(%r11),%rsp
1640 -.cfi_def_cfa           %rsp,8
1641 +       lea             -8(%r10),%rsp
1642 +.cfi_def_cfa_register  %rsp
1643  ___
1644  $code.=<<___;
1645         vzeroupper
1646         ret
1647  .cfi_endproc
1648 -.size  poly1305_blocks_avx,.-poly1305_blocks_avx
1649 +___
1650 +&end_function("poly1305_blocks_avx");
1651
1652 -.type  poly1305_emit_avx,\@function,3
1653 -.align 32
1654 -poly1305_emit_avx:
1655 +&declare_function("poly1305_emit_avx", 32, 3);
1656 +$code.=<<___;
1657         cmpl    \$0,20($ctx)    # is_base2_26?
1658         je      .Lemit
1659
1660 @@ -1423,41 +1513,51 @@ poly1305_emit_avx:
1661         mov     %rcx,8($mac)
1662
1663         ret
1664 -.size  poly1305_emit_avx,.-poly1305_emit_avx
1665  ___
1666 +&end_function("poly1305_emit_avx");
1667 +
1668 +if ($kernel) {
1669 +       $code .= "#endif\n";
1670 +}
1671
1672  if ($avx>1) {
1673 +
1674 +if ($kernel) {
1675 +       $code .= "#ifdef CONFIG_AS_AVX2\n";
1676 +}
1677 +
1678  my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1679      map("%ymm$_",(0..15));
1680  my $S4=$MASK;
1681
1682 +sub poly1305_blocks_avxN {
1683 +       my ($avx512) = @_;
1684 +       my $suffix = $avx512 ? "_avx512" : "";
1685  $code.=<<___;
1686 -.type  poly1305_blocks_avx2,\@function,4
1687 -.align 32
1688 -poly1305_blocks_avx2:
1689  .cfi_startproc
1690         mov     20($ctx),%r8d           # is_base2_26
1691         cmp     \$128,$len
1692 -       jae     .Lblocks_avx2
1693 +       jae     .Lblocks_avx2$suffix
1694         test    %r8d,%r8d
1695         jz      .Lblocks
1696
1697 -.Lblocks_avx2:
1698 +.Lblocks_avx2$suffix:
1699         and     \$-16,$len
1700 -       jz      .Lno_data_avx2
1701 +       jz      .Lno_data_avx2$suffix
1702
1703         vzeroupper
1704
1705         test    %r8d,%r8d
1706 -       jz      .Lbase2_64_avx2
1707 +       jz      .Lbase2_64_avx2$suffix
1708
1709         test    \$63,$len
1710 -       jz      .Leven_avx2
1711 +       jz      .Leven_avx2$suffix
1712
1713 -       push    %rbx
1714 -.cfi_push      %rbx
1715         push    %rbp
1716  .cfi_push      %rbp
1717 +       mov     %rsp,%rbp
1718 +       push    %rbx
1719 +.cfi_push      %rbx
1720         push    %r12
1721  .cfi_push      %r12
1722         push    %r13
1723 @@ -1466,7 +1566,7 @@ poly1305_blocks_avx2:
1724  .cfi_push      %r14
1725         push    %r15
1726  .cfi_push      %r15
1727 -.Lblocks_avx2_body:
1728 +.Lblocks_avx2_body$suffix:
1729
1730         mov     $len,%r15               # reassign $len
1731
1732 @@ -1513,7 +1613,7 @@ poly1305_blocks_avx2:
1733         shr     \$2,$s1
1734         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
1735
1736 -.Lbase2_26_pre_avx2:
1737 +.Lbase2_26_pre_avx2$suffix:
1738         add     0($inp),$h0             # accumulate input
1739         adc     8($inp),$h1
1740         lea     16($inp),$inp
1741 @@ -1524,10 +1624,10 @@ poly1305_blocks_avx2:
1742         mov     $r1,%rax
1743
1744         test    \$63,%r15
1745 -       jnz     .Lbase2_26_pre_avx2
1746 +       jnz     .Lbase2_26_pre_avx2$suffix
1747
1748         test    $padbit,$padbit         # if $padbit is zero,
1749 -       jz      .Lstore_base2_64_avx2   # store hash in base 2^64 format
1750 +       jz      .Lstore_base2_64_avx2$suffix    # store hash in base 2^64 format
1751
1752         ################################# base 2^64 -> base 2^26
1753         mov     $h0,%rax
1754 @@ -1548,57 +1648,56 @@ poly1305_blocks_avx2:
1755         or      $r1,$h2                 # h[4]
1756
1757         test    %r15,%r15
1758 -       jz      .Lstore_base2_26_avx2
1759 +       jz      .Lstore_base2_26_avx2$suffix
1760
1761         vmovd   %rax#d,%x#$H0
1762         vmovd   %rdx#d,%x#$H1
1763         vmovd   $h0#d,%x#$H2
1764         vmovd   $h1#d,%x#$H3
1765         vmovd   $h2#d,%x#$H4
1766 -       jmp     .Lproceed_avx2
1767 +       jmp     .Lproceed_avx2$suffix
1768
1769  .align 32
1770 -.Lstore_base2_64_avx2:
1771 +.Lstore_base2_64_avx2$suffix:
1772         mov     $h0,0($ctx)
1773         mov     $h1,8($ctx)
1774         mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
1775 -       jmp     .Ldone_avx2
1776 +       jmp     .Ldone_avx2$suffix
1777
1778  .align 16
1779 -.Lstore_base2_26_avx2:
1780 +.Lstore_base2_26_avx2$suffix:
1781         mov     %rax#d,0($ctx)          # store hash value base 2^26
1782         mov     %rdx#d,4($ctx)
1783         mov     $h0#d,8($ctx)
1784         mov     $h1#d,12($ctx)
1785         mov     $h2#d,16($ctx)
1786  .align 16
1787 -.Ldone_avx2:
1788 -       mov     0(%rsp),%r15
1789 +.Ldone_avx2$suffix:
1790 +       pop             %r15
1791  .cfi_restore   %r15
1792 -       mov     8(%rsp),%r14
1793 +       pop             %r14
1794  .cfi_restore   %r14
1795 -       mov     16(%rsp),%r13
1796 +       pop             %r13
1797  .cfi_restore   %r13
1798 -       mov     24(%rsp),%r12
1799 +       pop             %r12
1800  .cfi_restore   %r12
1801 -       mov     32(%rsp),%rbp
1802 -.cfi_restore   %rbp
1803 -       mov     40(%rsp),%rbx
1804 +       pop             %rbx
1805  .cfi_restore   %rbx
1806 -       lea     48(%rsp),%rsp
1807 -.cfi_adjust_cfa_offset -48
1808 -.Lno_data_avx2:
1809 -.Lblocks_avx2_epilogue:
1810 +       pop             %rbp
1811 +.cfi_restore   %rbp
1812 +.Lno_data_avx2$suffix:
1813 +.Lblocks_avx2_epilogue$suffix:
1814         ret
1815  .cfi_endproc
1816
1817  .align 32
1818 -.Lbase2_64_avx2:
1819 +.Lbase2_64_avx2$suffix:
1820  .cfi_startproc
1821 -       push    %rbx
1822 -.cfi_push      %rbx
1823         push    %rbp
1824  .cfi_push      %rbp
1825 +       mov     %rsp,%rbp
1826 +       push    %rbx
1827 +.cfi_push      %rbx
1828         push    %r12
1829  .cfi_push      %r12
1830         push    %r13
1831 @@ -1607,7 +1706,7 @@ poly1305_blocks_avx2:
1832  .cfi_push      %r14
1833         push    %r15
1834  .cfi_push      %r15
1835 -.Lbase2_64_avx2_body:
1836 +.Lbase2_64_avx2_body$suffix:
1837
1838         mov     $len,%r15               # reassign $len
1839
1840 @@ -1624,9 +1723,9 @@ poly1305_blocks_avx2:
1841         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
1842
1843         test    \$63,$len
1844 -       jz      .Linit_avx2
1845 +       jz      .Linit_avx2$suffix
1846
1847 -.Lbase2_64_pre_avx2:
1848 +.Lbase2_64_pre_avx2$suffix:
1849         add     0($inp),$h0             # accumulate input
1850         adc     8($inp),$h1
1851         lea     16($inp),$inp
1852 @@ -1637,9 +1736,9 @@ poly1305_blocks_avx2:
1853         mov     $r1,%rax
1854
1855         test    \$63,%r15
1856 -       jnz     .Lbase2_64_pre_avx2
1857 +       jnz     .Lbase2_64_pre_avx2$suffix
1858
1859 -.Linit_avx2:
1860 +.Linit_avx2$suffix:
1861         ################################# base 2^64 -> base 2^26
1862         mov     $h0,%rax
1863         mov     $h0,%rdx
1864 @@ -1667,69 +1766,77 @@ poly1305_blocks_avx2:
1865
1866         call    __poly1305_init_avx
1867
1868 -.Lproceed_avx2:
1869 +.Lproceed_avx2$suffix:
1870         mov     %r15,$len                       # restore $len
1871 -       mov     OPENSSL_ia32cap_P+8(%rip),%r10d
1872 +___
1873 +$code.=<<___ if (!$kernel);
1874 +       mov     OPENSSL_ia32cap_P+8(%rip),%r9d
1875         mov     \$`(1<<31|1<<30|1<<16)`,%r11d
1876 -
1877 -       mov     0(%rsp),%r15
1878 +___
1879 +$code.=<<___;
1880 +       pop             %r15
1881  .cfi_restore   %r15
1882 -       mov     8(%rsp),%r14
1883 +       pop             %r14
1884  .cfi_restore   %r14
1885 -       mov     16(%rsp),%r13
1886 +       pop             %r13
1887  .cfi_restore   %r13
1888 -       mov     24(%rsp),%r12
1889 +       pop             %r12
1890  .cfi_restore   %r12
1891 -       mov     32(%rsp),%rbp
1892 -.cfi_restore   %rbp
1893 -       mov     40(%rsp),%rbx
1894 +       pop             %rbx
1895  .cfi_restore   %rbx
1896 -       lea     48(%rsp),%rax
1897 -       lea     48(%rsp),%rsp
1898 -.cfi_adjust_cfa_offset -48
1899 -.Lbase2_64_avx2_epilogue:
1900 -       jmp     .Ldo_avx2
1901 +       pop             %rbp
1902 +.cfi_restore   %rbp
1903 +.Lbase2_64_avx2_epilogue$suffix:
1904 +       jmp     .Ldo_avx2$suffix
1905  .cfi_endproc
1906
1907  .align 32
1908 -.Leven_avx2:
1909 +.Leven_avx2$suffix:
1910  .cfi_startproc
1911 -       mov             OPENSSL_ia32cap_P+8(%rip),%r10d
1912 +___
1913 +$code.=<<___ if (!$kernel);
1914 +       mov             OPENSSL_ia32cap_P+8(%rip),%r9d
1915 +___
1916 +$code.=<<___;
1917         vmovd           4*0($ctx),%x#$H0        # load hash value base 2^26
1918         vmovd           4*1($ctx),%x#$H1
1919         vmovd           4*2($ctx),%x#$H2
1920         vmovd           4*3($ctx),%x#$H3
1921         vmovd           4*4($ctx),%x#$H4
1922
1923 -.Ldo_avx2:
1924 +.Ldo_avx2$suffix:
1925  ___
1926 -$code.=<<___           if ($avx>2);
1927 +$code.=<<___           if (!$kernel && $avx>2);
1928         cmp             \$512,$len
1929         jb              .Lskip_avx512
1930 -       and             %r11d,%r10d
1931 -       test            \$`1<<16`,%r10d         # check for AVX512F
1932 +       and             %r11d,%r9d
1933 +       test            \$`1<<16`,%r9d          # check for AVX512F
1934         jnz             .Lblocks_avx512
1935 -.Lskip_avx512:
1936 +.Lskip_avx512$suffix:
1937 +___
1938 +$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
1939 +       cmp             \$512,$len
1940 +       jae             .Lblocks_avx512
1941  ___
1942  $code.=<<___   if (!$win64);
1943 -       lea             -8(%rsp),%r11
1944 -.cfi_def_cfa           %r11,16
1945 +       lea             8(%rsp),%r10
1946 +.cfi_def_cfa_register  %r10
1947         sub             \$0x128,%rsp
1948  ___
1949  $code.=<<___   if ($win64);
1950 -       lea             -0xf8(%rsp),%r11
1951 +       lea             8(%rsp),%r10
1952         sub             \$0x1c8,%rsp
1953 -       vmovdqa         %xmm6,0x50(%r11)
1954 -       vmovdqa         %xmm7,0x60(%r11)
1955 -       vmovdqa         %xmm8,0x70(%r11)
1956 -       vmovdqa         %xmm9,0x80(%r11)
1957 -       vmovdqa         %xmm10,0x90(%r11)
1958 -       vmovdqa         %xmm11,0xa0(%r11)
1959 -       vmovdqa         %xmm12,0xb0(%r11)
1960 -       vmovdqa         %xmm13,0xc0(%r11)
1961 -       vmovdqa         %xmm14,0xd0(%r11)
1962 -       vmovdqa         %xmm15,0xe0(%r11)
1963 -.Ldo_avx2_body:
1964 +       vmovdqa         %xmm6,-0xb0(%r10)
1965 +       vmovdqa         %xmm7,-0xa0(%r10)
1966 +       vmovdqa         %xmm8,-0x90(%r10)
1967 +       vmovdqa         %xmm9,-0x80(%r10)
1968 +       vmovdqa         %xmm10,-0x70(%r10)
1969 +       vmovdqa         %xmm11,-0x60(%r10)
1970 +       vmovdqa         %xmm12,-0x50(%r10)
1971 +       vmovdqa         %xmm13,-0x40(%r10)
1972 +       vmovdqa         %xmm14,-0x30(%r10)
1973 +       vmovdqa         %xmm15,-0x20(%r10)
1974 +.Ldo_avx2_body$suffix:
1975  ___
1976  $code.=<<___;
1977         lea             .Lconst(%rip),%rcx
1978 @@ -1794,11 +1901,11 @@ $code.=<<___;
1979
1980         vpaddq          $H2,$T2,$H2             # accumulate input
1981         sub             \$64,$len
1982 -       jz              .Ltail_avx2
1983 -       jmp             .Loop_avx2
1984 +       jz              .Ltail_avx2$suffix
1985 +       jmp             .Loop_avx2$suffix
1986
1987  .align 32
1988 -.Loop_avx2:
1989 +.Loop_avx2$suffix:
1990         ################################################################
1991         # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1992         # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1993 @@ -1946,10 +2053,10 @@ $code.=<<___;
1994          vpor           32(%rcx),$T4,$T4        # padbit, yes, always
1995
1996         sub             \$64,$len
1997 -       jnz             .Loop_avx2
1998 +       jnz             .Loop_avx2$suffix
1999
2000         .byte           0x66,0x90
2001 -.Ltail_avx2:
2002 +.Ltail_avx2$suffix:
2003         ################################################################
2004         # while above multiplications were by r^4 in all lanes, in last
2005         # iteration we multiply least significant lane by r^4 and most
2006 @@ -2087,37 +2194,29 @@ $code.=<<___;
2007         vmovd           %x#$H4,`4*4-48-64`($ctx)
2008  ___
2009  $code.=<<___   if ($win64);
2010 -       vmovdqa         0x50(%r11),%xmm6
2011 -       vmovdqa         0x60(%r11),%xmm7
2012 -       vmovdqa         0x70(%r11),%xmm8
2013 -       vmovdqa         0x80(%r11),%xmm9
2014 -       vmovdqa         0x90(%r11),%xmm10
2015 -       vmovdqa         0xa0(%r11),%xmm11
2016 -       vmovdqa         0xb0(%r11),%xmm12
2017 -       vmovdqa         0xc0(%r11),%xmm13
2018 -       vmovdqa         0xd0(%r11),%xmm14
2019 -       vmovdqa         0xe0(%r11),%xmm15
2020 -       lea             0xf8(%r11),%rsp
2021 -.Ldo_avx2_epilogue:
2022 +       vmovdqa         -0xb0(%r10),%xmm6
2023 +       vmovdqa         -0xa0(%r10),%xmm7
2024 +       vmovdqa         -0x90(%r10),%xmm8
2025 +       vmovdqa         -0x80(%r10),%xmm9
2026 +       vmovdqa         -0x70(%r10),%xmm10
2027 +       vmovdqa         -0x60(%r10),%xmm11
2028 +       vmovdqa         -0x50(%r10),%xmm12
2029 +       vmovdqa         -0x40(%r10),%xmm13
2030 +       vmovdqa         -0x30(%r10),%xmm14
2031 +       vmovdqa         -0x20(%r10),%xmm15
2032 +       lea             -8(%r10),%rsp
2033 +.Ldo_avx2_epilogue$suffix:
2034  ___
2035  $code.=<<___   if (!$win64);
2036 -       lea             8(%r11),%rsp
2037 -.cfi_def_cfa           %rsp,8
2038 +       lea             -8(%r10),%rsp
2039 +.cfi_def_cfa_register  %rsp
2040  ___
2041  $code.=<<___;
2042         vzeroupper
2043         ret
2044  .cfi_endproc
2045 -.size  poly1305_blocks_avx2,.-poly1305_blocks_avx2
2046  ___
2047 -#######################################################################
2048 -if ($avx>2) {
2049 -# On entry we have input length divisible by 64. But since inner loop
2050 -# processes 128 bytes per iteration, cases when length is not divisible
2051 -# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2052 -# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2053 -# for this tail, we wouldn't have to even allocate stack frame...
2054 -
2055 +if($avx > 2 && $avx512) {
2056  my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2057  my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2058  my $PADBIT="%zmm30";
2059 @@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2060  map(s/%y/%z/,($MASK));
2061
2062  $code.=<<___;
2063 -.type  poly1305_blocks_avx512,\@function,4
2064 -.align 32
2065 -poly1305_blocks_avx512:
2066  .cfi_startproc
2067  .Lblocks_avx512:
2068         mov             \$15,%eax
2069         kmovw           %eax,%k2
2070  ___
2071  $code.=<<___   if (!$win64);
2072 -       lea             -8(%rsp),%r11
2073 -.cfi_def_cfa           %r11,16
2074 +       lea             8(%rsp),%r10
2075 +.cfi_def_cfa_register  %r10
2076         sub             \$0x128,%rsp
2077  ___
2078  $code.=<<___   if ($win64);
2079 -       lea             -0xf8(%rsp),%r11
2080 +       lea             8(%rsp),%r10
2081         sub             \$0x1c8,%rsp
2082 -       vmovdqa         %xmm6,0x50(%r11)
2083 -       vmovdqa         %xmm7,0x60(%r11)
2084 -       vmovdqa         %xmm8,0x70(%r11)
2085 -       vmovdqa         %xmm9,0x80(%r11)
2086 -       vmovdqa         %xmm10,0x90(%r11)
2087 -       vmovdqa         %xmm11,0xa0(%r11)
2088 -       vmovdqa         %xmm12,0xb0(%r11)
2089 -       vmovdqa         %xmm13,0xc0(%r11)
2090 -       vmovdqa         %xmm14,0xd0(%r11)
2091 -       vmovdqa         %xmm15,0xe0(%r11)
2092 +       vmovdqa         %xmm6,-0xb0(%r10)
2093 +       vmovdqa         %xmm7,-0xa0(%r10)
2094 +       vmovdqa         %xmm8,-0x90(%r10)
2095 +       vmovdqa         %xmm9,-0x80(%r10)
2096 +       vmovdqa         %xmm10,-0x70(%r10)
2097 +       vmovdqa         %xmm11,-0x60(%r10)
2098 +       vmovdqa         %xmm12,-0x50(%r10)
2099 +       vmovdqa         %xmm13,-0x40(%r10)
2100 +       vmovdqa         %xmm14,-0x30(%r10)
2101 +       vmovdqa         %xmm15,-0x20(%r10)
2102  .Ldo_avx512_body:
2103  ___
2104  $code.=<<___;
2105 @@ -2679,7 +2775,7 @@ $code.=<<___;
2106
2107         lea             0x90(%rsp),%rax         # size optimization for .Ltail_avx2
2108         add             \$64,$len
2109 -       jnz             .Ltail_avx2
2110 +       jnz             .Ltail_avx2$suffix
2111
2112         vpsubq          $T2,$H2,$H2             # undo input accumulation
2113         vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2114 @@ -2690,29 +2786,61 @@ $code.=<<___;
2115         vzeroall
2116  ___
2117  $code.=<<___   if ($win64);
2118 -       movdqa          0x50(%r11),%xmm6
2119 -       movdqa          0x60(%r11),%xmm7
2120 -       movdqa          0x70(%r11),%xmm8
2121 -       movdqa          0x80(%r11),%xmm9
2122 -       movdqa          0x90(%r11),%xmm10
2123 -       movdqa          0xa0(%r11),%xmm11
2124 -       movdqa          0xb0(%r11),%xmm12
2125 -       movdqa          0xc0(%r11),%xmm13
2126 -       movdqa          0xd0(%r11),%xmm14
2127 -       movdqa          0xe0(%r11),%xmm15
2128 -       lea             0xf8(%r11),%rsp
2129 +       movdqa          -0xb0(%r10),%xmm6
2130 +       movdqa          -0xa0(%r10),%xmm7
2131 +       movdqa          -0x90(%r10),%xmm8
2132 +       movdqa          -0x80(%r10),%xmm9
2133 +       movdqa          -0x70(%r10),%xmm10
2134 +       movdqa          -0x60(%r10),%xmm11
2135 +       movdqa          -0x50(%r10),%xmm12
2136 +       movdqa          -0x40(%r10),%xmm13
2137 +       movdqa          -0x30(%r10),%xmm14
2138 +       movdqa          -0x20(%r10),%xmm15
2139 +       lea             -8(%r10),%rsp
2140  .Ldo_avx512_epilogue:
2141  ___
2142  $code.=<<___   if (!$win64);
2143 -       lea             8(%r11),%rsp
2144 -.cfi_def_cfa           %rsp,8
2145 +       lea             -8(%r10),%rsp
2146 +.cfi_def_cfa_register  %rsp
2147  ___
2148  $code.=<<___;
2149         ret
2150  .cfi_endproc
2151 -.size  poly1305_blocks_avx512,.-poly1305_blocks_avx512
2152  ___
2153 -if ($avx>3) {
2154 +
2155 +}
2156 +
2157 +}
2158 +
2159 +&declare_function("poly1305_blocks_avx2", 32, 4);
2160 +poly1305_blocks_avxN(0);
2161 +&end_function("poly1305_blocks_avx2");
2162 +
2163 +if($kernel) {
2164 +       $code .= "#endif\n";
2165 +}
2166 +
2167 +#######################################################################
2168 +if ($avx>2) {
2169 +# On entry we have input length divisible by 64. But since inner loop
2170 +# processes 128 bytes per iteration, cases when length is not divisible
2171 +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2172 +# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2173 +# for this tail, we wouldn't have to even allocate stack frame...
2174 +
2175 +if($kernel) {
2176 +       $code .= "#ifdef CONFIG_AS_AVX512\n";
2177 +}
2178 +
2179 +&declare_function("poly1305_blocks_avx512", 32, 4);
2180 +poly1305_blocks_avxN(1);
2181 +&end_function("poly1305_blocks_avx512");
2182 +
2183 +if ($kernel) {
2184 +       $code .= "#endif\n";
2185 +}
2186 +
2187 +if (!$kernel && $avx>3) {
2188  ########################################################################
2189  # VPMADD52 version using 2^44 radix.
2190  #
2191 @@ -3753,45 +3881,9 @@ poly1305_emit_base2_44:
2192  .size  poly1305_emit_base2_44,.-poly1305_emit_base2_44
2193  ___
2194  }      }       }
2195 -$code.=<<___;
2196 -.align 64
2197 -.Lconst:
2198 -.Lmask24:
2199 -.long  0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
2200 -.L129:
2201 -.long  `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
2202 -.Lmask26:
2203 -.long  0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
2204 -.Lpermd_avx2:
2205 -.long  2,2,2,3,2,0,2,1
2206 -.Lpermd_avx512:
2207 -.long  0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
2208 -
2209 -.L2_44_inp_permd:
2210 -.long  0,1,1,2,2,3,7,7
2211 -.L2_44_inp_shift:
2212 -.quad  0,12,24,64
2213 -.L2_44_mask:
2214 -.quad  0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
2215 -.L2_44_shift_rgt:
2216 -.quad  44,44,42,64
2217 -.L2_44_shift_lft:
2218 -.quad  8,8,10,64
2219 -
2220 -.align 64
2221 -.Lx_mask44:
2222 -.quad  0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
2223 -.quad  0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
2224 -.Lx_mask42:
2225 -.quad  0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
2226 -.quad  0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
2227 -___
2228  }
2229 -$code.=<<___;
2230 -.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2231 -.align 16
2232 -___
2233
2234 +if (!$kernel)
2235  {      # chacha20-poly1305 helpers
2236  my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
2237                                    ("%rdi","%rsi","%rdx","%rcx");  # Unix order
2238 @@ -4038,17 +4130,17 @@ avx_handler:
2239
2240  .section       .pdata
2241  .align 4
2242 -       .rva    .LSEH_begin_poly1305_init
2243 -       .rva    .LSEH_end_poly1305_init
2244 -       .rva    .LSEH_info_poly1305_init
2245 -
2246 -       .rva    .LSEH_begin_poly1305_blocks
2247 -       .rva    .LSEH_end_poly1305_blocks
2248 -       .rva    .LSEH_info_poly1305_blocks
2249 -
2250 -       .rva    .LSEH_begin_poly1305_emit
2251 -       .rva    .LSEH_end_poly1305_emit
2252 -       .rva    .LSEH_info_poly1305_emit
2253 +       .rva    .LSEH_begin_poly1305_init_x86_64
2254 +       .rva    .LSEH_end_poly1305_init_x86_64
2255 +       .rva    .LSEH_info_poly1305_init_x86_64
2256 +
2257 +       .rva    .LSEH_begin_poly1305_blocks_x86_64
2258 +       .rva    .LSEH_end_poly1305_blocks_x86_64
2259 +       .rva    .LSEH_info_poly1305_blocks_x86_64
2260 +
2261 +       .rva    .LSEH_begin_poly1305_emit_x86_64
2262 +       .rva    .LSEH_end_poly1305_emit_x86_64
2263 +       .rva    .LSEH_info_poly1305_emit_x86_64
2264  ___
2265  $code.=<<___ if ($avx);
2266         .rva    .LSEH_begin_poly1305_blocks_avx
2267 @@ -4088,20 +4180,20 @@ ___
2268  $code.=<<___;
2269  .section       .xdata
2270  .align 8
2271 -.LSEH_info_poly1305_init:
2272 +.LSEH_info_poly1305_init_x86_64:
2273         .byte   9,0,0,0
2274         .rva    se_handler
2275 -       .rva    .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
2276 +       .rva    .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
2277
2278 -.LSEH_info_poly1305_blocks:
2279 +.LSEH_info_poly1305_blocks_x86_64:
2280         .byte   9,0,0,0
2281         .rva    se_handler
2282         .rva    .Lblocks_body,.Lblocks_epilogue
2283
2284 -.LSEH_info_poly1305_emit:
2285 +.LSEH_info_poly1305_emit_x86_64:
2286         .byte   9,0,0,0
2287         .rva    se_handler
2288 -       .rva    .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
2289 +       .rva    .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
2290  ___
2291  $code.=<<___ if ($avx);
2292  .LSEH_info_poly1305_blocks_avx_1:
2293 @@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2);
2294  ___
2295  }
2296
2297 +open SELF,$0;
2298 +while(<SELF>) {
2299 +       next if (/^#!/);
2300 +       last if (!s/^#/\/\// and !/^$/);
2301 +       print;
2302 +}
2303 +close SELF;
2304 +
2305  foreach (split('\n',$code)) {
2306         s/\`([^\`]*)\`/eval($1)/ge;
2307         s/%r([a-z]+)#d/%e$1/g;
2308         s/%r([0-9]+)#d/%r$1d/g;
2309         s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
2310
2311 +       if ($kernel) {
2312 +               s/(^\.type.*),[0-9]+$/\1/;
2313 +               s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
2314 +               next if /^\.cfi.*/;
2315 +       }
2316 +
2317         print $_,"\n";
2318  }
2319  close STDOUT;
2320 --- a/arch/x86/crypto/poly1305_glue.c
2321 +++ b/arch/x86/crypto/poly1305_glue.c
2322 @@ -1,8 +1,6 @@
2323 -// SPDX-License-Identifier: GPL-2.0-or-later
2324 +// SPDX-License-Identifier: GPL-2.0 OR MIT
2325  /*
2326 - * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
2327 - *
2328 - * Copyright (C) 2015 Martin Willi
2329 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
2330   */
2331
2332  #include <crypto/algapi.h>
2333 @@ -13,279 +11,170 @@
2334  #include <linux/jump_label.h>
2335  #include <linux/kernel.h>
2336  #include <linux/module.h>
2337 +#include <asm/intel-family.h>
2338  #include <asm/simd.h>
2339
2340 -asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
2341 -                                   const u32 *r, unsigned int blocks);
2342 -asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
2343 -                                    unsigned int blocks, const u32 *u);
2344 -asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
2345 -                                    unsigned int blocks, const u32 *u);
2346 +asmlinkage void poly1305_init_x86_64(void *ctx,
2347 +                                    const u8 key[POLY1305_KEY_SIZE]);
2348 +asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
2349 +                                      const size_t len, const u32 padbit);
2350 +asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
2351 +                                    const u32 nonce[4]);
2352 +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
2353 +                                 const u32 nonce[4]);
2354 +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
2355 +                                   const u32 padbit);
2356 +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
2357 +                                    const u32 padbit);
2358 +asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
2359 +                                      const size_t len, const u32 padbit);
2360
2361 -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
2362 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
2363  static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
2364 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
2365
2366 -static inline u64 mlt(u64 a, u64 b)
2367 -{
2368 -       return a * b;
2369 -}
2370 -
2371 -static inline u32 sr(u64 v, u_char n)
2372 -{
2373 -       return v >> n;
2374 -}
2375 -
2376 -static inline u32 and(u32 v, u32 mask)
2377 -{
2378 -       return v & mask;
2379 -}
2380 -
2381 -static void poly1305_simd_mult(u32 *a, const u32 *b)
2382 -{
2383 -       u8 m[POLY1305_BLOCK_SIZE];
2384 -
2385 -       memset(m, 0, sizeof(m));
2386 -       /* The poly1305 block function adds a hi-bit to the accumulator which
2387 -        * we don't need for key multiplication; compensate for it. */
2388 -       a[4] -= 1 << 24;
2389 -       poly1305_block_sse2(a, m, b, 1);
2390 -}
2391 -
2392 -static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key)
2393 -{
2394 -       /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
2395 -       key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
2396 -       key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
2397 -       key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
2398 -       key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
2399 -       key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
2400 -}
2401 +struct poly1305_arch_internal {
2402 +       union {
2403 +               struct {
2404 +                       u32 h[5];
2405 +                       u32 is_base2_26;
2406 +               };
2407 +               u64 hs[3];
2408 +       };
2409 +       u64 r[2];
2410 +       u64 pad;
2411 +       struct { u32 r2, r1, r4, r3; } rn[9];
2412 +};
2413
2414 -static void poly1305_integer_blocks(struct poly1305_state *state,
2415 -                                   const struct poly1305_key *key,
2416 -                                   const void *src,
2417 -                                   unsigned int nblocks, u32 hibit)
2418 +/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
2419 + * the unfortunate situation of using AVX and then having to go back to scalar
2420 + * -- because the user is silly and has called the update function from two
2421 + * separate contexts -- then we need to convert back to the original base before
2422 + * proceeding. It is possible to reason that the initial reduction below is
2423 + * sufficient given the implementation invariants. However, for an avoidance of
2424 + * doubt and because this is not performance critical, we do the full reduction
2425 + * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
2426 + */
2427 +static void convert_to_base2_64(void *ctx)
2428  {
2429 -       u32 r0, r1, r2, r3, r4;
2430 -       u32 s1, s2, s3, s4;
2431 -       u32 h0, h1, h2, h3, h4;
2432 -       u64 d0, d1, d2, d3, d4;
2433 +       struct poly1305_arch_internal *state = ctx;
2434 +       u32 cy;
2435
2436 -       if (!nblocks)
2437 +       if (!state->is_base2_26)
2438                 return;
2439
2440 -       r0 = key->r[0];
2441 -       r1 = key->r[1];
2442 -       r2 = key->r[2];
2443 -       r3 = key->r[3];
2444 -       r4 = key->r[4];
2445 -
2446 -       s1 = r1 * 5;
2447 -       s2 = r2 * 5;
2448 -       s3 = r3 * 5;
2449 -       s4 = r4 * 5;
2450 -
2451 -       h0 = state->h[0];
2452 -       h1 = state->h[1];
2453 -       h2 = state->h[2];
2454 -       h3 = state->h[3];
2455 -       h4 = state->h[4];
2456 -
2457 -       do {
2458 -               /* h += m[i] */
2459 -               h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
2460 -               h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
2461 -               h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
2462 -               h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
2463 -               h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
2464 -
2465 -               /* h *= r */
2466 -               d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
2467 -                    mlt(h3, s2) + mlt(h4, s1);
2468 -               d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
2469 -                    mlt(h3, s3) + mlt(h4, s2);
2470 -               d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
2471 -                    mlt(h3, s4) + mlt(h4, s3);
2472 -               d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
2473 -                    mlt(h3, r0) + mlt(h4, s4);
2474 -               d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
2475 -                    mlt(h3, r1) + mlt(h4, r0);
2476 -
2477 -               /* (partial) h %= p */
2478 -               d1 += sr(d0, 26);     h0 = and(d0, 0x3ffffff);
2479 -               d2 += sr(d1, 26);     h1 = and(d1, 0x3ffffff);
2480 -               d3 += sr(d2, 26);     h2 = and(d2, 0x3ffffff);
2481 -               d4 += sr(d3, 26);     h3 = and(d3, 0x3ffffff);
2482 -               h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
2483 -               h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
2484 -
2485 -               src += POLY1305_BLOCK_SIZE;
2486 -       } while (--nblocks);
2487 -
2488 -       state->h[0] = h0;
2489 -       state->h[1] = h1;
2490 -       state->h[2] = h2;
2491 -       state->h[3] = h3;
2492 -       state->h[4] = h4;
2493 -}
2494 -
2495 -static void poly1305_integer_emit(const struct poly1305_state *state, void *dst)
2496 -{
2497 -       u32 h0, h1, h2, h3, h4;
2498 -       u32 g0, g1, g2, g3, g4;
2499 -       u32 mask;
2500 -
2501 -       /* fully carry h */
2502 -       h0 = state->h[0];
2503 -       h1 = state->h[1];
2504 -       h2 = state->h[2];
2505 -       h3 = state->h[3];
2506 -       h4 = state->h[4];
2507 -
2508 -       h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
2509 -       h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
2510 -       h4 += (h3 >> 26);     h3 = h3 & 0x3ffffff;
2511 -       h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
2512 -       h1 += (h0 >> 26);     h0 = h0 & 0x3ffffff;
2513 -
2514 -       /* compute h + -p */
2515 -       g0 = h0 + 5;
2516 -       g1 = h1 + (g0 >> 26);             g0 &= 0x3ffffff;
2517 -       g2 = h2 + (g1 >> 26);             g1 &= 0x3ffffff;
2518 -       g3 = h3 + (g2 >> 26);             g2 &= 0x3ffffff;
2519 -       g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
2520 -
2521 -       /* select h if h < p, or h + -p if h >= p */
2522 -       mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
2523 -       g0 &= mask;
2524 -       g1 &= mask;
2525 -       g2 &= mask;
2526 -       g3 &= mask;
2527 -       g4 &= mask;
2528 -       mask = ~mask;
2529 -       h0 = (h0 & mask) | g0;
2530 -       h1 = (h1 & mask) | g1;
2531 -       h2 = (h2 & mask) | g2;
2532 -       h3 = (h3 & mask) | g3;
2533 -       h4 = (h4 & mask) | g4;
2534 -
2535 -       /* h = h % (2^128) */
2536 -       put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
2537 -       put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
2538 -       put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
2539 -       put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
2540 -}
2541 -
2542 -void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
2543 -{
2544 -       poly1305_integer_setkey(desc->opaque_r, key);
2545 -       desc->s[0] = get_unaligned_le32(key + 16);
2546 -       desc->s[1] = get_unaligned_le32(key + 20);
2547 -       desc->s[2] = get_unaligned_le32(key + 24);
2548 -       desc->s[3] = get_unaligned_le32(key + 28);
2549 -       poly1305_core_init(&desc->h);
2550 -       desc->buflen = 0;
2551 -       desc->sset = true;
2552 -       desc->rset = 1;
2553 -}
2554 -EXPORT_SYMBOL_GPL(poly1305_init_arch);
2555 -
2556 -static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
2557 -                                              const u8 *src, unsigned int srclen)
2558 -{
2559 -       if (!dctx->sset) {
2560 -               if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
2561 -                       poly1305_integer_setkey(dctx->r, src);
2562 -                       src += POLY1305_BLOCK_SIZE;
2563 -                       srclen -= POLY1305_BLOCK_SIZE;
2564 -                       dctx->rset = 1;
2565 -               }
2566 -               if (srclen >= POLY1305_BLOCK_SIZE) {
2567 -                       dctx->s[0] = get_unaligned_le32(src +  0);
2568 -                       dctx->s[1] = get_unaligned_le32(src +  4);
2569 -                       dctx->s[2] = get_unaligned_le32(src +  8);
2570 -                       dctx->s[3] = get_unaligned_le32(src + 12);
2571 -                       src += POLY1305_BLOCK_SIZE;
2572 -                       srclen -= POLY1305_BLOCK_SIZE;
2573 -                       dctx->sset = true;
2574 -               }
2575 +       cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
2576 +       cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
2577 +       cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
2578 +       cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
2579 +       state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
2580 +       state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
2581 +       state->hs[2] = state->h[4] >> 24;
2582 +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
2583 +       cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
2584 +       state->hs[2] &= 3;
2585 +       state->hs[0] += cy;
2586 +       state->hs[1] += (cy = ULT(state->hs[0], cy));
2587 +       state->hs[2] += ULT(state->hs[1], cy);
2588 +#undef ULT
2589 +       state->is_base2_26 = 0;
2590 +}
2591 +
2592 +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
2593 +{
2594 +       poly1305_init_x86_64(ctx, key);
2595 +}
2596 +
2597 +static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
2598 +                                const u32 padbit)
2599 +{
2600 +       struct poly1305_arch_internal *state = ctx;
2601 +
2602 +       /* SIMD disables preemption, so relax after processing each page. */
2603 +       BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
2604 +                    PAGE_SIZE % POLY1305_BLOCK_SIZE);
2605 +
2606 +       if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
2607 +           (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
2608 +           !crypto_simd_usable()) {
2609 +               convert_to_base2_64(ctx);
2610 +               poly1305_blocks_x86_64(ctx, inp, len, padbit);
2611 +               return;
2612         }
2613 -       return srclen;
2614 -}
2615
2616 -static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
2617 -                                          const u8 *src, unsigned int srclen)
2618 -{
2619 -       unsigned int datalen;
2620 +       for (;;) {
2621 +               const size_t bytes = min_t(size_t, len, PAGE_SIZE);
2622
2623 -       if (unlikely(!dctx->sset)) {
2624 -               datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
2625 -               src += srclen - datalen;
2626 -               srclen = datalen;
2627 -       }
2628 -       if (srclen >= POLY1305_BLOCK_SIZE) {
2629 -               poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src,
2630 -                                       srclen / POLY1305_BLOCK_SIZE, 1);
2631 -               srclen %= POLY1305_BLOCK_SIZE;
2632 +               kernel_fpu_begin();
2633 +               if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
2634 +                       poly1305_blocks_avx512(ctx, inp, bytes, padbit);
2635 +               else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
2636 +                       poly1305_blocks_avx2(ctx, inp, bytes, padbit);
2637 +               else
2638 +                       poly1305_blocks_avx(ctx, inp, bytes, padbit);
2639 +               kernel_fpu_end();
2640 +               len -= bytes;
2641 +               if (!len)
2642 +                       break;
2643 +               inp += bytes;
2644         }
2645 -       return srclen;
2646  }
2647
2648 -static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
2649 -                                        const u8 *src, unsigned int srclen)
2650 -{
2651 -       unsigned int blocks, datalen;
2652 +static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
2653 +                              const u32 nonce[4])
2654 +{
2655 +       struct poly1305_arch_internal *state = ctx;
2656 +
2657 +       if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
2658 +           !state->is_base2_26 || !crypto_simd_usable()) {
2659 +               convert_to_base2_64(ctx);
2660 +               poly1305_emit_x86_64(ctx, mac, nonce);
2661 +       } else
2662 +               poly1305_emit_avx(ctx, mac, nonce);
2663 +}
2664 +
2665 +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
2666 +{
2667 +       poly1305_simd_init(&dctx->h, key);
2668 +       dctx->s[0] = get_unaligned_le32(&key[16]);
2669 +       dctx->s[1] = get_unaligned_le32(&key[20]);
2670 +       dctx->s[2] = get_unaligned_le32(&key[24]);
2671 +       dctx->s[3] = get_unaligned_le32(&key[28]);
2672 +       dctx->buflen = 0;
2673 +       dctx->sset = true;
2674 +}
2675 +EXPORT_SYMBOL(poly1305_init_arch);
2676
2677 +static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
2678 +                                              const u8 *inp, unsigned int len)
2679 +{
2680 +       unsigned int acc = 0;
2681         if (unlikely(!dctx->sset)) {
2682 -               datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
2683 -               src += srclen - datalen;
2684 -               srclen = datalen;
2685 -       }
2686 -
2687 -       if (IS_ENABLED(CONFIG_AS_AVX2) &&
2688 -           static_branch_likely(&poly1305_use_avx2) &&
2689 -           srclen >= POLY1305_BLOCK_SIZE * 4) {
2690 -               if (unlikely(dctx->rset < 4)) {
2691 -                       if (dctx->rset < 2) {
2692 -                               dctx->r[1] = dctx->r[0];
2693 -                               poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
2694 -                       }
2695 -                       dctx->r[2] = dctx->r[1];
2696 -                       poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r);
2697 -                       dctx->r[3] = dctx->r[2];
2698 -                       poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r);
2699 -                       dctx->rset = 4;
2700 +               if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
2701 +                       poly1305_simd_init(&dctx->h, inp);
2702 +                       inp += POLY1305_BLOCK_SIZE;
2703 +                       len -= POLY1305_BLOCK_SIZE;
2704 +                       acc += POLY1305_BLOCK_SIZE;
2705 +                       dctx->rset = 1;
2706                 }
2707 -               blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
2708 -               poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks,
2709 -                                    dctx->r[1].r);
2710 -               src += POLY1305_BLOCK_SIZE * 4 * blocks;
2711 -               srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
2712 -       }
2713 -
2714 -       if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
2715 -               if (unlikely(dctx->rset < 2)) {
2716 -                       dctx->r[1] = dctx->r[0];
2717 -                       poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
2718 -                       dctx->rset = 2;
2719 +               if (len >= POLY1305_BLOCK_SIZE) {
2720 +                       dctx->s[0] = get_unaligned_le32(&inp[0]);
2721 +                       dctx->s[1] = get_unaligned_le32(&inp[4]);
2722 +                       dctx->s[2] = get_unaligned_le32(&inp[8]);
2723 +                       dctx->s[3] = get_unaligned_le32(&inp[12]);
2724 +                       inp += POLY1305_BLOCK_SIZE;
2725 +                       len -= POLY1305_BLOCK_SIZE;
2726 +                       acc += POLY1305_BLOCK_SIZE;
2727 +                       dctx->sset = true;
2728                 }
2729 -               blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
2730 -               poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r,
2731 -                                    blocks, dctx->r[1].r);
2732 -               src += POLY1305_BLOCK_SIZE * 2 * blocks;
2733 -               srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
2734 -       }
2735 -       if (srclen >= POLY1305_BLOCK_SIZE) {
2736 -               poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1);
2737 -               srclen -= POLY1305_BLOCK_SIZE;
2738         }
2739 -       return srclen;
2740 +       return acc;
2741  }
2742
2743  void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
2744                           unsigned int srclen)
2745  {
2746 -       unsigned int bytes;
2747 +       unsigned int bytes, used;
2748
2749         if (unlikely(dctx->buflen)) {
2750                 bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
2751 @@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly130
2752                 dctx->buflen += bytes;
2753
2754                 if (dctx->buflen == POLY1305_BLOCK_SIZE) {
2755 -                       if (static_branch_likely(&poly1305_use_simd) &&
2756 -                           likely(crypto_simd_usable())) {
2757 -                               kernel_fpu_begin();
2758 -                               poly1305_simd_blocks(dctx, dctx->buf,
2759 -                                                    POLY1305_BLOCK_SIZE);
2760 -                               kernel_fpu_end();
2761 -                       } else {
2762 -                               poly1305_scalar_blocks(dctx, dctx->buf,
2763 -                                                      POLY1305_BLOCK_SIZE);
2764 -                       }
2765 +                       if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
2766 +                               poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
2767                         dctx->buflen = 0;
2768                 }
2769         }
2770
2771         if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
2772 -               if (static_branch_likely(&poly1305_use_simd) &&
2773 -                   likely(crypto_simd_usable())) {
2774 -                       kernel_fpu_begin();
2775 -                       bytes = poly1305_simd_blocks(dctx, src, srclen);
2776 -                       kernel_fpu_end();
2777 -               } else {
2778 -                       bytes = poly1305_scalar_blocks(dctx, src, srclen);
2779 -               }
2780 -               src += srclen - bytes;
2781 -               srclen = bytes;
2782 +               bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
2783 +               srclen -= bytes;
2784 +               used = crypto_poly1305_setdctxkey(dctx, src, bytes);
2785 +               if (likely(bytes - used))
2786 +                       poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
2787 +               src += bytes;
2788         }
2789
2790         if (unlikely(srclen)) {
2791 @@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly130
2792  }
2793  EXPORT_SYMBOL(poly1305_update_arch);
2794
2795 -void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst)
2796 +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
2797  {
2798 -       __le32 digest[4];
2799 -       u64 f = 0;
2800 -
2801 -       if (unlikely(desc->buflen)) {
2802 -               desc->buf[desc->buflen++] = 1;
2803 -               memset(desc->buf + desc->buflen, 0,
2804 -                      POLY1305_BLOCK_SIZE - desc->buflen);
2805 -               poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0);
2806 +       if (unlikely(dctx->buflen)) {
2807 +               dctx->buf[dctx->buflen++] = 1;
2808 +               memset(dctx->buf + dctx->buflen, 0,
2809 +                      POLY1305_BLOCK_SIZE - dctx->buflen);
2810 +               poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
2811         }
2812
2813 -       poly1305_integer_emit(&desc->h, digest);
2814 -
2815 -       /* mac = (h + s) % (2^128) */
2816 -       f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
2817 -       put_unaligned_le32(f, dst + 0);
2818 -       f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
2819 -       put_unaligned_le32(f, dst + 4);
2820 -       f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
2821 -       put_unaligned_le32(f, dst + 8);
2822 -       f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
2823 -       put_unaligned_le32(f, dst + 12);
2824 -
2825 -       *desc = (struct poly1305_desc_ctx){};
2826 +       poly1305_simd_emit(&dctx->h, dst, dctx->s);
2827 +       *dctx = (struct poly1305_desc_ctx){};
2828  }
2829  EXPORT_SYMBOL(poly1305_final_arch);
2830
2831 @@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct s
2832  {
2833         struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2834
2835 -       poly1305_core_init(&dctx->h);
2836 -       dctx->buflen = 0;
2837 -       dctx->rset = 0;
2838 -       dctx->sset = false;
2839 -
2840 +       *dctx = (struct poly1305_desc_ctx){};
2841         return 0;
2842  }
2843
2844 -static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
2845 +static int crypto_poly1305_update(struct shash_desc *desc,
2846 +                                 const u8 *src, unsigned int srclen)
2847  {
2848         struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2849
2850 -       if (unlikely(!dctx->sset))
2851 -               return -ENOKEY;
2852 -
2853 -       poly1305_final_arch(dctx, dst);
2854 +       poly1305_update_arch(dctx, src, srclen);
2855         return 0;
2856  }
2857
2858 -static int poly1305_simd_update(struct shash_desc *desc,
2859 -                               const u8 *src, unsigned int srclen)
2860 +static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
2861  {
2862         struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
2863
2864 -       poly1305_update_arch(dctx, src, srclen);
2865 +       if (unlikely(!dctx->sset))
2866 +               return -ENOKEY;
2867 +
2868 +       poly1305_final_arch(dctx, dst);
2869         return 0;
2870  }
2871
2872  static struct shash_alg alg = {
2873         .digestsize     = POLY1305_DIGEST_SIZE,
2874         .init           = crypto_poly1305_init,
2875 -       .update         = poly1305_simd_update,
2876 +       .update         = crypto_poly1305_update,
2877         .final          = crypto_poly1305_final,
2878         .descsize       = sizeof(struct poly1305_desc_ctx),
2879         .base           = {
2880 @@ -406,17 +265,19 @@ static struct shash_alg alg = {
2881
2882  static int __init poly1305_simd_mod_init(void)
2883  {
2884 -       if (!boot_cpu_has(X86_FEATURE_XMM2))
2885 -               return 0;
2886 -
2887 -       static_branch_enable(&poly1305_use_simd);
2888 -
2889 -       if (IS_ENABLED(CONFIG_AS_AVX2) &&
2890 -           boot_cpu_has(X86_FEATURE_AVX) &&
2891 +       if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
2892 +           cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
2893 +               static_branch_enable(&poly1305_use_avx);
2894 +       if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
2895             boot_cpu_has(X86_FEATURE_AVX2) &&
2896             cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
2897                 static_branch_enable(&poly1305_use_avx2);
2898 -
2899 +       if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
2900 +           boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
2901 +           cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
2902 +           /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
2903 +           boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
2904 +               static_branch_enable(&poly1305_use_avx512);
2905         return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
2906  }
2907
2908 @@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init);
2909  module_exit(poly1305_simd_mod_exit);
2910
2911  MODULE_LICENSE("GPL");
2912 -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
2913 +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
2914  MODULE_DESCRIPTION("Poly1305 authenticator");
2915  MODULE_ALIAS_CRYPTO("poly1305");
2916  MODULE_ALIAS_CRYPTO("poly1305-simd");
2917 --- a/lib/crypto/Kconfig
2918 +++ b/lib/crypto/Kconfig
2919 @@ -90,7 +90,7 @@ config CRYPTO_LIB_DES
2920  config CRYPTO_LIB_POLY1305_RSIZE
2921         int
2922         default 2 if MIPS
2923 -       default 4 if X86_64
2924 +       default 11 if X86_64
2925         default 9 if ARM || ARM64
2926         default 1
2927