kernel: 5.4: import wireguard backport
[openwrt/openwrt.git] / target / linux / generic / backport-5.4 / 080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Fri, 8 Nov 2019 13:22:31 +0100
4 Subject: [PATCH] crypto: blake2s - x86_64 SIMD implementation
5
6 commit ed0356eda153f6a95649e11feb7b07083caf9e20 upstream.
7
8 These implementations from Samuel Neves support AVX and AVX-512VL.
9 Originally this used AVX-512F, but Skylake thermal throttling made
10 AVX-512VL more attractive and possible to do with negligable difference.
11
12 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
13 Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
14 Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
15 [ardb: move to arch/x86/crypto, wire into lib/crypto framework]
16 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
17 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
18 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
19 ---
20 arch/x86/crypto/Makefile | 2 +
21 arch/x86/crypto/blake2s-core.S | 258 +++++++++++++++++++++++++++++++++
22 arch/x86/crypto/blake2s-glue.c | 233 +++++++++++++++++++++++++++++
23 crypto/Kconfig | 6 +
24 4 files changed, 499 insertions(+)
25 create mode 100644 arch/x86/crypto/blake2s-core.S
26 create mode 100644 arch/x86/crypto/blake2s-glue.c
27
28 --- a/arch/x86/crypto/Makefile
29 +++ b/arch/x86/crypto/Makefile
30 @@ -48,6 +48,7 @@ ifeq ($(avx_supported),yes)
31 obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
32 obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
33 obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
34 + obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
35 endif
36
37 # These modules require assembler to support AVX2.
38 @@ -70,6 +71,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x8
39 aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
40
41 nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
42 +blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
43
44 ifeq ($(avx_supported),yes)
45 camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
46 --- /dev/null
47 +++ b/arch/x86/crypto/blake2s-core.S
48 @@ -0,0 +1,258 @@
49 +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
50 +/*
51 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
52 + * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
53 + */
54 +
55 +#include <linux/linkage.h>
56 +
57 +.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
58 +.align 32
59 +IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
60 + .octa 0x5BE0CD191F83D9AB9B05688C510E527F
61 +.section .rodata.cst16.ROT16, "aM", @progbits, 16
62 +.align 16
63 +ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
64 +.section .rodata.cst16.ROR328, "aM", @progbits, 16
65 +.align 16
66 +ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
67 +.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
68 +.align 64
69 +SIGMA:
70 +.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
71 +.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
72 +.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
73 +.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
74 +.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
75 +.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
76 +.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
77 +.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
78 +.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
79 +.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
80 +#ifdef CONFIG_AS_AVX512
81 +.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
82 +.align 64
83 +SIGMA2:
84 +.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
85 +.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
86 +.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
87 +.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
88 +.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
89 +.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
90 +.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
91 +.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
92 +.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
93 +.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
94 +#endif /* CONFIG_AS_AVX512 */
95 +
96 +.text
97 +#ifdef CONFIG_AS_SSSE3
98 +ENTRY(blake2s_compress_ssse3)
99 + testq %rdx,%rdx
100 + je .Lendofloop
101 + movdqu (%rdi),%xmm0
102 + movdqu 0x10(%rdi),%xmm1
103 + movdqa ROT16(%rip),%xmm12
104 + movdqa ROR328(%rip),%xmm13
105 + movdqu 0x20(%rdi),%xmm14
106 + movq %rcx,%xmm15
107 + leaq SIGMA+0xa0(%rip),%r8
108 + jmp .Lbeginofloop
109 + .align 32
110 +.Lbeginofloop:
111 + movdqa %xmm0,%xmm10
112 + movdqa %xmm1,%xmm11
113 + paddq %xmm15,%xmm14
114 + movdqa IV(%rip),%xmm2
115 + movdqa %xmm14,%xmm3
116 + pxor IV+0x10(%rip),%xmm3
117 + leaq SIGMA(%rip),%rcx
118 +.Lroundloop:
119 + movzbl (%rcx),%eax
120 + movd (%rsi,%rax,4),%xmm4
121 + movzbl 0x1(%rcx),%eax
122 + movd (%rsi,%rax,4),%xmm5
123 + movzbl 0x2(%rcx),%eax
124 + movd (%rsi,%rax,4),%xmm6
125 + movzbl 0x3(%rcx),%eax
126 + movd (%rsi,%rax,4),%xmm7
127 + punpckldq %xmm5,%xmm4
128 + punpckldq %xmm7,%xmm6
129 + punpcklqdq %xmm6,%xmm4
130 + paddd %xmm4,%xmm0
131 + paddd %xmm1,%xmm0
132 + pxor %xmm0,%xmm3
133 + pshufb %xmm12,%xmm3
134 + paddd %xmm3,%xmm2
135 + pxor %xmm2,%xmm1
136 + movdqa %xmm1,%xmm8
137 + psrld $0xc,%xmm1
138 + pslld $0x14,%xmm8
139 + por %xmm8,%xmm1
140 + movzbl 0x4(%rcx),%eax
141 + movd (%rsi,%rax,4),%xmm5
142 + movzbl 0x5(%rcx),%eax
143 + movd (%rsi,%rax,4),%xmm6
144 + movzbl 0x6(%rcx),%eax
145 + movd (%rsi,%rax,4),%xmm7
146 + movzbl 0x7(%rcx),%eax
147 + movd (%rsi,%rax,4),%xmm4
148 + punpckldq %xmm6,%xmm5
149 + punpckldq %xmm4,%xmm7
150 + punpcklqdq %xmm7,%xmm5
151 + paddd %xmm5,%xmm0
152 + paddd %xmm1,%xmm0
153 + pxor %xmm0,%xmm3
154 + pshufb %xmm13,%xmm3
155 + paddd %xmm3,%xmm2
156 + pxor %xmm2,%xmm1
157 + movdqa %xmm1,%xmm8
158 + psrld $0x7,%xmm1
159 + pslld $0x19,%xmm8
160 + por %xmm8,%xmm1
161 + pshufd $0x93,%xmm0,%xmm0
162 + pshufd $0x4e,%xmm3,%xmm3
163 + pshufd $0x39,%xmm2,%xmm2
164 + movzbl 0x8(%rcx),%eax
165 + movd (%rsi,%rax,4),%xmm6
166 + movzbl 0x9(%rcx),%eax
167 + movd (%rsi,%rax,4),%xmm7
168 + movzbl 0xa(%rcx),%eax
169 + movd (%rsi,%rax,4),%xmm4
170 + movzbl 0xb(%rcx),%eax
171 + movd (%rsi,%rax,4),%xmm5
172 + punpckldq %xmm7,%xmm6
173 + punpckldq %xmm5,%xmm4
174 + punpcklqdq %xmm4,%xmm6
175 + paddd %xmm6,%xmm0
176 + paddd %xmm1,%xmm0
177 + pxor %xmm0,%xmm3
178 + pshufb %xmm12,%xmm3
179 + paddd %xmm3,%xmm2
180 + pxor %xmm2,%xmm1
181 + movdqa %xmm1,%xmm8
182 + psrld $0xc,%xmm1
183 + pslld $0x14,%xmm8
184 + por %xmm8,%xmm1
185 + movzbl 0xc(%rcx),%eax
186 + movd (%rsi,%rax,4),%xmm7
187 + movzbl 0xd(%rcx),%eax
188 + movd (%rsi,%rax,4),%xmm4
189 + movzbl 0xe(%rcx),%eax
190 + movd (%rsi,%rax,4),%xmm5
191 + movzbl 0xf(%rcx),%eax
192 + movd (%rsi,%rax,4),%xmm6
193 + punpckldq %xmm4,%xmm7
194 + punpckldq %xmm6,%xmm5
195 + punpcklqdq %xmm5,%xmm7
196 + paddd %xmm7,%xmm0
197 + paddd %xmm1,%xmm0
198 + pxor %xmm0,%xmm3
199 + pshufb %xmm13,%xmm3
200 + paddd %xmm3,%xmm2
201 + pxor %xmm2,%xmm1
202 + movdqa %xmm1,%xmm8
203 + psrld $0x7,%xmm1
204 + pslld $0x19,%xmm8
205 + por %xmm8,%xmm1
206 + pshufd $0x39,%xmm0,%xmm0
207 + pshufd $0x4e,%xmm3,%xmm3
208 + pshufd $0x93,%xmm2,%xmm2
209 + addq $0x10,%rcx
210 + cmpq %r8,%rcx
211 + jnz .Lroundloop
212 + pxor %xmm2,%xmm0
213 + pxor %xmm3,%xmm1
214 + pxor %xmm10,%xmm0
215 + pxor %xmm11,%xmm1
216 + addq $0x40,%rsi
217 + decq %rdx
218 + jnz .Lbeginofloop
219 + movdqu %xmm0,(%rdi)
220 + movdqu %xmm1,0x10(%rdi)
221 + movdqu %xmm14,0x20(%rdi)
222 +.Lendofloop:
223 + ret
224 +ENDPROC(blake2s_compress_ssse3)
225 +#endif /* CONFIG_AS_SSSE3 */
226 +
227 +#ifdef CONFIG_AS_AVX512
228 +ENTRY(blake2s_compress_avx512)
229 + vmovdqu (%rdi),%xmm0
230 + vmovdqu 0x10(%rdi),%xmm1
231 + vmovdqu 0x20(%rdi),%xmm4
232 + vmovq %rcx,%xmm5
233 + vmovdqa IV(%rip),%xmm14
234 + vmovdqa IV+16(%rip),%xmm15
235 + jmp .Lblake2s_compress_avx512_mainloop
236 +.align 32
237 +.Lblake2s_compress_avx512_mainloop:
238 + vmovdqa %xmm0,%xmm10
239 + vmovdqa %xmm1,%xmm11
240 + vpaddq %xmm5,%xmm4,%xmm4
241 + vmovdqa %xmm14,%xmm2
242 + vpxor %xmm15,%xmm4,%xmm3
243 + vmovdqu (%rsi),%ymm6
244 + vmovdqu 0x20(%rsi),%ymm7
245 + addq $0x40,%rsi
246 + leaq SIGMA2(%rip),%rax
247 + movb $0xa,%cl
248 +.Lblake2s_compress_avx512_roundloop:
249 + addq $0x40,%rax
250 + vmovdqa -0x40(%rax),%ymm8
251 + vmovdqa -0x20(%rax),%ymm9
252 + vpermi2d %ymm7,%ymm6,%ymm8
253 + vpermi2d %ymm7,%ymm6,%ymm9
254 + vmovdqa %ymm8,%ymm6
255 + vmovdqa %ymm9,%ymm7
256 + vpaddd %xmm8,%xmm0,%xmm0
257 + vpaddd %xmm1,%xmm0,%xmm0
258 + vpxor %xmm0,%xmm3,%xmm3
259 + vprord $0x10,%xmm3,%xmm3
260 + vpaddd %xmm3,%xmm2,%xmm2
261 + vpxor %xmm2,%xmm1,%xmm1
262 + vprord $0xc,%xmm1,%xmm1
263 + vextracti128 $0x1,%ymm8,%xmm8
264 + vpaddd %xmm8,%xmm0,%xmm0
265 + vpaddd %xmm1,%xmm0,%xmm0
266 + vpxor %xmm0,%xmm3,%xmm3
267 + vprord $0x8,%xmm3,%xmm3
268 + vpaddd %xmm3,%xmm2,%xmm2
269 + vpxor %xmm2,%xmm1,%xmm1
270 + vprord $0x7,%xmm1,%xmm1
271 + vpshufd $0x93,%xmm0,%xmm0
272 + vpshufd $0x4e,%xmm3,%xmm3
273 + vpshufd $0x39,%xmm2,%xmm2
274 + vpaddd %xmm9,%xmm0,%xmm0
275 + vpaddd %xmm1,%xmm0,%xmm0
276 + vpxor %xmm0,%xmm3,%xmm3
277 + vprord $0x10,%xmm3,%xmm3
278 + vpaddd %xmm3,%xmm2,%xmm2
279 + vpxor %xmm2,%xmm1,%xmm1
280 + vprord $0xc,%xmm1,%xmm1
281 + vextracti128 $0x1,%ymm9,%xmm9
282 + vpaddd %xmm9,%xmm0,%xmm0
283 + vpaddd %xmm1,%xmm0,%xmm0
284 + vpxor %xmm0,%xmm3,%xmm3
285 + vprord $0x8,%xmm3,%xmm3
286 + vpaddd %xmm3,%xmm2,%xmm2
287 + vpxor %xmm2,%xmm1,%xmm1
288 + vprord $0x7,%xmm1,%xmm1
289 + vpshufd $0x39,%xmm0,%xmm0
290 + vpshufd $0x4e,%xmm3,%xmm3
291 + vpshufd $0x93,%xmm2,%xmm2
292 + decb %cl
293 + jne .Lblake2s_compress_avx512_roundloop
294 + vpxor %xmm10,%xmm0,%xmm0
295 + vpxor %xmm11,%xmm1,%xmm1
296 + vpxor %xmm2,%xmm0,%xmm0
297 + vpxor %xmm3,%xmm1,%xmm1
298 + decq %rdx
299 + jne .Lblake2s_compress_avx512_mainloop
300 + vmovdqu %xmm0,(%rdi)
301 + vmovdqu %xmm1,0x10(%rdi)
302 + vmovdqu %xmm4,0x20(%rdi)
303 + vzeroupper
304 + retq
305 +ENDPROC(blake2s_compress_avx512)
306 +#endif /* CONFIG_AS_AVX512 */
307 --- /dev/null
308 +++ b/arch/x86/crypto/blake2s-glue.c
309 @@ -0,0 +1,233 @@
310 +// SPDX-License-Identifier: GPL-2.0 OR MIT
311 +/*
312 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
313 + */
314 +
315 +#include <crypto/internal/blake2s.h>
316 +#include <crypto/internal/simd.h>
317 +#include <crypto/internal/hash.h>
318 +
319 +#include <linux/types.h>
320 +#include <linux/jump_label.h>
321 +#include <linux/kernel.h>
322 +#include <linux/module.h>
323 +
324 +#include <asm/cpufeature.h>
325 +#include <asm/fpu/api.h>
326 +#include <asm/processor.h>
327 +#include <asm/simd.h>
328 +
329 +asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
330 + const u8 *block, const size_t nblocks,
331 + const u32 inc);
332 +asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
333 + const u8 *block, const size_t nblocks,
334 + const u32 inc);
335 +
336 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
337 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
338 +
339 +void blake2s_compress_arch(struct blake2s_state *state,
340 + const u8 *block, size_t nblocks,
341 + const u32 inc)
342 +{
343 + /* SIMD disables preemption, so relax after processing each page. */
344 + BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
345 +
346 + if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
347 + blake2s_compress_generic(state, block, nblocks, inc);
348 + return;
349 + }
350 +
351 + for (;;) {
352 + const size_t blocks = min_t(size_t, nblocks,
353 + PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
354 +
355 + kernel_fpu_begin();
356 + if (IS_ENABLED(CONFIG_AS_AVX512) &&
357 + static_branch_likely(&blake2s_use_avx512))
358 + blake2s_compress_avx512(state, block, blocks, inc);
359 + else
360 + blake2s_compress_ssse3(state, block, blocks, inc);
361 + kernel_fpu_end();
362 +
363 + nblocks -= blocks;
364 + if (!nblocks)
365 + break;
366 + block += blocks * BLAKE2S_BLOCK_SIZE;
367 + }
368 +}
369 +EXPORT_SYMBOL(blake2s_compress_arch);
370 +
371 +static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
372 + unsigned int keylen)
373 +{
374 + struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
375 +
376 + if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
377 + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
378 + return -EINVAL;
379 + }
380 +
381 + memcpy(tctx->key, key, keylen);
382 + tctx->keylen = keylen;
383 +
384 + return 0;
385 +}
386 +
387 +static int crypto_blake2s_init(struct shash_desc *desc)
388 +{
389 + struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
390 + struct blake2s_state *state = shash_desc_ctx(desc);
391 + const int outlen = crypto_shash_digestsize(desc->tfm);
392 +
393 + if (tctx->keylen)
394 + blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
395 + else
396 + blake2s_init(state, outlen);
397 +
398 + return 0;
399 +}
400 +
401 +static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
402 + unsigned int inlen)
403 +{
404 + struct blake2s_state *state = shash_desc_ctx(desc);
405 + const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
406 +
407 + if (unlikely(!inlen))
408 + return 0;
409 + if (inlen > fill) {
410 + memcpy(state->buf + state->buflen, in, fill);
411 + blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
412 + state->buflen = 0;
413 + in += fill;
414 + inlen -= fill;
415 + }
416 + if (inlen > BLAKE2S_BLOCK_SIZE) {
417 + const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
418 + /* Hash one less (full) block than strictly possible */
419 + blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
420 + in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
421 + inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
422 + }
423 + memcpy(state->buf + state->buflen, in, inlen);
424 + state->buflen += inlen;
425 +
426 + return 0;
427 +}
428 +
429 +static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
430 +{
431 + struct blake2s_state *state = shash_desc_ctx(desc);
432 +
433 + blake2s_set_lastblock(state);
434 + memset(state->buf + state->buflen, 0,
435 + BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
436 + blake2s_compress_arch(state, state->buf, 1, state->buflen);
437 + cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
438 + memcpy(out, state->h, state->outlen);
439 + memzero_explicit(state, sizeof(*state));
440 +
441 + return 0;
442 +}
443 +
444 +static struct shash_alg blake2s_algs[] = {{
445 + .base.cra_name = "blake2s-128",
446 + .base.cra_driver_name = "blake2s-128-x86",
447 + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
448 + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
449 + .base.cra_priority = 200,
450 + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
451 + .base.cra_module = THIS_MODULE,
452 +
453 + .digestsize = BLAKE2S_128_HASH_SIZE,
454 + .setkey = crypto_blake2s_setkey,
455 + .init = crypto_blake2s_init,
456 + .update = crypto_blake2s_update,
457 + .final = crypto_blake2s_final,
458 + .descsize = sizeof(struct blake2s_state),
459 +}, {
460 + .base.cra_name = "blake2s-160",
461 + .base.cra_driver_name = "blake2s-160-x86",
462 + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
463 + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
464 + .base.cra_priority = 200,
465 + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
466 + .base.cra_module = THIS_MODULE,
467 +
468 + .digestsize = BLAKE2S_160_HASH_SIZE,
469 + .setkey = crypto_blake2s_setkey,
470 + .init = crypto_blake2s_init,
471 + .update = crypto_blake2s_update,
472 + .final = crypto_blake2s_final,
473 + .descsize = sizeof(struct blake2s_state),
474 +}, {
475 + .base.cra_name = "blake2s-224",
476 + .base.cra_driver_name = "blake2s-224-x86",
477 + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
478 + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
479 + .base.cra_priority = 200,
480 + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
481 + .base.cra_module = THIS_MODULE,
482 +
483 + .digestsize = BLAKE2S_224_HASH_SIZE,
484 + .setkey = crypto_blake2s_setkey,
485 + .init = crypto_blake2s_init,
486 + .update = crypto_blake2s_update,
487 + .final = crypto_blake2s_final,
488 + .descsize = sizeof(struct blake2s_state),
489 +}, {
490 + .base.cra_name = "blake2s-256",
491 + .base.cra_driver_name = "blake2s-256-x86",
492 + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
493 + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
494 + .base.cra_priority = 200,
495 + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
496 + .base.cra_module = THIS_MODULE,
497 +
498 + .digestsize = BLAKE2S_256_HASH_SIZE,
499 + .setkey = crypto_blake2s_setkey,
500 + .init = crypto_blake2s_init,
501 + .update = crypto_blake2s_update,
502 + .final = crypto_blake2s_final,
503 + .descsize = sizeof(struct blake2s_state),
504 +}};
505 +
506 +static int __init blake2s_mod_init(void)
507 +{
508 + if (!boot_cpu_has(X86_FEATURE_SSSE3))
509 + return 0;
510 +
511 + static_branch_enable(&blake2s_use_ssse3);
512 +
513 + if (IS_ENABLED(CONFIG_AS_AVX512) &&
514 + boot_cpu_has(X86_FEATURE_AVX) &&
515 + boot_cpu_has(X86_FEATURE_AVX2) &&
516 + boot_cpu_has(X86_FEATURE_AVX512F) &&
517 + boot_cpu_has(X86_FEATURE_AVX512VL) &&
518 + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
519 + XFEATURE_MASK_AVX512, NULL))
520 + static_branch_enable(&blake2s_use_avx512);
521 +
522 + return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
523 +}
524 +
525 +static void __exit blake2s_mod_exit(void)
526 +{
527 + if (boot_cpu_has(X86_FEATURE_SSSE3))
528 + crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
529 +}
530 +
531 +module_init(blake2s_mod_init);
532 +module_exit(blake2s_mod_exit);
533 +
534 +MODULE_ALIAS_CRYPTO("blake2s-128");
535 +MODULE_ALIAS_CRYPTO("blake2s-128-x86");
536 +MODULE_ALIAS_CRYPTO("blake2s-160");
537 +MODULE_ALIAS_CRYPTO("blake2s-160-x86");
538 +MODULE_ALIAS_CRYPTO("blake2s-224");
539 +MODULE_ALIAS_CRYPTO("blake2s-224-x86");
540 +MODULE_ALIAS_CRYPTO("blake2s-256");
541 +MODULE_ALIAS_CRYPTO("blake2s-256-x86");
542 +MODULE_LICENSE("GPL v2");
543 --- a/crypto/Kconfig
544 +++ b/crypto/Kconfig
545 @@ -657,6 +657,12 @@ config CRYPTO_BLAKE2S
546
547 See https://blake2.net for further information.
548
549 +config CRYPTO_BLAKE2S_X86
550 + tristate "BLAKE2s digest algorithm (x86 accelerated version)"
551 + depends on X86 && 64BIT
552 + select CRYPTO_LIB_BLAKE2S_GENERIC
553 + select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
554 +
555 config CRYPTO_CRCT10DIF
556 tristate "CRCT10DIF algorithm"
557 select CRYPTO_HASH