kernel: 5.4: import wireguard backport
[openwrt/openwrt.git] / target / linux / generic / backport-5.4 / 080-wireguard-0041-crypto-poly1305-add-new-32-and-64-bit-generic-versio.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Sun, 5 Jan 2020 22:40:46 -0500
4 Subject: [PATCH] crypto: poly1305 - add new 32 and 64-bit generic versions
5
6 commit 1c08a104360f3e18f4ee6346c21cc3923efb952e upstream.
7
8 These two C implementations from Zinc -- a 32x32 one and a 64x64 one,
9 depending on the platform -- come from Andrew Moon's public domain
10 poly1305-donna portable code, modified for usage in the kernel. The
11 precomputation in the 32-bit version and the use of 64x64 multiplies in
12 the 64-bit version make these perform better than the code it replaces.
13 Moon's code is also very widespread and has received many eyeballs of
14 scrutiny.
15
16 There's a bit of interference between the x86 implementation, which
17 relies on internal details of the old scalar implementation. In the next
18 commit, the x86 implementation will be replaced with a faster one that
19 doesn't rely on this, so none of this matters much. But for now, to keep
20 this passing the tests, we inline the bits of the old implementation
21 that the x86 implementation relied on. Also, since we now support a
22 slightly larger key space, via the union, some offsets had to be fixed
23 up.
24
25 Nonce calculation was folded in with the emit function, to take
26 advantage of 64x64 arithmetic. However, Adiantum appeared to rely on no
27 nonce handling in emit, so this path was conditionalized. We also
28 introduced a new struct, poly1305_core_key, to represent the precise
29 amount of space that particular implementation uses.
30
31 Testing with kbench9000, depending on the CPU, the update function for
32 the 32x32 version has been improved by 4%-7%, and for the 64x64 by
33 19%-30%. The 32x32 gains are small, but I think there's great value in
34 having a parallel implementation to the 64x64 one so that the two can be
35 compared side-by-side as nice stand-alone units.
36
37 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
38 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
39 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
40 ---
41 arch/x86/crypto/poly1305-avx2-x86_64.S | 20 +--
42 arch/x86/crypto/poly1305_glue.c | 215 +++++++++++++++++++++++--
43 crypto/adiantum.c | 4 +-
44 crypto/nhpoly1305.c | 2 +-
45 crypto/poly1305_generic.c | 25 ++-
46 include/crypto/internal/poly1305.h | 45 ++----
47 include/crypto/nhpoly1305.h | 4 +-
48 include/crypto/poly1305.h | 26 ++-
49 lib/crypto/Makefile | 4 +-
50 lib/crypto/poly1305-donna32.c | 204 +++++++++++++++++++++++
51 lib/crypto/poly1305-donna64.c | 185 +++++++++++++++++++++
52 lib/crypto/poly1305.c | 169 +------------------
53 12 files changed, 675 insertions(+), 228 deletions(-)
54 create mode 100644 lib/crypto/poly1305-donna32.c
55 create mode 100644 lib/crypto/poly1305-donna64.c
56
57 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S
58 +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
59 @@ -34,16 +34,16 @@ ORMASK: .octa 0x000000000100000000000000
60 #define u2 0x08(%r8)
61 #define u3 0x0c(%r8)
62 #define u4 0x10(%r8)
63 -#define w0 0x14(%r8)
64 -#define w1 0x18(%r8)
65 -#define w2 0x1c(%r8)
66 -#define w3 0x20(%r8)
67 -#define w4 0x24(%r8)
68 -#define y0 0x28(%r8)
69 -#define y1 0x2c(%r8)
70 -#define y2 0x30(%r8)
71 -#define y3 0x34(%r8)
72 -#define y4 0x38(%r8)
73 +#define w0 0x18(%r8)
74 +#define w1 0x1c(%r8)
75 +#define w2 0x20(%r8)
76 +#define w3 0x24(%r8)
77 +#define w4 0x28(%r8)
78 +#define y0 0x30(%r8)
79 +#define y1 0x34(%r8)
80 +#define y2 0x38(%r8)
81 +#define y3 0x3c(%r8)
82 +#define y4 0x40(%r8)
83 #define m %rsi
84 #define hc0 %ymm0
85 #define hc1 %ymm1
86 --- a/arch/x86/crypto/poly1305_glue.c
87 +++ b/arch/x86/crypto/poly1305_glue.c
88 @@ -25,6 +25,21 @@ asmlinkage void poly1305_4block_avx2(u32
89 static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
90 static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
91
92 +static inline u64 mlt(u64 a, u64 b)
93 +{
94 + return a * b;
95 +}
96 +
97 +static inline u32 sr(u64 v, u_char n)
98 +{
99 + return v >> n;
100 +}
101 +
102 +static inline u32 and(u32 v, u32 mask)
103 +{
104 + return v & mask;
105 +}
106 +
107 static void poly1305_simd_mult(u32 *a, const u32 *b)
108 {
109 u8 m[POLY1305_BLOCK_SIZE];
110 @@ -36,6 +51,168 @@ static void poly1305_simd_mult(u32 *a, c
111 poly1305_block_sse2(a, m, b, 1);
112 }
113
114 +static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key)
115 +{
116 + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
117 + key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff;
118 + key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03;
119 + key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff;
120 + key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff;
121 + key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
122 +}
123 +
124 +static void poly1305_integer_blocks(struct poly1305_state *state,
125 + const struct poly1305_key *key,
126 + const void *src,
127 + unsigned int nblocks, u32 hibit)
128 +{
129 + u32 r0, r1, r2, r3, r4;
130 + u32 s1, s2, s3, s4;
131 + u32 h0, h1, h2, h3, h4;
132 + u64 d0, d1, d2, d3, d4;
133 +
134 + if (!nblocks)
135 + return;
136 +
137 + r0 = key->r[0];
138 + r1 = key->r[1];
139 + r2 = key->r[2];
140 + r3 = key->r[3];
141 + r4 = key->r[4];
142 +
143 + s1 = r1 * 5;
144 + s2 = r2 * 5;
145 + s3 = r3 * 5;
146 + s4 = r4 * 5;
147 +
148 + h0 = state->h[0];
149 + h1 = state->h[1];
150 + h2 = state->h[2];
151 + h3 = state->h[3];
152 + h4 = state->h[4];
153 +
154 + do {
155 + /* h += m[i] */
156 + h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff;
157 + h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
158 + h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
159 + h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
160 + h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
161 +
162 + /* h *= r */
163 + d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
164 + mlt(h3, s2) + mlt(h4, s1);
165 + d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
166 + mlt(h3, s3) + mlt(h4, s2);
167 + d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
168 + mlt(h3, s4) + mlt(h4, s3);
169 + d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
170 + mlt(h3, r0) + mlt(h4, s4);
171 + d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
172 + mlt(h3, r1) + mlt(h4, r0);
173 +
174 + /* (partial) h %= p */
175 + d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
176 + d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
177 + d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
178 + d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
179 + h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
180 + h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
181 +
182 + src += POLY1305_BLOCK_SIZE;
183 + } while (--nblocks);
184 +
185 + state->h[0] = h0;
186 + state->h[1] = h1;
187 + state->h[2] = h2;
188 + state->h[3] = h3;
189 + state->h[4] = h4;
190 +}
191 +
192 +static void poly1305_integer_emit(const struct poly1305_state *state, void *dst)
193 +{
194 + u32 h0, h1, h2, h3, h4;
195 + u32 g0, g1, g2, g3, g4;
196 + u32 mask;
197 +
198 + /* fully carry h */
199 + h0 = state->h[0];
200 + h1 = state->h[1];
201 + h2 = state->h[2];
202 + h3 = state->h[3];
203 + h4 = state->h[4];
204 +
205 + h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
206 + h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
207 + h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
208 + h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
209 + h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
210 +
211 + /* compute h + -p */
212 + g0 = h0 + 5;
213 + g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
214 + g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
215 + g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
216 + g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
217 +
218 + /* select h if h < p, or h + -p if h >= p */
219 + mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
220 + g0 &= mask;
221 + g1 &= mask;
222 + g2 &= mask;
223 + g3 &= mask;
224 + g4 &= mask;
225 + mask = ~mask;
226 + h0 = (h0 & mask) | g0;
227 + h1 = (h1 & mask) | g1;
228 + h2 = (h2 & mask) | g2;
229 + h3 = (h3 & mask) | g3;
230 + h4 = (h4 & mask) | g4;
231 +
232 + /* h = h % (2^128) */
233 + put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0);
234 + put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4);
235 + put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8);
236 + put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12);
237 +}
238 +
239 +void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
240 +{
241 + poly1305_integer_setkey(desc->opaque_r, key);
242 + desc->s[0] = get_unaligned_le32(key + 16);
243 + desc->s[1] = get_unaligned_le32(key + 20);
244 + desc->s[2] = get_unaligned_le32(key + 24);
245 + desc->s[3] = get_unaligned_le32(key + 28);
246 + poly1305_core_init(&desc->h);
247 + desc->buflen = 0;
248 + desc->sset = true;
249 + desc->rset = 1;
250 +}
251 +EXPORT_SYMBOL_GPL(poly1305_init_arch);
252 +
253 +static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
254 + const u8 *src, unsigned int srclen)
255 +{
256 + if (!dctx->sset) {
257 + if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
258 + poly1305_integer_setkey(dctx->r, src);
259 + src += POLY1305_BLOCK_SIZE;
260 + srclen -= POLY1305_BLOCK_SIZE;
261 + dctx->rset = 1;
262 + }
263 + if (srclen >= POLY1305_BLOCK_SIZE) {
264 + dctx->s[0] = get_unaligned_le32(src + 0);
265 + dctx->s[1] = get_unaligned_le32(src + 4);
266 + dctx->s[2] = get_unaligned_le32(src + 8);
267 + dctx->s[3] = get_unaligned_le32(src + 12);
268 + src += POLY1305_BLOCK_SIZE;
269 + srclen -= POLY1305_BLOCK_SIZE;
270 + dctx->sset = true;
271 + }
272 + }
273 + return srclen;
274 +}
275 +
276 static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
277 const u8 *src, unsigned int srclen)
278 {
279 @@ -47,8 +224,8 @@ static unsigned int poly1305_scalar_bloc
280 srclen = datalen;
281 }
282 if (srclen >= POLY1305_BLOCK_SIZE) {
283 - poly1305_core_blocks(&dctx->h, dctx->r, src,
284 - srclen / POLY1305_BLOCK_SIZE, 1);
285 + poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src,
286 + srclen / POLY1305_BLOCK_SIZE, 1);
287 srclen %= POLY1305_BLOCK_SIZE;
288 }
289 return srclen;
290 @@ -105,12 +282,6 @@ static unsigned int poly1305_simd_blocks
291 return srclen;
292 }
293
294 -void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
295 -{
296 - poly1305_init_generic(desc, key);
297 -}
298 -EXPORT_SYMBOL(poly1305_init_arch);
299 -
300 void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
301 unsigned int srclen)
302 {
303 @@ -158,9 +329,31 @@ void poly1305_update_arch(struct poly130
304 }
305 EXPORT_SYMBOL(poly1305_update_arch);
306
307 -void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest)
308 +void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst)
309 {
310 - poly1305_final_generic(desc, digest);
311 + __le32 digest[4];
312 + u64 f = 0;
313 +
314 + if (unlikely(desc->buflen)) {
315 + desc->buf[desc->buflen++] = 1;
316 + memset(desc->buf + desc->buflen, 0,
317 + POLY1305_BLOCK_SIZE - desc->buflen);
318 + poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0);
319 + }
320 +
321 + poly1305_integer_emit(&desc->h, digest);
322 +
323 + /* mac = (h + s) % (2^128) */
324 + f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
325 + put_unaligned_le32(f, dst + 0);
326 + f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
327 + put_unaligned_le32(f, dst + 4);
328 + f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
329 + put_unaligned_le32(f, dst + 8);
330 + f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
331 + put_unaligned_le32(f, dst + 12);
332 +
333 + *desc = (struct poly1305_desc_ctx){};
334 }
335 EXPORT_SYMBOL(poly1305_final_arch);
336
337 @@ -183,7 +376,7 @@ static int crypto_poly1305_final(struct
338 if (unlikely(!dctx->sset))
339 return -ENOKEY;
340
341 - poly1305_final_generic(dctx, dst);
342 + poly1305_final_arch(dctx, dst);
343 return 0;
344 }
345
346 --- a/crypto/adiantum.c
347 +++ b/crypto/adiantum.c
348 @@ -72,7 +72,7 @@ struct adiantum_tfm_ctx {
349 struct crypto_skcipher *streamcipher;
350 struct crypto_cipher *blockcipher;
351 struct crypto_shash *hash;
352 - struct poly1305_key header_hash_key;
353 + struct poly1305_core_key header_hash_key;
354 };
355
356 struct adiantum_request_ctx {
357 @@ -249,7 +249,7 @@ static void adiantum_hash_header(struct
358 poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv,
359 TWEAK_SIZE / POLY1305_BLOCK_SIZE, 1);
360
361 - poly1305_core_emit(&state, &rctx->header_hash);
362 + poly1305_core_emit(&state, NULL, &rctx->header_hash);
363 }
364
365 /* Hash the left-hand part (the "bulk") of the message using NHPoly1305 */
366 --- a/crypto/nhpoly1305.c
367 +++ b/crypto/nhpoly1305.c
368 @@ -210,7 +210,7 @@ int crypto_nhpoly1305_final_helper(struc
369 if (state->nh_remaining)
370 process_nh_hash_value(state, key);
371
372 - poly1305_core_emit(&state->poly_state, dst);
373 + poly1305_core_emit(&state->poly_state, NULL, dst);
374 return 0;
375 }
376 EXPORT_SYMBOL(crypto_nhpoly1305_final_helper);
377 --- a/crypto/poly1305_generic.c
378 +++ b/crypto/poly1305_generic.c
379 @@ -31,6 +31,29 @@ static int crypto_poly1305_init(struct s
380 return 0;
381 }
382
383 +static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
384 + const u8 *src, unsigned int srclen)
385 +{
386 + if (!dctx->sset) {
387 + if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
388 + poly1305_core_setkey(&dctx->core_r, src);
389 + src += POLY1305_BLOCK_SIZE;
390 + srclen -= POLY1305_BLOCK_SIZE;
391 + dctx->rset = 2;
392 + }
393 + if (srclen >= POLY1305_BLOCK_SIZE) {
394 + dctx->s[0] = get_unaligned_le32(src + 0);
395 + dctx->s[1] = get_unaligned_le32(src + 4);
396 + dctx->s[2] = get_unaligned_le32(src + 8);
397 + dctx->s[3] = get_unaligned_le32(src + 12);
398 + src += POLY1305_BLOCK_SIZE;
399 + srclen -= POLY1305_BLOCK_SIZE;
400 + dctx->sset = true;
401 + }
402 + }
403 + return srclen;
404 +}
405 +
406 static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
407 unsigned int srclen)
408 {
409 @@ -42,7 +65,7 @@ static void poly1305_blocks(struct poly1
410 srclen = datalen;
411 }
412
413 - poly1305_core_blocks(&dctx->h, dctx->r, src,
414 + poly1305_core_blocks(&dctx->h, &dctx->core_r, src,
415 srclen / POLY1305_BLOCK_SIZE, 1);
416 }
417
418 --- a/include/crypto/internal/poly1305.h
419 +++ b/include/crypto/internal/poly1305.h
420 @@ -11,48 +11,23 @@
421 #include <crypto/poly1305.h>
422
423 /*
424 - * Poly1305 core functions. These implement the ε-almost-∆-universal hash
425 - * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
426 - * ("s key") at the end. They also only support block-aligned inputs.
427 + * Poly1305 core functions. These only accept whole blocks; the caller must
428 + * handle any needed block buffering and padding. 'hibit' must be 1 for any
429 + * full blocks, or 0 for the final block if it had to be padded. If 'nonce' is
430 + * non-NULL, then it's added at the end to compute the Poly1305 MAC. Otherwise,
431 + * only the ε-almost-∆-universal hash function (not the full MAC) is computed.
432 */
433 -void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key);
434 +
435 +void poly1305_core_setkey(struct poly1305_core_key *key, const u8 *raw_key);
436 static inline void poly1305_core_init(struct poly1305_state *state)
437 {
438 *state = (struct poly1305_state){};
439 }
440
441 void poly1305_core_blocks(struct poly1305_state *state,
442 - const struct poly1305_key *key, const void *src,
443 + const struct poly1305_core_key *key, const void *src,
444 unsigned int nblocks, u32 hibit);
445 -void poly1305_core_emit(const struct poly1305_state *state, void *dst);
446 -
447 -/*
448 - * Poly1305 requires a unique key for each tag, which implies that we can't set
449 - * it on the tfm that gets accessed by multiple users simultaneously. Instead we
450 - * expect the key as the first 32 bytes in the update() call.
451 - */
452 -static inline
453 -unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
454 - const u8 *src, unsigned int srclen)
455 -{
456 - if (!dctx->sset) {
457 - if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
458 - poly1305_core_setkey(dctx->r, src);
459 - src += POLY1305_BLOCK_SIZE;
460 - srclen -= POLY1305_BLOCK_SIZE;
461 - dctx->rset = 1;
462 - }
463 - if (srclen >= POLY1305_BLOCK_SIZE) {
464 - dctx->s[0] = get_unaligned_le32(src + 0);
465 - dctx->s[1] = get_unaligned_le32(src + 4);
466 - dctx->s[2] = get_unaligned_le32(src + 8);
467 - dctx->s[3] = get_unaligned_le32(src + 12);
468 - src += POLY1305_BLOCK_SIZE;
469 - srclen -= POLY1305_BLOCK_SIZE;
470 - dctx->sset = true;
471 - }
472 - }
473 - return srclen;
474 -}
475 +void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4],
476 + void *dst);
477
478 #endif
479 --- a/include/crypto/nhpoly1305.h
480 +++ b/include/crypto/nhpoly1305.h
481 @@ -7,7 +7,7 @@
482 #define _NHPOLY1305_H
483
484 #include <crypto/hash.h>
485 -#include <crypto/poly1305.h>
486 +#include <crypto/internal/poly1305.h>
487
488 /* NH parameterization: */
489
490 @@ -33,7 +33,7 @@
491 #define NHPOLY1305_KEY_SIZE (POLY1305_BLOCK_SIZE + NH_KEY_BYTES)
492
493 struct nhpoly1305_key {
494 - struct poly1305_key poly_key;
495 + struct poly1305_core_key poly_key;
496 u32 nh_key[NH_KEY_WORDS];
497 };
498
499 --- a/include/crypto/poly1305.h
500 +++ b/include/crypto/poly1305.h
501 @@ -13,12 +13,29 @@
502 #define POLY1305_KEY_SIZE 32
503 #define POLY1305_DIGEST_SIZE 16
504
505 +/* The poly1305_key and poly1305_state types are mostly opaque and
506 + * implementation-defined. Limbs might be in base 2^64 or base 2^26, or
507 + * different yet. The union type provided keeps these 64-bit aligned for the
508 + * case in which this is implemented using 64x64 multiplies.
509 + */
510 +
511 struct poly1305_key {
512 - u32 r[5]; /* key, base 2^26 */
513 + union {
514 + u32 r[5];
515 + u64 r64[3];
516 + };
517 +};
518 +
519 +struct poly1305_core_key {
520 + struct poly1305_key key;
521 + struct poly1305_key precomputed_s;
522 };
523
524 struct poly1305_state {
525 - u32 h[5]; /* accumulator, base 2^26 */
526 + union {
527 + u32 h[5];
528 + u64 h64[3];
529 + };
530 };
531
532 struct poly1305_desc_ctx {
533 @@ -35,7 +52,10 @@ struct poly1305_desc_ctx {
534 /* accumulator */
535 struct poly1305_state h;
536 /* key */
537 - struct poly1305_key r[CONFIG_CRYPTO_LIB_POLY1305_RSIZE];
538 + union {
539 + struct poly1305_key opaque_r[CONFIG_CRYPTO_LIB_POLY1305_RSIZE];
540 + struct poly1305_core_key core_r;
541 + };
542 };
543
544 void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key);
545 --- a/lib/crypto/Makefile
546 +++ b/lib/crypto/Makefile
547 @@ -28,7 +28,9 @@ obj-$(CONFIG_CRYPTO_LIB_DES) += libdes
548 libdes-y := des.o
549
550 obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += libpoly1305.o
551 -libpoly1305-y := poly1305.o
552 +libpoly1305-y := poly1305-donna32.o
553 +libpoly1305-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o
554 +libpoly1305-y += poly1305.o
555
556 obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o
557 libsha256-y := sha256.o
558 --- /dev/null
559 +++ b/lib/crypto/poly1305-donna32.c
560 @@ -0,0 +1,204 @@
561 +// SPDX-License-Identifier: GPL-2.0 OR MIT
562 +/*
563 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
564 + *
565 + * This is based in part on Andrew Moon's poly1305-donna, which is in the
566 + * public domain.
567 + */
568 +
569 +#include <linux/kernel.h>
570 +#include <asm/unaligned.h>
571 +#include <crypto/internal/poly1305.h>
572 +
573 +void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[16])
574 +{
575 + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
576 + key->key.r[0] = (get_unaligned_le32(&raw_key[0])) & 0x3ffffff;
577 + key->key.r[1] = (get_unaligned_le32(&raw_key[3]) >> 2) & 0x3ffff03;
578 + key->key.r[2] = (get_unaligned_le32(&raw_key[6]) >> 4) & 0x3ffc0ff;
579 + key->key.r[3] = (get_unaligned_le32(&raw_key[9]) >> 6) & 0x3f03fff;
580 + key->key.r[4] = (get_unaligned_le32(&raw_key[12]) >> 8) & 0x00fffff;
581 +
582 + /* s = 5*r */
583 + key->precomputed_s.r[0] = key->key.r[1] * 5;
584 + key->precomputed_s.r[1] = key->key.r[2] * 5;
585 + key->precomputed_s.r[2] = key->key.r[3] * 5;
586 + key->precomputed_s.r[3] = key->key.r[4] * 5;
587 +}
588 +EXPORT_SYMBOL(poly1305_core_setkey);
589 +
590 +void poly1305_core_blocks(struct poly1305_state *state,
591 + const struct poly1305_core_key *key, const void *src,
592 + unsigned int nblocks, u32 hibit)
593 +{
594 + const u8 *input = src;
595 + u32 r0, r1, r2, r3, r4;
596 + u32 s1, s2, s3, s4;
597 + u32 h0, h1, h2, h3, h4;
598 + u64 d0, d1, d2, d3, d4;
599 + u32 c;
600 +
601 + if (!nblocks)
602 + return;
603 +
604 + hibit <<= 24;
605 +
606 + r0 = key->key.r[0];
607 + r1 = key->key.r[1];
608 + r2 = key->key.r[2];
609 + r3 = key->key.r[3];
610 + r4 = key->key.r[4];
611 +
612 + s1 = key->precomputed_s.r[0];
613 + s2 = key->precomputed_s.r[1];
614 + s3 = key->precomputed_s.r[2];
615 + s4 = key->precomputed_s.r[3];
616 +
617 + h0 = state->h[0];
618 + h1 = state->h[1];
619 + h2 = state->h[2];
620 + h3 = state->h[3];
621 + h4 = state->h[4];
622 +
623 + do {
624 + /* h += m[i] */
625 + h0 += (get_unaligned_le32(&input[0])) & 0x3ffffff;
626 + h1 += (get_unaligned_le32(&input[3]) >> 2) & 0x3ffffff;
627 + h2 += (get_unaligned_le32(&input[6]) >> 4) & 0x3ffffff;
628 + h3 += (get_unaligned_le32(&input[9]) >> 6) & 0x3ffffff;
629 + h4 += (get_unaligned_le32(&input[12]) >> 8) | hibit;
630 +
631 + /* h *= r */
632 + d0 = ((u64)h0 * r0) + ((u64)h1 * s4) +
633 + ((u64)h2 * s3) + ((u64)h3 * s2) +
634 + ((u64)h4 * s1);
635 + d1 = ((u64)h0 * r1) + ((u64)h1 * r0) +
636 + ((u64)h2 * s4) + ((u64)h3 * s3) +
637 + ((u64)h4 * s2);
638 + d2 = ((u64)h0 * r2) + ((u64)h1 * r1) +
639 + ((u64)h2 * r0) + ((u64)h3 * s4) +
640 + ((u64)h4 * s3);
641 + d3 = ((u64)h0 * r3) + ((u64)h1 * r2) +
642 + ((u64)h2 * r1) + ((u64)h3 * r0) +
643 + ((u64)h4 * s4);
644 + d4 = ((u64)h0 * r4) + ((u64)h1 * r3) +
645 + ((u64)h2 * r2) + ((u64)h3 * r1) +
646 + ((u64)h4 * r0);
647 +
648 + /* (partial) h %= p */
649 + c = (u32)(d0 >> 26);
650 + h0 = (u32)d0 & 0x3ffffff;
651 + d1 += c;
652 + c = (u32)(d1 >> 26);
653 + h1 = (u32)d1 & 0x3ffffff;
654 + d2 += c;
655 + c = (u32)(d2 >> 26);
656 + h2 = (u32)d2 & 0x3ffffff;
657 + d3 += c;
658 + c = (u32)(d3 >> 26);
659 + h3 = (u32)d3 & 0x3ffffff;
660 + d4 += c;
661 + c = (u32)(d4 >> 26);
662 + h4 = (u32)d4 & 0x3ffffff;
663 + h0 += c * 5;
664 + c = (h0 >> 26);
665 + h0 = h0 & 0x3ffffff;
666 + h1 += c;
667 +
668 + input += POLY1305_BLOCK_SIZE;
669 + } while (--nblocks);
670 +
671 + state->h[0] = h0;
672 + state->h[1] = h1;
673 + state->h[2] = h2;
674 + state->h[3] = h3;
675 + state->h[4] = h4;
676 +}
677 +EXPORT_SYMBOL(poly1305_core_blocks);
678 +
679 +void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4],
680 + void *dst)
681 +{
682 + u8 *mac = dst;
683 + u32 h0, h1, h2, h3, h4, c;
684 + u32 g0, g1, g2, g3, g4;
685 + u64 f;
686 + u32 mask;
687 +
688 + /* fully carry h */
689 + h0 = state->h[0];
690 + h1 = state->h[1];
691 + h2 = state->h[2];
692 + h3 = state->h[3];
693 + h4 = state->h[4];
694 +
695 + c = h1 >> 26;
696 + h1 = h1 & 0x3ffffff;
697 + h2 += c;
698 + c = h2 >> 26;
699 + h2 = h2 & 0x3ffffff;
700 + h3 += c;
701 + c = h3 >> 26;
702 + h3 = h3 & 0x3ffffff;
703 + h4 += c;
704 + c = h4 >> 26;
705 + h4 = h4 & 0x3ffffff;
706 + h0 += c * 5;
707 + c = h0 >> 26;
708 + h0 = h0 & 0x3ffffff;
709 + h1 += c;
710 +
711 + /* compute h + -p */
712 + g0 = h0 + 5;
713 + c = g0 >> 26;
714 + g0 &= 0x3ffffff;
715 + g1 = h1 + c;
716 + c = g1 >> 26;
717 + g1 &= 0x3ffffff;
718 + g2 = h2 + c;
719 + c = g2 >> 26;
720 + g2 &= 0x3ffffff;
721 + g3 = h3 + c;
722 + c = g3 >> 26;
723 + g3 &= 0x3ffffff;
724 + g4 = h4 + c - (1UL << 26);
725 +
726 + /* select h if h < p, or h + -p if h >= p */
727 + mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
728 + g0 &= mask;
729 + g1 &= mask;
730 + g2 &= mask;
731 + g3 &= mask;
732 + g4 &= mask;
733 + mask = ~mask;
734 +
735 + h0 = (h0 & mask) | g0;
736 + h1 = (h1 & mask) | g1;
737 + h2 = (h2 & mask) | g2;
738 + h3 = (h3 & mask) | g3;
739 + h4 = (h4 & mask) | g4;
740 +
741 + /* h = h % (2^128) */
742 + h0 = ((h0) | (h1 << 26)) & 0xffffffff;
743 + h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
744 + h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
745 + h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
746 +
747 + if (likely(nonce)) {
748 + /* mac = (h + nonce) % (2^128) */
749 + f = (u64)h0 + nonce[0];
750 + h0 = (u32)f;
751 + f = (u64)h1 + nonce[1] + (f >> 32);
752 + h1 = (u32)f;
753 + f = (u64)h2 + nonce[2] + (f >> 32);
754 + h2 = (u32)f;
755 + f = (u64)h3 + nonce[3] + (f >> 32);
756 + h3 = (u32)f;
757 + }
758 +
759 + put_unaligned_le32(h0, &mac[0]);
760 + put_unaligned_le32(h1, &mac[4]);
761 + put_unaligned_le32(h2, &mac[8]);
762 + put_unaligned_le32(h3, &mac[12]);
763 +}
764 +EXPORT_SYMBOL(poly1305_core_emit);
765 --- /dev/null
766 +++ b/lib/crypto/poly1305-donna64.c
767 @@ -0,0 +1,185 @@
768 +// SPDX-License-Identifier: GPL-2.0 OR MIT
769 +/*
770 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
771 + *
772 + * This is based in part on Andrew Moon's poly1305-donna, which is in the
773 + * public domain.
774 + */
775 +
776 +#include <linux/kernel.h>
777 +#include <asm/unaligned.h>
778 +#include <crypto/internal/poly1305.h>
779 +
780 +typedef __uint128_t u128;
781 +
782 +void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[16])
783 +{
784 + u64 t0, t1;
785 +
786 + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
787 + t0 = get_unaligned_le64(&raw_key[0]);
788 + t1 = get_unaligned_le64(&raw_key[8]);
789 +
790 + key->key.r64[0] = t0 & 0xffc0fffffffULL;
791 + key->key.r64[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffffULL;
792 + key->key.r64[2] = ((t1 >> 24)) & 0x00ffffffc0fULL;
793 +
794 + /* s = 20*r */
795 + key->precomputed_s.r64[0] = key->key.r64[1] * 20;
796 + key->precomputed_s.r64[1] = key->key.r64[2] * 20;
797 +}
798 +EXPORT_SYMBOL(poly1305_core_setkey);
799 +
800 +void poly1305_core_blocks(struct poly1305_state *state,
801 + const struct poly1305_core_key *key, const void *src,
802 + unsigned int nblocks, u32 hibit)
803 +{
804 + const u8 *input = src;
805 + u64 hibit64;
806 + u64 r0, r1, r2;
807 + u64 s1, s2;
808 + u64 h0, h1, h2;
809 + u64 c;
810 + u128 d0, d1, d2, d;
811 +
812 + if (!nblocks)
813 + return;
814 +
815 + hibit64 = ((u64)hibit) << 40;
816 +
817 + r0 = key->key.r64[0];
818 + r1 = key->key.r64[1];
819 + r2 = key->key.r64[2];
820 +
821 + h0 = state->h64[0];
822 + h1 = state->h64[1];
823 + h2 = state->h64[2];
824 +
825 + s1 = key->precomputed_s.r64[0];
826 + s2 = key->precomputed_s.r64[1];
827 +
828 + do {
829 + u64 t0, t1;
830 +
831 + /* h += m[i] */
832 + t0 = get_unaligned_le64(&input[0]);
833 + t1 = get_unaligned_le64(&input[8]);
834 +
835 + h0 += t0 & 0xfffffffffffULL;
836 + h1 += ((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL;
837 + h2 += (((t1 >> 24)) & 0x3ffffffffffULL) | hibit64;
838 +
839 + /* h *= r */
840 + d0 = (u128)h0 * r0;
841 + d = (u128)h1 * s2;
842 + d0 += d;
843 + d = (u128)h2 * s1;
844 + d0 += d;
845 + d1 = (u128)h0 * r1;
846 + d = (u128)h1 * r0;
847 + d1 += d;
848 + d = (u128)h2 * s2;
849 + d1 += d;
850 + d2 = (u128)h0 * r2;
851 + d = (u128)h1 * r1;
852 + d2 += d;
853 + d = (u128)h2 * r0;
854 + d2 += d;
855 +
856 + /* (partial) h %= p */
857 + c = (u64)(d0 >> 44);
858 + h0 = (u64)d0 & 0xfffffffffffULL;
859 + d1 += c;
860 + c = (u64)(d1 >> 44);
861 + h1 = (u64)d1 & 0xfffffffffffULL;
862 + d2 += c;
863 + c = (u64)(d2 >> 42);
864 + h2 = (u64)d2 & 0x3ffffffffffULL;
865 + h0 += c * 5;
866 + c = h0 >> 44;
867 + h0 = h0 & 0xfffffffffffULL;
868 + h1 += c;
869 +
870 + input += POLY1305_BLOCK_SIZE;
871 + } while (--nblocks);
872 +
873 + state->h64[0] = h0;
874 + state->h64[1] = h1;
875 + state->h64[2] = h2;
876 +}
877 +EXPORT_SYMBOL(poly1305_core_blocks);
878 +
879 +void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4],
880 + void *dst)
881 +{
882 + u8 *mac = dst;
883 + u64 h0, h1, h2, c;
884 + u64 g0, g1, g2;
885 + u64 t0, t1;
886 +
887 + /* fully carry h */
888 + h0 = state->h64[0];
889 + h1 = state->h64[1];
890 + h2 = state->h64[2];
891 +
892 + c = h1 >> 44;
893 + h1 &= 0xfffffffffffULL;
894 + h2 += c;
895 + c = h2 >> 42;
896 + h2 &= 0x3ffffffffffULL;
897 + h0 += c * 5;
898 + c = h0 >> 44;
899 + h0 &= 0xfffffffffffULL;
900 + h1 += c;
901 + c = h1 >> 44;
902 + h1 &= 0xfffffffffffULL;
903 + h2 += c;
904 + c = h2 >> 42;
905 + h2 &= 0x3ffffffffffULL;
906 + h0 += c * 5;
907 + c = h0 >> 44;
908 + h0 &= 0xfffffffffffULL;
909 + h1 += c;
910 +
911 + /* compute h + -p */
912 + g0 = h0 + 5;
913 + c = g0 >> 44;
914 + g0 &= 0xfffffffffffULL;
915 + g1 = h1 + c;
916 + c = g1 >> 44;
917 + g1 &= 0xfffffffffffULL;
918 + g2 = h2 + c - (1ULL << 42);
919 +
920 + /* select h if h < p, or h + -p if h >= p */
921 + c = (g2 >> ((sizeof(u64) * 8) - 1)) - 1;
922 + g0 &= c;
923 + g1 &= c;
924 + g2 &= c;
925 + c = ~c;
926 + h0 = (h0 & c) | g0;
927 + h1 = (h1 & c) | g1;
928 + h2 = (h2 & c) | g2;
929 +
930 + if (likely(nonce)) {
931 + /* h = (h + nonce) */
932 + t0 = ((u64)nonce[1] << 32) | nonce[0];
933 + t1 = ((u64)nonce[3] << 32) | nonce[2];
934 +
935 + h0 += t0 & 0xfffffffffffULL;
936 + c = h0 >> 44;
937 + h0 &= 0xfffffffffffULL;
938 + h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL) + c;
939 + c = h1 >> 44;
940 + h1 &= 0xfffffffffffULL;
941 + h2 += (((t1 >> 24)) & 0x3ffffffffffULL) + c;
942 + h2 &= 0x3ffffffffffULL;
943 + }
944 +
945 + /* mac = h % (2^128) */
946 + h0 = h0 | (h1 << 44);
947 + h1 = (h1 >> 20) | (h2 << 24);
948 +
949 + put_unaligned_le64(h0, &mac[0]);
950 + put_unaligned_le64(h1, &mac[8]);
951 +}
952 +EXPORT_SYMBOL(poly1305_core_emit);
953 --- a/lib/crypto/poly1305.c
954 +++ b/lib/crypto/poly1305.c
955 @@ -12,151 +12,9 @@
956 #include <linux/module.h>
957 #include <asm/unaligned.h>
958
959 -static inline u64 mlt(u64 a, u64 b)
960 -{
961 - return a * b;
962 -}
963 -
964 -static inline u32 sr(u64 v, u_char n)
965 -{
966 - return v >> n;
967 -}
968 -
969 -static inline u32 and(u32 v, u32 mask)
970 -{
971 - return v & mask;
972 -}
973 -
974 -void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
975 -{
976 - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
977 - key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff;
978 - key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03;
979 - key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff;
980 - key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff;
981 - key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
982 -}
983 -EXPORT_SYMBOL_GPL(poly1305_core_setkey);
984 -
985 -void poly1305_core_blocks(struct poly1305_state *state,
986 - const struct poly1305_key *key, const void *src,
987 - unsigned int nblocks, u32 hibit)
988 -{
989 - u32 r0, r1, r2, r3, r4;
990 - u32 s1, s2, s3, s4;
991 - u32 h0, h1, h2, h3, h4;
992 - u64 d0, d1, d2, d3, d4;
993 -
994 - if (!nblocks)
995 - return;
996 -
997 - r0 = key->r[0];
998 - r1 = key->r[1];
999 - r2 = key->r[2];
1000 - r3 = key->r[3];
1001 - r4 = key->r[4];
1002 -
1003 - s1 = r1 * 5;
1004 - s2 = r2 * 5;
1005 - s3 = r3 * 5;
1006 - s4 = r4 * 5;
1007 -
1008 - h0 = state->h[0];
1009 - h1 = state->h[1];
1010 - h2 = state->h[2];
1011 - h3 = state->h[3];
1012 - h4 = state->h[4];
1013 -
1014 - do {
1015 - /* h += m[i] */
1016 - h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff;
1017 - h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
1018 - h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
1019 - h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
1020 - h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
1021 -
1022 - /* h *= r */
1023 - d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
1024 - mlt(h3, s2) + mlt(h4, s1);
1025 - d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
1026 - mlt(h3, s3) + mlt(h4, s2);
1027 - d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
1028 - mlt(h3, s4) + mlt(h4, s3);
1029 - d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
1030 - mlt(h3, r0) + mlt(h4, s4);
1031 - d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
1032 - mlt(h3, r1) + mlt(h4, r0);
1033 -
1034 - /* (partial) h %= p */
1035 - d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
1036 - d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
1037 - d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
1038 - d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
1039 - h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
1040 - h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
1041 -
1042 - src += POLY1305_BLOCK_SIZE;
1043 - } while (--nblocks);
1044 -
1045 - state->h[0] = h0;
1046 - state->h[1] = h1;
1047 - state->h[2] = h2;
1048 - state->h[3] = h3;
1049 - state->h[4] = h4;
1050 -}
1051 -EXPORT_SYMBOL_GPL(poly1305_core_blocks);
1052 -
1053 -void poly1305_core_emit(const struct poly1305_state *state, void *dst)
1054 -{
1055 - u32 h0, h1, h2, h3, h4;
1056 - u32 g0, g1, g2, g3, g4;
1057 - u32 mask;
1058 -
1059 - /* fully carry h */
1060 - h0 = state->h[0];
1061 - h1 = state->h[1];
1062 - h2 = state->h[2];
1063 - h3 = state->h[3];
1064 - h4 = state->h[4];
1065 -
1066 - h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
1067 - h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
1068 - h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
1069 - h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
1070 - h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
1071 -
1072 - /* compute h + -p */
1073 - g0 = h0 + 5;
1074 - g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
1075 - g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
1076 - g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
1077 - g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
1078 -
1079 - /* select h if h < p, or h + -p if h >= p */
1080 - mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
1081 - g0 &= mask;
1082 - g1 &= mask;
1083 - g2 &= mask;
1084 - g3 &= mask;
1085 - g4 &= mask;
1086 - mask = ~mask;
1087 - h0 = (h0 & mask) | g0;
1088 - h1 = (h1 & mask) | g1;
1089 - h2 = (h2 & mask) | g2;
1090 - h3 = (h3 & mask) | g3;
1091 - h4 = (h4 & mask) | g4;
1092 -
1093 - /* h = h % (2^128) */
1094 - put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0);
1095 - put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4);
1096 - put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8);
1097 - put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12);
1098 -}
1099 -EXPORT_SYMBOL_GPL(poly1305_core_emit);
1100 -
1101 void poly1305_init_generic(struct poly1305_desc_ctx *desc, const u8 *key)
1102 {
1103 - poly1305_core_setkey(desc->r, key);
1104 + poly1305_core_setkey(&desc->core_r, key);
1105 desc->s[0] = get_unaligned_le32(key + 16);
1106 desc->s[1] = get_unaligned_le32(key + 20);
1107 desc->s[2] = get_unaligned_le32(key + 24);
1108 @@ -164,7 +22,7 @@ void poly1305_init_generic(struct poly13
1109 poly1305_core_init(&desc->h);
1110 desc->buflen = 0;
1111 desc->sset = true;
1112 - desc->rset = 1;
1113 + desc->rset = 2;
1114 }
1115 EXPORT_SYMBOL_GPL(poly1305_init_generic);
1116
1117 @@ -181,13 +39,14 @@ void poly1305_update_generic(struct poly
1118 desc->buflen += bytes;
1119
1120 if (desc->buflen == POLY1305_BLOCK_SIZE) {
1121 - poly1305_core_blocks(&desc->h, desc->r, desc->buf, 1, 1);
1122 + poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf,
1123 + 1, 1);
1124 desc->buflen = 0;
1125 }
1126 }
1127
1128 if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
1129 - poly1305_core_blocks(&desc->h, desc->r, src,
1130 + poly1305_core_blocks(&desc->h, &desc->core_r, src,
1131 nbytes / POLY1305_BLOCK_SIZE, 1);
1132 src += nbytes - (nbytes % POLY1305_BLOCK_SIZE);
1133 nbytes %= POLY1305_BLOCK_SIZE;
1134 @@ -202,28 +61,14 @@ EXPORT_SYMBOL_GPL(poly1305_update_generi
1135
1136 void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *dst)
1137 {
1138 - __le32 digest[4];
1139 - u64 f = 0;
1140 -
1141 if (unlikely(desc->buflen)) {
1142 desc->buf[desc->buflen++] = 1;
1143 memset(desc->buf + desc->buflen, 0,
1144 POLY1305_BLOCK_SIZE - desc->buflen);
1145 - poly1305_core_blocks(&desc->h, desc->r, desc->buf, 1, 0);
1146 + poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf, 1, 0);
1147 }
1148
1149 - poly1305_core_emit(&desc->h, digest);
1150 -
1151 - /* mac = (h + s) % (2^128) */
1152 - f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
1153 - put_unaligned_le32(f, dst + 0);
1154 - f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
1155 - put_unaligned_le32(f, dst + 4);
1156 - f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
1157 - put_unaligned_le32(f, dst + 8);
1158 - f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
1159 - put_unaligned_le32(f, dst + 12);
1160 -
1161 + poly1305_core_emit(&desc->h, desc->s, dst);
1162 *desc = (struct poly1305_desc_ctx){};
1163 }
1164 EXPORT_SYMBOL_GPL(poly1305_final_generic);