kernel: 5.4: import wireguard backport
[openwrt/openwrt.git] / target / linux / generic / backport-5.4 / 080-wireguard-0029-crypto-curve25519-x86_64-library-and-KPP-implementat.patch
1 From 0195e7650ebe0fdb5e1d5891274c203cb6cee0b6 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Fri, 8 Nov 2019 13:22:36 +0100
4 Subject: [PATCH 029/124] crypto: curve25519 - x86_64 library and KPP
5 implementations
6 MIME-Version: 1.0
7 Content-Type: text/plain; charset=UTF-8
8 Content-Transfer-Encoding: 8bit
9
10 commit bb611bdfd6be34d9f822c73305fcc83720499d38 upstream.
11
12 This implementation is the fastest available x86_64 implementation, and
13 unlike Sandy2x, it doesn't requie use of the floating point registers at
14 all. Instead it makes use of BMI2 and ADX, available on recent
15 microarchitectures. The implementation was written by Armando
16 Faz-Hernández with contributions (upstream) from Samuel Neves and me,
17 in addition to further changes in the kernel implementation from us.
18
19 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
20 Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
21 Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
22 [ardb: - move to arch/x86/crypto
23 - wire into lib/crypto framework
24 - implement crypto API KPP hooks ]
25 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
26 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
27 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
28 ---
29 arch/x86/crypto/Makefile | 1 +
30 arch/x86/crypto/curve25519-x86_64.c | 2475 +++++++++++++++++++++++++++
31 crypto/Kconfig | 6 +
32 3 files changed, 2482 insertions(+)
33 create mode 100644 arch/x86/crypto/curve25519-x86_64.c
34
35 --- a/arch/x86/crypto/Makefile
36 +++ b/arch/x86/crypto/Makefile
37 @@ -39,6 +39,7 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2)
38
39 obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
40 obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
41 +obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
42
43 # These modules require assembler to support AVX.
44 ifeq ($(avx_supported),yes)
45 --- /dev/null
46 +++ b/arch/x86/crypto/curve25519-x86_64.c
47 @@ -0,0 +1,2475 @@
48 +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
49 +/*
50 + * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved.
51 + * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
52 + * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
53 + */
54 +
55 +#include <crypto/curve25519.h>
56 +#include <crypto/internal/kpp.h>
57 +
58 +#include <linux/types.h>
59 +#include <linux/jump_label.h>
60 +#include <linux/kernel.h>
61 +#include <linux/module.h>
62 +
63 +#include <asm/cpufeature.h>
64 +#include <asm/processor.h>
65 +
66 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2);
67 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx);
68 +
69 +enum { NUM_WORDS_ELTFP25519 = 4 };
70 +typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519];
71 +typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519];
72 +
73 +#define mul_eltfp25519_1w_adx(c, a, b) do { \
74 + mul_256x256_integer_adx(m.buffer, a, b); \
75 + red_eltfp25519_1w_adx(c, m.buffer); \
76 +} while (0)
77 +
78 +#define mul_eltfp25519_1w_bmi2(c, a, b) do { \
79 + mul_256x256_integer_bmi2(m.buffer, a, b); \
80 + red_eltfp25519_1w_bmi2(c, m.buffer); \
81 +} while (0)
82 +
83 +#define sqr_eltfp25519_1w_adx(a) do { \
84 + sqr_256x256_integer_adx(m.buffer, a); \
85 + red_eltfp25519_1w_adx(a, m.buffer); \
86 +} while (0)
87 +
88 +#define sqr_eltfp25519_1w_bmi2(a) do { \
89 + sqr_256x256_integer_bmi2(m.buffer, a); \
90 + red_eltfp25519_1w_bmi2(a, m.buffer); \
91 +} while (0)
92 +
93 +#define mul_eltfp25519_2w_adx(c, a, b) do { \
94 + mul2_256x256_integer_adx(m.buffer, a, b); \
95 + red_eltfp25519_2w_adx(c, m.buffer); \
96 +} while (0)
97 +
98 +#define mul_eltfp25519_2w_bmi2(c, a, b) do { \
99 + mul2_256x256_integer_bmi2(m.buffer, a, b); \
100 + red_eltfp25519_2w_bmi2(c, m.buffer); \
101 +} while (0)
102 +
103 +#define sqr_eltfp25519_2w_adx(a) do { \
104 + sqr2_256x256_integer_adx(m.buffer, a); \
105 + red_eltfp25519_2w_adx(a, m.buffer); \
106 +} while (0)
107 +
108 +#define sqr_eltfp25519_2w_bmi2(a) do { \
109 + sqr2_256x256_integer_bmi2(m.buffer, a); \
110 + red_eltfp25519_2w_bmi2(a, m.buffer); \
111 +} while (0)
112 +
113 +#define sqrn_eltfp25519_1w_adx(a, times) do { \
114 + int ____counter = (times); \
115 + while (____counter-- > 0) \
116 + sqr_eltfp25519_1w_adx(a); \
117 +} while (0)
118 +
119 +#define sqrn_eltfp25519_1w_bmi2(a, times) do { \
120 + int ____counter = (times); \
121 + while (____counter-- > 0) \
122 + sqr_eltfp25519_1w_bmi2(a); \
123 +} while (0)
124 +
125 +#define copy_eltfp25519_1w(C, A) do { \
126 + (C)[0] = (A)[0]; \
127 + (C)[1] = (A)[1]; \
128 + (C)[2] = (A)[2]; \
129 + (C)[3] = (A)[3]; \
130 +} while (0)
131 +
132 +#define setzero_eltfp25519_1w(C) do { \
133 + (C)[0] = 0; \
134 + (C)[1] = 0; \
135 + (C)[2] = 0; \
136 + (C)[3] = 0; \
137 +} while (0)
138 +
139 +__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = {
140 + /* 1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL,
141 + 0xffffffffffffffffUL, 0x5fffffffffffffffUL,
142 + /* 2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL,
143 + 0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL,
144 + /* 3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL,
145 + 0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL,
146 + /* 4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL,
147 + 0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL,
148 + /* 5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL,
149 + 0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL,
150 + /* 6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL,
151 + 0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL,
152 + /* 7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL,
153 + 0xc1c20d06231f7614UL, 0x2938218da274f972UL,
154 + /* 8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL,
155 + 0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL,
156 + /* 9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL,
157 + 0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL,
158 + /* 10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL,
159 + 0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL,
160 + /* 11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL,
161 + 0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL,
162 + /* 12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL,
163 + 0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL,
164 + /* 13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL,
165 + 0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL,
166 + /* 14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL,
167 + 0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL,
168 + /* 15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL,
169 + 0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL,
170 + /* 16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL,
171 + 0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL,
172 + /* 17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL,
173 + 0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL,
174 + /* 18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL,
175 + 0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL,
176 + /* 19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL,
177 + 0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL,
178 + /* 20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL,
179 + 0x9d4935467caaf22eUL, 0x5166408eee85ff49UL,
180 + /* 21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL,
181 + 0x5259729241159b1cUL, 0x6a621892d5b0ab33UL,
182 + /* 22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL,
183 + 0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL,
184 + /* 23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL,
185 + 0x23758739f630a257UL, 0x295a407a01a78580UL,
186 + /* 24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL,
187 + 0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL,
188 + /* 25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL,
189 + 0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL,
190 + /* 26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL,
191 + 0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL,
192 + /* 27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL,
193 + 0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL,
194 + /* 28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL,
195 + 0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL,
196 + /* 29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL,
197 + 0x74b4c4ceab102f64UL, 0x183abadd10139845UL,
198 + /* 30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL,
199 + 0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL,
200 + /* 31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL,
201 + 0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL,
202 + /* 32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL,
203 + 0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL,
204 + /* 33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL,
205 + 0xd88768e4904032d8UL, 0x131384427b3aaeecUL,
206 + /* 34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL,
207 + 0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL,
208 + /* 35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL,
209 + 0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL,
210 + /* 36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL,
211 + 0xa401760b882c797aUL, 0x1fc223e28dc88730UL,
212 + /* 37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL,
213 + 0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL,
214 + /* 38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL,
215 + 0xfdbf177988bbc586UL, 0x2959894fcad81df5UL,
216 + /* 39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL,
217 + 0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL,
218 + /* 40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL,
219 + 0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL,
220 + /* 41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL,
221 + 0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL,
222 + /* 42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL,
223 + 0x5c217736fa279374UL, 0x7dde05734afeb1faUL,
224 + /* 43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL,
225 + 0xe6053bf89595bf7aUL, 0x394faf38da245530UL,
226 + /* 44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL,
227 + 0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL,
228 + /* 45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL,
229 + 0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL,
230 + /* 46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL,
231 + 0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL,
232 + /* 47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL,
233 + 0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL,
234 + /* 48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL,
235 + 0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL,
236 + /* 49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL,
237 + 0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL,
238 + /* 50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL,
239 + 0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL,
240 + /* 51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL,
241 + 0xc189218075e91436UL, 0x6d9284169b3b8484UL,
242 + /* 52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL,
243 + 0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL,
244 + /* 53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL,
245 + 0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL,
246 + /* 54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL,
247 + 0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL,
248 + /* 55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL,
249 + 0x19346a65d3224a08UL, 0x0f5034e49b9af466UL,
250 + /* 56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL,
251 + 0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL,
252 + /* 57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL,
253 + 0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL,
254 + /* 58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL,
255 + 0xf826842130f5ad28UL, 0x3ea988f75301a441UL,
256 + /* 59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL,
257 + 0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL,
258 + /* 60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL,
259 + 0xd01469df811d644bUL, 0x77fea47d81a5d71fUL,
260 + /* 61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL,
261 + 0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL,
262 + /* 62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL,
263 + 0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL,
264 + /* 63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL,
265 + 0xbea450e1dbd885d5UL, 0x61b68649320f712cUL,
266 + /* 64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL,
267 + 0x25232973322dbef4UL, 0x445dc4758c17f770UL,
268 + /* 65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL,
269 + 0x1efebefdc053db34UL, 0x4adbe867c65daf99UL,
270 + /* 66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL,
271 + 0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL,
272 + /* 67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL,
273 + 0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL,
274 + /* 68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL,
275 + 0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL,
276 + /* 69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL,
277 + 0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL,
278 + /* 70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL,
279 + 0xbdaacb805831ca6fUL, 0x445b652dc916694fUL,
280 + /* 71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL,
281 + 0xa1823aafe04c314aUL, 0x790a2d94437cf586UL,
282 + /* 72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL,
283 + 0xbf70903b204f5169UL, 0x2f7a89891ba319feUL,
284 + /* 73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL,
285 + 0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL,
286 + /* 74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL,
287 + 0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL,
288 + /* 75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL,
289 + 0x674f1288f8e11217UL, 0x5682250f329f93d0UL,
290 + /* 76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL,
291 + 0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL,
292 + /* 77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL,
293 + 0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL,
294 + /* 78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL,
295 + 0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL,
296 + /* 79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL,
297 + 0x5deadacec9f04973UL, 0x29275b5d41d29b27UL,
298 + /* 80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL,
299 + 0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL,
300 + /* 81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL,
301 + 0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL,
302 + /* 82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL,
303 + 0x894d1d855ae52359UL, 0x68e122157b743d69UL,
304 + /* 83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL,
305 + 0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL,
306 + /* 84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL,
307 + 0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL,
308 + /* 85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL,
309 + 0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL,
310 + /* 86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL,
311 + 0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL,
312 + /* 87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL,
313 + 0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL,
314 + /* 88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL,
315 + 0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL,
316 + /* 89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL,
317 + 0x45adb16e76cefcf2UL, 0x01f768aead232999UL,
318 + /* 90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL,
319 + 0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL,
320 + /* 91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL,
321 + 0x5eefa966de2a701dUL, 0x23b20565de55e3efUL,
322 + /* 92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL,
323 + 0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL,
324 + /* 93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL,
325 + 0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL,
326 + /* 94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL,
327 + 0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL,
328 + /* 95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL,
329 + 0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL,
330 + /* 96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL,
331 + 0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL,
332 + /* 97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL,
333 + 0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL,
334 + /* 98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL,
335 + 0xb9886314844006b1UL, 0x40d2a72ab454cc60UL,
336 + /* 99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL,
337 + 0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL,
338 + /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL,
339 + 0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL,
340 + /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL,
341 + 0x40727064c416d74fUL, 0x6e15c6114b502ef0UL,
342 + /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL,
343 + 0x4a497962066e6043UL, 0x705b3aab41355b44UL,
344 + /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL,
345 + 0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL,
346 + /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL,
347 + 0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL,
348 + /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL,
349 + 0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL,
350 + /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL,
351 + 0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL,
352 + /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL,
353 + 0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL,
354 + /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL,
355 + 0x2088ce1570033c68UL, 0x7fba1f495c837987UL,
356 + /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL,
357 + 0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL,
358 + /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL,
359 + 0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL,
360 + /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL,
361 + 0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL,
362 + /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL,
363 + 0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL,
364 + /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL,
365 + 0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL,
366 + /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL,
367 + 0x00f52e3f67280294UL, 0x566d4fc14730c509UL,
368 + /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL,
369 + 0x216730fba68d6095UL, 0x22e8c3843f69cea7UL,
370 + /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL,
371 + 0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL,
372 + /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL,
373 + 0x508e862f121692fcUL, 0x3a81907fa093c291UL,
374 + /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL,
375 + 0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL,
376 + /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL,
377 + 0xbee595ce8a9df2e5UL, 0x25e496c722422236UL,
378 + /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL,
379 + 0xe488de11d761e352UL, 0x0e878a01a085545cUL,
380 + /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL,
381 + 0x9ea37a487ae80d67UL, 0x67a9958011e41794UL,
382 + /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL,
383 + 0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL,
384 + /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL,
385 + 0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL,
386 + /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL,
387 + 0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL,
388 + /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL,
389 + 0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL,
390 + /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL,
391 + 0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL,
392 + /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL,
393 + 0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL,
394 + /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL,
395 + 0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL,
396 + /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL,
397 + 0x97134556a9832d06UL, 0x269bb0360a84f8a0UL,
398 + /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL,
399 + 0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL,
400 + /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL,
401 + 0x904659bb686e3772UL, 0x7215c371746ba8c8UL,
402 + /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL,
403 + 0x266fd5809208f294UL, 0x5c847085619a26b9UL,
404 + /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL,
405 + 0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL,
406 + /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL,
407 + 0x2b1641917b307614UL, 0x117c554fc4f45b7cUL,
408 + /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL,
409 + 0xd7e803f4171b2827UL, 0x1015e87487d225eaUL,
410 + /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL,
411 + 0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL,
412 + /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL,
413 + 0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL,
414 + /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL,
415 + 0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL,
416 + /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL,
417 + 0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL,
418 + /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL,
419 + 0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL,
420 + /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL,
421 + 0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL,
422 + /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL,
423 + 0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL,
424 + /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL,
425 + 0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL,
426 + /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL,
427 + 0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL,
428 + /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL,
429 + 0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL,
430 + /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL,
431 + 0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL,
432 + /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL,
433 + 0xec219c48fbd21604UL, 0x1aaf1af517c36731UL,
434 + /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL,
435 + 0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL,
436 + /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL,
437 + 0x4dbbc207f531561aUL, 0x0253b7f082128a27UL,
438 + /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL,
439 + 0x52d17436309d4253UL, 0x356f97e13efae576UL,
440 + /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL,
441 + 0x0c776128bed92c98UL, 0x1d34ae93032885b8UL,
442 + /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL,
443 + 0x66124c6f97bda770UL, 0x0f81a0290654124aUL,
444 + /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL,
445 + 0xff08d03f93d8c20aUL, 0x52a148199faef26bUL,
446 + /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL,
447 + 0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL,
448 + /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL,
449 + 0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL,
450 + /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL,
451 + 0x5da643cb4bf30035UL, 0x77db28d63940f721UL,
452 + /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL,
453 + 0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL,
454 + /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL,
455 + 0x140a69245ca575edUL, 0x0cf1c37134273a4cUL,
456 + /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL,
457 + 0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL,
458 + /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL,
459 + 0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL,
460 + /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL,
461 + 0x497d723f802e88e1UL, 0x30684dea602f408dUL,
462 + /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL,
463 + 0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL,
464 + /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL,
465 + 0x7468c7423a543258UL, 0x4a7f11464eb5642fUL,
466 + /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL,
467 + 0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL,
468 + /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL,
469 + 0x026df551dbb85c20UL, 0x74fcd91047e21901UL,
470 + /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL,
471 + 0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL,
472 + /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL,
473 + 0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL,
474 + /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL,
475 + 0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL,
476 + /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL,
477 + 0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL,
478 + /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL,
479 + 0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL,
480 + /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL,
481 + 0x13033ac001f66697UL, 0x273b24fe3b367d75UL,
482 + /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL,
483 + 0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL,
484 + /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL,
485 + 0xacc63ca34b8ec145UL, 0x74621888fee66574UL,
486 + /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL,
487 + 0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL,
488 + /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL,
489 + 0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL,
490 + /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL,
491 + 0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL,
492 + /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL,
493 + 0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL,
494 + /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL,
495 + 0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL,
496 + /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL,
497 + 0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL,
498 + /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL,
499 + 0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL,
500 + /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL,
501 + 0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL,
502 + /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL,
503 + 0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL,
504 + /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL,
505 + 0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL,
506 + /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL,
507 + 0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL,
508 + /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL,
509 + 0x81004b71e33cc191UL, 0x44e6be345122803cUL,
510 + /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL,
511 + 0x49c8c4281af60c29UL, 0x21edb518de701aeeUL,
512 + /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL,
513 + 0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL,
514 + /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL,
515 + 0x12bc8d6915783712UL, 0x498194c0fc620abbUL,
516 + /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL,
517 + 0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL,
518 + /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL,
519 + 0x1e60c24598c71fffUL, 0x59f2f014979983efUL,
520 + /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL,
521 + 0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL,
522 + /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL,
523 + 0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL,
524 + /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL,
525 + 0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL,
526 + /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL,
527 + 0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL,
528 + /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL,
529 + 0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL,
530 + /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL,
531 + 0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL,
532 + /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL,
533 + 0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL,
534 + /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL,
535 + 0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL,
536 + /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL,
537 + 0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL,
538 + /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL,
539 + 0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL,
540 + /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL,
541 + 0x883ada83a6a1652cUL, 0x585f1974034d6c17UL,
542 + /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL,
543 + 0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL,
544 + /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL,
545 + 0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL,
546 + /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL,
547 + 0x33979624f0e917beUL, 0x2c018dc527356b30UL,
548 + /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL,
549 + 0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL,
550 + /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL,
551 + 0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL,
552 + /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL,
553 + 0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL,
554 + /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL,
555 + 0x345ead5e972d091eUL, 0x18c8df11a83103baUL,
556 + /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL,
557 + 0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL,
558 + /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL,
559 + 0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL,
560 + /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL,
561 + 0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL,
562 + /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL,
563 + 0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL,
564 + /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL,
565 + 0x79952a008221e738UL, 0x4322e1a7535cd2bbUL,
566 + /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL,
567 + 0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL,
568 + /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL,
569 + 0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL,
570 + /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL,
571 + 0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL,
572 + /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL,
573 + 0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL,
574 + /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL,
575 + 0x1df4c0af01314a60UL, 0x09a62dab89289527UL,
576 + /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL,
577 + 0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL,
578 + /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL,
579 + 0x49b96853d7a7084aUL, 0x4980a319601420a8UL,
580 + /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL,
581 + 0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL,
582 + /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL,
583 + 0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL,
584 + /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL,
585 + 0xddeb34a061615d99UL, 0x5129cecceb64b773UL,
586 + /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL,
587 + 0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL,
588 + /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL,
589 + 0x680bd77c73edad2eUL, 0x487c02354edd9041UL,
590 + /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL,
591 + 0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL,
592 + /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL,
593 + 0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL,
594 + /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL,
595 + 0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL,
596 + /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL,
597 + 0xe9834262d13921edUL, 0x27fedafaa54bb592UL,
598 + /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL,
599 + 0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL,
600 + /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL,
601 + 0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL,
602 + /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL,
603 + 0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL,
604 + /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL,
605 + 0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL,
606 + /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL,
607 + 0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL,
608 + /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL,
609 + 0x645b426f3d1d58acUL, 0x4804a82227a557bcUL,
610 + /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL,
611 + 0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL,
612 + /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL,
613 + 0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL,
614 + /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL,
615 + 0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL,
616 + /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL,
617 + 0xc26ccff352b37ec7UL, 0x056f68341d797b21UL,
618 + /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL,
619 + 0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL,
620 + /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL,
621 + 0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL,
622 + /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL,
623 + 0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL,
624 + /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL,
625 + 0x2c43ecea0107c1ddUL, 0x526028809372de35UL,
626 + /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL,
627 + 0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL,
628 + /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL,
629 + 0x899fc38fc4b5c515UL, 0x250386b124ffc207UL,
630 + /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL,
631 + 0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL,
632 + /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL,
633 + 0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL,
634 + /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL,
635 + 0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL,
636 + /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL,
637 + 0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL,
638 + /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL,
639 + 0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL,
640 + /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL,
641 + 0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL,
642 + /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL,
643 + 0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL
644 +};
645 +
646 +/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
647 + * a is two 256-bit integers: a0[0:3] and a1[4:7]
648 + * b is two 256-bit integers: b0[0:3] and b1[4:7]
649 + */
650 +static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a,
651 + const u64 *const b)
652 +{
653 + asm volatile(
654 + "xorl %%r14d, %%r14d ;"
655 + "movq (%1), %%rdx; " /* A[0] */
656 + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
657 + "xorl %%r10d, %%r10d ;"
658 + "movq %%r8, (%0) ;"
659 + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
660 + "adox %%r10, %%r15 ;"
661 + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
662 + "adox %%r8, %%rax ;"
663 + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
664 + "adox %%r10, %%rbx ;"
665 + /******************************************/
666 + "adox %%r14, %%rcx ;"
667 +
668 + "movq 8(%1), %%rdx; " /* A[1] */
669 + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
670 + "adox %%r15, %%r8 ;"
671 + "movq %%r8, 8(%0) ;"
672 + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
673 + "adox %%r10, %%r9 ;"
674 + "adcx %%r9, %%rax ;"
675 + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
676 + "adox %%r8, %%r11 ;"
677 + "adcx %%r11, %%rbx ;"
678 + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
679 + "adox %%r10, %%r13 ;"
680 + "adcx %%r13, %%rcx ;"
681 + /******************************************/
682 + "adox %%r14, %%r15 ;"
683 + "adcx %%r14, %%r15 ;"
684 +
685 + "movq 16(%1), %%rdx; " /* A[2] */
686 + "xorl %%r10d, %%r10d ;"
687 + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
688 + "adox %%rax, %%r8 ;"
689 + "movq %%r8, 16(%0) ;"
690 + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
691 + "adox %%r10, %%r9 ;"
692 + "adcx %%r9, %%rbx ;"
693 + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
694 + "adox %%r8, %%r11 ;"
695 + "adcx %%r11, %%rcx ;"
696 + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
697 + "adox %%r10, %%r13 ;"
698 + "adcx %%r13, %%r15 ;"
699 + /******************************************/
700 + "adox %%r14, %%rax ;"
701 + "adcx %%r14, %%rax ;"
702 +
703 + "movq 24(%1), %%rdx; " /* A[3] */
704 + "xorl %%r10d, %%r10d ;"
705 + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
706 + "adox %%rbx, %%r8 ;"
707 + "movq %%r8, 24(%0) ;"
708 + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
709 + "adox %%r10, %%r9 ;"
710 + "adcx %%r9, %%rcx ;"
711 + "movq %%rcx, 32(%0) ;"
712 + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
713 + "adox %%r8, %%r11 ;"
714 + "adcx %%r11, %%r15 ;"
715 + "movq %%r15, 40(%0) ;"
716 + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
717 + "adox %%r10, %%r13 ;"
718 + "adcx %%r13, %%rax ;"
719 + "movq %%rax, 48(%0) ;"
720 + /******************************************/
721 + "adox %%r14, %%rbx ;"
722 + "adcx %%r14, %%rbx ;"
723 + "movq %%rbx, 56(%0) ;"
724 +
725 + "movq 32(%1), %%rdx; " /* C[0] */
726 + "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */
727 + "xorl %%r10d, %%r10d ;"
728 + "movq %%r8, 64(%0);"
729 + "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
730 + "adox %%r10, %%r15 ;"
731 + "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
732 + "adox %%r8, %%rax ;"
733 + "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
734 + "adox %%r10, %%rbx ;"
735 + /******************************************/
736 + "adox %%r14, %%rcx ;"
737 +
738 + "movq 40(%1), %%rdx; " /* C[1] */
739 + "xorl %%r10d, %%r10d ;"
740 + "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
741 + "adox %%r15, %%r8 ;"
742 + "movq %%r8, 72(%0);"
743 + "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
744 + "adox %%r10, %%r9 ;"
745 + "adcx %%r9, %%rax ;"
746 + "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
747 + "adox %%r8, %%r11 ;"
748 + "adcx %%r11, %%rbx ;"
749 + "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
750 + "adox %%r10, %%r13 ;"
751 + "adcx %%r13, %%rcx ;"
752 + /******************************************/
753 + "adox %%r14, %%r15 ;"
754 + "adcx %%r14, %%r15 ;"
755 +
756 + "movq 48(%1), %%rdx; " /* C[2] */
757 + "xorl %%r10d, %%r10d ;"
758 + "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
759 + "adox %%rax, %%r8 ;"
760 + "movq %%r8, 80(%0);"
761 + "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
762 + "adox %%r10, %%r9 ;"
763 + "adcx %%r9, %%rbx ;"
764 + "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
765 + "adox %%r8, %%r11 ;"
766 + "adcx %%r11, %%rcx ;"
767 + "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
768 + "adox %%r10, %%r13 ;"
769 + "adcx %%r13, %%r15 ;"
770 + /******************************************/
771 + "adox %%r14, %%rax ;"
772 + "adcx %%r14, %%rax ;"
773 +
774 + "movq 56(%1), %%rdx; " /* C[3] */
775 + "xorl %%r10d, %%r10d ;"
776 + "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
777 + "adox %%rbx, %%r8 ;"
778 + "movq %%r8, 88(%0);"
779 + "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
780 + "adox %%r10, %%r9 ;"
781 + "adcx %%r9, %%rcx ;"
782 + "movq %%rcx, 96(%0) ;"
783 + "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
784 + "adox %%r8, %%r11 ;"
785 + "adcx %%r11, %%r15 ;"
786 + "movq %%r15, 104(%0) ;"
787 + "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
788 + "adox %%r10, %%r13 ;"
789 + "adcx %%r13, %%rax ;"
790 + "movq %%rax, 112(%0) ;"
791 + /******************************************/
792 + "adox %%r14, %%rbx ;"
793 + "adcx %%r14, %%rbx ;"
794 + "movq %%rbx, 120(%0) ;"
795 + :
796 + : "r"(c), "r"(a), "r"(b)
797 + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
798 + "%r10", "%r11", "%r13", "%r14", "%r15");
799 +}
800 +
801 +static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a,
802 + const u64 *const b)
803 +{
804 + asm volatile(
805 + "movq (%1), %%rdx; " /* A[0] */
806 + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
807 + "movq %%r8, (%0) ;"
808 + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
809 + "addq %%r10, %%r15 ;"
810 + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
811 + "adcq %%r8, %%rax ;"
812 + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
813 + "adcq %%r10, %%rbx ;"
814 + /******************************************/
815 + "adcq $0, %%rcx ;"
816 +
817 + "movq 8(%1), %%rdx; " /* A[1] */
818 + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
819 + "addq %%r15, %%r8 ;"
820 + "movq %%r8, 8(%0) ;"
821 + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
822 + "adcq %%r10, %%r9 ;"
823 + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
824 + "adcq %%r8, %%r11 ;"
825 + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
826 + "adcq %%r10, %%r13 ;"
827 + /******************************************/
828 + "adcq $0, %%r15 ;"
829 +
830 + "addq %%r9, %%rax ;"
831 + "adcq %%r11, %%rbx ;"
832 + "adcq %%r13, %%rcx ;"
833 + "adcq $0, %%r15 ;"
834 +
835 + "movq 16(%1), %%rdx; " /* A[2] */
836 + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
837 + "addq %%rax, %%r8 ;"
838 + "movq %%r8, 16(%0) ;"
839 + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
840 + "adcq %%r10, %%r9 ;"
841 + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
842 + "adcq %%r8, %%r11 ;"
843 + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
844 + "adcq %%r10, %%r13 ;"
845 + /******************************************/
846 + "adcq $0, %%rax ;"
847 +
848 + "addq %%r9, %%rbx ;"
849 + "adcq %%r11, %%rcx ;"
850 + "adcq %%r13, %%r15 ;"
851 + "adcq $0, %%rax ;"
852 +
853 + "movq 24(%1), %%rdx; " /* A[3] */
854 + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
855 + "addq %%rbx, %%r8 ;"
856 + "movq %%r8, 24(%0) ;"
857 + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
858 + "adcq %%r10, %%r9 ;"
859 + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
860 + "adcq %%r8, %%r11 ;"
861 + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
862 + "adcq %%r10, %%r13 ;"
863 + /******************************************/
864 + "adcq $0, %%rbx ;"
865 +
866 + "addq %%r9, %%rcx ;"
867 + "movq %%rcx, 32(%0) ;"
868 + "adcq %%r11, %%r15 ;"
869 + "movq %%r15, 40(%0) ;"
870 + "adcq %%r13, %%rax ;"
871 + "movq %%rax, 48(%0) ;"
872 + "adcq $0, %%rbx ;"
873 + "movq %%rbx, 56(%0) ;"
874 +
875 + "movq 32(%1), %%rdx; " /* C[0] */
876 + "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */
877 + "movq %%r8, 64(%0) ;"
878 + "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
879 + "addq %%r10, %%r15 ;"
880 + "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
881 + "adcq %%r8, %%rax ;"
882 + "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
883 + "adcq %%r10, %%rbx ;"
884 + /******************************************/
885 + "adcq $0, %%rcx ;"
886 +
887 + "movq 40(%1), %%rdx; " /* C[1] */
888 + "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
889 + "addq %%r15, %%r8 ;"
890 + "movq %%r8, 72(%0) ;"
891 + "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
892 + "adcq %%r10, %%r9 ;"
893 + "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
894 + "adcq %%r8, %%r11 ;"
895 + "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
896 + "adcq %%r10, %%r13 ;"
897 + /******************************************/
898 + "adcq $0, %%r15 ;"
899 +
900 + "addq %%r9, %%rax ;"
901 + "adcq %%r11, %%rbx ;"
902 + "adcq %%r13, %%rcx ;"
903 + "adcq $0, %%r15 ;"
904 +
905 + "movq 48(%1), %%rdx; " /* C[2] */
906 + "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
907 + "addq %%rax, %%r8 ;"
908 + "movq %%r8, 80(%0) ;"
909 + "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
910 + "adcq %%r10, %%r9 ;"
911 + "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
912 + "adcq %%r8, %%r11 ;"
913 + "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
914 + "adcq %%r10, %%r13 ;"
915 + /******************************************/
916 + "adcq $0, %%rax ;"
917 +
918 + "addq %%r9, %%rbx ;"
919 + "adcq %%r11, %%rcx ;"
920 + "adcq %%r13, %%r15 ;"
921 + "adcq $0, %%rax ;"
922 +
923 + "movq 56(%1), %%rdx; " /* C[3] */
924 + "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
925 + "addq %%rbx, %%r8 ;"
926 + "movq %%r8, 88(%0) ;"
927 + "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
928 + "adcq %%r10, %%r9 ;"
929 + "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
930 + "adcq %%r8, %%r11 ;"
931 + "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
932 + "adcq %%r10, %%r13 ;"
933 + /******************************************/
934 + "adcq $0, %%rbx ;"
935 +
936 + "addq %%r9, %%rcx ;"
937 + "movq %%rcx, 96(%0) ;"
938 + "adcq %%r11, %%r15 ;"
939 + "movq %%r15, 104(%0) ;"
940 + "adcq %%r13, %%rax ;"
941 + "movq %%rax, 112(%0) ;"
942 + "adcq $0, %%rbx ;"
943 + "movq %%rbx, 120(%0) ;"
944 + :
945 + : "r"(c), "r"(a), "r"(b)
946 + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
947 + "%r10", "%r11", "%r13", "%r15");
948 +}
949 +
950 +static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a)
951 +{
952 + asm volatile(
953 + "movq (%1), %%rdx ;" /* A[0] */
954 + "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
955 + "xorl %%r15d, %%r15d;"
956 + "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
957 + "adcx %%r14, %%r9 ;"
958 + "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
959 + "adcx %%rax, %%r10 ;"
960 + "movq 24(%1), %%rdx ;" /* A[3] */
961 + "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
962 + "adcx %%rcx, %%r11 ;"
963 + "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
964 + "adcx %%rax, %%rbx ;"
965 + "movq 8(%1), %%rdx ;" /* A[1] */
966 + "adcx %%r15, %%r13 ;"
967 + "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
968 + "movq $0, %%r14 ;"
969 + /******************************************/
970 + "adcx %%r15, %%r14 ;"
971 +
972 + "xorl %%r15d, %%r15d;"
973 + "adox %%rax, %%r10 ;"
974 + "adcx %%r8, %%r8 ;"
975 + "adox %%rcx, %%r11 ;"
976 + "adcx %%r9, %%r9 ;"
977 + "adox %%r15, %%rbx ;"
978 + "adcx %%r10, %%r10 ;"
979 + "adox %%r15, %%r13 ;"
980 + "adcx %%r11, %%r11 ;"
981 + "adox %%r15, %%r14 ;"
982 + "adcx %%rbx, %%rbx ;"
983 + "adcx %%r13, %%r13 ;"
984 + "adcx %%r14, %%r14 ;"
985 +
986 + "movq (%1), %%rdx ;"
987 + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
988 + /*******************/
989 + "movq %%rax, 0(%0) ;"
990 + "addq %%rcx, %%r8 ;"
991 + "movq %%r8, 8(%0) ;"
992 + "movq 8(%1), %%rdx ;"
993 + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
994 + "adcq %%rax, %%r9 ;"
995 + "movq %%r9, 16(%0) ;"
996 + "adcq %%rcx, %%r10 ;"
997 + "movq %%r10, 24(%0) ;"
998 + "movq 16(%1), %%rdx ;"
999 + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
1000 + "adcq %%rax, %%r11 ;"
1001 + "movq %%r11, 32(%0) ;"
1002 + "adcq %%rcx, %%rbx ;"
1003 + "movq %%rbx, 40(%0) ;"
1004 + "movq 24(%1), %%rdx ;"
1005 + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
1006 + "adcq %%rax, %%r13 ;"
1007 + "movq %%r13, 48(%0) ;"
1008 + "adcq %%rcx, %%r14 ;"
1009 + "movq %%r14, 56(%0) ;"
1010 +
1011 +
1012 + "movq 32(%1), %%rdx ;" /* B[0] */
1013 + "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */
1014 + "xorl %%r15d, %%r15d;"
1015 + "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */
1016 + "adcx %%r14, %%r9 ;"
1017 + "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */
1018 + "adcx %%rax, %%r10 ;"
1019 + "movq 56(%1), %%rdx ;" /* B[3] */
1020 + "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */
1021 + "adcx %%rcx, %%r11 ;"
1022 + "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */
1023 + "adcx %%rax, %%rbx ;"
1024 + "movq 40(%1), %%rdx ;" /* B[1] */
1025 + "adcx %%r15, %%r13 ;"
1026 + "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */
1027 + "movq $0, %%r14 ;"
1028 + /******************************************/
1029 + "adcx %%r15, %%r14 ;"
1030 +
1031 + "xorl %%r15d, %%r15d;"
1032 + "adox %%rax, %%r10 ;"
1033 + "adcx %%r8, %%r8 ;"
1034 + "adox %%rcx, %%r11 ;"
1035 + "adcx %%r9, %%r9 ;"
1036 + "adox %%r15, %%rbx ;"
1037 + "adcx %%r10, %%r10 ;"
1038 + "adox %%r15, %%r13 ;"
1039 + "adcx %%r11, %%r11 ;"
1040 + "adox %%r15, %%r14 ;"
1041 + "adcx %%rbx, %%rbx ;"
1042 + "adcx %%r13, %%r13 ;"
1043 + "adcx %%r14, %%r14 ;"
1044 +
1045 + "movq 32(%1), %%rdx ;"
1046 + "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
1047 + /*******************/
1048 + "movq %%rax, 64(%0) ;"
1049 + "addq %%rcx, %%r8 ;"
1050 + "movq %%r8, 72(%0) ;"
1051 + "movq 40(%1), %%rdx ;"
1052 + "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
1053 + "adcq %%rax, %%r9 ;"
1054 + "movq %%r9, 80(%0) ;"
1055 + "adcq %%rcx, %%r10 ;"
1056 + "movq %%r10, 88(%0) ;"
1057 + "movq 48(%1), %%rdx ;"
1058 + "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
1059 + "adcq %%rax, %%r11 ;"
1060 + "movq %%r11, 96(%0) ;"
1061 + "adcq %%rcx, %%rbx ;"
1062 + "movq %%rbx, 104(%0) ;"
1063 + "movq 56(%1), %%rdx ;"
1064 + "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
1065 + "adcq %%rax, %%r13 ;"
1066 + "movq %%r13, 112(%0) ;"
1067 + "adcq %%rcx, %%r14 ;"
1068 + "movq %%r14, 120(%0) ;"
1069 + :
1070 + : "r"(c), "r"(a)
1071 + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1072 + "%r10", "%r11", "%r13", "%r14", "%r15");
1073 +}
1074 +
1075 +static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a)
1076 +{
1077 + asm volatile(
1078 + "movq 8(%1), %%rdx ;" /* A[1] */
1079 + "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
1080 + "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
1081 + "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
1082 +
1083 + "movq 16(%1), %%rdx ;" /* A[2] */
1084 + "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
1085 + "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
1086 +
1087 + "addq %%rax, %%r9 ;"
1088 + "adcq %%rdx, %%r10 ;"
1089 + "adcq %%rcx, %%r11 ;"
1090 + "adcq %%r14, %%r15 ;"
1091 + "adcq $0, %%r13 ;"
1092 + "movq $0, %%r14 ;"
1093 + "adcq $0, %%r14 ;"
1094 +
1095 + "movq (%1), %%rdx ;" /* A[0] */
1096 + "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
1097 +
1098 + "addq %%rax, %%r10 ;"
1099 + "adcq %%rcx, %%r11 ;"
1100 + "adcq $0, %%r15 ;"
1101 + "adcq $0, %%r13 ;"
1102 + "adcq $0, %%r14 ;"
1103 +
1104 + "shldq $1, %%r13, %%r14 ;"
1105 + "shldq $1, %%r15, %%r13 ;"
1106 + "shldq $1, %%r11, %%r15 ;"
1107 + "shldq $1, %%r10, %%r11 ;"
1108 + "shldq $1, %%r9, %%r10 ;"
1109 + "shldq $1, %%r8, %%r9 ;"
1110 + "shlq $1, %%r8 ;"
1111 +
1112 + /*******************/
1113 + "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
1114 + /*******************/
1115 + "movq %%rax, 0(%0) ;"
1116 + "addq %%rcx, %%r8 ;"
1117 + "movq %%r8, 8(%0) ;"
1118 + "movq 8(%1), %%rdx ;"
1119 + "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
1120 + "adcq %%rax, %%r9 ;"
1121 + "movq %%r9, 16(%0) ;"
1122 + "adcq %%rcx, %%r10 ;"
1123 + "movq %%r10, 24(%0) ;"
1124 + "movq 16(%1), %%rdx ;"
1125 + "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
1126 + "adcq %%rax, %%r11 ;"
1127 + "movq %%r11, 32(%0) ;"
1128 + "adcq %%rcx, %%r15 ;"
1129 + "movq %%r15, 40(%0) ;"
1130 + "movq 24(%1), %%rdx ;"
1131 + "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
1132 + "adcq %%rax, %%r13 ;"
1133 + "movq %%r13, 48(%0) ;"
1134 + "adcq %%rcx, %%r14 ;"
1135 + "movq %%r14, 56(%0) ;"
1136 +
1137 + "movq 40(%1), %%rdx ;" /* B[1] */
1138 + "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */
1139 + "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
1140 + "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
1141 +
1142 + "movq 48(%1), %%rdx ;" /* B[2] */
1143 + "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */
1144 + "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
1145 +
1146 + "addq %%rax, %%r9 ;"
1147 + "adcq %%rdx, %%r10 ;"
1148 + "adcq %%rcx, %%r11 ;"
1149 + "adcq %%r14, %%r15 ;"
1150 + "adcq $0, %%r13 ;"
1151 + "movq $0, %%r14 ;"
1152 + "adcq $0, %%r14 ;"
1153 +
1154 + "movq 32(%1), %%rdx ;" /* B[0] */
1155 + "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
1156 +
1157 + "addq %%rax, %%r10 ;"
1158 + "adcq %%rcx, %%r11 ;"
1159 + "adcq $0, %%r15 ;"
1160 + "adcq $0, %%r13 ;"
1161 + "adcq $0, %%r14 ;"
1162 +
1163 + "shldq $1, %%r13, %%r14 ;"
1164 + "shldq $1, %%r15, %%r13 ;"
1165 + "shldq $1, %%r11, %%r15 ;"
1166 + "shldq $1, %%r10, %%r11 ;"
1167 + "shldq $1, %%r9, %%r10 ;"
1168 + "shldq $1, %%r8, %%r9 ;"
1169 + "shlq $1, %%r8 ;"
1170 +
1171 + /*******************/
1172 + "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
1173 + /*******************/
1174 + "movq %%rax, 64(%0) ;"
1175 + "addq %%rcx, %%r8 ;"
1176 + "movq %%r8, 72(%0) ;"
1177 + "movq 40(%1), %%rdx ;"
1178 + "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
1179 + "adcq %%rax, %%r9 ;"
1180 + "movq %%r9, 80(%0) ;"
1181 + "adcq %%rcx, %%r10 ;"
1182 + "movq %%r10, 88(%0) ;"
1183 + "movq 48(%1), %%rdx ;"
1184 + "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
1185 + "adcq %%rax, %%r11 ;"
1186 + "movq %%r11, 96(%0) ;"
1187 + "adcq %%rcx, %%r15 ;"
1188 + "movq %%r15, 104(%0) ;"
1189 + "movq 56(%1), %%rdx ;"
1190 + "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
1191 + "adcq %%rax, %%r13 ;"
1192 + "movq %%r13, 112(%0) ;"
1193 + "adcq %%rcx, %%r14 ;"
1194 + "movq %%r14, 120(%0) ;"
1195 + :
1196 + : "r"(c), "r"(a)
1197 + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1198 + "%r11", "%r13", "%r14", "%r15");
1199 +}
1200 +
1201 +static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a)
1202 +{
1203 + asm volatile(
1204 + "movl $38, %%edx; " /* 2*c = 38 = 2^256 */
1205 + "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */
1206 + "xorl %%ebx, %%ebx ;"
1207 + "adox (%1), %%r8 ;"
1208 + "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */
1209 + "adcx %%r10, %%r9 ;"
1210 + "adox 8(%1), %%r9 ;"
1211 + "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */
1212 + "adcx %%r11, %%r10 ;"
1213 + "adox 16(%1), %%r10 ;"
1214 + "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */
1215 + "adcx %%rax, %%r11 ;"
1216 + "adox 24(%1), %%r11 ;"
1217 + /***************************************/
1218 + "adcx %%rbx, %%rcx ;"
1219 + "adox %%rbx, %%rcx ;"
1220 + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1221 + "adcx %%rcx, %%r8 ;"
1222 + "adcx %%rbx, %%r9 ;"
1223 + "movq %%r9, 8(%0) ;"
1224 + "adcx %%rbx, %%r10 ;"
1225 + "movq %%r10, 16(%0) ;"
1226 + "adcx %%rbx, %%r11 ;"
1227 + "movq %%r11, 24(%0) ;"
1228 + "mov $0, %%ecx ;"
1229 + "cmovc %%edx, %%ecx ;"
1230 + "addq %%rcx, %%r8 ;"
1231 + "movq %%r8, (%0) ;"
1232 +
1233 + "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */
1234 + "xorl %%ebx, %%ebx ;"
1235 + "adox 64(%1), %%r8 ;"
1236 + "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */
1237 + "adcx %%r10, %%r9 ;"
1238 + "adox 72(%1), %%r9 ;"
1239 + "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */
1240 + "adcx %%r11, %%r10 ;"
1241 + "adox 80(%1), %%r10 ;"
1242 + "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */
1243 + "adcx %%rax, %%r11 ;"
1244 + "adox 88(%1), %%r11 ;"
1245 + /****************************************/
1246 + "adcx %%rbx, %%rcx ;"
1247 + "adox %%rbx, %%rcx ;"
1248 + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1249 + "adcx %%rcx, %%r8 ;"
1250 + "adcx %%rbx, %%r9 ;"
1251 + "movq %%r9, 40(%0) ;"
1252 + "adcx %%rbx, %%r10 ;"
1253 + "movq %%r10, 48(%0) ;"
1254 + "adcx %%rbx, %%r11 ;"
1255 + "movq %%r11, 56(%0) ;"
1256 + "mov $0, %%ecx ;"
1257 + "cmovc %%edx, %%ecx ;"
1258 + "addq %%rcx, %%r8 ;"
1259 + "movq %%r8, 32(%0) ;"
1260 + :
1261 + : "r"(c), "r"(a)
1262 + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1263 + "%r10", "%r11");
1264 +}
1265 +
1266 +static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a)
1267 +{
1268 + asm volatile(
1269 + "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */
1270 + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
1271 + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
1272 + "addq %%r10, %%r9 ;"
1273 + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1274 + "adcq %%r11, %%r10 ;"
1275 + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1276 + "adcq %%rax, %%r11 ;"
1277 + /***************************************/
1278 + "adcq $0, %%rcx ;"
1279 + "addq (%1), %%r8 ;"
1280 + "adcq 8(%1), %%r9 ;"
1281 + "adcq 16(%1), %%r10 ;"
1282 + "adcq 24(%1), %%r11 ;"
1283 + "adcq $0, %%rcx ;"
1284 + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1285 + "addq %%rcx, %%r8 ;"
1286 + "adcq $0, %%r9 ;"
1287 + "movq %%r9, 8(%0) ;"
1288 + "adcq $0, %%r10 ;"
1289 + "movq %%r10, 16(%0) ;"
1290 + "adcq $0, %%r11 ;"
1291 + "movq %%r11, 24(%0) ;"
1292 + "mov $0, %%ecx ;"
1293 + "cmovc %%edx, %%ecx ;"
1294 + "addq %%rcx, %%r8 ;"
1295 + "movq %%r8, (%0) ;"
1296 +
1297 + "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */
1298 + "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */
1299 + "addq %%r10, %%r9 ;"
1300 + "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */
1301 + "adcq %%r11, %%r10 ;"
1302 + "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */
1303 + "adcq %%rax, %%r11 ;"
1304 + /****************************************/
1305 + "adcq $0, %%rcx ;"
1306 + "addq 64(%1), %%r8 ;"
1307 + "adcq 72(%1), %%r9 ;"
1308 + "adcq 80(%1), %%r10 ;"
1309 + "adcq 88(%1), %%r11 ;"
1310 + "adcq $0, %%rcx ;"
1311 + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1312 + "addq %%rcx, %%r8 ;"
1313 + "adcq $0, %%r9 ;"
1314 + "movq %%r9, 40(%0) ;"
1315 + "adcq $0, %%r10 ;"
1316 + "movq %%r10, 48(%0) ;"
1317 + "adcq $0, %%r11 ;"
1318 + "movq %%r11, 56(%0) ;"
1319 + "mov $0, %%ecx ;"
1320 + "cmovc %%edx, %%ecx ;"
1321 + "addq %%rcx, %%r8 ;"
1322 + "movq %%r8, 32(%0) ;"
1323 + :
1324 + : "r"(c), "r"(a)
1325 + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1326 + "%r11");
1327 +}
1328 +
1329 +static void mul_256x256_integer_adx(u64 *const c, const u64 *const a,
1330 + const u64 *const b)
1331 +{
1332 + asm volatile(
1333 + "movq (%1), %%rdx; " /* A[0] */
1334 + "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */
1335 + "xorl %%r10d, %%r10d ;"
1336 + "movq %%r8, (%0) ;"
1337 + "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */
1338 + "adox %%r9, %%r10 ;"
1339 + "movq %%r10, 8(%0) ;"
1340 + "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */
1341 + "adox %%r11, %%r15 ;"
1342 + "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */
1343 + "adox %%r13, %%r14 ;"
1344 + "movq $0, %%rax ;"
1345 + /******************************************/
1346 + "adox %%rdx, %%rax ;"
1347 +
1348 + "movq 8(%1), %%rdx; " /* A[1] */
1349 + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
1350 + "xorl %%r10d, %%r10d ;"
1351 + "adcx 8(%0), %%r8 ;"
1352 + "movq %%r8, 8(%0) ;"
1353 + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
1354 + "adox %%r9, %%r10 ;"
1355 + "adcx %%r15, %%r10 ;"
1356 + "movq %%r10, 16(%0) ;"
1357 + "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */
1358 + "adox %%r11, %%r15 ;"
1359 + "adcx %%r14, %%r15 ;"
1360 + "movq $0, %%r8 ;"
1361 + "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */
1362 + "adox %%r13, %%r14 ;"
1363 + "adcx %%rax, %%r14 ;"
1364 + "movq $0, %%rax ;"
1365 + /******************************************/
1366 + "adox %%rdx, %%rax ;"
1367 + "adcx %%r8, %%rax ;"
1368 +
1369 + "movq 16(%1), %%rdx; " /* A[2] */
1370 + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
1371 + "xorl %%r10d, %%r10d ;"
1372 + "adcx 16(%0), %%r8 ;"
1373 + "movq %%r8, 16(%0) ;"
1374 + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
1375 + "adox %%r9, %%r10 ;"
1376 + "adcx %%r15, %%r10 ;"
1377 + "movq %%r10, 24(%0) ;"
1378 + "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */
1379 + "adox %%r11, %%r15 ;"
1380 + "adcx %%r14, %%r15 ;"
1381 + "movq $0, %%r8 ;"
1382 + "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */
1383 + "adox %%r13, %%r14 ;"
1384 + "adcx %%rax, %%r14 ;"
1385 + "movq $0, %%rax ;"
1386 + /******************************************/
1387 + "adox %%rdx, %%rax ;"
1388 + "adcx %%r8, %%rax ;"
1389 +
1390 + "movq 24(%1), %%rdx; " /* A[3] */
1391 + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
1392 + "xorl %%r10d, %%r10d ;"
1393 + "adcx 24(%0), %%r8 ;"
1394 + "movq %%r8, 24(%0) ;"
1395 + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
1396 + "adox %%r9, %%r10 ;"
1397 + "adcx %%r15, %%r10 ;"
1398 + "movq %%r10, 32(%0) ;"
1399 + "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */
1400 + "adox %%r11, %%r15 ;"
1401 + "adcx %%r14, %%r15 ;"
1402 + "movq %%r15, 40(%0) ;"
1403 + "movq $0, %%r8 ;"
1404 + "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */
1405 + "adox %%r13, %%r14 ;"
1406 + "adcx %%rax, %%r14 ;"
1407 + "movq %%r14, 48(%0) ;"
1408 + "movq $0, %%rax ;"
1409 + /******************************************/
1410 + "adox %%rdx, %%rax ;"
1411 + "adcx %%r8, %%rax ;"
1412 + "movq %%rax, 56(%0) ;"
1413 + :
1414 + : "r"(c), "r"(a), "r"(b)
1415 + : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11",
1416 + "%r13", "%r14", "%r15");
1417 +}
1418 +
1419 +static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a,
1420 + const u64 *const b)
1421 +{
1422 + asm volatile(
1423 + "movq (%1), %%rdx; " /* A[0] */
1424 + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
1425 + "movq %%r8, (%0) ;"
1426 + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
1427 + "addq %%r10, %%r15 ;"
1428 + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
1429 + "adcq %%r8, %%rax ;"
1430 + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
1431 + "adcq %%r10, %%rbx ;"
1432 + /******************************************/
1433 + "adcq $0, %%rcx ;"
1434 +
1435 + "movq 8(%1), %%rdx; " /* A[1] */
1436 + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
1437 + "addq %%r15, %%r8 ;"
1438 + "movq %%r8, 8(%0) ;"
1439 + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
1440 + "adcq %%r10, %%r9 ;"
1441 + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
1442 + "adcq %%r8, %%r11 ;"
1443 + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
1444 + "adcq %%r10, %%r13 ;"
1445 + /******************************************/
1446 + "adcq $0, %%r15 ;"
1447 +
1448 + "addq %%r9, %%rax ;"
1449 + "adcq %%r11, %%rbx ;"
1450 + "adcq %%r13, %%rcx ;"
1451 + "adcq $0, %%r15 ;"
1452 +
1453 + "movq 16(%1), %%rdx; " /* A[2] */
1454 + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
1455 + "addq %%rax, %%r8 ;"
1456 + "movq %%r8, 16(%0) ;"
1457 + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
1458 + "adcq %%r10, %%r9 ;"
1459 + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
1460 + "adcq %%r8, %%r11 ;"
1461 + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
1462 + "adcq %%r10, %%r13 ;"
1463 + /******************************************/
1464 + "adcq $0, %%rax ;"
1465 +
1466 + "addq %%r9, %%rbx ;"
1467 + "adcq %%r11, %%rcx ;"
1468 + "adcq %%r13, %%r15 ;"
1469 + "adcq $0, %%rax ;"
1470 +
1471 + "movq 24(%1), %%rdx; " /* A[3] */
1472 + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
1473 + "addq %%rbx, %%r8 ;"
1474 + "movq %%r8, 24(%0) ;"
1475 + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
1476 + "adcq %%r10, %%r9 ;"
1477 + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
1478 + "adcq %%r8, %%r11 ;"
1479 + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
1480 + "adcq %%r10, %%r13 ;"
1481 + /******************************************/
1482 + "adcq $0, %%rbx ;"
1483 +
1484 + "addq %%r9, %%rcx ;"
1485 + "movq %%rcx, 32(%0) ;"
1486 + "adcq %%r11, %%r15 ;"
1487 + "movq %%r15, 40(%0) ;"
1488 + "adcq %%r13, %%rax ;"
1489 + "movq %%rax, 48(%0) ;"
1490 + "adcq $0, %%rbx ;"
1491 + "movq %%rbx, 56(%0) ;"
1492 + :
1493 + : "r"(c), "r"(a), "r"(b)
1494 + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1495 + "%r10", "%r11", "%r13", "%r15");
1496 +}
1497 +
1498 +static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a)
1499 +{
1500 + asm volatile(
1501 + "movq (%1), %%rdx ;" /* A[0] */
1502 + "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
1503 + "xorl %%r15d, %%r15d;"
1504 + "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
1505 + "adcx %%r14, %%r9 ;"
1506 + "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
1507 + "adcx %%rax, %%r10 ;"
1508 + "movq 24(%1), %%rdx ;" /* A[3] */
1509 + "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
1510 + "adcx %%rcx, %%r11 ;"
1511 + "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
1512 + "adcx %%rax, %%rbx ;"
1513 + "movq 8(%1), %%rdx ;" /* A[1] */
1514 + "adcx %%r15, %%r13 ;"
1515 + "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
1516 + "movq $0, %%r14 ;"
1517 + /******************************************/
1518 + "adcx %%r15, %%r14 ;"
1519 +
1520 + "xorl %%r15d, %%r15d;"
1521 + "adox %%rax, %%r10 ;"
1522 + "adcx %%r8, %%r8 ;"
1523 + "adox %%rcx, %%r11 ;"
1524 + "adcx %%r9, %%r9 ;"
1525 + "adox %%r15, %%rbx ;"
1526 + "adcx %%r10, %%r10 ;"
1527 + "adox %%r15, %%r13 ;"
1528 + "adcx %%r11, %%r11 ;"
1529 + "adox %%r15, %%r14 ;"
1530 + "adcx %%rbx, %%rbx ;"
1531 + "adcx %%r13, %%r13 ;"
1532 + "adcx %%r14, %%r14 ;"
1533 +
1534 + "movq (%1), %%rdx ;"
1535 + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
1536 + /*******************/
1537 + "movq %%rax, 0(%0) ;"
1538 + "addq %%rcx, %%r8 ;"
1539 + "movq %%r8, 8(%0) ;"
1540 + "movq 8(%1), %%rdx ;"
1541 + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
1542 + "adcq %%rax, %%r9 ;"
1543 + "movq %%r9, 16(%0) ;"
1544 + "adcq %%rcx, %%r10 ;"
1545 + "movq %%r10, 24(%0) ;"
1546 + "movq 16(%1), %%rdx ;"
1547 + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
1548 + "adcq %%rax, %%r11 ;"
1549 + "movq %%r11, 32(%0) ;"
1550 + "adcq %%rcx, %%rbx ;"
1551 + "movq %%rbx, 40(%0) ;"
1552 + "movq 24(%1), %%rdx ;"
1553 + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
1554 + "adcq %%rax, %%r13 ;"
1555 + "movq %%r13, 48(%0) ;"
1556 + "adcq %%rcx, %%r14 ;"
1557 + "movq %%r14, 56(%0) ;"
1558 + :
1559 + : "r"(c), "r"(a)
1560 + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1561 + "%r10", "%r11", "%r13", "%r14", "%r15");
1562 +}
1563 +
1564 +static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a)
1565 +{
1566 + asm volatile(
1567 + "movq 8(%1), %%rdx ;" /* A[1] */
1568 + "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
1569 + "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
1570 + "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
1571 +
1572 + "movq 16(%1), %%rdx ;" /* A[2] */
1573 + "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
1574 + "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
1575 +
1576 + "addq %%rax, %%r9 ;"
1577 + "adcq %%rdx, %%r10 ;"
1578 + "adcq %%rcx, %%r11 ;"
1579 + "adcq %%r14, %%r15 ;"
1580 + "adcq $0, %%r13 ;"
1581 + "movq $0, %%r14 ;"
1582 + "adcq $0, %%r14 ;"
1583 +
1584 + "movq (%1), %%rdx ;" /* A[0] */
1585 + "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
1586 +
1587 + "addq %%rax, %%r10 ;"
1588 + "adcq %%rcx, %%r11 ;"
1589 + "adcq $0, %%r15 ;"
1590 + "adcq $0, %%r13 ;"
1591 + "adcq $0, %%r14 ;"
1592 +
1593 + "shldq $1, %%r13, %%r14 ;"
1594 + "shldq $1, %%r15, %%r13 ;"
1595 + "shldq $1, %%r11, %%r15 ;"
1596 + "shldq $1, %%r10, %%r11 ;"
1597 + "shldq $1, %%r9, %%r10 ;"
1598 + "shldq $1, %%r8, %%r9 ;"
1599 + "shlq $1, %%r8 ;"
1600 +
1601 + /*******************/
1602 + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
1603 + /*******************/
1604 + "movq %%rax, 0(%0) ;"
1605 + "addq %%rcx, %%r8 ;"
1606 + "movq %%r8, 8(%0) ;"
1607 + "movq 8(%1), %%rdx ;"
1608 + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
1609 + "adcq %%rax, %%r9 ;"
1610 + "movq %%r9, 16(%0) ;"
1611 + "adcq %%rcx, %%r10 ;"
1612 + "movq %%r10, 24(%0) ;"
1613 + "movq 16(%1), %%rdx ;"
1614 + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
1615 + "adcq %%rax, %%r11 ;"
1616 + "movq %%r11, 32(%0) ;"
1617 + "adcq %%rcx, %%r15 ;"
1618 + "movq %%r15, 40(%0) ;"
1619 + "movq 24(%1), %%rdx ;"
1620 + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
1621 + "adcq %%rax, %%r13 ;"
1622 + "movq %%r13, 48(%0) ;"
1623 + "adcq %%rcx, %%r14 ;"
1624 + "movq %%r14, 56(%0) ;"
1625 + :
1626 + : "r"(c), "r"(a)
1627 + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1628 + "%r11", "%r13", "%r14", "%r15");
1629 +}
1630 +
1631 +static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
1632 +{
1633 + asm volatile(
1634 + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
1635 + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
1636 + "xorl %%ebx, %%ebx ;"
1637 + "adox (%1), %%r8 ;"
1638 + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
1639 + "adcx %%r10, %%r9 ;"
1640 + "adox 8(%1), %%r9 ;"
1641 + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1642 + "adcx %%r11, %%r10 ;"
1643 + "adox 16(%1), %%r10 ;"
1644 + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1645 + "adcx %%rax, %%r11 ;"
1646 + "adox 24(%1), %%r11 ;"
1647 + /***************************************/
1648 + "adcx %%rbx, %%rcx ;"
1649 + "adox %%rbx, %%rcx ;"
1650 + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1651 + "adcx %%rcx, %%r8 ;"
1652 + "adcx %%rbx, %%r9 ;"
1653 + "movq %%r9, 8(%0) ;"
1654 + "adcx %%rbx, %%r10 ;"
1655 + "movq %%r10, 16(%0) ;"
1656 + "adcx %%rbx, %%r11 ;"
1657 + "movq %%r11, 24(%0) ;"
1658 + "mov $0, %%ecx ;"
1659 + "cmovc %%edx, %%ecx ;"
1660 + "addq %%rcx, %%r8 ;"
1661 + "movq %%r8, (%0) ;"
1662 + :
1663 + : "r"(c), "r"(a)
1664 + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1665 + "%r10", "%r11");
1666 +}
1667 +
1668 +static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
1669 +{
1670 + asm volatile(
1671 + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
1672 + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
1673 + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
1674 + "addq %%r10, %%r9 ;"
1675 + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1676 + "adcq %%r11, %%r10 ;"
1677 + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1678 + "adcq %%rax, %%r11 ;"
1679 + /***************************************/
1680 + "adcq $0, %%rcx ;"
1681 + "addq (%1), %%r8 ;"
1682 + "adcq 8(%1), %%r9 ;"
1683 + "adcq 16(%1), %%r10 ;"
1684 + "adcq 24(%1), %%r11 ;"
1685 + "adcq $0, %%rcx ;"
1686 + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1687 + "addq %%rcx, %%r8 ;"
1688 + "adcq $0, %%r9 ;"
1689 + "movq %%r9, 8(%0) ;"
1690 + "adcq $0, %%r10 ;"
1691 + "movq %%r10, 16(%0) ;"
1692 + "adcq $0, %%r11 ;"
1693 + "movq %%r11, 24(%0) ;"
1694 + "mov $0, %%ecx ;"
1695 + "cmovc %%edx, %%ecx ;"
1696 + "addq %%rcx, %%r8 ;"
1697 + "movq %%r8, (%0) ;"
1698 + :
1699 + : "r"(c), "r"(a)
1700 + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1701 + "%r11");
1702 +}
1703 +
1704 +static __always_inline void
1705 +add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b)
1706 +{
1707 + asm volatile(
1708 + "mov $38, %%eax ;"
1709 + "xorl %%ecx, %%ecx ;"
1710 + "movq (%2), %%r8 ;"
1711 + "adcx (%1), %%r8 ;"
1712 + "movq 8(%2), %%r9 ;"
1713 + "adcx 8(%1), %%r9 ;"
1714 + "movq 16(%2), %%r10 ;"
1715 + "adcx 16(%1), %%r10 ;"
1716 + "movq 24(%2), %%r11 ;"
1717 + "adcx 24(%1), %%r11 ;"
1718 + "cmovc %%eax, %%ecx ;"
1719 + "xorl %%eax, %%eax ;"
1720 + "adcx %%rcx, %%r8 ;"
1721 + "adcx %%rax, %%r9 ;"
1722 + "movq %%r9, 8(%0) ;"
1723 + "adcx %%rax, %%r10 ;"
1724 + "movq %%r10, 16(%0) ;"
1725 + "adcx %%rax, %%r11 ;"
1726 + "movq %%r11, 24(%0) ;"
1727 + "mov $38, %%ecx ;"
1728 + "cmovc %%ecx, %%eax ;"
1729 + "addq %%rax, %%r8 ;"
1730 + "movq %%r8, (%0) ;"
1731 + :
1732 + : "r"(c), "r"(a), "r"(b)
1733 + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
1734 +}
1735 +
1736 +static __always_inline void
1737 +add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b)
1738 +{
1739 + asm volatile(
1740 + "mov $38, %%eax ;"
1741 + "movq (%2), %%r8 ;"
1742 + "addq (%1), %%r8 ;"
1743 + "movq 8(%2), %%r9 ;"
1744 + "adcq 8(%1), %%r9 ;"
1745 + "movq 16(%2), %%r10 ;"
1746 + "adcq 16(%1), %%r10 ;"
1747 + "movq 24(%2), %%r11 ;"
1748 + "adcq 24(%1), %%r11 ;"
1749 + "mov $0, %%ecx ;"
1750 + "cmovc %%eax, %%ecx ;"
1751 + "addq %%rcx, %%r8 ;"
1752 + "adcq $0, %%r9 ;"
1753 + "movq %%r9, 8(%0) ;"
1754 + "adcq $0, %%r10 ;"
1755 + "movq %%r10, 16(%0) ;"
1756 + "adcq $0, %%r11 ;"
1757 + "movq %%r11, 24(%0) ;"
1758 + "mov $0, %%ecx ;"
1759 + "cmovc %%eax, %%ecx ;"
1760 + "addq %%rcx, %%r8 ;"
1761 + "movq %%r8, (%0) ;"
1762 + :
1763 + : "r"(c), "r"(a), "r"(b)
1764 + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
1765 +}
1766 +
1767 +static __always_inline void
1768 +sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b)
1769 +{
1770 + asm volatile(
1771 + "mov $38, %%eax ;"
1772 + "movq (%1), %%r8 ;"
1773 + "subq (%2), %%r8 ;"
1774 + "movq 8(%1), %%r9 ;"
1775 + "sbbq 8(%2), %%r9 ;"
1776 + "movq 16(%1), %%r10 ;"
1777 + "sbbq 16(%2), %%r10 ;"
1778 + "movq 24(%1), %%r11 ;"
1779 + "sbbq 24(%2), %%r11 ;"
1780 + "mov $0, %%ecx ;"
1781 + "cmovc %%eax, %%ecx ;"
1782 + "subq %%rcx, %%r8 ;"
1783 + "sbbq $0, %%r9 ;"
1784 + "movq %%r9, 8(%0) ;"
1785 + "sbbq $0, %%r10 ;"
1786 + "movq %%r10, 16(%0) ;"
1787 + "sbbq $0, %%r11 ;"
1788 + "movq %%r11, 24(%0) ;"
1789 + "mov $0, %%ecx ;"
1790 + "cmovc %%eax, %%ecx ;"
1791 + "subq %%rcx, %%r8 ;"
1792 + "movq %%r8, (%0) ;"
1793 + :
1794 + : "r"(c), "r"(a), "r"(b)
1795 + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
1796 +}
1797 +
1798 +/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */
1799 +static __always_inline void
1800 +mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a)
1801 +{
1802 + const u64 a24 = 121666;
1803 + asm volatile(
1804 + "movq %2, %%rdx ;"
1805 + "mulx (%1), %%r8, %%r10 ;"
1806 + "mulx 8(%1), %%r9, %%r11 ;"
1807 + "addq %%r10, %%r9 ;"
1808 + "mulx 16(%1), %%r10, %%rax ;"
1809 + "adcq %%r11, %%r10 ;"
1810 + "mulx 24(%1), %%r11, %%rcx ;"
1811 + "adcq %%rax, %%r11 ;"
1812 + /**************************/
1813 + "adcq $0, %%rcx ;"
1814 + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
1815 + "imul %%rdx, %%rcx ;"
1816 + "addq %%rcx, %%r8 ;"
1817 + "adcq $0, %%r9 ;"
1818 + "movq %%r9, 8(%0) ;"
1819 + "adcq $0, %%r10 ;"
1820 + "movq %%r10, 16(%0) ;"
1821 + "adcq $0, %%r11 ;"
1822 + "movq %%r11, 24(%0) ;"
1823 + "mov $0, %%ecx ;"
1824 + "cmovc %%edx, %%ecx ;"
1825 + "addq %%rcx, %%r8 ;"
1826 + "movq %%r8, (%0) ;"
1827 + :
1828 + : "r"(c), "r"(a), "r"(a24)
1829 + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1830 + "%r11");
1831 +}
1832 +
1833 +static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
1834 +{
1835 + struct {
1836 + eltfp25519_1w_buffer buffer;
1837 + eltfp25519_1w x0, x1, x2;
1838 + } __aligned(32) m;
1839 + u64 *T[4];
1840 +
1841 + T[0] = m.x0;
1842 + T[1] = c; /* x^(-1) */
1843 + T[2] = m.x1;
1844 + T[3] = m.x2;
1845 +
1846 + copy_eltfp25519_1w(T[1], a);
1847 + sqrn_eltfp25519_1w_adx(T[1], 1);
1848 + copy_eltfp25519_1w(T[2], T[1]);
1849 + sqrn_eltfp25519_1w_adx(T[2], 2);
1850 + mul_eltfp25519_1w_adx(T[0], a, T[2]);
1851 + mul_eltfp25519_1w_adx(T[1], T[1], T[0]);
1852 + copy_eltfp25519_1w(T[2], T[1]);
1853 + sqrn_eltfp25519_1w_adx(T[2], 1);
1854 + mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
1855 + copy_eltfp25519_1w(T[2], T[0]);
1856 + sqrn_eltfp25519_1w_adx(T[2], 5);
1857 + mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
1858 + copy_eltfp25519_1w(T[2], T[0]);
1859 + sqrn_eltfp25519_1w_adx(T[2], 10);
1860 + mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
1861 + copy_eltfp25519_1w(T[3], T[2]);
1862 + sqrn_eltfp25519_1w_adx(T[3], 20);
1863 + mul_eltfp25519_1w_adx(T[3], T[3], T[2]);
1864 + sqrn_eltfp25519_1w_adx(T[3], 10);
1865 + mul_eltfp25519_1w_adx(T[3], T[3], T[0]);
1866 + copy_eltfp25519_1w(T[0], T[3]);
1867 + sqrn_eltfp25519_1w_adx(T[0], 50);
1868 + mul_eltfp25519_1w_adx(T[0], T[0], T[3]);
1869 + copy_eltfp25519_1w(T[2], T[0]);
1870 + sqrn_eltfp25519_1w_adx(T[2], 100);
1871 + mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
1872 + sqrn_eltfp25519_1w_adx(T[2], 50);
1873 + mul_eltfp25519_1w_adx(T[2], T[2], T[3]);
1874 + sqrn_eltfp25519_1w_adx(T[2], 5);
1875 + mul_eltfp25519_1w_adx(T[1], T[1], T[2]);
1876 +
1877 + memzero_explicit(&m, sizeof(m));
1878 +}
1879 +
1880 +static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
1881 +{
1882 + struct {
1883 + eltfp25519_1w_buffer buffer;
1884 + eltfp25519_1w x0, x1, x2;
1885 + } __aligned(32) m;
1886 + u64 *T[5];
1887 +
1888 + T[0] = m.x0;
1889 + T[1] = c; /* x^(-1) */
1890 + T[2] = m.x1;
1891 + T[3] = m.x2;
1892 +
1893 + copy_eltfp25519_1w(T[1], a);
1894 + sqrn_eltfp25519_1w_bmi2(T[1], 1);
1895 + copy_eltfp25519_1w(T[2], T[1]);
1896 + sqrn_eltfp25519_1w_bmi2(T[2], 2);
1897 + mul_eltfp25519_1w_bmi2(T[0], a, T[2]);
1898 + mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]);
1899 + copy_eltfp25519_1w(T[2], T[1]);
1900 + sqrn_eltfp25519_1w_bmi2(T[2], 1);
1901 + mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
1902 + copy_eltfp25519_1w(T[2], T[0]);
1903 + sqrn_eltfp25519_1w_bmi2(T[2], 5);
1904 + mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
1905 + copy_eltfp25519_1w(T[2], T[0]);
1906 + sqrn_eltfp25519_1w_bmi2(T[2], 10);
1907 + mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
1908 + copy_eltfp25519_1w(T[3], T[2]);
1909 + sqrn_eltfp25519_1w_bmi2(T[3], 20);
1910 + mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]);
1911 + sqrn_eltfp25519_1w_bmi2(T[3], 10);
1912 + mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]);
1913 + copy_eltfp25519_1w(T[0], T[3]);
1914 + sqrn_eltfp25519_1w_bmi2(T[0], 50);
1915 + mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]);
1916 + copy_eltfp25519_1w(T[2], T[0]);
1917 + sqrn_eltfp25519_1w_bmi2(T[2], 100);
1918 + mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
1919 + sqrn_eltfp25519_1w_bmi2(T[2], 50);
1920 + mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]);
1921 + sqrn_eltfp25519_1w_bmi2(T[2], 5);
1922 + mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]);
1923 +
1924 + memzero_explicit(&m, sizeof(m));
1925 +}
1926 +
1927 +/* Given c, a 256-bit number, fred_eltfp25519_1w updates c
1928 + * with a number such that 0 <= C < 2**255-19.
1929 + */
1930 +static __always_inline void fred_eltfp25519_1w(u64 *const c)
1931 +{
1932 + u64 tmp0 = 38, tmp1 = 19;
1933 + asm volatile(
1934 + "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */
1935 + "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */
1936 +
1937 + /* Add either 19 or 38 to c */
1938 + "addq %4, %0 ;"
1939 + "adcq $0, %1 ;"
1940 + "adcq $0, %2 ;"
1941 + "adcq $0, %3 ;"
1942 +
1943 + /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */
1944 + "movl $0, %k4 ;"
1945 + "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */
1946 + "btrq $63, %3 ;" /* Clear bit 255 */
1947 +
1948 + /* Subtract 19 if necessary */
1949 + "subq %4, %0 ;"
1950 + "sbbq $0, %1 ;"
1951 + "sbbq $0, %2 ;"
1952 + "sbbq $0, %3 ;"
1953 +
1954 + : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0),
1955 + "+r"(tmp1)
1956 + :
1957 + : "memory", "cc");
1958 +}
1959 +
1960 +static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py)
1961 +{
1962 + u64 temp;
1963 + asm volatile(
1964 + "test %9, %9 ;"
1965 + "movq %0, %8 ;"
1966 + "cmovnzq %4, %0 ;"
1967 + "cmovnzq %8, %4 ;"
1968 + "movq %1, %8 ;"
1969 + "cmovnzq %5, %1 ;"
1970 + "cmovnzq %8, %5 ;"
1971 + "movq %2, %8 ;"
1972 + "cmovnzq %6, %2 ;"
1973 + "cmovnzq %8, %6 ;"
1974 + "movq %3, %8 ;"
1975 + "cmovnzq %7, %3 ;"
1976 + "cmovnzq %8, %7 ;"
1977 + : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]),
1978 + "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]),
1979 + "=r"(temp)
1980 + : "r"(bit)
1981 + : "cc"
1982 + );
1983 +}
1984 +
1985 +static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py)
1986 +{
1987 + asm volatile(
1988 + "test %4, %4 ;"
1989 + "cmovnzq %5, %0 ;"
1990 + "cmovnzq %6, %1 ;"
1991 + "cmovnzq %7, %2 ;"
1992 + "cmovnzq %8, %3 ;"
1993 + : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3])
1994 + : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3])
1995 + : "cc"
1996 + );
1997 +}
1998 +
1999 +static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE],
2000 + const u8 private_key[CURVE25519_KEY_SIZE],
2001 + const u8 session_key[CURVE25519_KEY_SIZE])
2002 +{
2003 + struct {
2004 + u64 buffer[4 * NUM_WORDS_ELTFP25519];
2005 + u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2006 + u64 workspace[6 * NUM_WORDS_ELTFP25519];
2007 + u8 session[CURVE25519_KEY_SIZE];
2008 + u8 private[CURVE25519_KEY_SIZE];
2009 + } __aligned(32) m;
2010 +
2011 + int i = 0, j = 0;
2012 + u64 prev = 0;
2013 + u64 *const X1 = (u64 *)m.session;
2014 + u64 *const key = (u64 *)m.private;
2015 + u64 *const Px = m.coordinates + 0;
2016 + u64 *const Pz = m.coordinates + 4;
2017 + u64 *const Qx = m.coordinates + 8;
2018 + u64 *const Qz = m.coordinates + 12;
2019 + u64 *const X2 = Qx;
2020 + u64 *const Z2 = Qz;
2021 + u64 *const X3 = Px;
2022 + u64 *const Z3 = Pz;
2023 + u64 *const X2Z2 = Qx;
2024 + u64 *const X3Z3 = Px;
2025 +
2026 + u64 *const A = m.workspace + 0;
2027 + u64 *const B = m.workspace + 4;
2028 + u64 *const D = m.workspace + 8;
2029 + u64 *const C = m.workspace + 12;
2030 + u64 *const DA = m.workspace + 16;
2031 + u64 *const CB = m.workspace + 20;
2032 + u64 *const AB = A;
2033 + u64 *const DC = D;
2034 + u64 *const DACB = DA;
2035 +
2036 + memcpy(m.private, private_key, sizeof(m.private));
2037 + memcpy(m.session, session_key, sizeof(m.session));
2038 +
2039 + curve25519_clamp_secret(m.private);
2040 +
2041 + /* As in the draft:
2042 + * When receiving such an array, implementations of curve25519
2043 + * MUST mask the most-significant bit in the final byte. This
2044 + * is done to preserve compatibility with point formats which
2045 + * reserve the sign bit for use in other protocols and to
2046 + * increase resistance to implementation fingerprinting
2047 + */
2048 + m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
2049 +
2050 + copy_eltfp25519_1w(Px, X1);
2051 + setzero_eltfp25519_1w(Pz);
2052 + setzero_eltfp25519_1w(Qx);
2053 + setzero_eltfp25519_1w(Qz);
2054 +
2055 + Pz[0] = 1;
2056 + Qx[0] = 1;
2057 +
2058 + /* main-loop */
2059 + prev = 0;
2060 + j = 62;
2061 + for (i = 3; i >= 0; --i) {
2062 + while (j >= 0) {
2063 + u64 bit = (key[i] >> j) & 0x1;
2064 + u64 swap = bit ^ prev;
2065 + prev = bit;
2066 +
2067 + add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */
2068 + sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
2069 + add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */
2070 + sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
2071 + mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
2072 +
2073 + cselect(swap, A, C);
2074 + cselect(swap, B, D);
2075 +
2076 + sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */
2077 + add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */
2078 + sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
2079 + sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
2080 +
2081 + copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
2082 + sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
2083 +
2084 + mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
2085 + add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */
2086 + mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
2087 + mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */
2088 + --j;
2089 + }
2090 + j = 63;
2091 + }
2092 +
2093 + inv_eltfp25519_1w_adx(A, Qz);
2094 + mul_eltfp25519_1w_adx((u64 *)shared, Qx, A);
2095 + fred_eltfp25519_1w((u64 *)shared);
2096 +
2097 + memzero_explicit(&m, sizeof(m));
2098 +}
2099 +
2100 +static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE],
2101 + const u8 private_key[CURVE25519_KEY_SIZE])
2102 +{
2103 + struct {
2104 + u64 buffer[4 * NUM_WORDS_ELTFP25519];
2105 + u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2106 + u64 workspace[4 * NUM_WORDS_ELTFP25519];
2107 + u8 private[CURVE25519_KEY_SIZE];
2108 + } __aligned(32) m;
2109 +
2110 + const int ite[4] = { 64, 64, 64, 63 };
2111 + const int q = 3;
2112 + u64 swap = 1;
2113 +
2114 + int i = 0, j = 0, k = 0;
2115 + u64 *const key = (u64 *)m.private;
2116 + u64 *const Ur1 = m.coordinates + 0;
2117 + u64 *const Zr1 = m.coordinates + 4;
2118 + u64 *const Ur2 = m.coordinates + 8;
2119 + u64 *const Zr2 = m.coordinates + 12;
2120 +
2121 + u64 *const UZr1 = m.coordinates + 0;
2122 + u64 *const ZUr2 = m.coordinates + 8;
2123 +
2124 + u64 *const A = m.workspace + 0;
2125 + u64 *const B = m.workspace + 4;
2126 + u64 *const C = m.workspace + 8;
2127 + u64 *const D = m.workspace + 12;
2128 +
2129 + u64 *const AB = m.workspace + 0;
2130 + u64 *const CD = m.workspace + 8;
2131 +
2132 + const u64 *const P = table_ladder_8k;
2133 +
2134 + memcpy(m.private, private_key, sizeof(m.private));
2135 +
2136 + curve25519_clamp_secret(m.private);
2137 +
2138 + setzero_eltfp25519_1w(Ur1);
2139 + setzero_eltfp25519_1w(Zr1);
2140 + setzero_eltfp25519_1w(Zr2);
2141 + Ur1[0] = 1;
2142 + Zr1[0] = 1;
2143 + Zr2[0] = 1;
2144 +
2145 + /* G-S */
2146 + Ur2[3] = 0x1eaecdeee27cab34UL;
2147 + Ur2[2] = 0xadc7a0b9235d48e2UL;
2148 + Ur2[1] = 0xbbf095ae14b2edf8UL;
2149 + Ur2[0] = 0x7e94e1fec82faabdUL;
2150 +
2151 + /* main-loop */
2152 + j = q;
2153 + for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
2154 + while (j < ite[i]) {
2155 + u64 bit = (key[i] >> j) & 0x1;
2156 + k = (64 * i + j - q);
2157 + swap = swap ^ bit;
2158 + cswap(swap, Ur1, Ur2);
2159 + cswap(swap, Zr1, Zr2);
2160 + swap = bit;
2161 + /* Addition */
2162 + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
2163 + add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
2164 + mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */
2165 + sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
2166 + add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
2167 + sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */
2168 + mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
2169 + ++j;
2170 + }
2171 + j = 0;
2172 + }
2173 +
2174 + /* Doubling */
2175 + for (i = 0; i < q; ++i) {
2176 + add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
2177 + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
2178 + sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */
2179 + copy_eltfp25519_1w(C, B); /* C = B */
2180 + sub_eltfp25519_1w(B, A, B); /* B = A-B */
2181 + mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
2182 + add_eltfp25519_1w_adx(D, D, C); /* D = D+C */
2183 + mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
2184 + }
2185 +
2186 + /* Convert to affine coordinates */
2187 + inv_eltfp25519_1w_adx(A, Zr1);
2188 + mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A);
2189 + fred_eltfp25519_1w((u64 *)session_key);
2190 +
2191 + memzero_explicit(&m, sizeof(m));
2192 +}
2193 +
2194 +static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE],
2195 + const u8 private_key[CURVE25519_KEY_SIZE],
2196 + const u8 session_key[CURVE25519_KEY_SIZE])
2197 +{
2198 + struct {
2199 + u64 buffer[4 * NUM_WORDS_ELTFP25519];
2200 + u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2201 + u64 workspace[6 * NUM_WORDS_ELTFP25519];
2202 + u8 session[CURVE25519_KEY_SIZE];
2203 + u8 private[CURVE25519_KEY_SIZE];
2204 + } __aligned(32) m;
2205 +
2206 + int i = 0, j = 0;
2207 + u64 prev = 0;
2208 + u64 *const X1 = (u64 *)m.session;
2209 + u64 *const key = (u64 *)m.private;
2210 + u64 *const Px = m.coordinates + 0;
2211 + u64 *const Pz = m.coordinates + 4;
2212 + u64 *const Qx = m.coordinates + 8;
2213 + u64 *const Qz = m.coordinates + 12;
2214 + u64 *const X2 = Qx;
2215 + u64 *const Z2 = Qz;
2216 + u64 *const X3 = Px;
2217 + u64 *const Z3 = Pz;
2218 + u64 *const X2Z2 = Qx;
2219 + u64 *const X3Z3 = Px;
2220 +
2221 + u64 *const A = m.workspace + 0;
2222 + u64 *const B = m.workspace + 4;
2223 + u64 *const D = m.workspace + 8;
2224 + u64 *const C = m.workspace + 12;
2225 + u64 *const DA = m.workspace + 16;
2226 + u64 *const CB = m.workspace + 20;
2227 + u64 *const AB = A;
2228 + u64 *const DC = D;
2229 + u64 *const DACB = DA;
2230 +
2231 + memcpy(m.private, private_key, sizeof(m.private));
2232 + memcpy(m.session, session_key, sizeof(m.session));
2233 +
2234 + curve25519_clamp_secret(m.private);
2235 +
2236 + /* As in the draft:
2237 + * When receiving such an array, implementations of curve25519
2238 + * MUST mask the most-significant bit in the final byte. This
2239 + * is done to preserve compatibility with point formats which
2240 + * reserve the sign bit for use in other protocols and to
2241 + * increase resistance to implementation fingerprinting
2242 + */
2243 + m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
2244 +
2245 + copy_eltfp25519_1w(Px, X1);
2246 + setzero_eltfp25519_1w(Pz);
2247 + setzero_eltfp25519_1w(Qx);
2248 + setzero_eltfp25519_1w(Qz);
2249 +
2250 + Pz[0] = 1;
2251 + Qx[0] = 1;
2252 +
2253 + /* main-loop */
2254 + prev = 0;
2255 + j = 62;
2256 + for (i = 3; i >= 0; --i) {
2257 + while (j >= 0) {
2258 + u64 bit = (key[i] >> j) & 0x1;
2259 + u64 swap = bit ^ prev;
2260 + prev = bit;
2261 +
2262 + add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */
2263 + sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
2264 + add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */
2265 + sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
2266 + mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
2267 +
2268 + cselect(swap, A, C);
2269 + cselect(swap, B, D);
2270 +
2271 + sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */
2272 + add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */
2273 + sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
2274 + sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
2275 +
2276 + copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
2277 + sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
2278 +
2279 + mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
2280 + add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */
2281 + mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
2282 + mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */
2283 + --j;
2284 + }
2285 + j = 63;
2286 + }
2287 +
2288 + inv_eltfp25519_1w_bmi2(A, Qz);
2289 + mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A);
2290 + fred_eltfp25519_1w((u64 *)shared);
2291 +
2292 + memzero_explicit(&m, sizeof(m));
2293 +}
2294 +
2295 +static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE],
2296 + const u8 private_key[CURVE25519_KEY_SIZE])
2297 +{
2298 + struct {
2299 + u64 buffer[4 * NUM_WORDS_ELTFP25519];
2300 + u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2301 + u64 workspace[4 * NUM_WORDS_ELTFP25519];
2302 + u8 private[CURVE25519_KEY_SIZE];
2303 + } __aligned(32) m;
2304 +
2305 + const int ite[4] = { 64, 64, 64, 63 };
2306 + const int q = 3;
2307 + u64 swap = 1;
2308 +
2309 + int i = 0, j = 0, k = 0;
2310 + u64 *const key = (u64 *)m.private;
2311 + u64 *const Ur1 = m.coordinates + 0;
2312 + u64 *const Zr1 = m.coordinates + 4;
2313 + u64 *const Ur2 = m.coordinates + 8;
2314 + u64 *const Zr2 = m.coordinates + 12;
2315 +
2316 + u64 *const UZr1 = m.coordinates + 0;
2317 + u64 *const ZUr2 = m.coordinates + 8;
2318 +
2319 + u64 *const A = m.workspace + 0;
2320 + u64 *const B = m.workspace + 4;
2321 + u64 *const C = m.workspace + 8;
2322 + u64 *const D = m.workspace + 12;
2323 +
2324 + u64 *const AB = m.workspace + 0;
2325 + u64 *const CD = m.workspace + 8;
2326 +
2327 + const u64 *const P = table_ladder_8k;
2328 +
2329 + memcpy(m.private, private_key, sizeof(m.private));
2330 +
2331 + curve25519_clamp_secret(m.private);
2332 +
2333 + setzero_eltfp25519_1w(Ur1);
2334 + setzero_eltfp25519_1w(Zr1);
2335 + setzero_eltfp25519_1w(Zr2);
2336 + Ur1[0] = 1;
2337 + Zr1[0] = 1;
2338 + Zr2[0] = 1;
2339 +
2340 + /* G-S */
2341 + Ur2[3] = 0x1eaecdeee27cab34UL;
2342 + Ur2[2] = 0xadc7a0b9235d48e2UL;
2343 + Ur2[1] = 0xbbf095ae14b2edf8UL;
2344 + Ur2[0] = 0x7e94e1fec82faabdUL;
2345 +
2346 + /* main-loop */
2347 + j = q;
2348 + for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
2349 + while (j < ite[i]) {
2350 + u64 bit = (key[i] >> j) & 0x1;
2351 + k = (64 * i + j - q);
2352 + swap = swap ^ bit;
2353 + cswap(swap, Ur1, Ur2);
2354 + cswap(swap, Zr1, Zr2);
2355 + swap = bit;
2356 + /* Addition */
2357 + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
2358 + add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
2359 + mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */
2360 + sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
2361 + add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
2362 + sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */
2363 + mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
2364 + ++j;
2365 + }
2366 + j = 0;
2367 + }
2368 +
2369 + /* Doubling */
2370 + for (i = 0; i < q; ++i) {
2371 + add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
2372 + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
2373 + sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */
2374 + copy_eltfp25519_1w(C, B); /* C = B */
2375 + sub_eltfp25519_1w(B, A, B); /* B = A-B */
2376 + mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
2377 + add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */
2378 + mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
2379 + }
2380 +
2381 + /* Convert to affine coordinates */
2382 + inv_eltfp25519_1w_bmi2(A, Zr1);
2383 + mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A);
2384 + fred_eltfp25519_1w((u64 *)session_key);
2385 +
2386 + memzero_explicit(&m, sizeof(m));
2387 +}
2388 +
2389 +void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
2390 + const u8 secret[CURVE25519_KEY_SIZE],
2391 + const u8 basepoint[CURVE25519_KEY_SIZE])
2392 +{
2393 + if (static_branch_likely(&curve25519_use_adx))
2394 + curve25519_adx(mypublic, secret, basepoint);
2395 + else if (static_branch_likely(&curve25519_use_bmi2))
2396 + curve25519_bmi2(mypublic, secret, basepoint);
2397 + else
2398 + curve25519_generic(mypublic, secret, basepoint);
2399 +}
2400 +EXPORT_SYMBOL(curve25519_arch);
2401 +
2402 +void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
2403 + const u8 secret[CURVE25519_KEY_SIZE])
2404 +{
2405 + if (static_branch_likely(&curve25519_use_adx))
2406 + curve25519_adx_base(pub, secret);
2407 + else if (static_branch_likely(&curve25519_use_bmi2))
2408 + curve25519_bmi2_base(pub, secret);
2409 + else
2410 + curve25519_generic(pub, secret, curve25519_base_point);
2411 +}
2412 +EXPORT_SYMBOL(curve25519_base_arch);
2413 +
2414 +static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
2415 + unsigned int len)
2416 +{
2417 + u8 *secret = kpp_tfm_ctx(tfm);
2418 +
2419 + if (!len)
2420 + curve25519_generate_secret(secret);
2421 + else if (len == CURVE25519_KEY_SIZE &&
2422 + crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
2423 + memcpy(secret, buf, CURVE25519_KEY_SIZE);
2424 + else
2425 + return -EINVAL;
2426 + return 0;
2427 +}
2428 +
2429 +static int curve25519_generate_public_key(struct kpp_request *req)
2430 +{
2431 + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
2432 + const u8 *secret = kpp_tfm_ctx(tfm);
2433 + u8 buf[CURVE25519_KEY_SIZE];
2434 + int copied, nbytes;
2435 +
2436 + if (req->src)
2437 + return -EINVAL;
2438 +
2439 + curve25519_base_arch(buf, secret);
2440 +
2441 + /* might want less than we've got */
2442 + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
2443 + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
2444 + nbytes),
2445 + buf, nbytes);
2446 + if (copied != nbytes)
2447 + return -EINVAL;
2448 + return 0;
2449 +}
2450 +
2451 +static int curve25519_compute_shared_secret(struct kpp_request *req)
2452 +{
2453 + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
2454 + const u8 *secret = kpp_tfm_ctx(tfm);
2455 + u8 public_key[CURVE25519_KEY_SIZE];
2456 + u8 buf[CURVE25519_KEY_SIZE];
2457 + int copied, nbytes;
2458 +
2459 + if (!req->src)
2460 + return -EINVAL;
2461 +
2462 + copied = sg_copy_to_buffer(req->src,
2463 + sg_nents_for_len(req->src,
2464 + CURVE25519_KEY_SIZE),
2465 + public_key, CURVE25519_KEY_SIZE);
2466 + if (copied != CURVE25519_KEY_SIZE)
2467 + return -EINVAL;
2468 +
2469 + curve25519_arch(buf, secret, public_key);
2470 +
2471 + /* might want less than we've got */
2472 + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
2473 + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
2474 + nbytes),
2475 + buf, nbytes);
2476 + if (copied != nbytes)
2477 + return -EINVAL;
2478 + return 0;
2479 +}
2480 +
2481 +static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
2482 +{
2483 + return CURVE25519_KEY_SIZE;
2484 +}
2485 +
2486 +static struct kpp_alg curve25519_alg = {
2487 + .base.cra_name = "curve25519",
2488 + .base.cra_driver_name = "curve25519-x86",
2489 + .base.cra_priority = 200,
2490 + .base.cra_module = THIS_MODULE,
2491 + .base.cra_ctxsize = CURVE25519_KEY_SIZE,
2492 +
2493 + .set_secret = curve25519_set_secret,
2494 + .generate_public_key = curve25519_generate_public_key,
2495 + .compute_shared_secret = curve25519_compute_shared_secret,
2496 + .max_size = curve25519_max_size,
2497 +};
2498 +
2499 +static int __init curve25519_mod_init(void)
2500 +{
2501 + if (boot_cpu_has(X86_FEATURE_BMI2))
2502 + static_branch_enable(&curve25519_use_bmi2);
2503 + else if (boot_cpu_has(X86_FEATURE_ADX))
2504 + static_branch_enable(&curve25519_use_adx);
2505 + else
2506 + return 0;
2507 + return crypto_register_kpp(&curve25519_alg);
2508 +}
2509 +
2510 +static void __exit curve25519_mod_exit(void)
2511 +{
2512 + if (boot_cpu_has(X86_FEATURE_BMI2) ||
2513 + boot_cpu_has(X86_FEATURE_ADX))
2514 + crypto_unregister_kpp(&curve25519_alg);
2515 +}
2516 +
2517 +module_init(curve25519_mod_init);
2518 +module_exit(curve25519_mod_exit);
2519 +
2520 +MODULE_ALIAS_CRYPTO("curve25519");
2521 +MODULE_ALIAS_CRYPTO("curve25519-x86");
2522 +MODULE_LICENSE("GPL v2");
2523 --- a/crypto/Kconfig
2524 +++ b/crypto/Kconfig
2525 @@ -269,6 +269,12 @@ config CRYPTO_CURVE25519
2526 select CRYPTO_KPP
2527 select CRYPTO_LIB_CURVE25519_GENERIC
2528
2529 +config CRYPTO_CURVE25519_X86
2530 + tristate "x86_64 accelerated Curve25519 scalar multiplication library"
2531 + depends on X86 && 64BIT
2532 + select CRYPTO_LIB_CURVE25519_GENERIC
2533 + select CRYPTO_ARCH_HAVE_LIB_CURVE25519
2534 +
2535 comment "Authenticated Encryption with Associated Data"
2536
2537 config CRYPTO_CCM