kernel: 5.4: import wireguard backport
[openwrt/openwrt.git] / target / linux / generic / backport-5.4 / 080-wireguard-0055-crypto-x86-curve25519-leave-r12-as-spare-register.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Sun, 1 Mar 2020 16:06:56 +0800
4 Subject: [PATCH] crypto: x86/curve25519 - leave r12 as spare register
5
6 commit dc7fc3a53ae158263196b1892b672aedf67796c5 upstream.
7
8 This updates to the newer register selection proved by HACL*, which
9 leads to a more compact instruction encoding, and saves around 100
10 cycles.
11
12 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
13 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
14 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
15 ---
16 arch/x86/crypto/curve25519-x86_64.c | 110 ++++++++++++++--------------
17 1 file changed, 55 insertions(+), 55 deletions(-)
18
19 --- a/arch/x86/crypto/curve25519-x86_64.c
20 +++ b/arch/x86/crypto/curve25519-x86_64.c
21 @@ -167,28 +167,28 @@ static inline void fmul(u64 *out, const
22 " movq 0(%1), %%rdx;"
23 " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
24 " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
25 - " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
26 + " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
27 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
28 " adox %%rdx, %%rax;"
29 /* Compute src1[1] * src2 */
30 " movq 8(%1), %%rdx;"
31 " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
32 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 16(%0);"
33 - " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
34 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
35 + " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
36 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
37 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
38 /* Compute src1[2] * src2 */
39 " movq 16(%1), %%rdx;"
40 " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
41 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 24(%0);"
42 - " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
43 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
44 + " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
45 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
46 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
47 /* Compute src1[3] * src2 */
48 " movq 24(%1), %%rdx;"
49 " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
50 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 32(%0);"
51 - " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 40(%0);" " mov $0, %%r8;"
52 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
53 + " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
54 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
55 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
56 /* Line up pointers */
57 @@ -202,11 +202,11 @@ static inline void fmul(u64 *out, const
58 " mulxq 32(%1), %%r8, %%r13;"
59 " xor %3, %3;"
60 " adoxq 0(%1), %%r8;"
61 - " mulxq 40(%1), %%r9, %%r12;"
62 + " mulxq 40(%1), %%r9, %%rbx;"
63 " adcx %%r13, %%r9;"
64 " adoxq 8(%1), %%r9;"
65 " mulxq 48(%1), %%r10, %%r13;"
66 - " adcx %%r12, %%r10;"
67 + " adcx %%rbx, %%r10;"
68 " adoxq 16(%1), %%r10;"
69 " mulxq 56(%1), %%r11, %%rax;"
70 " adcx %%r13, %%r11;"
71 @@ -231,7 +231,7 @@ static inline void fmul(u64 *out, const
72 " movq %%r8, 0(%0);"
73 : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
74 :
75 - : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
76 + : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
77 );
78 }
79
80 @@ -248,28 +248,28 @@ static inline void fmul2(u64 *out, const
81 " movq 0(%1), %%rdx;"
82 " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
83 " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
84 - " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
85 + " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
86 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
87 " adox %%rdx, %%rax;"
88 /* Compute src1[1] * src2 */
89 " movq 8(%1), %%rdx;"
90 " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
91 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 16(%0);"
92 - " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
93 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
94 + " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
95 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
96 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
97 /* Compute src1[2] * src2 */
98 " movq 16(%1), %%rdx;"
99 " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
100 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 24(%0);"
101 - " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
102 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
103 + " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
104 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
105 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
106 /* Compute src1[3] * src2 */
107 " movq 24(%1), %%rdx;"
108 " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
109 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 32(%0);"
110 - " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 40(%0);" " mov $0, %%r8;"
111 + " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
112 + " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
113 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
114 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
115
116 @@ -279,28 +279,28 @@ static inline void fmul2(u64 *out, const
117 " movq 32(%1), %%rdx;"
118 " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);"
119 " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
120 - " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
121 + " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
122 " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
123 " adox %%rdx, %%rax;"
124 /* Compute src1[1] * src2 */
125 " movq 40(%1), %%rdx;"
126 " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
127 - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 80(%0);"
128 - " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
129 + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
130 + " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
131 " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
132 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
133 /* Compute src1[2] * src2 */
134 " movq 48(%1), %%rdx;"
135 " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
136 - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 88(%0);"
137 - " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
138 + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
139 + " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
140 " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
141 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
142 /* Compute src1[3] * src2 */
143 " movq 56(%1), %%rdx;"
144 " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
145 - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 96(%0);"
146 - " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 104(%0);" " mov $0, %%r8;"
147 + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
148 + " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;"
149 " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
150 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
151 /* Line up pointers */
152 @@ -314,11 +314,11 @@ static inline void fmul2(u64 *out, const
153 " mulxq 32(%1), %%r8, %%r13;"
154 " xor %3, %3;"
155 " adoxq 0(%1), %%r8;"
156 - " mulxq 40(%1), %%r9, %%r12;"
157 + " mulxq 40(%1), %%r9, %%rbx;"
158 " adcx %%r13, %%r9;"
159 " adoxq 8(%1), %%r9;"
160 " mulxq 48(%1), %%r10, %%r13;"
161 - " adcx %%r12, %%r10;"
162 + " adcx %%rbx, %%r10;"
163 " adoxq 16(%1), %%r10;"
164 " mulxq 56(%1), %%r11, %%rax;"
165 " adcx %%r13, %%r11;"
166 @@ -347,11 +347,11 @@ static inline void fmul2(u64 *out, const
167 " mulxq 96(%1), %%r8, %%r13;"
168 " xor %3, %3;"
169 " adoxq 64(%1), %%r8;"
170 - " mulxq 104(%1), %%r9, %%r12;"
171 + " mulxq 104(%1), %%r9, %%rbx;"
172 " adcx %%r13, %%r9;"
173 " adoxq 72(%1), %%r9;"
174 " mulxq 112(%1), %%r10, %%r13;"
175 - " adcx %%r12, %%r10;"
176 + " adcx %%rbx, %%r10;"
177 " adoxq 80(%1), %%r10;"
178 " mulxq 120(%1), %%r11, %%rax;"
179 " adcx %%r13, %%r11;"
180 @@ -376,7 +376,7 @@ static inline void fmul2(u64 *out, const
181 " movq %%r8, 32(%0);"
182 : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
183 :
184 - : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
185 + : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
186 );
187 }
188
189 @@ -388,11 +388,11 @@ static inline void fmul_scalar(u64 *out,
190 asm volatile(
191 /* Compute the raw multiplication of f1*f2 */
192 " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
193 - " mulxq 8(%2), %%r9, %%r12;" /* f1[1]*f2 */
194 + " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
195 " add %%rcx, %%r9;"
196 " mov $0, %%rcx;"
197 " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
198 - " adcx %%r12, %%r10;"
199 + " adcx %%rbx, %%r10;"
200 " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
201 " adcx %%r13, %%r11;"
202 " adcx %%rcx, %%rax;"
203 @@ -419,7 +419,7 @@ static inline void fmul_scalar(u64 *out,
204 " movq %%r8, 0(%1);"
205 : "+&r" (f2_r)
206 : "r" (out), "r" (f1)
207 - : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "memory", "cc"
208 + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc"
209 );
210 }
211
212 @@ -520,8 +520,8 @@ static inline void fsqr(u64 *out, const
213 " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
214 " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
215 " movq 24(%1), %%rdx;" /* f[3] */
216 - " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
217 - " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
218 + " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
219 + " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
220 " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
221 " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
222
223 @@ -531,12 +531,12 @@ static inline void fsqr(u64 *out, const
224 " adcx %%r8, %%r8;"
225 " adox %%rcx, %%r11;"
226 " adcx %%r9, %%r9;"
227 - " adox %%r15, %%r12;"
228 + " adox %%r15, %%rbx;"
229 " adcx %%r10, %%r10;"
230 " adox %%r15, %%r13;"
231 " adcx %%r11, %%r11;"
232 " adox %%r15, %%r14;"
233 - " adcx %%r12, %%r12;"
234 + " adcx %%rbx, %%rbx;"
235 " adcx %%r13, %%r13;"
236 " adcx %%r14, %%r14;"
237
238 @@ -549,7 +549,7 @@ static inline void fsqr(u64 *out, const
239 " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
240 " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
241 " adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
242 - " adcx %%rcx, %%r12;" " movq %%r12, 40(%0);"
243 + " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);"
244 " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
245 " adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
246 " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
247 @@ -565,11 +565,11 @@ static inline void fsqr(u64 *out, const
248 " mulxq 32(%1), %%r8, %%r13;"
249 " xor %%rcx, %%rcx;"
250 " adoxq 0(%1), %%r8;"
251 - " mulxq 40(%1), %%r9, %%r12;"
252 + " mulxq 40(%1), %%r9, %%rbx;"
253 " adcx %%r13, %%r9;"
254 " adoxq 8(%1), %%r9;"
255 " mulxq 48(%1), %%r10, %%r13;"
256 - " adcx %%r12, %%r10;"
257 + " adcx %%rbx, %%r10;"
258 " adoxq 16(%1), %%r10;"
259 " mulxq 56(%1), %%r11, %%rax;"
260 " adcx %%r13, %%r11;"
261 @@ -594,7 +594,7 @@ static inline void fsqr(u64 *out, const
262 " movq %%r8, 0(%0);"
263 : "+&r" (tmp), "+&r" (f), "+&r" (out)
264 :
265 - : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
266 + : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
267 );
268 }
269
270 @@ -611,8 +611,8 @@ static inline void fsqr2(u64 *out, const
271 " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
272 " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
273 " movq 24(%1), %%rdx;" /* f[3] */
274 - " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
275 - " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
276 + " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
277 + " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
278 " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
279 " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
280
281 @@ -622,12 +622,12 @@ static inline void fsqr2(u64 *out, const
282 " adcx %%r8, %%r8;"
283 " adox %%rcx, %%r11;"
284 " adcx %%r9, %%r9;"
285 - " adox %%r15, %%r12;"
286 + " adox %%r15, %%rbx;"
287 " adcx %%r10, %%r10;"
288 " adox %%r15, %%r13;"
289 " adcx %%r11, %%r11;"
290 " adox %%r15, %%r14;"
291 - " adcx %%r12, %%r12;"
292 + " adcx %%rbx, %%rbx;"
293 " adcx %%r13, %%r13;"
294 " adcx %%r14, %%r14;"
295
296 @@ -640,7 +640,7 @@ static inline void fsqr2(u64 *out, const
297 " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
298 " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
299 " adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
300 - " adcx %%rcx, %%r12;" " movq %%r12, 40(%0);"
301 + " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);"
302 " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
303 " adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
304 " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
305 @@ -651,8 +651,8 @@ static inline void fsqr2(u64 *out, const
306 " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
307 " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
308 " movq 56(%1), %%rdx;" /* f[3] */
309 - " mulxq 40(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
310 - " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
311 + " mulxq 40(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
312 + " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
313 " movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
314 " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
315
316 @@ -662,12 +662,12 @@ static inline void fsqr2(u64 *out, const
317 " adcx %%r8, %%r8;"
318 " adox %%rcx, %%r11;"
319 " adcx %%r9, %%r9;"
320 - " adox %%r15, %%r12;"
321 + " adox %%r15, %%rbx;"
322 " adcx %%r10, %%r10;"
323 " adox %%r15, %%r13;"
324 " adcx %%r11, %%r11;"
325 " adox %%r15, %%r14;"
326 - " adcx %%r12, %%r12;"
327 + " adcx %%rbx, %%rbx;"
328 " adcx %%r13, %%r13;"
329 " adcx %%r14, %%r14;"
330
331 @@ -680,7 +680,7 @@ static inline void fsqr2(u64 *out, const
332 " adcx %%rcx, %%r10;" " movq %%r10, 88(%0);"
333 " movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
334 " adcx %%rax, %%r11;" " movq %%r11, 96(%0);"
335 - " adcx %%rcx, %%r12;" " movq %%r12, 104(%0);"
336 + " adcx %%rcx, %%rbx;" " movq %%rbx, 104(%0);"
337 " movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
338 " adcx %%rax, %%r13;" " movq %%r13, 112(%0);"
339 " adcx %%rcx, %%r14;" " movq %%r14, 120(%0);"
340 @@ -694,11 +694,11 @@ static inline void fsqr2(u64 *out, const
341 " mulxq 32(%1), %%r8, %%r13;"
342 " xor %%rcx, %%rcx;"
343 " adoxq 0(%1), %%r8;"
344 - " mulxq 40(%1), %%r9, %%r12;"
345 + " mulxq 40(%1), %%r9, %%rbx;"
346 " adcx %%r13, %%r9;"
347 " adoxq 8(%1), %%r9;"
348 " mulxq 48(%1), %%r10, %%r13;"
349 - " adcx %%r12, %%r10;"
350 + " adcx %%rbx, %%r10;"
351 " adoxq 16(%1), %%r10;"
352 " mulxq 56(%1), %%r11, %%rax;"
353 " adcx %%r13, %%r11;"
354 @@ -727,11 +727,11 @@ static inline void fsqr2(u64 *out, const
355 " mulxq 96(%1), %%r8, %%r13;"
356 " xor %%rcx, %%rcx;"
357 " adoxq 64(%1), %%r8;"
358 - " mulxq 104(%1), %%r9, %%r12;"
359 + " mulxq 104(%1), %%r9, %%rbx;"
360 " adcx %%r13, %%r9;"
361 " adoxq 72(%1), %%r9;"
362 " mulxq 112(%1), %%r10, %%r13;"
363 - " adcx %%r12, %%r10;"
364 + " adcx %%rbx, %%r10;"
365 " adoxq 80(%1), %%r10;"
366 " mulxq 120(%1), %%r11, %%rax;"
367 " adcx %%r13, %%r11;"
368 @@ -756,7 +756,7 @@ static inline void fsqr2(u64 *out, const
369 " movq %%r8, 32(%0);"
370 : "+&r" (tmp), "+&r" (f), "+&r" (out)
371 :
372 - : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
373 + : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
374 );
375 }
376