kernel: 5.4: import wireguard backport
[openwrt/openwrt.git] / target / linux / generic / backport-5.4 / 080-wireguard-0063-crypto-curve25519-x86_64-Use-XORL-r32-32.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Uros Bizjak <ubizjak@gmail.com>
3 Date: Thu, 27 Aug 2020 19:30:58 +0200
4 Subject: [PATCH] crypto: curve25519-x86_64 - Use XORL r32,32
5
6 commit db719539fd3889836900bf912755aa30a5985e9a upstream.
7
8 x86_64 zero extends 32bit operations, so for 64bit operands,
9 XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
10 a REX prefix byte when legacy registers are used.
11
12 Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
13 Cc: Herbert Xu <herbert@gondor.apana.org.au>
14 Cc: "David S. Miller" <davem@davemloft.net>
15 Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
16 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
17 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
18 ---
19 arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
20 1 file changed, 34 insertions(+), 34 deletions(-)
21
22 --- a/arch/x86/crypto/curve25519-x86_64.c
23 +++ b/arch/x86/crypto/curve25519-x86_64.c
24 @@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, c
25
26 asm volatile(
27 /* Clear registers to propagate the carry bit */
28 - " xor %%r8, %%r8;"
29 - " xor %%r9, %%r9;"
30 - " xor %%r10, %%r10;"
31 - " xor %%r11, %%r11;"
32 - " xor %1, %1;"
33 + " xor %%r8d, %%r8d;"
34 + " xor %%r9d, %%r9d;"
35 + " xor %%r10d, %%r10d;"
36 + " xor %%r11d, %%r11d;"
37 + " xor %k1, %k1;"
38
39 /* Begin addition chain */
40 " addq 0(%3), %0;"
41 @@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const
42 " cmovc %0, %%rax;"
43
44 /* Step 2: Add carry*38 to the original sum */
45 - " xor %%rcx, %%rcx;"
46 + " xor %%ecx, %%ecx;"
47 " add %%rax, %%r8;"
48 " adcx %%rcx, %%r9;"
49 " movq %%r9, 8(%1);"
50 @@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const
51
52 /* Compute src1[0] * src2 */
53 " movq 0(%1), %%rdx;"
54 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
55 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
56 " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
57 " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
58 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
59 " adox %%rdx, %%rax;"
60 /* Compute src1[1] * src2 */
61 " movq 8(%1), %%rdx;"
62 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
63 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
64 " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
65 " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
66 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
67 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
68 /* Compute src1[2] * src2 */
69 " movq 16(%1), %%rdx;"
70 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
71 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
72 " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
73 " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
74 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
75 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
76 /* Compute src1[3] * src2 */
77 " movq 24(%1), %%rdx;"
78 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
79 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
80 " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
81 " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
82 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
83 @@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const
84 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
85 " mov $38, %%rdx;"
86 " mulxq 32(%1), %%r8, %%r13;"
87 - " xor %3, %3;"
88 + " xor %k3, %k3;"
89 " adoxq 0(%1), %%r8;"
90 " mulxq 40(%1), %%r9, %%rbx;"
91 " adcx %%r13, %%r9;"
92 @@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const
93
94 /* Compute src1[0] * src2 */
95 " movq 0(%1), %%rdx;"
96 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
97 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
98 " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
99 " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
100 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
101 " adox %%rdx, %%rax;"
102 /* Compute src1[1] * src2 */
103 " movq 8(%1), %%rdx;"
104 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
105 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
106 " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
107 " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
108 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
109 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
110 /* Compute src1[2] * src2 */
111 " movq 16(%1), %%rdx;"
112 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
113 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
114 " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
115 " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
116 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
117 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
118 /* Compute src1[3] * src2 */
119 " movq 24(%1), %%rdx;"
120 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
121 + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
122 " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
123 " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
124 " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
125 @@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const
126
127 /* Compute src1[0] * src2 */
128 " movq 32(%1), %%rdx;"
129 - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);"
130 - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
131 + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%0);"
132 + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
133 " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
134 " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
135 " adox %%rdx, %%rax;"
136 /* Compute src1[1] * src2 */
137 " movq 40(%1), %%rdx;"
138 - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
139 - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
140 + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
141 + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
142 " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
143 " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
144 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
145 /* Compute src1[2] * src2 */
146 " movq 48(%1), %%rdx;"
147 - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
148 - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
149 + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
150 + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
151 " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
152 " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
153 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
154 /* Compute src1[3] * src2 */
155 " movq 56(%1), %%rdx;"
156 - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
157 - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
158 + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
159 + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
160 " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;"
161 " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
162 " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
163 @@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const
164 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
165 " mov $38, %%rdx;"
166 " mulxq 32(%1), %%r8, %%r13;"
167 - " xor %3, %3;"
168 + " xor %k3, %k3;"
169 " adoxq 0(%1), %%r8;"
170 " mulxq 40(%1), %%r9, %%rbx;"
171 " adcx %%r13, %%r9;"
172 @@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const
173 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
174 " mov $38, %%rdx;"
175 " mulxq 96(%1), %%r8, %%r13;"
176 - " xor %3, %3;"
177 + " xor %k3, %k3;"
178 " adoxq 64(%1), %%r8;"
179 " mulxq 104(%1), %%r9, %%rbx;"
180 " adcx %%r13, %%r9;"
181 @@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const
182
183 /* Step 1: Compute all partial products */
184 " movq 0(%1), %%rdx;" /* f[0] */
185 - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
186 + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
187 " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
188 " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
189 " movq 24(%1), %%rdx;" /* f[3] */
190 @@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const
191 " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
192
193 /* Step 2: Compute two parallel carry chains */
194 - " xor %%r15, %%r15;"
195 + " xor %%r15d, %%r15d;"
196 " adox %%rax, %%r10;"
197 " adcx %%r8, %%r8;"
198 " adox %%rcx, %%r11;"
199 @@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const
200 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
201 " mov $38, %%rdx;"
202 " mulxq 32(%1), %%r8, %%r13;"
203 - " xor %%rcx, %%rcx;"
204 + " xor %%ecx, %%ecx;"
205 " adoxq 0(%1), %%r8;"
206 " mulxq 40(%1), %%r9, %%rbx;"
207 " adcx %%r13, %%r9;"
208 @@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const
209 asm volatile(
210 /* Step 1: Compute all partial products */
211 " movq 0(%1), %%rdx;" /* f[0] */
212 - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
213 + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
214 " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
215 " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
216 " movq 24(%1), %%rdx;" /* f[3] */
217 @@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const
218 " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
219
220 /* Step 2: Compute two parallel carry chains */
221 - " xor %%r15, %%r15;"
222 + " xor %%r15d, %%r15d;"
223 " adox %%rax, %%r10;"
224 " adcx %%r8, %%r8;"
225 " adox %%rcx, %%r11;"
226 @@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const
227
228 /* Step 1: Compute all partial products */
229 " movq 32(%1), %%rdx;" /* f[0] */
230 - " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
231 + " mulxq 40(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
232 " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
233 " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
234 " movq 56(%1), %%rdx;" /* f[3] */
235 @@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const
236 " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
237
238 /* Step 2: Compute two parallel carry chains */
239 - " xor %%r15, %%r15;"
240 + " xor %%r15d, %%r15d;"
241 " adox %%rax, %%r10;"
242 " adcx %%r8, %%r8;"
243 " adox %%rcx, %%r11;"
244 @@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const
245 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
246 " mov $38, %%rdx;"
247 " mulxq 32(%1), %%r8, %%r13;"
248 - " xor %%rcx, %%rcx;"
249 + " xor %%ecx, %%ecx;"
250 " adoxq 0(%1), %%r8;"
251 " mulxq 40(%1), %%r9, %%rbx;"
252 " adcx %%r13, %%r9;"
253 @@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const
254 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
255 " mov $38, %%rdx;"
256 " mulxq 96(%1), %%r8, %%r13;"
257 - " xor %%rcx, %%rcx;"
258 + " xor %%ecx, %%ecx;"
259 " adoxq 64(%1), %%r8;"
260 " mulxq 104(%1), %%r9, %%rbx;"
261 " adcx %%r13, %%r9;"