kernel: 5.4: import wireguard backport
[openwrt/openwrt.git] / target / linux / generic / backport-5.4 / 080-wireguard-0069-crypto-arm64-chacha-simplify-tail-block-handling.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Ard Biesheuvel <ardb@kernel.org>
3 Date: Fri, 6 Nov 2020 17:39:38 +0100
4 Subject: [PATCH] crypto: arm64/chacha - simplify tail block handling
5
6 commit c4fc6328d6c67690a7e6e03f43a5a976a13120ef upstream.
7
8 Based on lessons learnt from optimizing the 32-bit version of this driver,
9 we can simplify the arm64 version considerably, by reordering the final
10 two stores when the last block is not a multiple of 64 bytes. This removes
11 the need to use permutation instructions to calculate the elements that are
12 clobbered by the final overlapping store, given that the store of the
13 penultimate block now follows it, and that one carries the correct values
14 for those elements already.
15
16 While at it, simplify the overlapping loads as well, by calculating the
17 address of the final overlapping load upfront, and switching to this
18 address for every load that would otherwise extend past the end of the
19 source buffer.
20
21 There is no impact on performance, but the resulting code is substantially
22 smaller and easier to follow.
23
24 Cc: Eric Biggers <ebiggers@google.com>
25 Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
26 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
27 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
28 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
29 ---
30 arch/arm64/crypto/chacha-neon-core.S | 193 ++++++++++-----------------
31 1 file changed, 69 insertions(+), 124 deletions(-)
32
33 --- a/arch/arm64/crypto/chacha-neon-core.S
34 +++ b/arch/arm64/crypto/chacha-neon-core.S
35 @@ -195,7 +195,6 @@ ENTRY(chacha_4block_xor_neon)
36 adr_l x10, .Lpermute
37 and x5, x4, #63
38 add x10, x10, x5
39 - add x11, x10, #64
40
41 //
42 // This function encrypts four consecutive ChaCha blocks by loading
43 @@ -645,11 +644,11 @@ CPU_BE( rev a15, a15 )
44 zip2 v31.4s, v14.4s, v15.4s
45 eor a15, a15, w9
46
47 - mov x3, #64
48 + add x3, x2, x4
49 + sub x3, x3, #128 // start of last block
50 +
51 subs x5, x4, #128
52 - add x6, x5, x2
53 - csel x3, x3, xzr, ge
54 - csel x2, x2, x6, ge
55 + csel x2, x2, x3, ge
56
57 // interleave 64-bit words in state n, n+2
58 zip1 v0.2d, v16.2d, v18.2d
59 @@ -658,13 +657,10 @@ CPU_BE( rev a15, a15 )
60 zip1 v8.2d, v17.2d, v19.2d
61 zip2 v12.2d, v17.2d, v19.2d
62 stp a2, a3, [x1, #-56]
63 - ld1 {v16.16b-v19.16b}, [x2], x3
64
65 subs x6, x4, #192
66 - ccmp x3, xzr, #4, lt
67 - add x7, x6, x2
68 - csel x3, x3, xzr, eq
69 - csel x2, x2, x7, eq
70 + ld1 {v16.16b-v19.16b}, [x2], #64
71 + csel x2, x2, x3, ge
72
73 zip1 v1.2d, v20.2d, v22.2d
74 zip2 v5.2d, v20.2d, v22.2d
75 @@ -672,13 +668,10 @@ CPU_BE( rev a15, a15 )
76 zip1 v9.2d, v21.2d, v23.2d
77 zip2 v13.2d, v21.2d, v23.2d
78 stp a6, a7, [x1, #-40]
79 - ld1 {v20.16b-v23.16b}, [x2], x3
80
81 subs x7, x4, #256
82 - ccmp x3, xzr, #4, lt
83 - add x8, x7, x2
84 - csel x3, x3, xzr, eq
85 - csel x2, x2, x8, eq
86 + ld1 {v20.16b-v23.16b}, [x2], #64
87 + csel x2, x2, x3, ge
88
89 zip1 v2.2d, v24.2d, v26.2d
90 zip2 v6.2d, v24.2d, v26.2d
91 @@ -686,12 +679,10 @@ CPU_BE( rev a15, a15 )
92 zip1 v10.2d, v25.2d, v27.2d
93 zip2 v14.2d, v25.2d, v27.2d
94 stp a10, a11, [x1, #-24]
95 - ld1 {v24.16b-v27.16b}, [x2], x3
96
97 subs x8, x4, #320
98 - ccmp x3, xzr, #4, lt
99 - add x9, x8, x2
100 - csel x2, x2, x9, eq
101 + ld1 {v24.16b-v27.16b}, [x2], #64
102 + csel x2, x2, x3, ge
103
104 zip1 v3.2d, v28.2d, v30.2d
105 zip2 v7.2d, v28.2d, v30.2d
106 @@ -699,151 +690,105 @@ CPU_BE( rev a15, a15 )
107 zip1 v11.2d, v29.2d, v31.2d
108 zip2 v15.2d, v29.2d, v31.2d
109 stp a14, a15, [x1, #-8]
110 +
111 + tbnz x5, #63, .Lt128
112 ld1 {v28.16b-v31.16b}, [x2]
113
114 // xor with corresponding input, write to output
115 - tbnz x5, #63, 0f
116 eor v16.16b, v16.16b, v0.16b
117 eor v17.16b, v17.16b, v1.16b
118 eor v18.16b, v18.16b, v2.16b
119 eor v19.16b, v19.16b, v3.16b
120 - st1 {v16.16b-v19.16b}, [x1], #64
121 - cbz x5, .Lout
122
123 - tbnz x6, #63, 1f
124 + tbnz x6, #63, .Lt192
125 +
126 eor v20.16b, v20.16b, v4.16b
127 eor v21.16b, v21.16b, v5.16b
128 eor v22.16b, v22.16b, v6.16b
129 eor v23.16b, v23.16b, v7.16b
130 - st1 {v20.16b-v23.16b}, [x1], #64
131 - cbz x6, .Lout
132
133 - tbnz x7, #63, 2f
134 + st1 {v16.16b-v19.16b}, [x1], #64
135 + tbnz x7, #63, .Lt256
136 +
137 eor v24.16b, v24.16b, v8.16b
138 eor v25.16b, v25.16b, v9.16b
139 eor v26.16b, v26.16b, v10.16b
140 eor v27.16b, v27.16b, v11.16b
141 - st1 {v24.16b-v27.16b}, [x1], #64
142 - cbz x7, .Lout
143
144 - tbnz x8, #63, 3f
145 + st1 {v20.16b-v23.16b}, [x1], #64
146 + tbnz x8, #63, .Lt320
147 +
148 eor v28.16b, v28.16b, v12.16b
149 eor v29.16b, v29.16b, v13.16b
150 eor v30.16b, v30.16b, v14.16b
151 eor v31.16b, v31.16b, v15.16b
152 +
153 + st1 {v24.16b-v27.16b}, [x1], #64
154 st1 {v28.16b-v31.16b}, [x1]
155
156 .Lout: frame_pop
157 ret
158
159 - // fewer than 128 bytes of in/output
160 -0: ld1 {v8.16b}, [x10]
161 - ld1 {v9.16b}, [x11]
162 - movi v10.16b, #16
163 - sub x2, x1, #64
164 - add x1, x1, x5
165 - ld1 {v16.16b-v19.16b}, [x2]
166 - tbl v4.16b, {v0.16b-v3.16b}, v8.16b
167 - tbx v20.16b, {v16.16b-v19.16b}, v9.16b
168 - add v8.16b, v8.16b, v10.16b
169 - add v9.16b, v9.16b, v10.16b
170 - tbl v5.16b, {v0.16b-v3.16b}, v8.16b
171 - tbx v21.16b, {v16.16b-v19.16b}, v9.16b
172 - add v8.16b, v8.16b, v10.16b
173 - add v9.16b, v9.16b, v10.16b
174 - tbl v6.16b, {v0.16b-v3.16b}, v8.16b
175 - tbx v22.16b, {v16.16b-v19.16b}, v9.16b
176 - add v8.16b, v8.16b, v10.16b
177 - add v9.16b, v9.16b, v10.16b
178 - tbl v7.16b, {v0.16b-v3.16b}, v8.16b
179 - tbx v23.16b, {v16.16b-v19.16b}, v9.16b
180 -
181 - eor v20.16b, v20.16b, v4.16b
182 - eor v21.16b, v21.16b, v5.16b
183 - eor v22.16b, v22.16b, v6.16b
184 - eor v23.16b, v23.16b, v7.16b
185 - st1 {v20.16b-v23.16b}, [x1]
186 - b .Lout
187 -
188 // fewer than 192 bytes of in/output
189 -1: ld1 {v8.16b}, [x10]
190 - ld1 {v9.16b}, [x11]
191 - movi v10.16b, #16
192 - add x1, x1, x6
193 - tbl v0.16b, {v4.16b-v7.16b}, v8.16b
194 - tbx v20.16b, {v16.16b-v19.16b}, v9.16b
195 - add v8.16b, v8.16b, v10.16b
196 - add v9.16b, v9.16b, v10.16b
197 - tbl v1.16b, {v4.16b-v7.16b}, v8.16b
198 - tbx v21.16b, {v16.16b-v19.16b}, v9.16b
199 - add v8.16b, v8.16b, v10.16b
200 - add v9.16b, v9.16b, v10.16b
201 - tbl v2.16b, {v4.16b-v7.16b}, v8.16b
202 - tbx v22.16b, {v16.16b-v19.16b}, v9.16b
203 - add v8.16b, v8.16b, v10.16b
204 - add v9.16b, v9.16b, v10.16b
205 - tbl v3.16b, {v4.16b-v7.16b}, v8.16b
206 - tbx v23.16b, {v16.16b-v19.16b}, v9.16b
207 -
208 - eor v20.16b, v20.16b, v0.16b
209 - eor v21.16b, v21.16b, v1.16b
210 - eor v22.16b, v22.16b, v2.16b
211 - eor v23.16b, v23.16b, v3.16b
212 - st1 {v20.16b-v23.16b}, [x1]
213 +.Lt192: cbz x5, 1f // exactly 128 bytes?
214 + ld1 {v28.16b-v31.16b}, [x10]
215 + add x5, x5, x1
216 + tbl v28.16b, {v4.16b-v7.16b}, v28.16b
217 + tbl v29.16b, {v4.16b-v7.16b}, v29.16b
218 + tbl v30.16b, {v4.16b-v7.16b}, v30.16b
219 + tbl v31.16b, {v4.16b-v7.16b}, v31.16b
220 +
221 +0: eor v20.16b, v20.16b, v28.16b
222 + eor v21.16b, v21.16b, v29.16b
223 + eor v22.16b, v22.16b, v30.16b
224 + eor v23.16b, v23.16b, v31.16b
225 + st1 {v20.16b-v23.16b}, [x5] // overlapping stores
226 +1: st1 {v16.16b-v19.16b}, [x1]
227 b .Lout
228
229 + // fewer than 128 bytes of in/output
230 +.Lt128: ld1 {v28.16b-v31.16b}, [x10]
231 + add x5, x5, x1
232 + sub x1, x1, #64
233 + tbl v28.16b, {v0.16b-v3.16b}, v28.16b
234 + tbl v29.16b, {v0.16b-v3.16b}, v29.16b
235 + tbl v30.16b, {v0.16b-v3.16b}, v30.16b
236 + tbl v31.16b, {v0.16b-v3.16b}, v31.16b
237 + ld1 {v16.16b-v19.16b}, [x1] // reload first output block
238 + b 0b
239 +
240 // fewer than 256 bytes of in/output
241 -2: ld1 {v4.16b}, [x10]
242 - ld1 {v5.16b}, [x11]
243 - movi v6.16b, #16
244 - add x1, x1, x7
245 +.Lt256: cbz x6, 2f // exactly 192 bytes?
246 + ld1 {v4.16b-v7.16b}, [x10]
247 + add x6, x6, x1
248 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
249 - tbx v24.16b, {v20.16b-v23.16b}, v5.16b
250 - add v4.16b, v4.16b, v6.16b
251 - add v5.16b, v5.16b, v6.16b
252 - tbl v1.16b, {v8.16b-v11.16b}, v4.16b
253 - tbx v25.16b, {v20.16b-v23.16b}, v5.16b
254 - add v4.16b, v4.16b, v6.16b
255 - add v5.16b, v5.16b, v6.16b
256 - tbl v2.16b, {v8.16b-v11.16b}, v4.16b
257 - tbx v26.16b, {v20.16b-v23.16b}, v5.16b
258 - add v4.16b, v4.16b, v6.16b
259 - add v5.16b, v5.16b, v6.16b
260 - tbl v3.16b, {v8.16b-v11.16b}, v4.16b
261 - tbx v27.16b, {v20.16b-v23.16b}, v5.16b
262 -
263 - eor v24.16b, v24.16b, v0.16b
264 - eor v25.16b, v25.16b, v1.16b
265 - eor v26.16b, v26.16b, v2.16b
266 - eor v27.16b, v27.16b, v3.16b
267 - st1 {v24.16b-v27.16b}, [x1]
268 + tbl v1.16b, {v8.16b-v11.16b}, v5.16b
269 + tbl v2.16b, {v8.16b-v11.16b}, v6.16b
270 + tbl v3.16b, {v8.16b-v11.16b}, v7.16b
271 +
272 + eor v28.16b, v28.16b, v0.16b
273 + eor v29.16b, v29.16b, v1.16b
274 + eor v30.16b, v30.16b, v2.16b
275 + eor v31.16b, v31.16b, v3.16b
276 + st1 {v28.16b-v31.16b}, [x6] // overlapping stores
277 +2: st1 {v20.16b-v23.16b}, [x1]
278 b .Lout
279
280 // fewer than 320 bytes of in/output
281 -3: ld1 {v4.16b}, [x10]
282 - ld1 {v5.16b}, [x11]
283 - movi v6.16b, #16
284 - add x1, x1, x8
285 +.Lt320: cbz x7, 3f // exactly 256 bytes?
286 + ld1 {v4.16b-v7.16b}, [x10]
287 + add x7, x7, x1
288 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
289 - tbx v28.16b, {v24.16b-v27.16b}, v5.16b
290 - add v4.16b, v4.16b, v6.16b
291 - add v5.16b, v5.16b, v6.16b
292 - tbl v1.16b, {v12.16b-v15.16b}, v4.16b
293 - tbx v29.16b, {v24.16b-v27.16b}, v5.16b
294 - add v4.16b, v4.16b, v6.16b
295 - add v5.16b, v5.16b, v6.16b
296 - tbl v2.16b, {v12.16b-v15.16b}, v4.16b
297 - tbx v30.16b, {v24.16b-v27.16b}, v5.16b
298 - add v4.16b, v4.16b, v6.16b
299 - add v5.16b, v5.16b, v6.16b
300 - tbl v3.16b, {v12.16b-v15.16b}, v4.16b
301 - tbx v31.16b, {v24.16b-v27.16b}, v5.16b
302 + tbl v1.16b, {v12.16b-v15.16b}, v5.16b
303 + tbl v2.16b, {v12.16b-v15.16b}, v6.16b
304 + tbl v3.16b, {v12.16b-v15.16b}, v7.16b
305
306 eor v28.16b, v28.16b, v0.16b
307 eor v29.16b, v29.16b, v1.16b
308 eor v30.16b, v30.16b, v2.16b
309 eor v31.16b, v31.16b, v3.16b
310 - st1 {v28.16b-v31.16b}, [x1]
311 + st1 {v28.16b-v31.16b}, [x7] // overlapping stores
312 +3: st1 {v24.16b-v27.16b}, [x1]
313 b .Lout
314 ENDPROC(chacha_4block_xor_neon)
315
316 @@ -851,7 +796,7 @@ ENDPROC(chacha_4block_xor_neon)
317 .align L1_CACHE_SHIFT
318 .Lpermute:
319 .set .Li, 0
320 - .rept 192
321 + .rept 128
322 .byte (.Li - 64)
323 .set .Li, .Li + 1
324 .endr