kernel: 5.4: import wireguard backport
[openwrt/openwrt.git] / target / linux / generic / backport-5.4 / 080-wireguard-0004-crypto-x86-chacha-expose-SIMD-ChaCha-routine-as-libr.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Ard Biesheuvel <ardb@kernel.org>
3 Date: Fri, 8 Nov 2019 13:22:10 +0100
4 Subject: [PATCH] crypto: x86/chacha - expose SIMD ChaCha routine as library
5 function
6
7 commit 84e03fa39fbe95a5567d43bff458c6d3b3a23ad1 upstream.
8
9 Wire the existing x86 SIMD ChaCha code into the new ChaCha library
10 interface, so that users of the library interface will get the
11 accelerated version when available.
12
13 Given that calls into the library API will always go through the
14 routines in this module if it is enabled, switch to static keys
15 to select the optimal implementation available (which may be none
16 at all, in which case we defer to the generic implementation for
17 all invocations).
18
19 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
20 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
21 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
22 ---
23 arch/x86/crypto/chacha_glue.c | 91 +++++++++++++++++++++++++----------
24 crypto/Kconfig | 1 +
25 include/crypto/chacha.h | 6 +++
26 3 files changed, 73 insertions(+), 25 deletions(-)
27
28 --- a/arch/x86/crypto/chacha_glue.c
29 +++ b/arch/x86/crypto/chacha_glue.c
30 @@ -21,24 +21,24 @@ asmlinkage void chacha_block_xor_ssse3(u
31 asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
32 unsigned int len, int nrounds);
33 asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
34 -#ifdef CONFIG_AS_AVX2
35 +
36 asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
37 unsigned int len, int nrounds);
38 asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
39 unsigned int len, int nrounds);
40 asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
41 unsigned int len, int nrounds);
42 -static bool chacha_use_avx2;
43 -#ifdef CONFIG_AS_AVX512
44 +
45 asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
46 unsigned int len, int nrounds);
47 asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
48 unsigned int len, int nrounds);
49 asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
50 unsigned int len, int nrounds);
51 -static bool chacha_use_avx512vl;
52 -#endif
53 -#endif
54 +
55 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
56 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
57 +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
58
59 static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
60 {
61 @@ -49,9 +49,8 @@ static unsigned int chacha_advance(unsig
62 static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
63 unsigned int bytes, int nrounds)
64 {
65 -#ifdef CONFIG_AS_AVX2
66 -#ifdef CONFIG_AS_AVX512
67 - if (chacha_use_avx512vl) {
68 + if (IS_ENABLED(CONFIG_AS_AVX512) &&
69 + static_branch_likely(&chacha_use_avx512vl)) {
70 while (bytes >= CHACHA_BLOCK_SIZE * 8) {
71 chacha_8block_xor_avx512vl(state, dst, src, bytes,
72 nrounds);
73 @@ -79,8 +78,9 @@ static void chacha_dosimd(u32 *state, u8
74 return;
75 }
76 }
77 -#endif
78 - if (chacha_use_avx2) {
79 +
80 + if (IS_ENABLED(CONFIG_AS_AVX2) &&
81 + static_branch_likely(&chacha_use_avx2)) {
82 while (bytes >= CHACHA_BLOCK_SIZE * 8) {
83 chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
84 bytes -= CHACHA_BLOCK_SIZE * 8;
85 @@ -104,7 +104,7 @@ static void chacha_dosimd(u32 *state, u8
86 return;
87 }
88 }
89 -#endif
90 +
91 while (bytes >= CHACHA_BLOCK_SIZE * 4) {
92 chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
93 bytes -= CHACHA_BLOCK_SIZE * 4;
94 @@ -123,6 +123,43 @@ static void chacha_dosimd(u32 *state, u8
95 }
96 }
97
98 +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
99 +{
100 + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
101 +
102 + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
103 + hchacha_block_generic(state, stream, nrounds);
104 + } else {
105 + kernel_fpu_begin();
106 + hchacha_block_ssse3(state, stream, nrounds);
107 + kernel_fpu_end();
108 + }
109 +}
110 +EXPORT_SYMBOL(hchacha_block_arch);
111 +
112 +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
113 +{
114 + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
115 +
116 + chacha_init_generic(state, key, iv);
117 +}
118 +EXPORT_SYMBOL(chacha_init_arch);
119 +
120 +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
121 + int nrounds)
122 +{
123 + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
124 +
125 + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
126 + bytes <= CHACHA_BLOCK_SIZE)
127 + return chacha_crypt_generic(state, dst, src, bytes, nrounds);
128 +
129 + kernel_fpu_begin();
130 + chacha_dosimd(state, dst, src, bytes, nrounds);
131 + kernel_fpu_end();
132 +}
133 +EXPORT_SYMBOL(chacha_crypt_arch);
134 +
135 static int chacha_simd_stream_xor(struct skcipher_request *req,
136 const struct chacha_ctx *ctx, const u8 *iv)
137 {
138 @@ -143,7 +180,8 @@ static int chacha_simd_stream_xor(struct
139 if (nbytes < walk.total)
140 nbytes = round_down(nbytes, walk.stride);
141
142 - if (!crypto_simd_usable()) {
143 + if (!static_branch_likely(&chacha_use_simd) ||
144 + !crypto_simd_usable()) {
145 chacha_crypt_generic(state, walk.dst.virt.addr,
146 walk.src.virt.addr, nbytes,
147 ctx->nrounds);
148 @@ -246,18 +284,21 @@ static struct skcipher_alg algs[] = {
149 static int __init chacha_simd_mod_init(void)
150 {
151 if (!boot_cpu_has(X86_FEATURE_SSSE3))
152 - return -ENODEV;
153 + return 0;
154
155 -#ifdef CONFIG_AS_AVX2
156 - chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
157 - boot_cpu_has(X86_FEATURE_AVX2) &&
158 - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
159 -#ifdef CONFIG_AS_AVX512
160 - chacha_use_avx512vl = chacha_use_avx2 &&
161 - boot_cpu_has(X86_FEATURE_AVX512VL) &&
162 - boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
163 -#endif
164 -#endif
165 + static_branch_enable(&chacha_use_simd);
166 +
167 + if (IS_ENABLED(CONFIG_AS_AVX2) &&
168 + boot_cpu_has(X86_FEATURE_AVX) &&
169 + boot_cpu_has(X86_FEATURE_AVX2) &&
170 + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
171 + static_branch_enable(&chacha_use_avx2);
172 +
173 + if (IS_ENABLED(CONFIG_AS_AVX512) &&
174 + boot_cpu_has(X86_FEATURE_AVX512VL) &&
175 + boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
176 + static_branch_enable(&chacha_use_avx512vl);
177 + }
178 return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
179 }
180
181 --- a/crypto/Kconfig
182 +++ b/crypto/Kconfig
183 @@ -1418,6 +1418,7 @@ config CRYPTO_CHACHA20_X86_64
184 depends on X86 && 64BIT
185 select CRYPTO_BLKCIPHER
186 select CRYPTO_LIB_CHACHA_GENERIC
187 + select CRYPTO_ARCH_HAVE_LIB_CHACHA
188 help
189 SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
190 XChaCha20, and XChaCha12 stream ciphers.
191 --- a/include/crypto/chacha.h
192 +++ b/include/crypto/chacha.h
193 @@ -25,6 +25,12 @@
194 #define CHACHA_BLOCK_SIZE 64
195 #define CHACHAPOLY_IV_SIZE 12
196
197 +#ifdef CONFIG_X86_64
198 +#define CHACHA_STATE_WORDS ((CHACHA_BLOCK_SIZE + 12) / sizeof(u32))
199 +#else
200 +#define CHACHA_STATE_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32))
201 +#endif
202 +
203 /* 192-bit nonce, then 64-bit stream position */
204 #define XCHACHA_IV_SIZE 32
205