kernel: 5.4: import wireguard backport
[openwrt/openwrt.git] / target / linux / generic / backport-5.4 / 080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Ard Biesheuvel <ardb@kernel.org>
3 Date: Wed, 8 Jul 2020 12:11:18 +0300
4 Subject: [PATCH] crypto: x86/chacha-sse3 - use unaligned loads for state array
5
6 commit e79a31715193686e92dadb4caedfbb1f5de3659c upstream.
7
8 Due to the fact that the x86 port does not support allocating objects
9 on the stack with an alignment that exceeds 8 bytes, we have a rather
10 ugly hack in the x86 code for ChaCha to ensure that the state array is
11 aligned to 16 bytes, allowing the SSE3 implementation of the algorithm
12 to use aligned loads.
13
14 Given that the performance benefit of using of aligned loads appears to
15 be limited (~0.25% for 1k blocks using tcrypt on a Corei7-8650U), and
16 the fact that this hack has leaked into generic ChaCha code, let's just
17 remove it.
18
19 Cc: Martin Willi <martin@strongswan.org>
20 Cc: Herbert Xu <herbert@gondor.apana.org.au>
21 Cc: Eric Biggers <ebiggers@kernel.org>
22 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
23 Reviewed-by: Martin Willi <martin@strongswan.org>
24 Reviewed-by: Eric Biggers <ebiggers@google.com>
25 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
26 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
27 ---
28 arch/x86/crypto/chacha-ssse3-x86_64.S | 16 ++++++++--------
29 arch/x86/crypto/chacha_glue.c | 17 ++---------------
30 include/crypto/chacha.h | 4 ----
31 3 files changed, 10 insertions(+), 27 deletions(-)
32
33 --- a/arch/x86/crypto/chacha-ssse3-x86_64.S
34 +++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
35 @@ -120,10 +120,10 @@ ENTRY(chacha_block_xor_ssse3)
36 FRAME_BEGIN
37
38 # x0..3 = s0..3
39 - movdqa 0x00(%rdi),%xmm0
40 - movdqa 0x10(%rdi),%xmm1
41 - movdqa 0x20(%rdi),%xmm2
42 - movdqa 0x30(%rdi),%xmm3
43 + movdqu 0x00(%rdi),%xmm0
44 + movdqu 0x10(%rdi),%xmm1
45 + movdqu 0x20(%rdi),%xmm2
46 + movdqu 0x30(%rdi),%xmm3
47 movdqa %xmm0,%xmm8
48 movdqa %xmm1,%xmm9
49 movdqa %xmm2,%xmm10
50 @@ -205,10 +205,10 @@ ENTRY(hchacha_block_ssse3)
51 # %edx: nrounds
52 FRAME_BEGIN
53
54 - movdqa 0x00(%rdi),%xmm0
55 - movdqa 0x10(%rdi),%xmm1
56 - movdqa 0x20(%rdi),%xmm2
57 - movdqa 0x30(%rdi),%xmm3
58 + movdqu 0x00(%rdi),%xmm0
59 + movdqu 0x10(%rdi),%xmm1
60 + movdqu 0x20(%rdi),%xmm2
61 + movdqu 0x30(%rdi),%xmm3
62
63 mov %edx,%r8d
64 call chacha_permute
65 --- a/arch/x86/crypto/chacha_glue.c
66 +++ b/arch/x86/crypto/chacha_glue.c
67 @@ -14,8 +14,6 @@
68 #include <linux/module.h>
69 #include <asm/simd.h>
70
71 -#define CHACHA_STATE_ALIGN 16
72 -
73 asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
74 unsigned int len, int nrounds);
75 asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
76 @@ -125,8 +123,6 @@ static void chacha_dosimd(u32 *state, u8
77
78 void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
79 {
80 - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
81 -
82 if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
83 hchacha_block_generic(state, stream, nrounds);
84 } else {
85 @@ -139,8 +135,6 @@ EXPORT_SYMBOL(hchacha_block_arch);
86
87 void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
88 {
89 - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
90 -
91 chacha_init_generic(state, key, iv);
92 }
93 EXPORT_SYMBOL(chacha_init_arch);
94 @@ -148,8 +142,6 @@ EXPORT_SYMBOL(chacha_init_arch);
95 void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
96 int nrounds)
97 {
98 - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
99 -
100 if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
101 bytes <= CHACHA_BLOCK_SIZE)
102 return chacha_crypt_generic(state, dst, src, bytes, nrounds);
103 @@ -171,15 +163,12 @@ EXPORT_SYMBOL(chacha_crypt_arch);
104 static int chacha_simd_stream_xor(struct skcipher_request *req,
105 const struct chacha_ctx *ctx, const u8 *iv)
106 {
107 - u32 *state, state_buf[16 + 2] __aligned(8);
108 + u32 state[CHACHA_STATE_WORDS] __aligned(8);
109 struct skcipher_walk walk;
110 int err;
111
112 err = skcipher_walk_virt(&walk, req, false);
113
114 - BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
115 - state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
116 -
117 chacha_init_generic(state, ctx->key, iv);
118
119 while (walk.nbytes > 0) {
120 @@ -218,12 +207,10 @@ static int xchacha_simd(struct skcipher_
121 {
122 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
123 struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
124 - u32 *state, state_buf[16 + 2] __aligned(8);
125 + u32 state[CHACHA_STATE_WORDS] __aligned(8);
126 struct chacha_ctx subctx;
127 u8 real_iv[16];
128
129 - BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
130 - state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
131 chacha_init_generic(state, ctx->key, req->iv);
132
133 if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
134 --- a/include/crypto/chacha.h
135 +++ b/include/crypto/chacha.h
136 @@ -25,11 +25,7 @@
137 #define CHACHA_BLOCK_SIZE 64
138 #define CHACHAPOLY_IV_SIZE 12
139
140 -#ifdef CONFIG_X86_64
141 -#define CHACHA_STATE_WORDS ((CHACHA_BLOCK_SIZE + 12) / sizeof(u32))
142 -#else
143 #define CHACHA_STATE_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32))
144 -#endif
145
146 /* 192-bit nonce, then 64-bit stream position */
147 #define XCHACHA_IV_SIZE 32