CoCalc -- chacha20

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bearssl/src/symcipher/chacha20_sse2.c
³⁹⁴⁸² views
1
/*
2
 * Copyright (c) 2017 Thomas Pornin <[email protected]>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining 
5
 * a copy of this software and associated documentation files (the
6
 * "Software"), to deal in the Software without restriction, including
7
 * without limitation the rights to use, copy, modify, merge, publish,
8
 * distribute, sublicense, and/or sell copies of the Software, and to
9
 * permit persons to whom the Software is furnished to do so, subject to
10
 * the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be 
13
 * included in all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
 * SOFTWARE.
23
 */
24

25
#define BR_ENABLE_INTRINSICS   1
26
#include "inner.h"
27

28
#if BR_SSE2
29

30
/*
31
 * This file contains a ChaCha20 implementation that leverages SSE2
32
 * opcodes for better performance.
33
 */
34

35
/* see bearssl_block.h */
36
br_chacha20_run
37
br_chacha20_sse2_get(void)
38
{
39
	/*
40
	 * If using 64-bit mode, then SSE2 opcodes should be automatically
41
	 * available, since they are part of the ABI.
42
	 *
43
	 * In 32-bit mode, we use CPUID to detect the SSE2 feature.
44
	 */
45

46
#if BR_amd64
47
	return &br_chacha20_sse2_run;
48
#else
49

50
	/*
51
	 * SSE2 support is indicated by bit 26 in EDX.
52
	 */
53
	if (br_cpuid(0, 0, 0, 0x04000000)) {
54
		return &br_chacha20_sse2_run;
55
	} else {
56
		return 0;
57
	}
58
#endif
59
}
60

61
BR_TARGETS_X86_UP
62

63
/* see bearssl_block.h */
64
BR_TARGET("sse2")
65
uint32_t
66
br_chacha20_sse2_run(const void *key,
67
	const void *iv, uint32_t cc, void *data, size_t len)
68
{
69
	unsigned char *buf;
70
	uint32_t ivtmp[4];
71
	__m128i kw0, kw1;
72
	__m128i iw, cw;
73
	__m128i one;
74

75
	static const uint32_t CW[] = {
76
		0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
77
	};
78

79
	buf = data;
80
	kw0 = _mm_loadu_si128(key);
81
	kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
82
	ivtmp[0] = cc;
83
	memcpy(ivtmp + 1, iv, 12);
84
	iw = _mm_loadu_si128((const void *)ivtmp);
85
	cw = _mm_loadu_si128((const void *)CW);
86
	one = _mm_set_epi32(0, 0, 0, 1);
87

88
	while (len > 0) {
89
		/*
90
		 * sj contains state words 4*j to 4*j+3.
91
		 */
92
		__m128i s0, s1, s2, s3;
93
		int i;
94

95
		s0 = cw;
96
		s1 = kw0;
97
		s2 = kw1;
98
		s3 = iw;
99
		for (i = 0; i < 10; i ++) {
100
			/*
101
			 * Even round is straightforward application on
102
			 * the state words.
103
			 */
104
			s0 = _mm_add_epi32(s0, s1);
105
			s3 = _mm_xor_si128(s3, s0);
106
			s3 = _mm_or_si128(
107
				_mm_slli_epi32(s3, 16),
108
				_mm_srli_epi32(s3, 16));
109

110
			s2 = _mm_add_epi32(s2, s3);
111
			s1 = _mm_xor_si128(s1, s2);
112
			s1 = _mm_or_si128(
113
				_mm_slli_epi32(s1, 12),
114
				_mm_srli_epi32(s1, 20));
115

116
			s0 = _mm_add_epi32(s0, s1);
117
			s3 = _mm_xor_si128(s3, s0);
118
			s3 = _mm_or_si128(
119
				_mm_slli_epi32(s3, 8),
120
				_mm_srli_epi32(s3, 24));
121

122
			s2 = _mm_add_epi32(s2, s3);
123
			s1 = _mm_xor_si128(s1, s2);
124
			s1 = _mm_or_si128(
125
				_mm_slli_epi32(s1, 7),
126
				_mm_srli_epi32(s1, 25));
127

128
			/*
129
			 * For the odd round, we must rotate some state
130
			 * words so that the computations apply on the
131
			 * right combinations of words.
132
			 */
133
			s1 = _mm_shuffle_epi32(s1, 0x39);
134
			s2 = _mm_shuffle_epi32(s2, 0x4E);
135
			s3 = _mm_shuffle_epi32(s3, 0x93);
136

137
			s0 = _mm_add_epi32(s0, s1);
138
			s3 = _mm_xor_si128(s3, s0);
139
			s3 = _mm_or_si128(
140
				_mm_slli_epi32(s3, 16),
141
				_mm_srli_epi32(s3, 16));
142

143
			s2 = _mm_add_epi32(s2, s3);
144
			s1 = _mm_xor_si128(s1, s2);
145
			s1 = _mm_or_si128(
146
				_mm_slli_epi32(s1, 12),
147
				_mm_srli_epi32(s1, 20));
148

149
			s0 = _mm_add_epi32(s0, s1);
150
			s3 = _mm_xor_si128(s3, s0);
151
			s3 = _mm_or_si128(
152
				_mm_slli_epi32(s3, 8),
153
				_mm_srli_epi32(s3, 24));
154

155
			s2 = _mm_add_epi32(s2, s3);
156
			s1 = _mm_xor_si128(s1, s2);
157
			s1 = _mm_or_si128(
158
				_mm_slli_epi32(s1, 7),
159
				_mm_srli_epi32(s1, 25));
160

161
			/*
162
			 * After the odd round, we rotate back the values
163
			 * to undo the rotate at the start of the odd round.
164
			 */
165
			s1 = _mm_shuffle_epi32(s1, 0x93);
166
			s2 = _mm_shuffle_epi32(s2, 0x4E);
167
			s3 = _mm_shuffle_epi32(s3, 0x39);
168
		}
169

170
		/*
171
		 * Addition with the initial state.
172
		 */
173
		s0 = _mm_add_epi32(s0, cw);
174
		s1 = _mm_add_epi32(s1, kw0);
175
		s2 = _mm_add_epi32(s2, kw1);
176
		s3 = _mm_add_epi32(s3, iw);
177

178
		/*
179
		 * Increment block counter.
180
		 */
181
		iw = _mm_add_epi32(iw, one);
182

183
		/*
184
		 * XOR final state with the data.
185
		 */
186
		if (len < 64) {
187
			unsigned char tmp[64];
188
			size_t u;
189

190
			_mm_storeu_si128((void *)(tmp +  0), s0);
191
			_mm_storeu_si128((void *)(tmp + 16), s1);
192
			_mm_storeu_si128((void *)(tmp + 32), s2);
193
			_mm_storeu_si128((void *)(tmp + 48), s3);
194
			for (u = 0; u < len; u ++) {
195
				buf[u] ^= tmp[u];
196
			}
197
			break;
198
		} else {
199
			__m128i b0, b1, b2, b3;
200

201
			b0 = _mm_loadu_si128((const void *)(buf +  0));
202
			b1 = _mm_loadu_si128((const void *)(buf + 16));
203
			b2 = _mm_loadu_si128((const void *)(buf + 32));
204
			b3 = _mm_loadu_si128((const void *)(buf + 48));
205
			b0 = _mm_xor_si128(b0, s0);
206
			b1 = _mm_xor_si128(b1, s1);
207
			b2 = _mm_xor_si128(b2, s2);
208
			b3 = _mm_xor_si128(b3, s3);
209
			_mm_storeu_si128((void *)(buf +  0), b0);
210
			_mm_storeu_si128((void *)(buf + 16), b1);
211
			_mm_storeu_si128((void *)(buf + 32), b2);
212
			_mm_storeu_si128((void *)(buf + 48), b3);
213
			buf += 64;
214
			len -= 64;
215
		}
216
	}
217

218
	/*
219
	 * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
220
	 * raw SSE2, thus we use _mm_extract_epi16().
221
	 */
222
	return (uint32_t)_mm_extract_epi16(iw, 0)
223
		| ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
224
}
225

226
BR_TARGETS_X86_DOWN
227

228
#else
229

230
/* see bearssl_block.h */
231
br_chacha20_run
232
br_chacha20_sse2_get(void)
233
{
234
	return 0;
235
}
236

237
#endif
238

239
Product

Resources

Company