Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bearssl/src/symcipher/chacha20_sse2.c
39482 views
1
/*
2
* Copyright (c) 2017 Thomas Pornin <[email protected]>
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining
5
* a copy of this software and associated documentation files (the
6
* "Software"), to deal in the Software without restriction, including
7
* without limitation the rights to use, copy, modify, merge, publish,
8
* distribute, sublicense, and/or sell copies of the Software, and to
9
* permit persons to whom the Software is furnished to do so, subject to
10
* the following conditions:
11
*
12
* The above copyright notice and this permission notice shall be
13
* included in all copies or substantial portions of the Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
25
#define BR_ENABLE_INTRINSICS 1
26
#include "inner.h"
27
28
#if BR_SSE2
29
30
/*
31
* This file contains a ChaCha20 implementation that leverages SSE2
32
* opcodes for better performance.
33
*/
34
35
/* see bearssl_block.h */
36
br_chacha20_run
37
br_chacha20_sse2_get(void)
38
{
39
/*
40
* If using 64-bit mode, then SSE2 opcodes should be automatically
41
* available, since they are part of the ABI.
42
*
43
* In 32-bit mode, we use CPUID to detect the SSE2 feature.
44
*/
45
46
#if BR_amd64
47
return &br_chacha20_sse2_run;
48
#else
49
50
/*
51
* SSE2 support is indicated by bit 26 in EDX.
52
*/
53
if (br_cpuid(0, 0, 0, 0x04000000)) {
54
return &br_chacha20_sse2_run;
55
} else {
56
return 0;
57
}
58
#endif
59
}
60
61
BR_TARGETS_X86_UP
62
63
/* see bearssl_block.h */
64
BR_TARGET("sse2")
65
uint32_t
66
br_chacha20_sse2_run(const void *key,
67
const void *iv, uint32_t cc, void *data, size_t len)
68
{
69
unsigned char *buf;
70
uint32_t ivtmp[4];
71
__m128i kw0, kw1;
72
__m128i iw, cw;
73
__m128i one;
74
75
static const uint32_t CW[] = {
76
0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
77
};
78
79
buf = data;
80
kw0 = _mm_loadu_si128(key);
81
kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
82
ivtmp[0] = cc;
83
memcpy(ivtmp + 1, iv, 12);
84
iw = _mm_loadu_si128((const void *)ivtmp);
85
cw = _mm_loadu_si128((const void *)CW);
86
one = _mm_set_epi32(0, 0, 0, 1);
87
88
while (len > 0) {
89
/*
90
* sj contains state words 4*j to 4*j+3.
91
*/
92
__m128i s0, s1, s2, s3;
93
int i;
94
95
s0 = cw;
96
s1 = kw0;
97
s2 = kw1;
98
s3 = iw;
99
for (i = 0; i < 10; i ++) {
100
/*
101
* Even round is straightforward application on
102
* the state words.
103
*/
104
s0 = _mm_add_epi32(s0, s1);
105
s3 = _mm_xor_si128(s3, s0);
106
s3 = _mm_or_si128(
107
_mm_slli_epi32(s3, 16),
108
_mm_srli_epi32(s3, 16));
109
110
s2 = _mm_add_epi32(s2, s3);
111
s1 = _mm_xor_si128(s1, s2);
112
s1 = _mm_or_si128(
113
_mm_slli_epi32(s1, 12),
114
_mm_srli_epi32(s1, 20));
115
116
s0 = _mm_add_epi32(s0, s1);
117
s3 = _mm_xor_si128(s3, s0);
118
s3 = _mm_or_si128(
119
_mm_slli_epi32(s3, 8),
120
_mm_srli_epi32(s3, 24));
121
122
s2 = _mm_add_epi32(s2, s3);
123
s1 = _mm_xor_si128(s1, s2);
124
s1 = _mm_or_si128(
125
_mm_slli_epi32(s1, 7),
126
_mm_srli_epi32(s1, 25));
127
128
/*
129
* For the odd round, we must rotate some state
130
* words so that the computations apply on the
131
* right combinations of words.
132
*/
133
s1 = _mm_shuffle_epi32(s1, 0x39);
134
s2 = _mm_shuffle_epi32(s2, 0x4E);
135
s3 = _mm_shuffle_epi32(s3, 0x93);
136
137
s0 = _mm_add_epi32(s0, s1);
138
s3 = _mm_xor_si128(s3, s0);
139
s3 = _mm_or_si128(
140
_mm_slli_epi32(s3, 16),
141
_mm_srli_epi32(s3, 16));
142
143
s2 = _mm_add_epi32(s2, s3);
144
s1 = _mm_xor_si128(s1, s2);
145
s1 = _mm_or_si128(
146
_mm_slli_epi32(s1, 12),
147
_mm_srli_epi32(s1, 20));
148
149
s0 = _mm_add_epi32(s0, s1);
150
s3 = _mm_xor_si128(s3, s0);
151
s3 = _mm_or_si128(
152
_mm_slli_epi32(s3, 8),
153
_mm_srli_epi32(s3, 24));
154
155
s2 = _mm_add_epi32(s2, s3);
156
s1 = _mm_xor_si128(s1, s2);
157
s1 = _mm_or_si128(
158
_mm_slli_epi32(s1, 7),
159
_mm_srli_epi32(s1, 25));
160
161
/*
162
* After the odd round, we rotate back the values
163
* to undo the rotate at the start of the odd round.
164
*/
165
s1 = _mm_shuffle_epi32(s1, 0x93);
166
s2 = _mm_shuffle_epi32(s2, 0x4E);
167
s3 = _mm_shuffle_epi32(s3, 0x39);
168
}
169
170
/*
171
* Addition with the initial state.
172
*/
173
s0 = _mm_add_epi32(s0, cw);
174
s1 = _mm_add_epi32(s1, kw0);
175
s2 = _mm_add_epi32(s2, kw1);
176
s3 = _mm_add_epi32(s3, iw);
177
178
/*
179
* Increment block counter.
180
*/
181
iw = _mm_add_epi32(iw, one);
182
183
/*
184
* XOR final state with the data.
185
*/
186
if (len < 64) {
187
unsigned char tmp[64];
188
size_t u;
189
190
_mm_storeu_si128((void *)(tmp + 0), s0);
191
_mm_storeu_si128((void *)(tmp + 16), s1);
192
_mm_storeu_si128((void *)(tmp + 32), s2);
193
_mm_storeu_si128((void *)(tmp + 48), s3);
194
for (u = 0; u < len; u ++) {
195
buf[u] ^= tmp[u];
196
}
197
break;
198
} else {
199
__m128i b0, b1, b2, b3;
200
201
b0 = _mm_loadu_si128((const void *)(buf + 0));
202
b1 = _mm_loadu_si128((const void *)(buf + 16));
203
b2 = _mm_loadu_si128((const void *)(buf + 32));
204
b3 = _mm_loadu_si128((const void *)(buf + 48));
205
b0 = _mm_xor_si128(b0, s0);
206
b1 = _mm_xor_si128(b1, s1);
207
b2 = _mm_xor_si128(b2, s2);
208
b3 = _mm_xor_si128(b3, s3);
209
_mm_storeu_si128((void *)(buf + 0), b0);
210
_mm_storeu_si128((void *)(buf + 16), b1);
211
_mm_storeu_si128((void *)(buf + 32), b2);
212
_mm_storeu_si128((void *)(buf + 48), b3);
213
buf += 64;
214
len -= 64;
215
}
216
}
217
218
/*
219
* _mm_extract_epi32() requires SSE4.1. We prefer to stick to
220
* raw SSE2, thus we use _mm_extract_epi16().
221
*/
222
return (uint32_t)_mm_extract_epi16(iw, 0)
223
| ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
224
}
225
226
BR_TARGETS_X86_DOWN
227
228
#else
229
230
/* see bearssl_block.h */
231
br_chacha20_run
232
br_chacha20_sse2_get(void)
233
{
234
return 0;
235
}
236
237
#endif
238
239