Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/aesni/intel_sha1.c
39478 views
1
/*******************************************************************************
2
* Copyright (c) 2013, Intel Corporation
3
*
4
* All rights reserved.
5
*
6
* Redistribution and use in source and binary forms, with or without
7
* modification, are permitted provided that the following conditions are
8
* met:
9
*
10
* * Redistributions of source code must retain the above copyright
11
* notice, this list of conditions and the following disclaimer.
12
*
13
* * Redistributions in binary form must reproduce the above copyright
14
* notice, this list of conditions and the following disclaimer in the
15
* documentation and/or other materials provided with the
16
* distribution.
17
*
18
* * Neither the name of the Intel Corporation nor the names of its
19
* contributors may be used to endorse or promote products derived from
20
* this software without specific prior written permission.
21
*
22
*
23
* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
24
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
27
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
********************************************************************************
35
*
36
* Intel SHA Extensions optimized implementation of a SHA-1 update function
37
*
38
* The function takes a pointer to the current hash values, a pointer to the
39
* input data, and a number of 64 byte blocks to process. Once all blocks have
40
* been processed, the digest pointer is updated with the resulting hash value.
41
* The function only processes complete blocks, there is no functionality to
42
* store partial blocks. All message padding and hash value initialization must
43
* be done outside the update function.
44
*
45
* The indented lines in the loop are instructions related to rounds processing.
46
* The non-indented lines are instructions related to the message schedule.
47
*
48
* Author: Sean Gulley <[email protected]>
49
* Date: July 2013
50
*
51
********************************************************************************
52
*
53
* Example complier command line:
54
* icc intel_sha_extensions_sha1_intrinsic.c
55
* gcc -msha -msse4 intel_sha_extensions_sha1_intrinsic.c
56
*
57
*******************************************************************************/
58
59
#include <sys/types.h>
60
#include <crypto/aesni/aesni_os.h>
61
#include <crypto/aesni/sha_sse.h>
62
63
#include <immintrin.h>
64
65
void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks) {
66
__m128i abcd, e0, e1;
67
__m128i abcd_save, e_save;
68
__m128i msg0, msg1, msg2, msg3;
69
__m128i shuf_mask, e_mask;
70
71
#if 0
72
e_mask = _mm_set_epi64x(0xFFFFFFFF00000000ull, 0x0000000000000000ull);
73
#else
74
(void)e_mask;
75
e0 = _mm_set_epi64x(0, 0);
76
#endif
77
shuf_mask = _mm_set_epi64x(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
78
79
// Load initial hash values
80
abcd = _mm_loadu_si128((__m128i*) digest);
81
e0 = _mm_insert_epi32(e0, *(digest+4), 3);
82
abcd = _mm_shuffle_epi32(abcd, 0x1B);
83
#if 0
84
e0 = _mm_and_si128(e0, e_mask);
85
#endif
86
87
while (num_blks > 0) {
88
// Save hash values for addition after rounds
89
abcd_save = abcd;
90
e_save = e0;
91
92
// Rounds 0-3
93
msg0 = _mm_loadu_si128((const __m128i*) data);
94
msg0 = _mm_shuffle_epi8(msg0, shuf_mask);
95
e0 = _mm_add_epi32(e0, msg0);
96
e1 = abcd;
97
abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
98
99
// Rounds 4-7
100
msg1 = _mm_loadu_si128((const __m128i*) (data+16));
101
msg1 = _mm_shuffle_epi8(msg1, shuf_mask);
102
e1 = _mm_sha1nexte_epu32(e1, msg1);
103
e0 = abcd;
104
abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
105
msg0 = _mm_sha1msg1_epu32(msg0, msg1);
106
107
// Rounds 8-11
108
msg2 = _mm_loadu_si128((const __m128i*) (data+32));
109
msg2 = _mm_shuffle_epi8(msg2, shuf_mask);
110
e0 = _mm_sha1nexte_epu32(e0, msg2);
111
e1 = abcd;
112
abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
113
msg1 = _mm_sha1msg1_epu32(msg1, msg2);
114
msg0 = _mm_xor_si128(msg0, msg2);
115
116
// Rounds 12-15
117
msg3 = _mm_loadu_si128((const __m128i*) (data+48));
118
msg3 = _mm_shuffle_epi8(msg3, shuf_mask);
119
e1 = _mm_sha1nexte_epu32(e1, msg3);
120
e0 = abcd;
121
msg0 = _mm_sha1msg2_epu32(msg0, msg3);
122
abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
123
msg2 = _mm_sha1msg1_epu32(msg2, msg3);
124
msg1 = _mm_xor_si128(msg1, msg3);
125
126
// Rounds 16-19
127
e0 = _mm_sha1nexte_epu32(e0, msg0);
128
e1 = abcd;
129
msg1 = _mm_sha1msg2_epu32(msg1, msg0);
130
abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
131
msg3 = _mm_sha1msg1_epu32(msg3, msg0);
132
msg2 = _mm_xor_si128(msg2, msg0);
133
134
// Rounds 20-23
135
e1 = _mm_sha1nexte_epu32(e1, msg1);
136
e0 = abcd;
137
msg2 = _mm_sha1msg2_epu32(msg2, msg1);
138
abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
139
msg0 = _mm_sha1msg1_epu32(msg0, msg1);
140
msg3 = _mm_xor_si128(msg3, msg1);
141
142
// Rounds 24-27
143
e0 = _mm_sha1nexte_epu32(e0, msg2);
144
e1 = abcd;
145
msg3 = _mm_sha1msg2_epu32(msg3, msg2);
146
abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
147
msg1 = _mm_sha1msg1_epu32(msg1, msg2);
148
msg0 = _mm_xor_si128(msg0, msg2);
149
150
// Rounds 28-31
151
e1 = _mm_sha1nexte_epu32(e1, msg3);
152
e0 = abcd;
153
msg0 = _mm_sha1msg2_epu32(msg0, msg3);
154
abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
155
msg2 = _mm_sha1msg1_epu32(msg2, msg3);
156
msg1 = _mm_xor_si128(msg1, msg3);
157
158
// Rounds 32-35
159
e0 = _mm_sha1nexte_epu32(e0, msg0);
160
e1 = abcd;
161
msg1 = _mm_sha1msg2_epu32(msg1, msg0);
162
abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
163
msg3 = _mm_sha1msg1_epu32(msg3, msg0);
164
msg2 = _mm_xor_si128(msg2, msg0);
165
166
// Rounds 36-39
167
e1 = _mm_sha1nexte_epu32(e1, msg1);
168
e0 = abcd;
169
msg2 = _mm_sha1msg2_epu32(msg2, msg1);
170
abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
171
msg0 = _mm_sha1msg1_epu32(msg0, msg1);
172
msg3 = _mm_xor_si128(msg3, msg1);
173
174
// Rounds 40-43
175
e0 = _mm_sha1nexte_epu32(e0, msg2);
176
e1 = abcd;
177
msg3 = _mm_sha1msg2_epu32(msg3, msg2);
178
abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
179
msg1 = _mm_sha1msg1_epu32(msg1, msg2);
180
msg0 = _mm_xor_si128(msg0, msg2);
181
182
// Rounds 44-47
183
e1 = _mm_sha1nexte_epu32(e1, msg3);
184
e0 = abcd;
185
msg0 = _mm_sha1msg2_epu32(msg0, msg3);
186
abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
187
msg2 = _mm_sha1msg1_epu32(msg2, msg3);
188
msg1 = _mm_xor_si128(msg1, msg3);
189
190
// Rounds 48-51
191
e0 = _mm_sha1nexte_epu32(e0, msg0);
192
e1 = abcd;
193
msg1 = _mm_sha1msg2_epu32(msg1, msg0);
194
abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
195
msg3 = _mm_sha1msg1_epu32(msg3, msg0);
196
msg2 = _mm_xor_si128(msg2, msg0);
197
198
// Rounds 52-55
199
e1 = _mm_sha1nexte_epu32(e1, msg1);
200
e0 = abcd;
201
msg2 = _mm_sha1msg2_epu32(msg2, msg1);
202
abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
203
msg0 = _mm_sha1msg1_epu32(msg0, msg1);
204
msg3 = _mm_xor_si128(msg3, msg1);
205
206
// Rounds 56-59
207
e0 = _mm_sha1nexte_epu32(e0, msg2);
208
e1 = abcd;
209
msg3 = _mm_sha1msg2_epu32(msg3, msg2);
210
abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
211
msg1 = _mm_sha1msg1_epu32(msg1, msg2);
212
msg0 = _mm_xor_si128(msg0, msg2);
213
214
// Rounds 60-63
215
e1 = _mm_sha1nexte_epu32(e1, msg3);
216
e0 = abcd;
217
msg0 = _mm_sha1msg2_epu32(msg0, msg3);
218
abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
219
msg2 = _mm_sha1msg1_epu32(msg2, msg3);
220
msg1 = _mm_xor_si128(msg1, msg3);
221
222
// Rounds 64-67
223
e0 = _mm_sha1nexte_epu32(e0, msg0);
224
e1 = abcd;
225
msg1 = _mm_sha1msg2_epu32(msg1, msg0);
226
abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
227
msg3 = _mm_sha1msg1_epu32(msg3, msg0);
228
msg2 = _mm_xor_si128(msg2, msg0);
229
230
// Rounds 68-71
231
e1 = _mm_sha1nexte_epu32(e1, msg1);
232
e0 = abcd;
233
msg2 = _mm_sha1msg2_epu32(msg2, msg1);
234
abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
235
msg3 = _mm_xor_si128(msg3, msg1);
236
237
// Rounds 72-75
238
e0 = _mm_sha1nexte_epu32(e0, msg2);
239
e1 = abcd;
240
msg3 = _mm_sha1msg2_epu32(msg3, msg2);
241
abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
242
243
// Rounds 76-79
244
e1 = _mm_sha1nexte_epu32(e1, msg3);
245
e0 = abcd;
246
abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
247
248
// Add current hash values with previously saved
249
e0 = _mm_sha1nexte_epu32(e0, e_save);
250
abcd = _mm_add_epi32(abcd, abcd_save);
251
252
data += 64;
253
num_blks--;
254
}
255
256
abcd = _mm_shuffle_epi32(abcd, 0x1B);
257
_mm_store_si128((__m128i*) digest, abcd);
258
*(digest+4) = _mm_extract_epi32(e0, 3);
259
}
260
261
262