CoCalc -- intel

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/aesni/intel_sha1.c
³⁹⁴⁷⁸ views
1
/*******************************************************************************
2
* Copyright (c) 2013, Intel Corporation 
3
* 
4
* All rights reserved. 
5
* 
6
* Redistribution and use in source and binary forms, with or without
7
* modification, are permitted provided that the following conditions are
8
* met: 
9
* 
10
* * Redistributions of source code must retain the above copyright
11
*   notice, this list of conditions and the following disclaimer.  
12
* 
13
* * Redistributions in binary form must reproduce the above copyright
14
*   notice, this list of conditions and the following disclaimer in the
15
*   documentation and/or other materials provided with the
16
*   distribution. 
17
* 
18
* * Neither the name of the Intel Corporation nor the names of its
19
*   contributors may be used to endorse or promote products derived from
20
*   this software without specific prior written permission. 
21
* 
22
* 
23
* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
24
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
27
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
********************************************************************************
35
*
36
* Intel SHA Extensions optimized implementation of a SHA-1 update function 
37
* 
38
* The function takes a pointer to the current hash values, a pointer to the 
39
* input data, and a number of 64 byte blocks to process.  Once all blocks have 
40
* been processed, the digest pointer is  updated with the resulting hash value.
41
* The function only processes complete blocks, there is no functionality to 
42
* store partial blocks.  All message padding and hash value initialization must
43
* be done outside the update function.  
44
* 
45
* The indented lines in the loop are instructions related to rounds processing.
46
* The non-indented lines are instructions related to the message schedule.
47
* 
48
* Author: Sean Gulley <[email protected]>
49
* Date:   July 2013
50
*
51
********************************************************************************
52
*
53
* Example complier command line:
54
* icc intel_sha_extensions_sha1_intrinsic.c
55
* gcc -msha -msse4 intel_sha_extensions_sha1_intrinsic.c
56
*
57
*******************************************************************************/
58

59
#include <sys/types.h>
60
#include <crypto/aesni/aesni_os.h>
61
#include <crypto/aesni/sha_sse.h>
62

63
#include <immintrin.h>
64

65
void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks) {
66
   __m128i abcd, e0, e1;
67
   __m128i abcd_save, e_save;
68
   __m128i msg0, msg1, msg2, msg3;
69
   __m128i shuf_mask, e_mask;
70

71
#if 0
72
   e_mask    = _mm_set_epi64x(0xFFFFFFFF00000000ull, 0x0000000000000000ull);
73
#else
74
   (void)e_mask;
75
   e0        = _mm_set_epi64x(0, 0);
76
#endif
77
   shuf_mask = _mm_set_epi64x(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
78

79
   // Load initial hash values
80
   abcd      = _mm_loadu_si128((__m128i*) digest);
81
   e0        = _mm_insert_epi32(e0, *(digest+4), 3);
82
   abcd      = _mm_shuffle_epi32(abcd, 0x1B);
83
#if 0
84
   e0        = _mm_and_si128(e0, e_mask);
85
#endif
86

87
   while (num_blks > 0) {
88
      // Save hash values for addition after rounds
89
      abcd_save = abcd;
90
      e_save    = e0;
91

92
      // Rounds 0-3
93
      msg0 = _mm_loadu_si128((const __m128i*) data);
94
      msg0 = _mm_shuffle_epi8(msg0, shuf_mask);
95
         e0   = _mm_add_epi32(e0, msg0);
96
         e1   = abcd;
97
         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
98

99
      // Rounds 4-7
100
      msg1 = _mm_loadu_si128((const __m128i*) (data+16));
101
      msg1 = _mm_shuffle_epi8(msg1, shuf_mask);
102
         e1   = _mm_sha1nexte_epu32(e1, msg1);
103
         e0   = abcd;
104
         abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
105
      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
106

107
      // Rounds 8-11
108
      msg2 = _mm_loadu_si128((const __m128i*) (data+32));
109
      msg2 = _mm_shuffle_epi8(msg2, shuf_mask);
110
         e0   = _mm_sha1nexte_epu32(e0, msg2);
111
         e1   = abcd;
112
         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
113
      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
114
      msg0 = _mm_xor_si128(msg0, msg2);
115

116
      // Rounds 12-15
117
      msg3 = _mm_loadu_si128((const __m128i*) (data+48));
118
      msg3 = _mm_shuffle_epi8(msg3, shuf_mask);
119
         e1   = _mm_sha1nexte_epu32(e1, msg3);
120
         e0   = abcd;
121
      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
122
         abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
123
      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
124
      msg1 = _mm_xor_si128(msg1, msg3);
125

126
      // Rounds 16-19
127
         e0   = _mm_sha1nexte_epu32(e0, msg0);
128
         e1   = abcd;
129
      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
130
         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
131
      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
132
      msg2 = _mm_xor_si128(msg2, msg0);
133

134
      // Rounds 20-23
135
         e1   = _mm_sha1nexte_epu32(e1, msg1);
136
         e0   = abcd;
137
      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
138
         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
139
      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
140
      msg3 = _mm_xor_si128(msg3, msg1);
141
	
142
      // Rounds 24-27
143
         e0   = _mm_sha1nexte_epu32(e0, msg2);
144
         e1   = abcd;
145
      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
146
         abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
147
      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
148
      msg0 = _mm_xor_si128(msg0, msg2);
149

150
      // Rounds 28-31
151
         e1   = _mm_sha1nexte_epu32(e1, msg3);
152
         e0   = abcd;
153
      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
154
         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
155
      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
156
      msg1 = _mm_xor_si128(msg1, msg3);
157

158
      // Rounds 32-35
159
         e0   = _mm_sha1nexte_epu32(e0, msg0);
160
         e1   = abcd;
161
      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
162
         abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
163
      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
164
      msg2 = _mm_xor_si128(msg2, msg0);
165

166
      // Rounds 36-39
167
         e1   = _mm_sha1nexte_epu32(e1, msg1);
168
         e0   = abcd;
169
      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
170
         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
171
      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
172
      msg3 = _mm_xor_si128(msg3, msg1);
173
	
174
      // Rounds 40-43
175
         e0   = _mm_sha1nexte_epu32(e0, msg2);
176
         e1   = abcd;
177
      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
178
         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
179
      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
180
      msg0 = _mm_xor_si128(msg0, msg2);
181

182
      // Rounds 44-47
183
         e1   = _mm_sha1nexte_epu32(e1, msg3);
184
         e0   = abcd;
185
      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
186
         abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
187
      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
188
      msg1 = _mm_xor_si128(msg1, msg3);
189

190
      // Rounds 48-51
191
         e0   = _mm_sha1nexte_epu32(e0, msg0);
192
         e1   = abcd;
193
      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
194
         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
195
      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
196
      msg2 = _mm_xor_si128(msg2, msg0);
197

198
      // Rounds 52-55
199
         e1   = _mm_sha1nexte_epu32(e1, msg1);
200
         e0   = abcd;
201
      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
202
         abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
203
      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
204
      msg3 = _mm_xor_si128(msg3, msg1);
205
	
206
      // Rounds 56-59
207
         e0   = _mm_sha1nexte_epu32(e0, msg2);
208
         e1   = abcd;
209
      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
210
         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
211
      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
212
      msg0 = _mm_xor_si128(msg0, msg2);
213

214
      // Rounds 60-63
215
         e1   = _mm_sha1nexte_epu32(e1, msg3);
216
         e0   = abcd;
217
      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
218
         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
219
      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
220
      msg1 = _mm_xor_si128(msg1, msg3);
221

222
      // Rounds 64-67
223
         e0   = _mm_sha1nexte_epu32(e0, msg0);
224
         e1   = abcd;
225
      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
226
         abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
227
      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
228
      msg2 = _mm_xor_si128(msg2, msg0);
229

230
      // Rounds 68-71
231
         e1   = _mm_sha1nexte_epu32(e1, msg1);
232
         e0   = abcd;
233
      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
234
         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
235
      msg3 = _mm_xor_si128(msg3, msg1);
236
	
237
      // Rounds 72-75
238
         e0   = _mm_sha1nexte_epu32(e0, msg2);
239
         e1   = abcd;
240
      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
241
         abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
242

243
      // Rounds 76-79
244
         e1   = _mm_sha1nexte_epu32(e1, msg3);
245
         e0   = abcd;
246
         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
247

248
      // Add current hash values with previously saved
249
      e0   = _mm_sha1nexte_epu32(e0, e_save);
250
      abcd = _mm_add_epi32(abcd, abcd_save);
251

252
      data += 64;
253
      num_blks--;
254
   }
255

256
   abcd = _mm_shuffle_epi32(abcd, 0x1B);
257
   _mm_store_si128((__m128i*) digest, abcd);
258
   *(digest+4) = _mm_extract_epi32(e0, 3);
259
}
260

261

262
Product

Resources

Company