Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/symcrypt/lib/aes-pattern.c
15010 views
1
//
2
// aes-pattern.c
3
//
4
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
5
//
6
// This file contains "pattern" code for AES-related functions. It's not intended to be compiled
7
// directly; rather it is included by other aes-*.c files which define the macros used here.
8
//
9
10
#if 0
11
#pragma makedep header
12
#endif
13
14
#if SYMCRYPT_CPU_ARM64
15
16
VOID
17
SYMCRYPT_CALL
18
SYMCRYPT_AesCtrMsbXxNeon(
19
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
20
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
21
_In_reads_( cbData ) PCBYTE pbSrc,
22
_Out_writes_( cbData ) PBYTE pbDst,
23
SIZE_T cbData )
24
{
25
__n128 chain = *(__n128 *)pbChainingValue;
26
const __n128 * pSrc = (const __n128 *) pbSrc;
27
__n128 * pDst = (__n128 *) pbDst;
28
29
const __n128 chainIncrement1 = SYMCRYPT_SET_N128_U64( 0, 1 );
30
const __n128 chainIncrement2 = SYMCRYPT_SET_N128_U64( 0, 2 );
31
const __n128 chainIncrement8 = SYMCRYPT_SET_N128_U64( 0, 8 );
32
33
__n128 ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7;
34
__n128 c0, c1, c2, c3, c4, c5, c6, c7;
35
36
cbData &= ~(SYMCRYPT_AES_BLOCK_SIZE - 1);
37
38
// Our chain variable is in integer format, not the MSBfirst format loaded from memory.
39
ctr0 = vrev64q_u8( chain );
40
ctr1 = VADDQ_UXX( ctr0, chainIncrement1 );
41
ctr2 = VADDQ_UXX( ctr0, chainIncrement2 );
42
ctr3 = VADDQ_UXX( ctr1, chainIncrement2 );
43
ctr4 = VADDQ_UXX( ctr2, chainIncrement2 );
44
ctr5 = VADDQ_UXX( ctr3, chainIncrement2 );
45
ctr6 = VADDQ_UXX( ctr4, chainIncrement2 );
46
ctr7 = VADDQ_UXX( ctr5, chainIncrement2 );
47
48
/*
49
while cbData >= 5 * block
50
generate 8 blocks of key stream
51
if cbData < 8 * block
52
break;
53
process 8 blocks
54
if cbData >= 5 * block
55
process 5-7 blocks
56
done
57
if cbData >= 2 * block
58
generate 4 blocks of key stream
59
process 2-4 blocks
60
done
61
if cbData == 1 block
62
generate 1 block of key stream
63
process block
64
*/
65
while( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
66
{
67
c0 = vrev64q_u8( ctr0 );
68
c1 = vrev64q_u8( ctr1 );
69
c2 = vrev64q_u8( ctr2 );
70
c3 = vrev64q_u8( ctr3 );
71
c4 = vrev64q_u8( ctr4 );
72
c5 = vrev64q_u8( ctr5 );
73
c6 = vrev64q_u8( ctr6 );
74
c7 = vrev64q_u8( ctr7 );
75
76
ctr0 = VADDQ_UXX( ctr0, chainIncrement8 );
77
ctr1 = VADDQ_UXX( ctr1, chainIncrement8 );
78
ctr2 = VADDQ_UXX( ctr2, chainIncrement8 );
79
ctr3 = VADDQ_UXX( ctr3, chainIncrement8 );
80
ctr4 = VADDQ_UXX( ctr4, chainIncrement8 );
81
ctr5 = VADDQ_UXX( ctr5, chainIncrement8 );
82
ctr6 = VADDQ_UXX( ctr6, chainIncrement8 );
83
ctr7 = VADDQ_UXX( ctr7, chainIncrement8 );
84
85
AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
86
87
if( cbData < 8 * SYMCRYPT_AES_BLOCK_SIZE )
88
{
89
break;
90
}
91
92
pDst[0] = veorq_u64( pSrc[0], c0 );
93
pDst[1] = veorq_u64( pSrc[1], c1 );
94
pDst[2] = veorq_u64( pSrc[2], c2 );
95
pDst[3] = veorq_u64( pSrc[3], c3 );
96
pDst[4] = veorq_u64( pSrc[4], c4 );
97
pDst[5] = veorq_u64( pSrc[5], c5 );
98
pDst[6] = veorq_u64( pSrc[6], c6 );
99
pDst[7] = veorq_u64( pSrc[7], c7 );
100
101
pDst += 8;
102
pSrc += 8;
103
cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
104
}
105
106
//
107
// At this point we have one of the two following cases:
108
// - cbData >= 5 * 16 and we have 8 blocks of key stream in c0-c7. ctr0-ctr7 is set to (c0+8)-(c7+8)
109
// - cbData < 5 * 16 and we have no blocks of key stream, and ctr0-ctr7 set to the next 8 counters to use
110
//
111
112
if( cbData >= SYMCRYPT_AES_BLOCK_SIZE ) // quick exit of function if the request was a multiple of 8 blocks
113
{
114
if( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
115
{
116
//
117
// We already have the key stream
118
//
119
pDst[0] = veorq_u64( pSrc[0], c0 );
120
pDst[1] = veorq_u64( pSrc[1], c1 );
121
pDst[2] = veorq_u64( pSrc[2], c2 );
122
pDst[3] = veorq_u64( pSrc[3], c3 );
123
pDst[4] = veorq_u64( pSrc[4], c4 );
124
chain = VSUBQ_UXX( ctr5, chainIncrement8 );
125
126
if( cbData >= 96 )
127
{
128
chain = VSUBQ_UXX( ctr6, chainIncrement8 );
129
pDst[5] = veorq_u64( pSrc[5], c5 );
130
if( cbData >= 112 )
131
{
132
chain = VSUBQ_UXX( ctr7, chainIncrement8 );
133
pDst[6] = veorq_u64( pSrc[6], c6 );
134
}
135
}
136
}
137
else if( cbData >= 2 * SYMCRYPT_AES_BLOCK_SIZE )
138
{
139
// Produce 4 blocks of key stream
140
141
chain = ctr2; // chain is only incremented by 2 for now
142
143
c0 = vrev64q_u8( ctr0 );
144
c1 = vrev64q_u8( ctr1 );
145
c2 = vrev64q_u8( ctr2 );
146
c3 = vrev64q_u8( ctr3 );
147
148
AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 );
149
150
pDst[0] = veorq_u64( pSrc[0], c0 );
151
pDst[1] = veorq_u64( pSrc[1], c1 );
152
if( cbData >= 48 )
153
{
154
chain = ctr3;
155
pDst[2] = veorq_u64( pSrc[2], c2 );
156
if( cbData >= 64 )
157
{
158
chain = ctr4;
159
pDst[3] = veorq_u64( pSrc[3], c3 );
160
}
161
}
162
}
163
else
164
{
165
// Exactly 1 block to process
166
chain = ctr1;
167
168
c0 = vrev64q_u8( ctr0 );
169
170
AES_ENCRYPT_1( pExpandedKey, c0 );
171
pDst[0] = veorq_u64( pSrc[0], c0 );
172
}
173
}
174
else
175
{
176
chain = ctr0;
177
}
178
179
chain = vrev64q_u8( chain );
180
*(__n128 *)pbChainingValue = chain;
181
}
182
183
#endif // SYMCRYPT_CPU_ARM64
184
185
#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
186
187
VOID
188
SYMCRYPT_CALL
189
SYMCRYPT_AesCtrMsbXxXmm(
190
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
191
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
192
_In_reads_( cbData ) PCBYTE pbSrc,
193
_Out_writes_( cbData ) PBYTE pbDst,
194
SIZE_T cbData )
195
{
196
__m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue );
197
198
__m128i BYTE_REVERSE_ORDER = _mm_set_epi8(
199
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
200
201
__m128i chainIncrement1 = _mm_set_epi32( 0, 0, 0, 1 );
202
__m128i chainIncrement2 = _mm_set_epi32( 0, 0, 0, 2 );
203
__m128i chainIncrement3 = _mm_set_epi32( 0, 0, 0, 3 );
204
//__m128i chainIncrement8 = _mm_set_epi32( 0, 0, 0, 8 );
205
206
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
207
208
cbData &= ~(SYMCRYPT_AES_BLOCK_SIZE - 1);
209
210
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
211
212
/*
213
while cbData >= 5 * block
214
generate 8 blocks of key stream
215
if cbData < 8 * block
216
break;
217
process 8 blocks
218
if cbData >= 5 * block
219
process 5-7 blocks
220
done
221
if cbData > 1 block
222
generate 4 blocks of key stream
223
process 2-4 blocks
224
done
225
if cbData == 1 block
226
generate 1 block of key stream
227
process block
228
*/
229
while( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
230
{
231
c0 = chain;
232
c1 = MM_ADD_EPIXX( chain, chainIncrement1 );
233
c2 = MM_ADD_EPIXX( chain, chainIncrement2 );
234
c3 = MM_ADD_EPIXX( c1, chainIncrement2 );
235
c4 = MM_ADD_EPIXX( c2, chainIncrement2 );
236
c5 = MM_ADD_EPIXX( c3, chainIncrement2 );
237
c6 = MM_ADD_EPIXX( c4, chainIncrement2 );
238
c7 = MM_ADD_EPIXX( c5, chainIncrement2 );
239
chain = MM_ADD_EPIXX( c6, chainIncrement2 );
240
241
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
242
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
243
c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
244
c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
245
c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
246
c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
247
c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER );
248
c7 = _mm_shuffle_epi8( c7, BYTE_REVERSE_ORDER );
249
250
AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
251
252
if( cbData < 8 * SYMCRYPT_AES_BLOCK_SIZE )
253
{
254
break;
255
}
256
257
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) );
258
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ) ) ) );
259
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ) ) ) );
260
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ) ) ) );
261
_mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 ) ) ) );
262
_mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 ) ) ) );
263
_mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 ) ) ) );
264
_mm_storeu_si128( (__m128i *) (pbDst +112), _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc +112 ) ) ) );
265
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
266
pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
267
cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
268
}
269
270
//
271
// At this point we have one of the two following cases:
272
// - cbData >= 5 * 16 and we have 8 blocks of key stream in c0-c7. chain is set to c7 + 1
273
// - cbData < 5 * 16 and we have no blocks of key stream, with chain the next value to use
274
//
275
276
if( cbData >= SYMCRYPT_AES_BLOCK_SIZE ) // quick exit of function if the request was a multiple of 8 blocks
277
{
278
if( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
279
{
280
//
281
// We already have the key stream
282
//
283
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) );
284
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ) ) ) );
285
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ) ) ) );
286
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ) ) ) );
287
_mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 ) ) ) );
288
chain = MM_SUB_EPIXX( chain, chainIncrement3 );
289
290
if( cbData >= 96 )
291
{
292
chain = MM_ADD_EPIXX( chain, chainIncrement1 );
293
_mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 ) ) ) );
294
if( cbData >= 112 )
295
{
296
chain = MM_ADD_EPIXX( chain, chainIncrement1 );
297
_mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 ) ) ) );
298
}
299
}
300
}
301
else if( cbData >= 2 * SYMCRYPT_AES_BLOCK_SIZE )
302
{
303
// Produce 4 blocks of key stream
304
305
c0 = chain;
306
c1 = MM_ADD_EPIXX( chain, chainIncrement1 );
307
c2 = MM_ADD_EPIXX( chain, chainIncrement2 );
308
c3 = MM_ADD_EPIXX( c1, chainIncrement2 );
309
chain = c2; // chain is only incremented by 2 for now
310
311
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
312
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
313
c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
314
c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
315
316
AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 );
317
318
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) );
319
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ) ) ) );
320
if( cbData >= 48 )
321
{
322
chain = MM_ADD_EPIXX( chain, chainIncrement1 );
323
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ) ) ) );
324
if( cbData >= 64 )
325
{
326
chain = MM_ADD_EPIXX( chain, chainIncrement1 );
327
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ) ) ) );
328
}
329
}
330
}
331
else
332
{
333
// Exactly 1 block to process
334
c0 = chain;
335
chain = MM_ADD_EPIXX( chain, chainIncrement1 );
336
337
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
338
339
AES_ENCRYPT_1( pExpandedKey, c0 );
340
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) );
341
}
342
}
343
344
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
345
_mm_storeu_si128( (__m128i *) pbChainingValue, chain );
346
}
347
348
#endif // SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
349
350