Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/symcrypt/lib/aes-ymm.c
15010 views
1
//
2
// aes-ymm.c code for AES implementation
3
//
4
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
5
//
6
// All YMM code for AES operations
7
// Requires compiler support for aesni, pclmulqdq, avx2, vaes and vpclmulqdq
8
//
9
10
#include "precomp.h"
11
12
#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
13
14
#ifdef __clang__
15
#pragma clang attribute push (__attribute__((target("avx2,pclmul,vaes,vpclmulqdq"))), apply_to=function)
16
#else
17
#pragma GCC push_options
18
#pragma GCC target("avx2,pclmul,vaes,vpclmulqdq")
19
#endif
20
21
#include "xtsaes_definitions.h"
22
#include "ghash_definitions.h"
23
24
#define AES_ENCRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \
25
{ \
26
const BYTE (*keyPtr)[4][4]; \
27
const BYTE (*keyLimit)[4][4]; \
28
__m256i roundkeys; \
29
\
30
keyPtr = pExpandedKey->RoundKey; \
31
keyLimit = pExpandedKey->lastEncRoundKey; \
32
\
33
/* _mm256_broadcastsi128_si256 requires AVX2 */ \
34
roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
35
keyPtr ++; \
36
\
37
/* _mm256_xor_si256 requires AVX2 */ \
38
c0 = _mm256_xor_si256( c0, roundkeys ); \
39
c1 = _mm256_xor_si256( c1, roundkeys ); \
40
c2 = _mm256_xor_si256( c2, roundkeys ); \
41
c3 = _mm256_xor_si256( c3, roundkeys ); \
42
c4 = _mm256_xor_si256( c4, roundkeys ); \
43
c5 = _mm256_xor_si256( c5, roundkeys ); \
44
c6 = _mm256_xor_si256( c6, roundkeys ); \
45
c7 = _mm256_xor_si256( c7, roundkeys ); \
46
\
47
do \
48
{ \
49
roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
50
keyPtr ++; \
51
c0 = _mm256_aesenc_epi128( c0, roundkeys ); \
52
c1 = _mm256_aesenc_epi128( c1, roundkeys ); \
53
c2 = _mm256_aesenc_epi128( c2, roundkeys ); \
54
c3 = _mm256_aesenc_epi128( c3, roundkeys ); \
55
c4 = _mm256_aesenc_epi128( c4, roundkeys ); \
56
c5 = _mm256_aesenc_epi128( c5, roundkeys ); \
57
c6 = _mm256_aesenc_epi128( c6, roundkeys ); \
58
c7 = _mm256_aesenc_epi128( c7, roundkeys ); \
59
} while( keyPtr < keyLimit ); \
60
\
61
roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
62
\
63
c0 = _mm256_aesenclast_epi128( c0, roundkeys ); \
64
c1 = _mm256_aesenclast_epi128( c1, roundkeys ); \
65
c2 = _mm256_aesenclast_epi128( c2, roundkeys ); \
66
c3 = _mm256_aesenclast_epi128( c3, roundkeys ); \
67
c4 = _mm256_aesenclast_epi128( c4, roundkeys ); \
68
c5 = _mm256_aesenclast_epi128( c5, roundkeys ); \
69
c6 = _mm256_aesenclast_epi128( c6, roundkeys ); \
70
c7 = _mm256_aesenclast_epi128( c7, roundkeys ); \
71
};
72
73
#define AES_DECRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \
74
{ \
75
const BYTE (*keyPtr)[4][4]; \
76
const BYTE (*keyLimit)[4][4]; \
77
__m256i roundkeys; \
78
\
79
keyPtr = pExpandedKey->lastEncRoundKey; \
80
keyLimit = pExpandedKey->lastDecRoundKey; \
81
\
82
/* _mm256_broadcastsi128_si256 requires AVX2 */ \
83
roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
84
keyPtr ++; \
85
\
86
/* _mm256_xor_si256 requires AVX2 */ \
87
c0 = _mm256_xor_si256( c0, roundkeys ); \
88
c1 = _mm256_xor_si256( c1, roundkeys ); \
89
c2 = _mm256_xor_si256( c2, roundkeys ); \
90
c3 = _mm256_xor_si256( c3, roundkeys ); \
91
c4 = _mm256_xor_si256( c4, roundkeys ); \
92
c5 = _mm256_xor_si256( c5, roundkeys ); \
93
c6 = _mm256_xor_si256( c6, roundkeys ); \
94
c7 = _mm256_xor_si256( c7, roundkeys ); \
95
\
96
do \
97
{ \
98
roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
99
keyPtr ++; \
100
c0 = _mm256_aesdec_epi128( c0, roundkeys ); \
101
c1 = _mm256_aesdec_epi128( c1, roundkeys ); \
102
c2 = _mm256_aesdec_epi128( c2, roundkeys ); \
103
c3 = _mm256_aesdec_epi128( c3, roundkeys ); \
104
c4 = _mm256_aesdec_epi128( c4, roundkeys ); \
105
c5 = _mm256_aesdec_epi128( c5, roundkeys ); \
106
c6 = _mm256_aesdec_epi128( c6, roundkeys ); \
107
c7 = _mm256_aesdec_epi128( c7, roundkeys ); \
108
} while( keyPtr < keyLimit ); \
109
\
110
roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
111
\
112
c0 = _mm256_aesdeclast_epi128( c0, roundkeys ); \
113
c1 = _mm256_aesdeclast_epi128( c1, roundkeys ); \
114
c2 = _mm256_aesdeclast_epi128( c2, roundkeys ); \
115
c3 = _mm256_aesdeclast_epi128( c3, roundkeys ); \
116
c4 = _mm256_aesdeclast_epi128( c4, roundkeys ); \
117
c5 = _mm256_aesdeclast_epi128( c5, roundkeys ); \
118
c6 = _mm256_aesdeclast_epi128( c6, roundkeys ); \
119
c7 = _mm256_aesdeclast_epi128( c7, roundkeys ); \
120
};
121
122
VOID
123
SYMCRYPT_CALL
124
SymCryptXtsAesEncryptDataUnitYmm_2048(
125
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
126
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbTweakBlock,
127
_Out_writes_( SYMCRYPT_AES_BLOCK_SIZE*16 ) PBYTE pbScratch,
128
_In_reads_( cbData ) PCBYTE pbSrc,
129
_Out_writes_( cbData ) PBYTE pbDst,
130
SIZE_T cbData )
131
{
132
__m128i t0, t1, t2, t3, t4, t5, t6, t7;
133
__m256i c0, c1, c2, c3, c4, c5, c6, c7;
134
__m128i XTS_ALPHA_MASK;
135
__m256i XTS_ALPHA_MULTIPLIER_Ymm;
136
137
// Load tweaks into big T
138
__m256i T0, T1, T2, T3, T4, T5, T6, T7;
139
140
SIZE_T cbDataMain; // number of bytes to handle in the main loop
141
SIZE_T cbDataTail; // number of bytes to handle in the tail loop
142
143
// To simplify logic and unusual size processing, we handle all
144
// data not a multiple of 16 blocks in the tail loop
145
cbDataTail = cbData & ((16*SYMCRYPT_AES_BLOCK_SIZE)-1);
146
// Additionally, so that ciphertext stealing logic does not rely on
147
// reading back from the destination buffer, when we have a non-zero
148
// tail, we ensure that we handle at least 1 whole block in the tail
149
cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (16*SYMCRYPT_AES_BLOCK_SIZE) : 0;
150
cbDataMain = cbData - cbDataTail;
151
152
SYMCRYPT_ASSERT(cbDataMain <= cbData);
153
SYMCRYPT_ASSERT(cbDataTail <= cbData);
154
SYMCRYPT_ASSERT((cbDataMain & ((16*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0);
155
156
if( cbDataMain == 0 )
157
{
158
SymCryptXtsAesEncryptDataUnitXmm( pExpandedKey, pbTweakBlock, pbScratch, pbSrc, pbDst, cbDataTail );
159
return;
160
}
161
162
t0 = _mm_loadu_si128( (__m128i *) pbTweakBlock );
163
XTS_ALPHA_MASK = _mm_set_epi32( 1, 1, 1, 0x87 );
164
XTS_ALPHA_MULTIPLIER_Ymm = _mm256_set_epi64x( 0, 0x87, 0, 0x87 );
165
166
// Do not stall.
167
XTS_MUL_ALPHA4( t0, t4 );
168
XTS_MUL_ALPHA ( t0, t1 );
169
XTS_MUL_ALPHA ( t4, t5 );
170
XTS_MUL_ALPHA ( t1, t2 );
171
XTS_MUL_ALPHA ( t5, t6 );
172
XTS_MUL_ALPHA ( t2, t3 );
173
XTS_MUL_ALPHA ( t6, t7 );
174
175
T0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), t1, 1 ); // AVX
176
T1 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), t3, 1 );
177
T2 = _mm256_insertf128_si256( _mm256_castsi128_si256( t4 ), t5, 1 );
178
T3 = _mm256_insertf128_si256( _mm256_castsi128_si256( t6 ), t7, 1 );
179
XTS_MUL_ALPHA8_YMM(T0, T4);
180
XTS_MUL_ALPHA8_YMM(T1, T5);
181
XTS_MUL_ALPHA8_YMM(T2, T6);
182
XTS_MUL_ALPHA8_YMM(T3, T7);
183
184
for(;;)
185
{
186
c0 = _mm256_xor_si256( T0, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 0 ) ) );
187
c1 = _mm256_xor_si256( T1, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 2*SYMCRYPT_AES_BLOCK_SIZE ) ) );
188
c2 = _mm256_xor_si256( T2, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 4*SYMCRYPT_AES_BLOCK_SIZE ) ) );
189
c3 = _mm256_xor_si256( T3, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 6*SYMCRYPT_AES_BLOCK_SIZE ) ) );
190
c4 = _mm256_xor_si256( T4, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 8*SYMCRYPT_AES_BLOCK_SIZE ) ) );
191
c5 = _mm256_xor_si256( T5, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 10*SYMCRYPT_AES_BLOCK_SIZE ) ) );
192
c6 = _mm256_xor_si256( T6, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 12*SYMCRYPT_AES_BLOCK_SIZE ) ) );
193
c7 = _mm256_xor_si256( T7, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 14*SYMCRYPT_AES_BLOCK_SIZE ) ) );
194
195
pbSrc += 16 * SYMCRYPT_AES_BLOCK_SIZE;
196
197
AES_ENCRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
198
199
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 0 ), _mm256_xor_si256( c0, T0 ) );
200
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 2*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c1, T1 ) );
201
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 4*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c2, T2 ) );
202
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 6*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c3, T3 ) );
203
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 8*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c4, T4 ) );
204
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 10*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c5, T5 ) );
205
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 12*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c6, T6 ) );
206
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 14*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c7, T7 ) );
207
208
pbDst += 16 * SYMCRYPT_AES_BLOCK_SIZE;
209
210
cbDataMain -= 16 * SYMCRYPT_AES_BLOCK_SIZE;
211
if( cbDataMain < 16 * SYMCRYPT_AES_BLOCK_SIZE )
212
{
213
break;
214
}
215
216
XTS_MUL_ALPHA16_YMM(T0, T0);
217
XTS_MUL_ALPHA16_YMM(T1, T1);
218
XTS_MUL_ALPHA16_YMM(T2, T2);
219
XTS_MUL_ALPHA16_YMM(T3, T3);
220
XTS_MUL_ALPHA16_YMM(T4, T4);
221
XTS_MUL_ALPHA16_YMM(T5, T5);
222
XTS_MUL_ALPHA16_YMM(T6, T6);
223
XTS_MUL_ALPHA16_YMM(T7, T7);
224
}
225
226
// We won't do another 16-block set so we don't update the tweak blocks
227
228
if( cbDataTail > 0 )
229
{
230
//
231
// This is a rare case: the data unit length is not a multiple of 256 bytes.
232
// We do this in the Xmm implementation.
233
// Fix up the tweak block first
234
//
235
t7 = _mm256_extracti128_si256 ( T7, 1 /* Highest 128 bits */ ); // AVX2
236
_mm256_zeroupper();
237
XTS_MUL_ALPHA( t7, t0 );
238
_mm_storeu_si128( (__m128i *) pbTweakBlock, t0 );
239
240
SymCryptXtsAesEncryptDataUnitXmm( pExpandedKey, pbTweakBlock, pbScratch, pbSrc, pbDst, cbDataTail );
241
}
242
else {
243
_mm256_zeroupper();
244
}
245
}
246
247
VOID
248
SYMCRYPT_CALL
249
SymCryptXtsAesDecryptDataUnitYmm_2048(
250
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
251
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbTweakBlock,
252
_Out_writes_( SYMCRYPT_AES_BLOCK_SIZE*16 ) PBYTE pbScratch,
253
_In_reads_( cbData ) PCBYTE pbSrc,
254
_Out_writes_( cbData ) PBYTE pbDst,
255
SIZE_T cbData )
256
{
257
__m128i t0, t1, t2, t3, t4, t5, t6, t7;
258
__m256i c0, c1, c2, c3, c4, c5, c6, c7;
259
__m128i XTS_ALPHA_MASK;
260
__m256i XTS_ALPHA_MULTIPLIER_Ymm;
261
262
// Load tweaks into big T
263
__m256i T0, T1, T2, T3, T4, T5, T6, T7;
264
265
SIZE_T cbDataMain; // number of bytes to handle in the main loop
266
SIZE_T cbDataTail; // number of bytes to handle in the tail loop
267
268
// To simplify logic and unusual size processing, we handle all
269
// data not a multiple of 16 blocks in the tail loop
270
cbDataTail = cbData & ((16*SYMCRYPT_AES_BLOCK_SIZE)-1);
271
// Additionally, so that ciphertext stealing logic does not rely on
272
// reading back from the destination buffer, when we have a non-zero
273
// tail, we ensure that we handle at least 1 whole block in the tail
274
cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (16*SYMCRYPT_AES_BLOCK_SIZE) : 0;
275
cbDataMain = cbData - cbDataTail;
276
277
SYMCRYPT_ASSERT(cbDataMain <= cbData);
278
SYMCRYPT_ASSERT(cbDataTail <= cbData);
279
SYMCRYPT_ASSERT((cbDataMain & ((16*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0);
280
281
if( cbDataMain == 0 )
282
{
283
SymCryptXtsAesDecryptDataUnitXmm( pExpandedKey, pbTweakBlock, pbScratch, pbSrc, pbDst, cbDataTail );
284
return;
285
}
286
287
t0 = _mm_loadu_si128( (__m128i *) pbTweakBlock );
288
XTS_ALPHA_MASK = _mm_set_epi32( 1, 1, 1, 0x87 );
289
XTS_ALPHA_MULTIPLIER_Ymm = _mm256_set_epi64x( 0, 0x87, 0, 0x87 );
290
291
// Do not stall.
292
XTS_MUL_ALPHA4( t0, t4 );
293
XTS_MUL_ALPHA ( t0, t1 );
294
XTS_MUL_ALPHA ( t4, t5 );
295
XTS_MUL_ALPHA ( t1, t2 );
296
XTS_MUL_ALPHA ( t5, t6 );
297
XTS_MUL_ALPHA ( t2, t3 );
298
XTS_MUL_ALPHA ( t6, t7 );
299
300
T0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), t1, 1); // AVX
301
T1 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), t3, 1);
302
T2 = _mm256_insertf128_si256( _mm256_castsi128_si256( t4 ), t5, 1);
303
T3 = _mm256_insertf128_si256( _mm256_castsi128_si256( t6 ), t7, 1);
304
XTS_MUL_ALPHA8_YMM(T0, T4);
305
XTS_MUL_ALPHA8_YMM(T1, T5);
306
XTS_MUL_ALPHA8_YMM(T2, T6);
307
XTS_MUL_ALPHA8_YMM(T3, T7);
308
309
for(;;)
310
{
311
c0 = _mm256_xor_si256( T0, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 0 ) ) );
312
c1 = _mm256_xor_si256( T1, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 2*SYMCRYPT_AES_BLOCK_SIZE ) ) );
313
c2 = _mm256_xor_si256( T2, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 4*SYMCRYPT_AES_BLOCK_SIZE ) ) );
314
c3 = _mm256_xor_si256( T3, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 6*SYMCRYPT_AES_BLOCK_SIZE ) ) );
315
c4 = _mm256_xor_si256( T4, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 8*SYMCRYPT_AES_BLOCK_SIZE ) ) );
316
c5 = _mm256_xor_si256( T5, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 10*SYMCRYPT_AES_BLOCK_SIZE ) ) );
317
c6 = _mm256_xor_si256( T6, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 12*SYMCRYPT_AES_BLOCK_SIZE ) ) );
318
c7 = _mm256_xor_si256( T7, _mm256_loadu_si256( ( __m256i * ) ( pbSrc + 14*SYMCRYPT_AES_BLOCK_SIZE ) ) );
319
320
pbSrc += 16 * SYMCRYPT_AES_BLOCK_SIZE;
321
322
AES_DECRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
323
324
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 0 ), _mm256_xor_si256( c0, T0 ) );
325
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 2*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c1, T1 ) );
326
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 4*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c2, T2 ) );
327
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 6*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c3, T3 ) );
328
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 8*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c4, T4 ) );
329
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 10*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c5, T5 ) );
330
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 12*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c6, T6 ) );
331
_mm256_storeu_si256( ( __m256i * ) ( pbDst + 14*SYMCRYPT_AES_BLOCK_SIZE ), _mm256_xor_si256( c7, T7 ) );
332
333
pbDst += 16 * SYMCRYPT_AES_BLOCK_SIZE;
334
335
cbDataMain -= 16 * SYMCRYPT_AES_BLOCK_SIZE;
336
if( cbDataMain < 16 * SYMCRYPT_AES_BLOCK_SIZE )
337
{
338
break;
339
}
340
341
XTS_MUL_ALPHA16_YMM(T0, T0);
342
XTS_MUL_ALPHA16_YMM(T1, T1);
343
XTS_MUL_ALPHA16_YMM(T2, T2);
344
XTS_MUL_ALPHA16_YMM(T3, T3);
345
XTS_MUL_ALPHA16_YMM(T4, T4);
346
XTS_MUL_ALPHA16_YMM(T5, T5);
347
XTS_MUL_ALPHA16_YMM(T6, T6);
348
XTS_MUL_ALPHA16_YMM(T7, T7);
349
}
350
351
// We won't do another 16-block set so we don't update the tweak blocks
352
353
if( cbDataTail > 0 )
354
{
355
//
356
// This is a rare case: the data unit length is not a multiple of 256 bytes.
357
// We do this in the Xmm implementation.
358
// Fix up the tweak block first
359
//
360
t7 = _mm256_extracti128_si256 ( T7, 1 /* Highest 128 bits */ ); // AVX2
361
_mm256_zeroupper();
362
XTS_MUL_ALPHA( t7, t0 );
363
_mm_storeu_si128( (__m128i *) pbTweakBlock, t0 );
364
365
SymCryptXtsAesDecryptDataUnitXmm( pExpandedKey, pbTweakBlock, pbScratch, pbSrc, pbDst, cbDataTail );
366
}
367
else {
368
_mm256_zeroupper();
369
}
370
}
371
372
#define AES_FULLROUND_16_GHASH_2_Ymm( roundkeys, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \
373
{ \
374
roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
375
keyPtr ++; \
376
c0 = _mm256_aesenc_epi128( c0, roundkeys ); \
377
c1 = _mm256_aesenc_epi128( c1, roundkeys ); \
378
c2 = _mm256_aesenc_epi128( c2, roundkeys ); \
379
c3 = _mm256_aesenc_epi128( c3, roundkeys ); \
380
c4 = _mm256_aesenc_epi128( c4, roundkeys ); \
381
c5 = _mm256_aesenc_epi128( c5, roundkeys ); \
382
c6 = _mm256_aesenc_epi128( c6, roundkeys ); \
383
c7 = _mm256_aesenc_epi128( c7, roundkeys ); \
384
\
385
r0 = _mm256_loadu_si256( (__m256i *) gHashPointer ); \
386
r0 = _mm256_shuffle_epi8( r0, byteReverseOrder ); \
387
gHashPointer += 32; \
388
\
389
t1 = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(gHashExpandedKeyTable, todo) ); \
390
t0 = _mm256_clmulepi64_epi128( r0, t1, 0x00 ); \
391
t1 = _mm256_clmulepi64_epi128( r0, t1, 0x11 ); \
392
\
393
resl = _mm256_xor_si256( resl, t0 ); \
394
resh = _mm256_xor_si256( resh, t1 ); \
395
\
396
t0 = _mm256_srli_si256( r0, 8 ); \
397
r0 = _mm256_xor_si256( r0, t0 ); \
398
t1 = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(gHashExpandedKeyTable, todo) ); \
399
t1 = _mm256_clmulepi64_epi128( r0, t1, 0x00 ); \
400
\
401
resm = _mm256_xor_si256( resm, t1 ); \
402
todo -= 2; \
403
};
404
405
#define AES_GCM_ENCRYPT_16_Ymm( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \
406
{ \
407
const BYTE (*keyPtr)[4][4]; \
408
const BYTE (*keyLimit)[4][4]; \
409
__m256i roundkeys; \
410
__m256i t0, t1; \
411
__m256i r0; \
412
int aesEncryptGhashLoop; \
413
\
414
keyPtr = pExpandedKey->RoundKey; \
415
keyLimit = pExpandedKey->lastEncRoundKey; \
416
\
417
/* _mm256_broadcastsi128_si256 requires AVX2 */ \
418
roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
419
keyPtr ++; \
420
\
421
/* _mm256_xor_si256 requires AVX2 */ \
422
c0 = _mm256_xor_si256( c0, roundkeys ); \
423
c1 = _mm256_xor_si256( c1, roundkeys ); \
424
c2 = _mm256_xor_si256( c2, roundkeys ); \
425
c3 = _mm256_xor_si256( c3, roundkeys ); \
426
c4 = _mm256_xor_si256( c4, roundkeys ); \
427
c5 = _mm256_xor_si256( c5, roundkeys ); \
428
c6 = _mm256_xor_si256( c6, roundkeys ); \
429
c7 = _mm256_xor_si256( c7, roundkeys ); \
430
\
431
/* Do 8(x2) full rounds (AES-128|AES-192|AES-256) with stitched GHASH */ \
432
for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < 4; aesEncryptGhashLoop++ ) \
433
{ \
434
AES_FULLROUND_16_GHASH_2_Ymm( roundkeys, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ); \
435
AES_FULLROUND_16_GHASH_2_Ymm( roundkeys, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ); \
436
} \
437
\
438
do \
439
{ \
440
roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
441
keyPtr ++; \
442
c0 = _mm256_aesenc_epi128( c0, roundkeys ); \
443
c1 = _mm256_aesenc_epi128( c1, roundkeys ); \
444
c2 = _mm256_aesenc_epi128( c2, roundkeys ); \
445
c3 = _mm256_aesenc_epi128( c3, roundkeys ); \
446
c4 = _mm256_aesenc_epi128( c4, roundkeys ); \
447
c5 = _mm256_aesenc_epi128( c5, roundkeys ); \
448
c6 = _mm256_aesenc_epi128( c6, roundkeys ); \
449
c7 = _mm256_aesenc_epi128( c7, roundkeys ); \
450
} while( keyPtr < keyLimit ); \
451
\
452
roundkeys = _mm256_broadcastsi128_si256( *( (const __m128i *) keyPtr ) ); \
453
\
454
c0 = _mm256_aesenclast_epi128( c0, roundkeys ); \
455
c1 = _mm256_aesenclast_epi128( c1, roundkeys ); \
456
c2 = _mm256_aesenclast_epi128( c2, roundkeys ); \
457
c3 = _mm256_aesenclast_epi128( c3, roundkeys ); \
458
c4 = _mm256_aesenclast_epi128( c4, roundkeys ); \
459
c5 = _mm256_aesenclast_epi128( c5, roundkeys ); \
460
c6 = _mm256_aesenclast_epi128( c6, roundkeys ); \
461
c7 = _mm256_aesenclast_epi128( c7, roundkeys ); \
462
};
463
464
VOID
465
SYMCRYPT_CALL
466
SymCryptAesGcmEncryptStitchedYmm_2048(
467
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
468
_In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
469
_In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable,
470
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
471
_In_reads_( cbData ) PCBYTE pbSrc,
472
_Out_writes_( cbData ) PBYTE pbDst,
473
SIZE_T cbData )
474
{
475
__m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue );
476
477
__m128i BYTE_REVERSE_ORDER_xmm = _mm_set_epi8(
478
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
479
__m256i BYTE_REVERSE_ORDER = _mm256_set_epi64x( 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f );
480
__m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 );
481
482
__m256i chainIncrementUpper1 = _mm256_set_epi64x( 0, 1, 0, 0 );
483
__m256i chainIncrement2 = _mm256_set_epi64x( 0, 2, 0, 2 );
484
__m256i chainIncrement4 = _mm256_set_epi64x( 0, 4, 0, 4 );
485
__m256i chainIncrement16 = _mm256_set_epi64x( 0, 16, 0, 16 );
486
487
__m256i ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7;
488
__m256i c0, c1, c2, c3, c4, c5, c6, c7;
489
__m256i r0, r1, r2, r3, r4, r5, r6, r7;
490
__m256i Hi, Hix;
491
492
__m128i state;
493
__m128i a0_xmm, a1_xmm, a2_xmm;
494
__m256i a0, a1, a2;
495
SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
496
SIZE_T todo;
497
PCBYTE pbGhashSrc = pbDst;
498
499
SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size
500
SYMCRYPT_ASSERT( nBlocks >= GCM_YMM_MINBLOCKS );
501
502
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ) & ~(GCM_YMM_MINBLOCKS-1);
503
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER_xmm );
504
505
state = _mm_loadu_si128( (__m128i *) pState );
506
ctr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( chain ), chain, 1); // AVX
507
ctr0 = _mm256_add_epi32( ctr0, chainIncrementUpper1 );
508
ctr1 = _mm256_add_epi32( ctr0, chainIncrement2 );
509
ctr2 = _mm256_add_epi32( ctr0, chainIncrement4 );
510
ctr3 = _mm256_add_epi32( ctr1, chainIncrement4 );
511
ctr4 = _mm256_add_epi32( ctr2, chainIncrement4 );
512
ctr5 = _mm256_add_epi32( ctr3, chainIncrement4 );
513
ctr6 = _mm256_add_epi32( ctr4, chainIncrement4 );
514
ctr7 = _mm256_add_epi32( ctr5, chainIncrement4 );
515
516
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm );
517
a0 = a1 = a2 = _mm256_setzero_si256();
518
519
c0 = _mm256_shuffle_epi8( ctr0, BYTE_REVERSE_ORDER );
520
c1 = _mm256_shuffle_epi8( ctr1, BYTE_REVERSE_ORDER );
521
c2 = _mm256_shuffle_epi8( ctr2, BYTE_REVERSE_ORDER );
522
c3 = _mm256_shuffle_epi8( ctr3, BYTE_REVERSE_ORDER );
523
c4 = _mm256_shuffle_epi8( ctr4, BYTE_REVERSE_ORDER );
524
c5 = _mm256_shuffle_epi8( ctr5, BYTE_REVERSE_ORDER );
525
c6 = _mm256_shuffle_epi8( ctr6, BYTE_REVERSE_ORDER );
526
c7 = _mm256_shuffle_epi8( ctr7, BYTE_REVERSE_ORDER );
527
528
ctr0 = _mm256_add_epi32( ctr0, chainIncrement16 );
529
ctr1 = _mm256_add_epi32( ctr1, chainIncrement16 );
530
ctr2 = _mm256_add_epi32( ctr2, chainIncrement16 );
531
ctr3 = _mm256_add_epi32( ctr3, chainIncrement16 );
532
ctr4 = _mm256_add_epi32( ctr4, chainIncrement16 );
533
ctr5 = _mm256_add_epi32( ctr5, chainIncrement16 );
534
ctr6 = _mm256_add_epi32( ctr6, chainIncrement16 );
535
ctr7 = _mm256_add_epi32( ctr7, chainIncrement16 );
536
537
AES_ENCRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
538
539
_mm256_storeu_si256( (__m256i *) (pbDst + 0), _mm256_xor_si256( c0, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 0) ) ) );
540
_mm256_storeu_si256( (__m256i *) (pbDst + 32), _mm256_xor_si256( c1, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 32) ) ) );
541
_mm256_storeu_si256( (__m256i *) (pbDst + 64), _mm256_xor_si256( c2, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 64) ) ) );
542
_mm256_storeu_si256( (__m256i *) (pbDst + 96), _mm256_xor_si256( c3, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 96) ) ) );
543
_mm256_storeu_si256( (__m256i *) (pbDst +128), _mm256_xor_si256( c4, _mm256_loadu_si256( ( __m256i * ) (pbSrc +128) ) ) );
544
_mm256_storeu_si256( (__m256i *) (pbDst +160), _mm256_xor_si256( c5, _mm256_loadu_si256( ( __m256i * ) (pbSrc +160) ) ) );
545
_mm256_storeu_si256( (__m256i *) (pbDst +192), _mm256_xor_si256( c6, _mm256_loadu_si256( ( __m256i * ) (pbSrc +192) ) ) );
546
_mm256_storeu_si256( (__m256i *) (pbDst +224), _mm256_xor_si256( c7, _mm256_loadu_si256( ( __m256i * ) (pbSrc +224) ) ) );
547
548
pbDst += 16 * SYMCRYPT_AES_BLOCK_SIZE;
549
pbSrc += 16 * SYMCRYPT_AES_BLOCK_SIZE;
550
551
while( nBlocks >= 2*GCM_YMM_MINBLOCKS )
552
{
553
c0 = _mm256_shuffle_epi8( ctr0, BYTE_REVERSE_ORDER );
554
c1 = _mm256_shuffle_epi8( ctr1, BYTE_REVERSE_ORDER );
555
c2 = _mm256_shuffle_epi8( ctr2, BYTE_REVERSE_ORDER );
556
c3 = _mm256_shuffle_epi8( ctr3, BYTE_REVERSE_ORDER );
557
c4 = _mm256_shuffle_epi8( ctr4, BYTE_REVERSE_ORDER );
558
c5 = _mm256_shuffle_epi8( ctr5, BYTE_REVERSE_ORDER );
559
c6 = _mm256_shuffle_epi8( ctr6, BYTE_REVERSE_ORDER );
560
c7 = _mm256_shuffle_epi8( ctr7, BYTE_REVERSE_ORDER );
561
562
ctr0 = _mm256_add_epi32( ctr0, chainIncrement16 );
563
ctr1 = _mm256_add_epi32( ctr1, chainIncrement16 );
564
ctr2 = _mm256_add_epi32( ctr2, chainIncrement16 );
565
ctr3 = _mm256_add_epi32( ctr3, chainIncrement16 );
566
ctr4 = _mm256_add_epi32( ctr4, chainIncrement16 );
567
ctr5 = _mm256_add_epi32( ctr5, chainIncrement16 );
568
ctr6 = _mm256_add_epi32( ctr6, chainIncrement16 );
569
ctr7 = _mm256_add_epi32( ctr7, chainIncrement16 );
570
571
AES_GCM_ENCRYPT_16_Ymm( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
572
573
_mm256_storeu_si256( (__m256i *) (pbDst + 0), _mm256_xor_si256( c0, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 0) ) ) );
574
_mm256_storeu_si256( (__m256i *) (pbDst + 32), _mm256_xor_si256( c1, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 32) ) ) );
575
_mm256_storeu_si256( (__m256i *) (pbDst + 64), _mm256_xor_si256( c2, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 64) ) ) );
576
_mm256_storeu_si256( (__m256i *) (pbDst + 96), _mm256_xor_si256( c3, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 96) ) ) );
577
_mm256_storeu_si256( (__m256i *) (pbDst +128), _mm256_xor_si256( c4, _mm256_loadu_si256( ( __m256i * ) (pbSrc +128) ) ) );
578
_mm256_storeu_si256( (__m256i *) (pbDst +160), _mm256_xor_si256( c5, _mm256_loadu_si256( ( __m256i * ) (pbSrc +160) ) ) );
579
_mm256_storeu_si256( (__m256i *) (pbDst +192), _mm256_xor_si256( c6, _mm256_loadu_si256( ( __m256i * ) (pbSrc +192) ) ) );
580
_mm256_storeu_si256( (__m256i *) (pbDst +224), _mm256_xor_si256( c7, _mm256_loadu_si256( ( __m256i * ) (pbSrc +224) ) ) );
581
582
pbDst += 16 * SYMCRYPT_AES_BLOCK_SIZE;
583
pbSrc += 16 * SYMCRYPT_AES_BLOCK_SIZE;
584
nBlocks -= 16;
585
586
if ( todo == 0 )
587
{
588
a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 0 /* Lowest 128 bits */ ));
589
a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 0 /* Lowest 128 bits */ ));
590
a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 0 /* Lowest 128 bits */ ));
591
592
a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 1 /* Highest 128 bits */ ));
593
a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 1 /* Highest 128 bits */ ));
594
a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 1 /* Highest 128 bits */ ));
595
CLMUL_3_POST( a0_xmm, a1_xmm, a2_xmm );
596
MODREDUCE( vMultiplicationConstant, a0_xmm, a1_xmm, a2_xmm, state );
597
598
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ) & ~(GCM_YMM_MINBLOCKS-1);
599
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm );
600
a0 = a1 = a2 = _mm256_setzero_si256();
601
}
602
}
603
604
r0 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc + 0) ), BYTE_REVERSE_ORDER );
605
r1 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc + 32) ), BYTE_REVERSE_ORDER );
606
r2 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc + 64) ), BYTE_REVERSE_ORDER );
607
r3 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc + 96) ), BYTE_REVERSE_ORDER );
608
r4 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc +128) ), BYTE_REVERSE_ORDER );
609
r5 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc +160) ), BYTE_REVERSE_ORDER );
610
r6 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc +192) ), BYTE_REVERSE_ORDER );
611
r7 = _mm256_shuffle_epi8( _mm256_loadu_si256( (__m256i *) (pbGhashSrc +224) ), BYTE_REVERSE_ORDER );
612
613
Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo - 0) );
614
Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 0) );
615
CLMUL_ACC_3_Ymm( r0, Hi, Hix, a0, a1, a2 );
616
Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo - 2) );
617
Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 2) );
618
CLMUL_ACC_3_Ymm( r1, Hi, Hix, a0, a1, a2 );
619
Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo - 4) );
620
Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 4) );
621
CLMUL_ACC_3_Ymm( r2, Hi, Hix, a0, a1, a2 );
622
Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo - 6) );
623
Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 6) );
624
CLMUL_ACC_3_Ymm( r3, Hi, Hix, a0, a1, a2 );
625
Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo - 8) );
626
Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo - 8) );
627
CLMUL_ACC_3_Ymm( r4, Hi, Hix, a0, a1, a2 );
628
Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo -10) );
629
Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo -10) );
630
CLMUL_ACC_3_Ymm( r5, Hi, Hix, a0, a1, a2 );
631
Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo -12) );
632
Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo -12) );
633
CLMUL_ACC_3_Ymm( r6, Hi, Hix, a0, a1, a2 );
634
Hi = _mm256_loadu_si256( (__m256i *) &GHASH_H_POWER(expandedKeyTable, todo -14) );
635
Hix = _mm256_loadu_si256( (__m256i *) &GHASH_Hx_POWER(expandedKeyTable, todo -14) );
636
CLMUL_ACC_3_Ymm( r7, Hi, Hix, a0, a1, a2 );
637
638
a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 0 /* Lowest 128 bits */ ));
639
a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 0 /* Lowest 128 bits */ ));
640
a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 0 /* Lowest 128 bits */ ));
641
642
a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 1 /* Highest 128 bits */ ));
643
a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 1 /* Highest 128 bits */ ));
644
a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 1 /* Highest 128 bits */ ));
645
CLMUL_3_POST( a0_xmm, a1_xmm, a2_xmm );
646
MODREDUCE( vMultiplicationConstant, a0_xmm, a1_xmm, a2_xmm, state );
647
648
chain = _mm256_extracti128_si256 ( ctr0, 0 /* Lowest 128 bits */ );
649
_mm256_zeroupper();
650
651
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER_xmm );
652
_mm_storeu_si128((__m128i *) pbChainingValue, chain );
653
_mm_storeu_si128((__m128i *) pState, state );
654
655
cbData &= ( GCM_YMM_MINBLOCKS*SYMCRYPT_AES_BLOCK_SIZE ) - 1;
656
SYMCRYPT_ASSERT( cbData == (nBlocks-16)*SYMCRYPT_AES_BLOCK_SIZE );
657
if ( cbData >= SYMCRYPT_AES_BLOCK_SIZE )
658
{
659
SymCryptAesGcmEncryptStitchedXmm( pExpandedKey, pbChainingValue, expandedKeyTable, pState, pbSrc, pbDst, cbData);
660
}
661
}
662
663
VOID
664
SYMCRYPT_CALL
665
SymCryptAesGcmDecryptStitchedYmm_2048(
666
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
667
_In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
668
_In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable,
669
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
670
_In_reads_( cbData ) PCBYTE pbSrc,
671
_Out_writes_( cbData ) PBYTE pbDst,
672
SIZE_T cbData )
673
{
674
__m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue );
675
676
__m128i BYTE_REVERSE_ORDER_xmm = _mm_set_epi8(
677
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
678
__m256i BYTE_REVERSE_ORDER = _mm256_set_epi64x( 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f );
679
__m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 );
680
681
__m256i chainIncrementUpper1 = _mm256_set_epi64x( 0, 1, 0, 0 );
682
__m256i chainIncrement2 = _mm256_set_epi64x( 0, 2, 0, 2 );
683
__m256i chainIncrement4 = _mm256_set_epi64x( 0, 4, 0, 4 );
684
__m256i chainIncrement16 = _mm256_set_epi64x( 0, 16, 0, 16 );
685
686
__m256i ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7;
687
__m256i c0, c1, c2, c3, c4, c5, c6, c7;
688
689
__m128i state;
690
__m128i a0_xmm, a1_xmm, a2_xmm;
691
__m256i a0, a1, a2;
692
SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
693
SIZE_T todo;
694
PCBYTE pbGhashSrc = pbSrc;
695
696
SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size
697
SYMCRYPT_ASSERT( nBlocks >= GCM_YMM_MINBLOCKS );
698
699
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ) & ~(GCM_YMM_MINBLOCKS-1);
700
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER_xmm );
701
702
state = _mm_loadu_si128( (__m128i *) pState );
703
ctr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( chain ), chain, 1); // AVX
704
ctr0 = _mm256_add_epi32( ctr0, chainIncrementUpper1 );
705
ctr1 = _mm256_add_epi32( ctr0, chainIncrement2 );
706
ctr2 = _mm256_add_epi32( ctr0, chainIncrement4 );
707
ctr3 = _mm256_add_epi32( ctr1, chainIncrement4 );
708
ctr4 = _mm256_add_epi32( ctr2, chainIncrement4 );
709
ctr5 = _mm256_add_epi32( ctr3, chainIncrement4 );
710
ctr6 = _mm256_add_epi32( ctr4, chainIncrement4 );
711
ctr7 = _mm256_add_epi32( ctr5, chainIncrement4 );
712
713
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm );
714
a0 = a1 = a2 = _mm256_setzero_si256();
715
716
while( nBlocks >= GCM_YMM_MINBLOCKS )
717
{
718
c0 = _mm256_shuffle_epi8( ctr0, BYTE_REVERSE_ORDER );
719
c1 = _mm256_shuffle_epi8( ctr1, BYTE_REVERSE_ORDER );
720
c2 = _mm256_shuffle_epi8( ctr2, BYTE_REVERSE_ORDER );
721
c3 = _mm256_shuffle_epi8( ctr3, BYTE_REVERSE_ORDER );
722
c4 = _mm256_shuffle_epi8( ctr4, BYTE_REVERSE_ORDER );
723
c5 = _mm256_shuffle_epi8( ctr5, BYTE_REVERSE_ORDER );
724
c6 = _mm256_shuffle_epi8( ctr6, BYTE_REVERSE_ORDER );
725
c7 = _mm256_shuffle_epi8( ctr7, BYTE_REVERSE_ORDER );
726
727
ctr0 = _mm256_add_epi32( ctr0, chainIncrement16 );
728
ctr1 = _mm256_add_epi32( ctr1, chainIncrement16 );
729
ctr2 = _mm256_add_epi32( ctr2, chainIncrement16 );
730
ctr3 = _mm256_add_epi32( ctr3, chainIncrement16 );
731
ctr4 = _mm256_add_epi32( ctr4, chainIncrement16 );
732
ctr5 = _mm256_add_epi32( ctr5, chainIncrement16 );
733
ctr6 = _mm256_add_epi32( ctr6, chainIncrement16 );
734
ctr7 = _mm256_add_epi32( ctr7, chainIncrement16 );
735
736
AES_GCM_ENCRYPT_16_Ymm( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
737
738
_mm256_storeu_si256( (__m256i *) (pbDst + 0), _mm256_xor_si256( c0, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 0) ) ) );
739
_mm256_storeu_si256( (__m256i *) (pbDst + 32), _mm256_xor_si256( c1, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 32) ) ) );
740
_mm256_storeu_si256( (__m256i *) (pbDst + 64), _mm256_xor_si256( c2, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 64) ) ) );
741
_mm256_storeu_si256( (__m256i *) (pbDst + 96), _mm256_xor_si256( c3, _mm256_loadu_si256( ( __m256i * ) (pbSrc + 96) ) ) );
742
_mm256_storeu_si256( (__m256i *) (pbDst +128), _mm256_xor_si256( c4, _mm256_loadu_si256( ( __m256i * ) (pbSrc +128) ) ) );
743
_mm256_storeu_si256( (__m256i *) (pbDst +160), _mm256_xor_si256( c5, _mm256_loadu_si256( ( __m256i * ) (pbSrc +160) ) ) );
744
_mm256_storeu_si256( (__m256i *) (pbDst +192), _mm256_xor_si256( c6, _mm256_loadu_si256( ( __m256i * ) (pbSrc +192) ) ) );
745
_mm256_storeu_si256( (__m256i *) (pbDst +224), _mm256_xor_si256( c7, _mm256_loadu_si256( ( __m256i * ) (pbSrc +224) ) ) );
746
747
pbDst += 16 * SYMCRYPT_AES_BLOCK_SIZE;
748
pbSrc += 16 * SYMCRYPT_AES_BLOCK_SIZE;
749
nBlocks -= 16;
750
751
if ( todo == 0 )
752
{
753
a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 0 /* Lowest 128 bits */ ));
754
a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 0 /* Lowest 128 bits */ ));
755
a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 0 /* Lowest 128 bits */ ));
756
757
a0_xmm = _mm_xor_si128( a0_xmm, _mm256_extracti128_si256 ( a0, 1 /* Highest 128 bits */ ));
758
a1_xmm = _mm_xor_si128( a1_xmm, _mm256_extracti128_si256 ( a1, 1 /* Highest 128 bits */ ));
759
a2_xmm = _mm_xor_si128( a2_xmm, _mm256_extracti128_si256 ( a2, 1 /* Highest 128 bits */ ));
760
CLMUL_3_POST( a0_xmm, a1_xmm, a2_xmm );
761
MODREDUCE( vMultiplicationConstant, a0_xmm, a1_xmm, a2_xmm, state );
762
763
if ( nBlocks > 0 )
764
{
765
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS ) & ~(GCM_YMM_MINBLOCKS-1);
766
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm );
767
a0 = a1 = a2 = _mm256_setzero_si256();
768
}
769
}
770
}
771
772
chain = _mm256_extracti128_si256 ( ctr0, 0 /* Lowest 128 bits */ );
773
_mm256_zeroupper();
774
775
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER_xmm );
776
_mm_storeu_si128((__m128i *) pbChainingValue, chain );
777
_mm_storeu_si128((__m128i *) pState, state );
778
779
cbData &= ( GCM_YMM_MINBLOCKS*SYMCRYPT_AES_BLOCK_SIZE ) - 1;
780
SYMCRYPT_ASSERT( cbData == nBlocks*SYMCRYPT_AES_BLOCK_SIZE );
781
if ( cbData >= SYMCRYPT_AES_BLOCK_SIZE )
782
{
783
SymCryptAesGcmDecryptStitchedXmm( pExpandedKey, pbChainingValue, expandedKeyTable, pState, pbSrc, pbDst, cbData);
784
}
785
}
786
787
#ifdef __clang__
788
#pragma clang attribute pop
789
#else
790
#pragma GCC pop_options
791
#endif
792
793
#endif // CPU_X86 | CPU_AMD64
794
795