Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/symcrypt/lib/aes-xmm.c
15010 views
1
//
2
// aes-xmm.c code for AES implementation
3
//
4
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
5
//
6
// All XMM code for AES operations
7
// Requires compiler support for ssse3, aesni and pclmulqdq
8
//
9
10
#include "precomp.h"
11
12
#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
13
14
#include "xtsaes_definitions.h"
15
#include "ghash_definitions.h"
16
17
#ifdef __clang__
18
#pragma clang attribute push (__attribute__((target("ssse3,aes,pclmul"))), apply_to=function)
19
#else
20
#pragma GCC push_options
21
#pragma GCC target("ssse3,aes,pclmul")
22
#endif
23
24
VOID
25
SYMCRYPT_CALL
26
SymCryptAes4SboxXmm( _In_reads_(4) PCBYTE pIn, _Out_writes_(4) PBYTE pOut )
27
{
28
__m128i x;
29
x = _mm_set1_epi32( *(int *) pIn );
30
31
x = _mm_aeskeygenassist_si128( x, 0 );
32
33
// Could use _mm_storeu_si32( pOut, x ) but it is missing from some headers and _mm_store_ss will be as fast
34
_mm_store_ss( (float *) pOut, _mm_castsi128_ps(x) );
35
}
36
37
VOID
38
SYMCRYPT_CALL
39
SymCryptAesCreateDecryptionRoundKeyXmm(
40
_In_reads_(16) PCBYTE pEncryptionRoundKey,
41
_Out_writes_(16) PBYTE pDecryptionRoundKey )
42
{
43
//
44
// On x86 our key structure is only 4-aligned (the best we can do) so we use unaligned load/stores.
45
// On Amd64 our round keys are aligned, but recent CPUs have fast unaligned load/store if the address is
46
// actually aligned properly.
47
//
48
_mm_storeu_si128( (__m128i *) pDecryptionRoundKey, _mm_aesimc_si128( _mm_loadu_si128( (__m128i *)pEncryptionRoundKey ) ) );
49
}
50
51
//
52
// The latency of AES instruction has increased up to 8 cycles in Ivy Bridge,
53
// and back to 7 in Haswell.
54
// We use 8-parallel code to expose the maximum parallelism to the CPU.
55
// On x86 it will introduce some register spilling, but the load/stores
56
// should be able to hide behind the AES instruction latencies.
57
// Silvermont x86 CPUs has AES-NI with latency = 8 and throughput = 5, so there
58
// the CPU parallelism is low.
59
// For things like BitLocker that is fine, but other uses, such as GCM & AES_CTR_DRBG
60
// use odd sizes.
61
// We try to do 5-8 blocks in 8-parallel code, 2-4 blocks in 4-parallel code, and
62
// 1 block in 1-parallel code.
63
// This is a compromise; the big cores can do 8 parallel in about the time of a 4-parallel,
64
// but Silvermont cannot and would pay a big price on small requests if we only use 8-parallel.
65
// Doing only 8-parallel and then 1-parallel would penalize the big cores a lot.
66
//
67
// We used to have 7-parallel code, but common request sizes are not multiples of 7
68
// blocks so we end up doing a lot of extra work. This is especially expensive on
69
// Silvermont where the extra work isn't hidden in the latencies.
70
//
71
72
#define AES_ENCRYPT_1( pExpandedKey, c0 ) \
73
{ \
74
const BYTE (*keyPtr)[4][4]; \
75
const BYTE (*keyLimit)[4][4]; \
76
__m128i roundkey; \
77
\
78
keyPtr = &pExpandedKey->RoundKey[0]; \
79
keyLimit = pExpandedKey->lastEncRoundKey; \
80
\
81
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
82
keyPtr ++; \
83
\
84
c0 = _mm_xor_si128( c0, roundkey ); \
85
\
86
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
87
keyPtr ++; \
88
c0 = _mm_aesenc_si128( c0, roundkey ); \
89
\
90
do \
91
{ \
92
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
93
keyPtr ++; \
94
c0 = _mm_aesenc_si128( c0, roundkey ); \
95
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
96
keyPtr ++; \
97
c0 = _mm_aesenc_si128( c0, roundkey ); \
98
} while( keyPtr < keyLimit ); \
99
\
100
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
101
\
102
c0 = _mm_aesenclast_si128( c0, roundkey ); \
103
};
104
105
106
// Perform AES encryption without the first round key and with a specified last round key
107
//
108
// For algorithms where performance is dominated by a chain of dependent AES rounds (i.e. CBC encryption, CCM, CMAC)
109
// we can gain a reasonable performance uplift by computing (last round key ^ next plaintext block ^ first round key)
110
// off the critical path and using this computed value in place of last round key in AESENCLAST instructions.
111
#define AES_ENCRYPT_1_CHAIN( pExpandedKey, cipherState, mergedLastRoundKey ) \
112
{ \
113
const BYTE (*keyPtr)[4][4]; \
114
const BYTE (*keyLimit)[4][4]; \
115
__m128i roundkey; \
116
\
117
keyPtr = &pExpandedKey->RoundKey[1]; \
118
keyLimit = pExpandedKey->lastEncRoundKey; \
119
\
120
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
121
keyPtr ++; \
122
\
123
cipherState = _mm_aesenc_si128( cipherState, roundkey ); \
124
\
125
do \
126
{ \
127
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
128
keyPtr ++; \
129
cipherState = _mm_aesenc_si128( cipherState, roundkey ); \
130
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
131
keyPtr ++; \
132
cipherState = _mm_aesenc_si128( cipherState, roundkey ); \
133
} while( keyPtr < keyLimit ); \
134
\
135
cipherState = _mm_aesenclast_si128( cipherState, mergedLastRoundKey ); \
136
};
137
138
#define AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 ) \
139
{ \
140
const BYTE (*keyPtr)[4][4]; \
141
const BYTE (*keyLimit)[4][4]; \
142
__m128i roundkey; \
143
\
144
keyPtr = &pExpandedKey->RoundKey[0]; \
145
keyLimit = pExpandedKey->lastEncRoundKey; \
146
\
147
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
148
keyPtr ++; \
149
\
150
c0 = _mm_xor_si128( c0, roundkey ); \
151
c1 = _mm_xor_si128( c1, roundkey ); \
152
c2 = _mm_xor_si128( c2, roundkey ); \
153
c3 = _mm_xor_si128( c3, roundkey ); \
154
\
155
do \
156
{ \
157
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
158
keyPtr ++; \
159
c0 = _mm_aesenc_si128( c0, roundkey ); \
160
c1 = _mm_aesenc_si128( c1, roundkey ); \
161
c2 = _mm_aesenc_si128( c2, roundkey ); \
162
c3 = _mm_aesenc_si128( c3, roundkey ); \
163
} while( keyPtr < keyLimit ); \
164
\
165
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
166
\
167
c0 = _mm_aesenclast_si128( c0, roundkey ); \
168
c1 = _mm_aesenclast_si128( c1, roundkey ); \
169
c2 = _mm_aesenclast_si128( c2, roundkey ); \
170
c3 = _mm_aesenclast_si128( c3, roundkey ); \
171
};
172
173
#define AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \
174
{ \
175
const BYTE (*keyPtr)[4][4]; \
176
const BYTE (*keyLimit)[4][4]; \
177
__m128i roundkey; \
178
\
179
keyPtr = &pExpandedKey->RoundKey[0]; \
180
keyLimit = pExpandedKey->lastEncRoundKey; \
181
\
182
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
183
keyPtr ++; \
184
\
185
c0 = _mm_xor_si128( c0, roundkey ); \
186
c1 = _mm_xor_si128( c1, roundkey ); \
187
c2 = _mm_xor_si128( c2, roundkey ); \
188
c3 = _mm_xor_si128( c3, roundkey ); \
189
c4 = _mm_xor_si128( c4, roundkey ); \
190
c5 = _mm_xor_si128( c5, roundkey ); \
191
c6 = _mm_xor_si128( c6, roundkey ); \
192
c7 = _mm_xor_si128( c7, roundkey ); \
193
\
194
do \
195
{ \
196
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
197
keyPtr ++; \
198
c0 = _mm_aesenc_si128( c0, roundkey ); \
199
c1 = _mm_aesenc_si128( c1, roundkey ); \
200
c2 = _mm_aesenc_si128( c2, roundkey ); \
201
c3 = _mm_aesenc_si128( c3, roundkey ); \
202
c4 = _mm_aesenc_si128( c4, roundkey ); \
203
c5 = _mm_aesenc_si128( c5, roundkey ); \
204
c6 = _mm_aesenc_si128( c6, roundkey ); \
205
c7 = _mm_aesenc_si128( c7, roundkey ); \
206
} while( keyPtr < keyLimit ); \
207
\
208
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
209
\
210
c0 = _mm_aesenclast_si128( c0, roundkey ); \
211
c1 = _mm_aesenclast_si128( c1, roundkey ); \
212
c2 = _mm_aesenclast_si128( c2, roundkey ); \
213
c3 = _mm_aesenclast_si128( c3, roundkey ); \
214
c4 = _mm_aesenclast_si128( c4, roundkey ); \
215
c5 = _mm_aesenclast_si128( c5, roundkey ); \
216
c6 = _mm_aesenclast_si128( c6, roundkey ); \
217
c7 = _mm_aesenclast_si128( c7, roundkey ); \
218
};
219
220
#define AES_DECRYPT_1( pExpandedKey, c0 ) \
221
{ \
222
const BYTE (*keyPtr)[4][4]; \
223
const BYTE (*keyLimit)[4][4]; \
224
__m128i roundkey; \
225
\
226
keyPtr = pExpandedKey->lastEncRoundKey; \
227
keyLimit = pExpandedKey->lastDecRoundKey; \
228
\
229
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
230
keyPtr ++; \
231
\
232
c0 = _mm_xor_si128( c0, roundkey ); \
233
\
234
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
235
keyPtr ++; \
236
c0 = _mm_aesdec_si128( c0, roundkey ); \
237
\
238
do \
239
{ \
240
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
241
keyPtr ++; \
242
c0 = _mm_aesdec_si128( c0, roundkey ); \
243
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
244
keyPtr ++; \
245
c0 = _mm_aesdec_si128( c0, roundkey ); \
246
} while( keyPtr < keyLimit ); \
247
\
248
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
249
\
250
c0 = _mm_aesdeclast_si128( c0, roundkey ); \
251
};
252
253
#define AES_DECRYPT_4( pExpandedKey, c0, c1, c2, c3 ) \
254
{ \
255
const BYTE (*keyPtr)[4][4]; \
256
const BYTE (*keyLimit)[4][4]; \
257
__m128i roundkey; \
258
\
259
keyPtr = pExpandedKey->lastEncRoundKey; \
260
keyLimit = pExpandedKey->lastDecRoundKey; \
261
\
262
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
263
keyPtr ++; \
264
\
265
c0 = _mm_xor_si128( c0, roundkey ); \
266
c1 = _mm_xor_si128( c1, roundkey ); \
267
c2 = _mm_xor_si128( c2, roundkey ); \
268
c3 = _mm_xor_si128( c3, roundkey ); \
269
\
270
do \
271
{ \
272
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
273
keyPtr ++; \
274
c0 = _mm_aesdec_si128( c0, roundkey ); \
275
c1 = _mm_aesdec_si128( c1, roundkey ); \
276
c2 = _mm_aesdec_si128( c2, roundkey ); \
277
c3 = _mm_aesdec_si128( c3, roundkey ); \
278
} while( keyPtr < keyLimit ); \
279
\
280
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
281
\
282
c0 = _mm_aesdeclast_si128( c0, roundkey ); \
283
c1 = _mm_aesdeclast_si128( c1, roundkey ); \
284
c2 = _mm_aesdeclast_si128( c2, roundkey ); \
285
c3 = _mm_aesdeclast_si128( c3, roundkey ); \
286
};
287
288
#define AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \
289
{ \
290
const BYTE (*keyPtr)[4][4]; \
291
const BYTE (*keyLimit)[4][4]; \
292
__m128i roundkey; \
293
\
294
keyPtr = pExpandedKey->lastEncRoundKey; \
295
keyLimit = pExpandedKey->lastDecRoundKey; \
296
\
297
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
298
keyPtr ++; \
299
\
300
c0 = _mm_xor_si128( c0, roundkey ); \
301
c1 = _mm_xor_si128( c1, roundkey ); \
302
c2 = _mm_xor_si128( c2, roundkey ); \
303
c3 = _mm_xor_si128( c3, roundkey ); \
304
c4 = _mm_xor_si128( c4, roundkey ); \
305
c5 = _mm_xor_si128( c5, roundkey ); \
306
c6 = _mm_xor_si128( c6, roundkey ); \
307
c7 = _mm_xor_si128( c7, roundkey ); \
308
\
309
do \
310
{ \
311
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
312
keyPtr ++; \
313
c0 = _mm_aesdec_si128( c0, roundkey ); \
314
c1 = _mm_aesdec_si128( c1, roundkey ); \
315
c2 = _mm_aesdec_si128( c2, roundkey ); \
316
c3 = _mm_aesdec_si128( c3, roundkey ); \
317
c4 = _mm_aesdec_si128( c4, roundkey ); \
318
c5 = _mm_aesdec_si128( c5, roundkey ); \
319
c6 = _mm_aesdec_si128( c6, roundkey ); \
320
c7 = _mm_aesdec_si128( c7, roundkey ); \
321
} while( keyPtr < keyLimit ); \
322
\
323
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
324
\
325
c0 = _mm_aesdeclast_si128( c0, roundkey ); \
326
c1 = _mm_aesdeclast_si128( c1, roundkey ); \
327
c2 = _mm_aesdeclast_si128( c2, roundkey ); \
328
c3 = _mm_aesdeclast_si128( c3, roundkey ); \
329
c4 = _mm_aesdeclast_si128( c4, roundkey ); \
330
c5 = _mm_aesdeclast_si128( c5, roundkey ); \
331
c6 = _mm_aesdeclast_si128( c6, roundkey ); \
332
c7 = _mm_aesdeclast_si128( c7, roundkey ); \
333
};
334
335
336
//
337
// The EncryptXmm code is tested through the CFB mode encryption which has no further optimizations.
338
//
339
VOID
340
SYMCRYPT_CALL
341
SymCryptAesEncryptXmm(
342
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
343
_In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PCBYTE pbSrc,
344
_Out_writes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbDst )
345
{
346
__m128i c;
347
348
c = _mm_loadu_si128( ( __m128i * ) pbSrc);
349
350
AES_ENCRYPT_1( pExpandedKey, c );
351
352
_mm_storeu_si128( (__m128i *) pbDst, c );
353
}
354
355
//
356
// The DecryptXmm code is tested through the EcbDecrypt calls which has no further optimizations.
357
//
358
VOID
359
SYMCRYPT_CALL
360
SymCryptAesDecryptXmm(
361
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
362
_In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PCBYTE pbSrc,
363
_Out_writes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbDst )
364
{
365
__m128i c;
366
367
c = _mm_loadu_si128( ( __m128i * ) pbSrc);
368
369
AES_DECRYPT_1( pExpandedKey, c );
370
371
_mm_storeu_si128( (__m128i *) pbDst, c );
372
}
373
374
// Disable warnings and VC++ runtime checks for use of uninitialized values (by design)
375
#pragma warning(push)
376
#pragma warning( disable: 6001 4701 )
377
#pragma runtime_checks( "u", off )
378
VOID
379
SYMCRYPT_CALL
380
SymCryptAesEcbEncryptXmm(
381
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
382
_In_reads_( cbData ) PCBYTE pbSrc,
383
_Out_writes_( cbData ) PBYTE pbDst,
384
SIZE_T cbData )
385
{
386
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
387
388
while( cbData >= 8 * SYMCRYPT_AES_BLOCK_SIZE )
389
{
390
c0 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ));
391
c1 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ));
392
c2 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ));
393
c3 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ));
394
c4 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 ));
395
c5 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 ));
396
c6 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 ));
397
c7 = _mm_loadu_si128( ( __m128i * ) (pbSrc +112 ));
398
399
AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
400
401
_mm_storeu_si128( (__m128i *) (pbDst + 0 ), c0 );
402
_mm_storeu_si128( (__m128i *) (pbDst + 16 ), c1 );
403
_mm_storeu_si128( (__m128i *) (pbDst + 32 ), c2 );
404
_mm_storeu_si128( (__m128i *) (pbDst + 48 ), c3 );
405
_mm_storeu_si128( (__m128i *) (pbDst + 64 ), c4 );
406
_mm_storeu_si128( (__m128i *) (pbDst + 80 ), c5 );
407
_mm_storeu_si128( (__m128i *) (pbDst + 96 ), c6 );
408
_mm_storeu_si128( (__m128i *) (pbDst +112 ), c7 );
409
410
pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
411
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
412
cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
413
}
414
415
if( cbData < 16 )
416
{
417
return;
418
}
419
420
c0 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ));
421
if( cbData >= 32 )
422
{
423
c1 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ));
424
if( cbData >= 48 )
425
{
426
c2 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ));
427
if( cbData >= 64 )
428
{
429
c3 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ));
430
if( cbData >= 80 )
431
{
432
c4 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 ));
433
if( cbData >= 96 )
434
{
435
c5 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 ));
436
if( cbData >= 112 )
437
{
438
c6 = _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 ));
439
}
440
}
441
}
442
}
443
}
444
}
445
446
if( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
447
{
448
AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
449
}
450
else if( cbData >= 2 * SYMCRYPT_AES_BLOCK_SIZE )
451
{
452
AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 );
453
}
454
else
455
{
456
AES_ENCRYPT_1( pExpandedKey, c0 );
457
}
458
459
_mm_storeu_si128( (__m128i *) (pbDst + 0 ), c0 );
460
if( cbData >= 32 )
461
{
462
_mm_storeu_si128( (__m128i *) (pbDst + 16 ), c1 );
463
if( cbData >= 48 )
464
{
465
_mm_storeu_si128( (__m128i *) (pbDst + 32 ), c2 );
466
if( cbData >= 64 )
467
{
468
_mm_storeu_si128( (__m128i *) (pbDst + 48 ), c3 );
469
if( cbData >= 80 )
470
{
471
_mm_storeu_si128( (__m128i *) (pbDst + 64 ), c4 );
472
if( cbData >= 96 )
473
{
474
_mm_storeu_si128( (__m128i *) (pbDst + 80 ), c5 );
475
if( cbData >= 112 )
476
{
477
_mm_storeu_si128( (__m128i *) (pbDst + 96 ), c6 );
478
}
479
}
480
}
481
}
482
}
483
}
484
}
485
#pragma runtime_checks( "u", restore )
486
#pragma warning( pop )
487
488
489
490
VOID
491
SYMCRYPT_CALL
492
SymCryptAesCbcEncryptXmm(
493
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
494
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
495
_In_reads_( cbData ) PCBYTE pbSrc,
496
_Out_writes_( cbData ) PBYTE pbDst,
497
SIZE_T cbData )
498
{
499
__m128i c = _mm_loadu_si128( (__m128i *) pbChainingValue );
500
__m128i rk0 = _mm_loadu_si128( (__m128i *) &pExpandedKey->RoundKey[0] );
501
__m128i rkLast = _mm_loadu_si128( (__m128i *) pExpandedKey->lastEncRoundKey );
502
__m128i d;
503
504
if (cbData < SYMCRYPT_AES_BLOCK_SIZE)
505
return;
506
507
// This algorithm is dominated by chain of dependent AES rounds, so we want to avoid XOR
508
// instructions on the critical path where possible
509
// We can compute (last round key ^ next plaintext block ^ first round key) off the critical
510
// path and use this with AES_ENCRYPT_1_CHAIN so that only AES instructions write to c in
511
// the main loop
512
d = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbSrc ), rk0 );
513
c = _mm_xor_si128( c, d );
514
pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
515
cbData -= SYMCRYPT_AES_BLOCK_SIZE;
516
517
while( cbData >= SYMCRYPT_AES_BLOCK_SIZE )
518
{
519
d = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbSrc ), rk0 );
520
AES_ENCRYPT_1_CHAIN( pExpandedKey, c, _mm_xor_si128(d, rkLast ) );
521
_mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128(c, d) );
522
523
pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
524
pbDst += SYMCRYPT_AES_BLOCK_SIZE;
525
cbData -= SYMCRYPT_AES_BLOCK_SIZE;
526
}
527
AES_ENCRYPT_1_CHAIN( pExpandedKey, c, rkLast );
528
_mm_storeu_si128( (__m128i *) pbDst, c );
529
_mm_storeu_si128( (__m128i *) pbChainingValue, c );
530
}
531
532
// Disable warnings and VC++ runtime checks for use of uninitialized values (by design)
533
#pragma warning(push)
534
#pragma warning( disable: 6001 4701 )
535
#pragma runtime_checks( "u", off )
536
VOID
537
SYMCRYPT_CALL
538
SymCryptAesCbcDecryptXmm(
539
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
540
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
541
_In_reads_( cbData ) PCBYTE pbSrc,
542
_Out_writes_( cbData ) PBYTE pbDst,
543
SIZE_T cbData )
544
{
545
__m128i chain;
546
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
547
__m128i d0, d1, d2, d3, d4, d5, d6, d7;
548
549
if( cbData < SYMCRYPT_AES_BLOCK_SIZE )
550
{
551
return;
552
}
553
554
chain = _mm_loadu_si128( (__m128i *) pbChainingValue );
555
556
//
557
// First we do all multiples of 8 blocks
558
//
559
560
while( cbData >= 8 * SYMCRYPT_AES_BLOCK_SIZE )
561
{
562
d0 = c0 = _mm_loadu_si128( (__m128i *) (pbSrc + 0 * SYMCRYPT_AES_BLOCK_SIZE ) );
563
d1 = c1 = _mm_loadu_si128( (__m128i *) (pbSrc + 1 * SYMCRYPT_AES_BLOCK_SIZE ) );
564
d2 = c2 = _mm_loadu_si128( (__m128i *) (pbSrc + 2 * SYMCRYPT_AES_BLOCK_SIZE ) );
565
d3 = c3 = _mm_loadu_si128( (__m128i *) (pbSrc + 3 * SYMCRYPT_AES_BLOCK_SIZE ) );
566
d4 = c4 = _mm_loadu_si128( (__m128i *) (pbSrc + 4 * SYMCRYPT_AES_BLOCK_SIZE ) );
567
d5 = c5 = _mm_loadu_si128( (__m128i *) (pbSrc + 5 * SYMCRYPT_AES_BLOCK_SIZE ) );
568
d6 = c6 = _mm_loadu_si128( (__m128i *) (pbSrc + 6 * SYMCRYPT_AES_BLOCK_SIZE ) );
569
d7 = c7 = _mm_loadu_si128( (__m128i *) (pbSrc + 7 * SYMCRYPT_AES_BLOCK_SIZE ) );
570
571
AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
572
573
c0 = _mm_xor_si128( c0, chain );
574
c1 = _mm_xor_si128( c1, d0 );
575
c2 = _mm_xor_si128( c2, d1 );
576
c3 = _mm_xor_si128( c3, d2 );
577
c4 = _mm_xor_si128( c4, d3 );
578
c5 = _mm_xor_si128( c5, d4 );
579
c6 = _mm_xor_si128( c6, d5 );
580
c7 = _mm_xor_si128( c7, d6 );
581
chain = d7;
582
583
_mm_storeu_si128( (__m128i *) (pbDst + 0 * SYMCRYPT_AES_BLOCK_SIZE ), c0 );
584
_mm_storeu_si128( (__m128i *) (pbDst + 1 * SYMCRYPT_AES_BLOCK_SIZE ), c1 );
585
_mm_storeu_si128( (__m128i *) (pbDst + 2 * SYMCRYPT_AES_BLOCK_SIZE ), c2 );
586
_mm_storeu_si128( (__m128i *) (pbDst + 3 * SYMCRYPT_AES_BLOCK_SIZE ), c3 );
587
_mm_storeu_si128( (__m128i *) (pbDst + 4 * SYMCRYPT_AES_BLOCK_SIZE ), c4 );
588
_mm_storeu_si128( (__m128i *) (pbDst + 5 * SYMCRYPT_AES_BLOCK_SIZE ), c5 );
589
_mm_storeu_si128( (__m128i *) (pbDst + 6 * SYMCRYPT_AES_BLOCK_SIZE ), c6 );
590
_mm_storeu_si128( (__m128i *) (pbDst + 7 * SYMCRYPT_AES_BLOCK_SIZE ), c7 );
591
592
pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
593
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
594
cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
595
}
596
597
if( cbData >= 16 )
598
{
599
//
600
// There is remaining work to be done
601
//
602
d0 = c0 = _mm_loadu_si128( (__m128i *) (pbSrc + 0 * SYMCRYPT_AES_BLOCK_SIZE ) );
603
if( cbData >= 32 )
604
{
605
d1 = c1 = _mm_loadu_si128( (__m128i *) (pbSrc + 1 * SYMCRYPT_AES_BLOCK_SIZE ) );
606
if( cbData >= 48 )
607
{
608
d2 = c2 = _mm_loadu_si128( (__m128i *) (pbSrc + 2 * SYMCRYPT_AES_BLOCK_SIZE ) );
609
if( cbData >= 64 )
610
{
611
d3 = c3 = _mm_loadu_si128( (__m128i *) (pbSrc + 3 * SYMCRYPT_AES_BLOCK_SIZE ) );
612
if( cbData >= 80 )
613
{
614
d4 = c4 = _mm_loadu_si128( (__m128i *) (pbSrc + 4 * SYMCRYPT_AES_BLOCK_SIZE ) );
615
if( cbData >= 96 )
616
{
617
d5 = c5 = _mm_loadu_si128( (__m128i *) (pbSrc + 5 * SYMCRYPT_AES_BLOCK_SIZE ) );
618
if( cbData >= 112 )
619
{
620
d6 = c6 = _mm_loadu_si128( (__m128i *) (pbSrc + 6 * SYMCRYPT_AES_BLOCK_SIZE ) );
621
}
622
}
623
}
624
}
625
}
626
}
627
628
//
629
// Decrypt 1, 4, or 8 blocks in AES-CBC mode. This might decrypt uninitialized registers,
630
// but those will not be used when we store the results.
631
//
632
if( cbData > 4 * SYMCRYPT_AES_BLOCK_SIZE )
633
{
634
AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
635
c0 = _mm_xor_si128( c0, chain );
636
c1 = _mm_xor_si128( c1, d0 );
637
c2 = _mm_xor_si128( c2, d1 );
638
c3 = _mm_xor_si128( c3, d2 );
639
c4 = _mm_xor_si128( c4, d3 );
640
c5 = _mm_xor_si128( c5, d4 );
641
c6 = _mm_xor_si128( c6, d5 );
642
}
643
else if( cbData > SYMCRYPT_AES_BLOCK_SIZE )
644
{
645
AES_DECRYPT_4( pExpandedKey, c0, c1, c2, c3 );
646
c0 = _mm_xor_si128( c0, chain );
647
c1 = _mm_xor_si128( c1, d0 );
648
c2 = _mm_xor_si128( c2, d1 );
649
c3 = _mm_xor_si128( c3, d2 );
650
} else
651
{
652
AES_DECRYPT_1( pExpandedKey, c0 );
653
c0 = _mm_xor_si128( c0, chain );
654
}
655
656
chain = _mm_loadu_si128( (__m128i *) (pbSrc + cbData - SYMCRYPT_AES_BLOCK_SIZE ) );
657
_mm_storeu_si128( (__m128i *) (pbDst + 0 * SYMCRYPT_AES_BLOCK_SIZE ), c0 );
658
if( cbData >= 32 )
659
{
660
_mm_storeu_si128( (__m128i *) (pbDst + 1 * SYMCRYPT_AES_BLOCK_SIZE ), c1 );
661
if( cbData >= 48 )
662
{
663
_mm_storeu_si128( (__m128i *) (pbDst + 2 * SYMCRYPT_AES_BLOCK_SIZE ), c2 );
664
if( cbData >= 64 )
665
{
666
_mm_storeu_si128( (__m128i *) (pbDst + 3 * SYMCRYPT_AES_BLOCK_SIZE ), c3 );
667
if( cbData >= 80 )
668
{
669
_mm_storeu_si128( (__m128i *) (pbDst + 4 * SYMCRYPT_AES_BLOCK_SIZE ), c4 );
670
if( cbData >= 96 )
671
{
672
_mm_storeu_si128( (__m128i *) (pbDst + 5 * SYMCRYPT_AES_BLOCK_SIZE ), c5 );
673
if( cbData >= 112 )
674
{
675
_mm_storeu_si128( (__m128i *) (pbDst + 6 * SYMCRYPT_AES_BLOCK_SIZE ), c6 );
676
}
677
}
678
}
679
}
680
}
681
}
682
}
683
684
_mm_storeu_si128( (__m128i *) pbChainingValue, chain );
685
686
return;
687
}
688
#pragma runtime_checks( "u", restore )
689
#pragma warning( pop )
690
691
VOID
692
SYMCRYPT_CALL
693
SymCryptAesCbcMacXmm(
694
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
695
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
696
_In_reads_( cbData ) PCBYTE pbData,
697
SIZE_T cbData )
698
{
699
__m128i c = _mm_loadu_si128( (__m128i *) pbChainingValue );
700
__m128i rk0 = _mm_loadu_si128( (__m128i *) &pExpandedKey->RoundKey[0] );
701
__m128i rkLast = _mm_loadu_si128( (__m128i *) pExpandedKey->lastEncRoundKey );
702
__m128i d, rk0AndLast;
703
704
if (cbData < SYMCRYPT_AES_BLOCK_SIZE)
705
return;
706
707
// This algorithm is dominated by chain of dependent AES rounds, so we want to avoid XOR
708
// instructions on the critical path where possible
709
// We can compute (last round key ^ next plaintext block ^ first round key) off the critical
710
// path and use this with AES_ENCRYPT_1_CHAIN so that only AES instructions write to c in
711
// the main loop
712
d = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbData ), rk0 );
713
c = _mm_xor_si128( c, d );
714
pbData += SYMCRYPT_AES_BLOCK_SIZE;
715
cbData -= SYMCRYPT_AES_BLOCK_SIZE;
716
717
// As we don't compute ciphertext here, we only need to XOR rk0 and rkLast once
718
rk0AndLast = _mm_xor_si128( rk0, rkLast );
719
720
while( cbData >= SYMCRYPT_AES_BLOCK_SIZE )
721
{
722
d = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbData ), rk0AndLast );
723
AES_ENCRYPT_1_CHAIN( pExpandedKey, c, d );
724
725
pbData += SYMCRYPT_AES_BLOCK_SIZE;
726
cbData -= SYMCRYPT_AES_BLOCK_SIZE;
727
}
728
AES_ENCRYPT_1_CHAIN( pExpandedKey, c, rkLast );
729
_mm_storeu_si128( (__m128i *) pbChainingValue, c );
730
}
731
732
733
#pragma warning(push)
734
#pragma warning( disable:4701 ) // "Use of uninitialized variable"
735
#pragma runtime_checks( "u", off )
736
737
#define SYMCRYPT_AesCtrMsbXxXmm SymCryptAesCtrMsb64Xmm
738
#define MM_ADD_EPIXX _mm_add_epi64
739
#define MM_SUB_EPIXX _mm_sub_epi64
740
741
#include "aes-pattern.c"
742
743
#undef MM_SUB_EPIXX
744
#undef MM_ADD_EPIXX
745
#undef SYMCRYPT_AesCtrMsbXxXmm
746
747
#define SYMCRYPT_AesCtrMsbXxXmm SymCryptAesCtrMsb32Xmm
748
#define MM_ADD_EPIXX _mm_add_epi32
749
#define MM_SUB_EPIXX _mm_sub_epi32
750
751
#include "aes-pattern.c"
752
753
#undef MM_SUB_EPIXX
754
#undef MM_ADD_EPIXX
755
#undef SYMCRYPT_AesCtrMsbXxXmm
756
757
#pragma runtime_checks( "u", restore )
758
#pragma warning(pop)
759
760
/*
761
if( cbData >= 16 )
762
{
763
if( cbData >= 32 )
764
{
765
if( cbData >= 48 )
766
{
767
if( cbData >= 64 )
768
{
769
if( cbData >= 80 )
770
{
771
if( cbData >= 96 )
772
{
773
if( cbData >= 112 )
774
{
775
}
776
}
777
}
778
}
779
}
780
}
781
}
782
*/
783
784
VOID
785
SYMCRYPT_CALL
786
SymCryptXtsAesEncryptDataUnitXmm(
787
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
788
_In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbTweakBlock,
789
_Out_writes_( SYMCRYPT_AES_BLOCK_SIZE*16 ) PBYTE pbScratch,
790
_In_reads_( cbData ) PCBYTE pbSrc,
791
_Out_writes_( cbData ) PBYTE pbDst,
792
SIZE_T cbData )
793
{
794
__m128i t0;
795
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
796
__m128i roundkey, firstRoundKey, lastRoundKey;
797
__m128i XTS_ALPHA_MASK = _mm_set_epi32( 1, 1, 1, 0x87 );
798
SYMCRYPT_GF128_ELEMENT* tweakBuffer = (SYMCRYPT_GF128_ELEMENT*) pbScratch;
799
800
const BYTE (*keyPtr)[4][4];
801
const BYTE (*keyLimit)[4][4] = pExpandedKey->lastEncRoundKey;
802
UINT64 lastTweakLow, lastTweakHigh;
803
int aesEncryptXtsLoop;
804
805
SIZE_T cbDataMain; // number of bytes to handle in the main loop
806
SIZE_T cbDataTail; // number of bytes to handle in the tail loop
807
808
SYMCRYPT_ASSERT(cbData >= SYMCRYPT_AES_BLOCK_SIZE);
809
810
// To simplify logic and unusual size processing, we handle all
811
// data not a multiple of 8 blocks in the tail loop
812
cbDataTail = cbData & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1);
813
// Additionally, so that ciphertext stealing logic does not rely on
814
// reading back from the destination buffer, when we have a non-zero
815
// tail, we ensure that we handle at least 1 whole block in the tail
816
//
817
// Note that our caller has ensured we have at least 1 whole block
818
// to process, this is checked in debug build
819
// This means that cbDataTail is in [1,15] at this point iff there are
820
// at least 8 whole blocks to process; so the below does not cause
821
// cbDataTail or cbDataMain to exceed cbData
822
cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (8*SYMCRYPT_AES_BLOCK_SIZE) : 0;
823
cbDataMain = cbData - cbDataTail;
824
825
SYMCRYPT_ASSERT(cbDataMain <= cbData);
826
SYMCRYPT_ASSERT(cbDataTail <= cbData);
827
SYMCRYPT_ASSERT((cbDataMain & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0);
828
829
c0 = _mm_loadu_si128( (__m128i *) pbTweakBlock );
830
XTS_MUL_ALPHA( c0, c1 );
831
XTS_MUL_ALPHA( c1, c2 );
832
XTS_MUL_ALPHA( c2, c3 );
833
834
XTS_MUL_ALPHA4( c0, c4 );
835
XTS_MUL_ALPHA ( c4, c5 );
836
XTS_MUL_ALPHA ( c5, c6 );
837
XTS_MUL_ALPHA ( c6, c7 );
838
839
tweakBuffer[0].m128i = c0;
840
tweakBuffer[1].m128i = c1;
841
tweakBuffer[2].m128i = c2;
842
tweakBuffer[3].m128i = c3;
843
tweakBuffer[4].m128i = c4;
844
tweakBuffer[5].m128i = c5;
845
tweakBuffer[6].m128i = c6;
846
tweakBuffer[7].m128i = c7;
847
lastTweakLow = tweakBuffer[7].ull[0];
848
lastTweakHigh = tweakBuffer[7].ull[1];
849
850
firstRoundKey = _mm_loadu_si128( (__m128i *) &pExpandedKey->RoundKey[0] );
851
lastRoundKey = _mm_loadu_si128( (__m128i *) pExpandedKey->lastEncRoundKey );
852
853
while( cbDataMain > 0 )
854
{
855
// At loop entry, tweakBuffer[0-7] are tweakValues for the next 8 blocks
856
c0 = _mm_xor_si128( tweakBuffer[0].m128i, firstRoundKey );
857
c1 = _mm_xor_si128( tweakBuffer[1].m128i, firstRoundKey );
858
c2 = _mm_xor_si128( tweakBuffer[2].m128i, firstRoundKey );
859
c3 = _mm_xor_si128( tweakBuffer[3].m128i, firstRoundKey );
860
c4 = _mm_xor_si128( tweakBuffer[4].m128i, firstRoundKey );
861
c5 = _mm_xor_si128( tweakBuffer[5].m128i, firstRoundKey );
862
c6 = _mm_xor_si128( tweakBuffer[6].m128i, firstRoundKey );
863
c7 = _mm_xor_si128( tweakBuffer[7].m128i, firstRoundKey );
864
865
c0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) );
866
c1 = _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) );
867
c2 = _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32) ) );
868
c3 = _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48) ) );
869
c4 = _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64) ) );
870
c5 = _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80) ) );
871
c6 = _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96) ) );
872
c7 = _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc + 112) ) );
873
874
keyPtr = &pExpandedKey->RoundKey[1];
875
876
// Do 8 full rounds (AES-128|AES-192|AES-256) with stitched XTS (performed in scalar registers)
877
for( aesEncryptXtsLoop = 0; aesEncryptXtsLoop < 8; aesEncryptXtsLoop++ )
878
{
879
roundkey = _mm_loadu_si128( (__m128i *) keyPtr );
880
keyPtr ++;
881
c0 = _mm_aesenc_si128( c0, roundkey );
882
c1 = _mm_aesenc_si128( c1, roundkey );
883
c2 = _mm_aesenc_si128( c2, roundkey );
884
c3 = _mm_aesenc_si128( c3, roundkey );
885
c4 = _mm_aesenc_si128( c4, roundkey );
886
c5 = _mm_aesenc_si128( c5, roundkey );
887
c6 = _mm_aesenc_si128( c6, roundkey );
888
c7 = _mm_aesenc_si128( c7, roundkey );
889
890
// Prepare tweakBuffer[8-15] with tweak^lastRoundKey
891
tweakBuffer[ 8+aesEncryptXtsLoop ].m128i = _mm_xor_si128( tweakBuffer[ aesEncryptXtsLoop ].m128i, lastRoundKey );
892
// Prepare tweakBuffer[0-7] with tweaks for next 8 blocks
893
XTS_MUL_ALPHA_Scalar( lastTweakLow, lastTweakHigh );
894
tweakBuffer[ aesEncryptXtsLoop ].ull[0] = lastTweakLow;
895
tweakBuffer[ aesEncryptXtsLoop ].ull[1] = lastTweakHigh;
896
}
897
898
do
899
{
900
roundkey = _mm_loadu_si128( (__m128i *) keyPtr );
901
keyPtr ++;
902
c0 = _mm_aesenc_si128( c0, roundkey );
903
c1 = _mm_aesenc_si128( c1, roundkey );
904
c2 = _mm_aesenc_si128( c2, roundkey );
905
c3 = _mm_aesenc_si128( c3, roundkey );
906
c4 = _mm_aesenc_si128( c4, roundkey );
907
c5 = _mm_aesenc_si128( c5, roundkey );
908
c6 = _mm_aesenc_si128( c6, roundkey );
909
c7 = _mm_aesenc_si128( c7, roundkey );
910
} while( keyPtr < keyLimit );
911
912
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_aesenclast_si128( c0, tweakBuffer[ 8].m128i ) );
913
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_aesenclast_si128( c1, tweakBuffer[ 9].m128i ) );
914
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_aesenclast_si128( c2, tweakBuffer[10].m128i ) );
915
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_aesenclast_si128( c3, tweakBuffer[11].m128i ) );
916
_mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_aesenclast_si128( c4, tweakBuffer[12].m128i ) );
917
_mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_aesenclast_si128( c5, tweakBuffer[13].m128i ) );
918
_mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_aesenclast_si128( c6, tweakBuffer[14].m128i ) );
919
_mm_storeu_si128( (__m128i *) (pbDst + 112), _mm_aesenclast_si128( c7, tweakBuffer[15].m128i ) );
920
921
pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
922
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
923
cbDataMain -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
924
}
925
926
if( cbDataTail == 0 )
927
{
928
return; // <-- expected case; early return here
929
}
930
931
// Rare case, with data unit length not being multiple of 128 bytes, handle the tail one block at a time
932
t0 = tweakBuffer[0].m128i;
933
934
while( cbDataTail >= 2*SYMCRYPT_AES_BLOCK_SIZE )
935
{
936
c0 = _mm_xor_si128( _mm_loadu_si128( ( __m128i * ) pbSrc ), t0 );
937
pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
938
AES_ENCRYPT_1( pExpandedKey, c0 );
939
_mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128( c0, t0 ) );
940
pbDst += SYMCRYPT_AES_BLOCK_SIZE;
941
XTS_MUL_ALPHA( t0, t0 );
942
cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
943
}
944
945
if( cbDataTail > SYMCRYPT_AES_BLOCK_SIZE )
946
{
947
// Ciphertext stealing encryption
948
//
949
// +--------------+
950
// | |
951
// | V
952
// +-----------------+ | +-----+-----------+
953
// | P_m-1 | | | P_m |++++CP+++++|
954
// +-----------------+ | +-----+-----------+
955
// | | |
956
// enc_m-1 | enc_m
957
// | | |
958
// V | V
959
// +-----+-----------+ | +-----------------+
960
// | C_m |++++CP+++++|--+ | C_m-1 |
961
// +-----+-----------+ +-----------------+
962
// | /
963
// +---------------- / --+
964
// / |
965
// | V
966
// +-----------------+ | +-----+
967
// | C_m-1 |<-+ | C_m |
968
// +-----------------+ +-----+
969
970
// Encrypt penultimate plaintext block into tweakBuffer[0]
971
c0 = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbSrc ), t0 );
972
AES_ENCRYPT_1( pExpandedKey, c0 );
973
tweakBuffer[0].m128i = _mm_xor_si128( c0, t0 );
974
975
cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
976
977
// Copy tweakBuffer[0] to tweakBuffer[1]
978
tweakBuffer[1].m128i = tweakBuffer[0].m128i;
979
// Copy final plaintext bytes to prefix of tweakBuffer[0] - we must read before writing to support in-place encryption
980
memcpy( &tweakBuffer[0].ul[0], pbSrc + SYMCRYPT_AES_BLOCK_SIZE, cbDataTail );
981
// Copy prefix of tweakBuffer[1] to the right place in the destination buffer
982
memcpy( pbDst + SYMCRYPT_AES_BLOCK_SIZE, &tweakBuffer[1].ul[0], cbDataTail );
983
984
// Do final tweak update
985
XTS_MUL_ALPHA( t0, t0 );
986
987
// Load updated tweakBuffer[0] into c0
988
c0 = tweakBuffer[0].m128i;
989
} else {
990
// Just load final plaintext block into c0
991
c0 = _mm_loadu_si128( (__m128i*) pbSrc );
992
}
993
994
// Final full block encryption
995
c0 = _mm_xor_si128( c0, t0 );
996
AES_ENCRYPT_1( pExpandedKey, c0 );
997
_mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128( c0, t0 ) );
998
}
999
1000
VOID
1001
SYMCRYPT_CALL
1002
SymCryptXtsAesDecryptDataUnitXmm(
1003
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
1004
_In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbTweakBlock,
1005
_Out_writes_( SYMCRYPT_AES_BLOCK_SIZE*16 ) PBYTE pbScratch,
1006
_In_reads_( cbData ) PCBYTE pbSrc,
1007
_Out_writes_( cbData ) PBYTE pbDst,
1008
SIZE_T cbData )
1009
{
1010
__m128i t0;
1011
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
1012
__m128i roundkey, firstRoundKey, lastRoundKey;
1013
__m128i XTS_ALPHA_MASK = _mm_set_epi32( 1, 1, 1, 0x87 );
1014
SYMCRYPT_GF128_ELEMENT* tweakBuffer = (SYMCRYPT_GF128_ELEMENT*) pbScratch;
1015
1016
const BYTE (*keyPtr)[4][4];
1017
const BYTE (*keyLimit)[4][4] = pExpandedKey->lastDecRoundKey;
1018
UINT64 lastTweakLow, lastTweakHigh;
1019
int aesDecryptXtsLoop;
1020
1021
SIZE_T cbDataMain; // number of bytes to handle in the main loop
1022
SIZE_T cbDataTail; // number of bytes to handle in the tail loop
1023
1024
SYMCRYPT_ASSERT(cbData >= SYMCRYPT_AES_BLOCK_SIZE);
1025
1026
// To simplify logic and unusual size processing, we handle all
1027
// data not a multiple of 8 blocks in the tail loop
1028
cbDataTail = cbData & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1);
1029
// Additionally, so that ciphertext stealing logic does not rely on
1030
// reading back from the destination buffer, when we have a non-zero
1031
// tail, we ensure that we handle at least 1 whole block in the tail
1032
//
1033
// Note that our caller has ensured we have at least 1 whole block
1034
// to process, this is checked in debug build
1035
// This means that cbDataTail is in [1,15] at this point iff there are
1036
// at least 8 whole blocks to process; so the below does not cause
1037
// cbDataTail or cbDataMain to exceed cbData
1038
cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (8*SYMCRYPT_AES_BLOCK_SIZE) : 0;
1039
cbDataMain = cbData - cbDataTail;
1040
1041
SYMCRYPT_ASSERT(cbDataMain <= cbData);
1042
SYMCRYPT_ASSERT(cbDataTail <= cbData);
1043
SYMCRYPT_ASSERT((cbDataMain & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0);
1044
1045
c0 = _mm_loadu_si128( (__m128i *) pbTweakBlock );
1046
XTS_MUL_ALPHA( c0, c1 );
1047
XTS_MUL_ALPHA( c1, c2 );
1048
XTS_MUL_ALPHA( c2, c3 );
1049
1050
XTS_MUL_ALPHA4( c0, c4 );
1051
XTS_MUL_ALPHA ( c4, c5 );
1052
XTS_MUL_ALPHA ( c5, c6 );
1053
XTS_MUL_ALPHA ( c6, c7 );
1054
1055
tweakBuffer[0].m128i = c0;
1056
tweakBuffer[1].m128i = c1;
1057
tweakBuffer[2].m128i = c2;
1058
tweakBuffer[3].m128i = c3;
1059
tweakBuffer[4].m128i = c4;
1060
tweakBuffer[5].m128i = c5;
1061
tweakBuffer[6].m128i = c6;
1062
tweakBuffer[7].m128i = c7;
1063
lastTweakLow = tweakBuffer[7].ull[0];
1064
lastTweakHigh = tweakBuffer[7].ull[1];
1065
1066
firstRoundKey = _mm_loadu_si128( (__m128i *) pExpandedKey->lastEncRoundKey );
1067
lastRoundKey = _mm_loadu_si128( (__m128i *) pExpandedKey->lastDecRoundKey );
1068
1069
while( cbDataMain > 0 )
1070
{
1071
// At loop entry, tweakBuffer[0-7] are tweakValues for the next 8 blocks
1072
c0 = _mm_xor_si128( tweakBuffer[0].m128i, firstRoundKey );
1073
c1 = _mm_xor_si128( tweakBuffer[1].m128i, firstRoundKey );
1074
c2 = _mm_xor_si128( tweakBuffer[2].m128i, firstRoundKey );
1075
c3 = _mm_xor_si128( tweakBuffer[3].m128i, firstRoundKey );
1076
c4 = _mm_xor_si128( tweakBuffer[4].m128i, firstRoundKey );
1077
c5 = _mm_xor_si128( tweakBuffer[5].m128i, firstRoundKey );
1078
c6 = _mm_xor_si128( tweakBuffer[6].m128i, firstRoundKey );
1079
c7 = _mm_xor_si128( tweakBuffer[7].m128i, firstRoundKey );
1080
1081
c0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) );
1082
c1 = _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) );
1083
c2 = _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32) ) );
1084
c3 = _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48) ) );
1085
c4 = _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64) ) );
1086
c5 = _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80) ) );
1087
c6 = _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96) ) );
1088
c7 = _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc + 112) ) );
1089
1090
keyPtr = pExpandedKey->lastEncRoundKey + 1;
1091
1092
// Do 8 full rounds (AES-128|AES-192|AES-256) with stitched XTS (performed in scalar registers)
1093
for( aesDecryptXtsLoop = 0; aesDecryptXtsLoop < 8; aesDecryptXtsLoop++ )
1094
{
1095
roundkey = _mm_loadu_si128( (__m128i *) keyPtr );
1096
keyPtr ++;
1097
c0 = _mm_aesdec_si128( c0, roundkey );
1098
c1 = _mm_aesdec_si128( c1, roundkey );
1099
c2 = _mm_aesdec_si128( c2, roundkey );
1100
c3 = _mm_aesdec_si128( c3, roundkey );
1101
c4 = _mm_aesdec_si128( c4, roundkey );
1102
c5 = _mm_aesdec_si128( c5, roundkey );
1103
c6 = _mm_aesdec_si128( c6, roundkey );
1104
c7 = _mm_aesdec_si128( c7, roundkey );
1105
1106
// Prepare tweakBuffer[8-15] with tweak^lastRoundKey
1107
tweakBuffer[ 8+aesDecryptXtsLoop ].m128i = _mm_xor_si128( tweakBuffer[ aesDecryptXtsLoop ].m128i, lastRoundKey );
1108
// Prepare tweakBuffer[0-7] with tweaks for next 8 blocks
1109
XTS_MUL_ALPHA_Scalar( lastTweakLow, lastTweakHigh );
1110
tweakBuffer[ aesDecryptXtsLoop ].ull[0] = lastTweakLow;
1111
tweakBuffer[ aesDecryptXtsLoop ].ull[1] = lastTweakHigh;
1112
}
1113
1114
do
1115
{
1116
roundkey = _mm_loadu_si128( (__m128i *) keyPtr );
1117
keyPtr ++;
1118
c0 = _mm_aesdec_si128( c0, roundkey );
1119
c1 = _mm_aesdec_si128( c1, roundkey );
1120
c2 = _mm_aesdec_si128( c2, roundkey );
1121
c3 = _mm_aesdec_si128( c3, roundkey );
1122
c4 = _mm_aesdec_si128( c4, roundkey );
1123
c5 = _mm_aesdec_si128( c5, roundkey );
1124
c6 = _mm_aesdec_si128( c6, roundkey );
1125
c7 = _mm_aesdec_si128( c7, roundkey );
1126
} while( keyPtr < keyLimit );
1127
1128
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_aesdeclast_si128( c0, tweakBuffer[ 8].m128i ) );
1129
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_aesdeclast_si128( c1, tweakBuffer[ 9].m128i ) );
1130
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_aesdeclast_si128( c2, tweakBuffer[10].m128i ) );
1131
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_aesdeclast_si128( c3, tweakBuffer[11].m128i ) );
1132
_mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_aesdeclast_si128( c4, tweakBuffer[12].m128i ) );
1133
_mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_aesdeclast_si128( c5, tweakBuffer[13].m128i ) );
1134
_mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_aesdeclast_si128( c6, tweakBuffer[14].m128i ) );
1135
_mm_storeu_si128( (__m128i *) (pbDst + 112), _mm_aesdeclast_si128( c7, tweakBuffer[15].m128i ) );
1136
1137
pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1138
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1139
cbDataMain -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
1140
}
1141
1142
if( cbDataTail == 0 )
1143
{
1144
return; // <-- expected case; early return here
1145
}
1146
1147
// Rare case, with data unit length not being multiple of 128 bytes, handle the tail one block at a time
1148
t0 = tweakBuffer[0].m128i;
1149
1150
while( cbDataTail >= 2*SYMCRYPT_AES_BLOCK_SIZE )
1151
{
1152
c0 = _mm_xor_si128( _mm_loadu_si128( ( __m128i * ) pbSrc ), t0 );
1153
pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
1154
AES_DECRYPT_1( pExpandedKey, c0 );
1155
_mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128( c0, t0 ) );
1156
pbDst += SYMCRYPT_AES_BLOCK_SIZE;
1157
c7 = t0;
1158
XTS_MUL_ALPHA( t0, t0 );
1159
cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
1160
}
1161
1162
if( cbDataTail > SYMCRYPT_AES_BLOCK_SIZE )
1163
{
1164
// Ciphertext stealing decryption
1165
//
1166
// +--------------+
1167
// | |
1168
// | V
1169
// +-----------------+ | +-----+-----------+
1170
// | C_m-1 | | | C_m |++++CP+++++|
1171
// +-----------------+ | +-----+-----------+
1172
// | | |
1173
// dec_m | dec_m-1
1174
// | | |
1175
// V | V
1176
// +-----+-----------+ | +-----------------+
1177
// | P_m |++++CP+++++|--+ | P_m-1 |
1178
// +-----+-----------+ +-----------------+
1179
// | /
1180
// +---------------- / --+
1181
// / |
1182
// | V
1183
// +-----------------+ | +-----+
1184
// | P_m-1 |<-+ | P_m |
1185
// +-----------------+ +-----+
1186
1187
// Do final tweak update into c1
1188
// Penultimate tweak is in t0, ready for final decryption
1189
XTS_MUL_ALPHA( t0, c1 );
1190
1191
// Decrypt penultimate ciphertext block into tweakBuffer[0]
1192
c0 = _mm_xor_si128( _mm_loadu_si128( (__m128i *) pbSrc ), c1 );
1193
AES_DECRYPT_1( pExpandedKey, c0 );
1194
tweakBuffer[0].m128i = _mm_xor_si128( c0, c1 );
1195
1196
cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
1197
1198
// Copy tweakBuffer[0] to tweakBuffer[1]
1199
tweakBuffer[1].m128i = tweakBuffer[0].m128i;
1200
// Copy final ciphertext bytes to prefix of tweakBuffer[0] - we must read before writing to support in-place decryption
1201
memcpy( &tweakBuffer[0].ul[0], pbSrc + SYMCRYPT_AES_BLOCK_SIZE, cbDataTail );
1202
// Copy prefix of tweakBuffer[1] to the right place in the destination buffer
1203
memcpy( pbDst + SYMCRYPT_AES_BLOCK_SIZE, &tweakBuffer[1].ul[0], cbDataTail );
1204
1205
// Load updated tweakBuffer[0] into c0
1206
c0 = tweakBuffer[0].m128i;
1207
} else {
1208
// Just load final ciphertext block into c0
1209
c0 = _mm_loadu_si128( (__m128i*) pbSrc );
1210
}
1211
1212
// Final full block decryption
1213
c0 = _mm_xor_si128( c0, t0 );
1214
AES_DECRYPT_1( pExpandedKey, c0 );
1215
_mm_storeu_si128( (__m128i *) pbDst, _mm_xor_si128( c0, t0 ) );
1216
}
1217
1218
#define AES_FULLROUND_4_GHASH_1( roundkey, keyPtr, c0, c1, c2, c3, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1219
{ \
1220
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1221
keyPtr ++; \
1222
c0 = _mm_aesenc_si128( c0, roundkey ); \
1223
c1 = _mm_aesenc_si128( c1, roundkey ); \
1224
c2 = _mm_aesenc_si128( c2, roundkey ); \
1225
c3 = _mm_aesenc_si128( c3, roundkey ); \
1226
\
1227
r0 = _mm_loadu_si128( (__m128i *) gHashPointer ); \
1228
r0 = _mm_shuffle_epi8( r0, byteReverseOrder ); \
1229
gHashPointer += 16; \
1230
\
1231
t1 = _mm_loadu_si128( (__m128i *) &GHASH_H_POWER(gHashExpandedKeyTable, todo) ); \
1232
t0 = _mm_clmulepi64_si128( r0, t1, 0x00 ); \
1233
t1 = _mm_clmulepi64_si128( r0, t1, 0x11 ); \
1234
\
1235
resl = _mm_xor_si128( resl, t0 ); \
1236
resh = _mm_xor_si128( resh, t1 ); \
1237
\
1238
t0 = _mm_srli_si128( r0, 8 ); \
1239
r0 = _mm_xor_si128( r0, t0 ); \
1240
t1 = _mm_loadu_si128( (__m128i *) &GHASH_Hx_POWER(gHashExpandedKeyTable, todo) ); \
1241
t1 = _mm_clmulepi64_si128( r0, t1, 0x00 ); \
1242
\
1243
resm = _mm_xor_si128( resm, t1 ); \
1244
todo --; \
1245
};
1246
1247
#define AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, gHashPointer, ghashRounds, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1248
{ \
1249
const BYTE (*keyPtr)[4][4]; \
1250
const BYTE (*keyLimit)[4][4]; \
1251
__m128i roundkey; \
1252
__m128i t0, t1; \
1253
__m128i r0; \
1254
SIZE_T aesEncryptGhashLoop; \
1255
\
1256
keyPtr = &pExpandedKey->RoundKey[0]; \
1257
keyLimit = pExpandedKey->lastEncRoundKey; \
1258
\
1259
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1260
keyPtr ++; \
1261
c0 = _mm_xor_si128( c0, roundkey ); \
1262
c1 = _mm_xor_si128( c1, roundkey ); \
1263
c2 = _mm_xor_si128( c2, roundkey ); \
1264
c3 = _mm_xor_si128( c3, roundkey ); \
1265
\
1266
/* Do ghashRounds full rounds (AES-128|AES-192|AES-256) with stitched GHASH */ \
1267
for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < ghashRounds; aesEncryptGhashLoop++ ) \
1268
{ \
1269
AES_FULLROUND_4_GHASH_1( roundkey, keyPtr, c0, c1, c2, c3, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ); \
1270
} \
1271
\
1272
do \
1273
{ \
1274
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1275
keyPtr ++; \
1276
c0 = _mm_aesenc_si128( c0, roundkey ); \
1277
c1 = _mm_aesenc_si128( c1, roundkey ); \
1278
c2 = _mm_aesenc_si128( c2, roundkey ); \
1279
c3 = _mm_aesenc_si128( c3, roundkey ); \
1280
} while( keyPtr < keyLimit ); \
1281
\
1282
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1283
\
1284
c0 = _mm_aesenclast_si128( c0, roundkey ); \
1285
c1 = _mm_aesenclast_si128( c1, roundkey ); \
1286
c2 = _mm_aesenclast_si128( c2, roundkey ); \
1287
c3 = _mm_aesenclast_si128( c3, roundkey ); \
1288
};
1289
1290
#define AES_FULLROUND_8_GHASH_1( roundkey, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1291
{ \
1292
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1293
keyPtr ++; \
1294
c0 = _mm_aesenc_si128( c0, roundkey ); \
1295
c1 = _mm_aesenc_si128( c1, roundkey ); \
1296
c2 = _mm_aesenc_si128( c2, roundkey ); \
1297
c3 = _mm_aesenc_si128( c3, roundkey ); \
1298
c4 = _mm_aesenc_si128( c4, roundkey ); \
1299
c5 = _mm_aesenc_si128( c5, roundkey ); \
1300
c6 = _mm_aesenc_si128( c6, roundkey ); \
1301
c7 = _mm_aesenc_si128( c7, roundkey ); \
1302
\
1303
r0 = _mm_loadu_si128( (__m128i *) gHashPointer ); \
1304
r0 = _mm_shuffle_epi8( r0, byteReverseOrder ); \
1305
gHashPointer += 16; \
1306
\
1307
t1 = _mm_loadu_si128( (__m128i *) &GHASH_H_POWER(gHashExpandedKeyTable, todo) ); \
1308
t0 = _mm_clmulepi64_si128( r0, t1, 0x00 ); \
1309
t1 = _mm_clmulepi64_si128( r0, t1, 0x11 ); \
1310
\
1311
resl = _mm_xor_si128( resl, t0 ); \
1312
resh = _mm_xor_si128( resh, t1 ); \
1313
\
1314
t0 = _mm_srli_si128( r0, 8 ); \
1315
r0 = _mm_xor_si128( r0, t0 ); \
1316
t1 = _mm_loadu_si128( (__m128i *) &GHASH_Hx_POWER(gHashExpandedKeyTable, todo) ); \
1317
t1 = _mm_clmulepi64_si128( r0, t1, 0x00 ); \
1318
\
1319
resm = _mm_xor_si128( resm, t1 ); \
1320
todo --; \
1321
};
1322
1323
#define AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, gHashPointer, ghashRounds, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1324
{ \
1325
const BYTE (*keyPtr)[4][4]; \
1326
const BYTE (*keyLimit)[4][4]; \
1327
__m128i roundkey; \
1328
__m128i t0, t1; \
1329
__m128i r0; \
1330
SIZE_T aesEncryptGhashLoop; \
1331
\
1332
keyPtr = &pExpandedKey->RoundKey[0]; \
1333
keyLimit = pExpandedKey->lastEncRoundKey; \
1334
\
1335
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1336
keyPtr ++; \
1337
c0 = _mm_xor_si128( c0, roundkey ); \
1338
c1 = _mm_xor_si128( c1, roundkey ); \
1339
c2 = _mm_xor_si128( c2, roundkey ); \
1340
c3 = _mm_xor_si128( c3, roundkey ); \
1341
c4 = _mm_xor_si128( c4, roundkey ); \
1342
c5 = _mm_xor_si128( c5, roundkey ); \
1343
c6 = _mm_xor_si128( c6, roundkey ); \
1344
c7 = _mm_xor_si128( c7, roundkey ); \
1345
\
1346
/* Do ghashRounds full rounds (AES-128|AES-192|AES-256) with stitched GHASH */ \
1347
for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < ghashRounds; aesEncryptGhashLoop++ ) \
1348
{ \
1349
AES_FULLROUND_8_GHASH_1( roundkey, keyPtr, c0, c1, c2, c3, c4, c5, c6, c7, r0, t0, t1, gHashPointer, byteReverseOrder, gHashExpandedKeyTable, todo, resl, resm, resh ); \
1350
} \
1351
\
1352
do \
1353
{ \
1354
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1355
keyPtr ++; \
1356
c0 = _mm_aesenc_si128( c0, roundkey ); \
1357
c1 = _mm_aesenc_si128( c1, roundkey ); \
1358
c2 = _mm_aesenc_si128( c2, roundkey ); \
1359
c3 = _mm_aesenc_si128( c3, roundkey ); \
1360
c4 = _mm_aesenc_si128( c4, roundkey ); \
1361
c5 = _mm_aesenc_si128( c5, roundkey ); \
1362
c6 = _mm_aesenc_si128( c6, roundkey ); \
1363
c7 = _mm_aesenc_si128( c7, roundkey ); \
1364
} while( keyPtr < keyLimit ); \
1365
\
1366
roundkey = _mm_loadu_si128( (__m128i *) keyPtr ); \
1367
\
1368
c0 = _mm_aesenclast_si128( c0, roundkey ); \
1369
c1 = _mm_aesenclast_si128( c1, roundkey ); \
1370
c2 = _mm_aesenclast_si128( c2, roundkey ); \
1371
c3 = _mm_aesenclast_si128( c3, roundkey ); \
1372
c4 = _mm_aesenclast_si128( c4, roundkey ); \
1373
c5 = _mm_aesenclast_si128( c5, roundkey ); \
1374
c6 = _mm_aesenclast_si128( c6, roundkey ); \
1375
c7 = _mm_aesenclast_si128( c7, roundkey ); \
1376
};
1377
1378
// This call is functionally identical to:
1379
// SymCryptAesCtrMsb64Xmm( pExpandedKey,
1380
// pbChainingValue,
1381
// pbSrc,
1382
// pbDst,
1383
// cbData );
1384
// SymCryptGHashAppendDataPclmulqdq( expandedKeyTable,
1385
// pState,
1386
// pbDst,
1387
// cbData );
1388
VOID
1389
SYMCRYPT_CALL
1390
SymCryptAesGcmEncryptStitchedXmm(
1391
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
1392
_In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
1393
_In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable,
1394
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
1395
_In_reads_( cbData ) PCBYTE pbSrc,
1396
_Out_writes_( cbData ) PBYTE pbDst,
1397
SIZE_T cbData )
1398
{
1399
__m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue );
1400
1401
__m128i BYTE_REVERSE_ORDER = _mm_set_epi8(
1402
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
1403
__m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 );
1404
1405
__m128i chainIncrement1 = _mm_set_epi32( 0, 0, 0, 1 );
1406
__m128i chainIncrement2 = _mm_set_epi32( 0, 0, 0, 2 );
1407
__m128i chainIncrement8 = _mm_set_epi32( 0, 0, 0, 8 );
1408
1409
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
1410
__m128i r0, r1;
1411
1412
__m128i state;
1413
__m128i a0, a1, a2;
1414
SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
1415
SIZE_T todo;
1416
PCBYTE pbGhashSrc = pbDst;
1417
1418
SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size
1419
1420
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
1421
state = _mm_loadu_si128( (__m128i *) pState );
1422
1423
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS );
1424
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1425
1426
// Do 8 blocks of CTR either for tail (if total blocks <8) or for encryption of first 8 blocks
1427
c0 = chain;
1428
c1 = _mm_add_epi32( chain, chainIncrement1 );
1429
c2 = _mm_add_epi32( chain, chainIncrement2 );
1430
c3 = _mm_add_epi32( c1, chainIncrement2 );
1431
c4 = _mm_add_epi32( c2, chainIncrement2 );
1432
c5 = _mm_add_epi32( c3, chainIncrement2 );
1433
c6 = _mm_add_epi32( c4, chainIncrement2 );
1434
c7 = _mm_add_epi32( c5, chainIncrement2 );
1435
1436
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
1437
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
1438
c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
1439
c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
1440
c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
1441
c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
1442
c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER );
1443
c7 = _mm_shuffle_epi8( c7, BYTE_REVERSE_ORDER );
1444
1445
AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
1446
1447
if( nBlocks >= 8 )
1448
{
1449
// Encrypt first 8 blocks - update chain
1450
chain = _mm_add_epi32( chain, chainIncrement8 );
1451
1452
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ) );
1453
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) );
1454
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32) ) ) );
1455
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48) ) ) );
1456
_mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64) ) ) );
1457
_mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80) ) ) );
1458
_mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96) ) ) );
1459
_mm_storeu_si128( (__m128i *) (pbDst +112), _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc +112) ) ) );
1460
1461
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1462
pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1463
1464
while( nBlocks >= 16 )
1465
{
1466
// In this loop we always have 8 blocks to encrypt and we have already encrypted the previous 8 blocks ready for GHASH
1467
c0 = chain;
1468
c1 = _mm_add_epi32( chain, chainIncrement1 );
1469
c2 = _mm_add_epi32( chain, chainIncrement2 );
1470
c3 = _mm_add_epi32( c1, chainIncrement2 );
1471
c4 = _mm_add_epi32( c2, chainIncrement2 );
1472
c5 = _mm_add_epi32( c3, chainIncrement2 );
1473
c6 = _mm_add_epi32( c4, chainIncrement2 );
1474
c7 = _mm_add_epi32( c5, chainIncrement2 );
1475
chain = _mm_add_epi32( c6, chainIncrement2 );
1476
1477
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
1478
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
1479
c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
1480
c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
1481
c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
1482
c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
1483
c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER );
1484
c7 = _mm_shuffle_epi8( c7, BYTE_REVERSE_ORDER );
1485
1486
AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, 8, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
1487
1488
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ) );
1489
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) );
1490
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32) ) ) );
1491
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48) ) ) );
1492
_mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64) ) ) );
1493
_mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80) ) ) );
1494
_mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96) ) ) );
1495
_mm_storeu_si128( (__m128i *) (pbDst +112), _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc +112) ) ) );
1496
1497
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1498
pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1499
nBlocks -= 8;
1500
1501
if( todo == 0 )
1502
{
1503
CLMUL_3_POST( a0, a1, a2 );
1504
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1505
1506
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS );
1507
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1508
}
1509
}
1510
1511
// We now have at least 8 blocks of encrypted data to GHASH and at most 7 blocks left to encrypt
1512
// Do 8 blocks of GHASH in parallel with generating 0, 4, or 8 AES-CTR blocks for tail encryption
1513
nBlocks -= 8;
1514
if (nBlocks > 0)
1515
{
1516
c0 = chain;
1517
c1 = _mm_add_epi32( chain, chainIncrement1 );
1518
c2 = _mm_add_epi32( chain, chainIncrement2 );
1519
c3 = _mm_add_epi32( c1, chainIncrement2 );
1520
c4 = _mm_add_epi32( c2, chainIncrement2 );
1521
1522
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
1523
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
1524
c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
1525
c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
1526
1527
if (nBlocks > 4)
1528
{
1529
// Do 8 rounds of AES-CTR for tail in parallel with 8 rounds of GHASH
1530
c5 = _mm_add_epi32( c4, chainIncrement1 );
1531
c6 = _mm_add_epi32( c4, chainIncrement2 );
1532
1533
c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
1534
c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
1535
c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER );
1536
1537
AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, 8, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
1538
}
1539
else
1540
{
1541
// Do 4 rounds of AES-CTR for tail in parallel with 8 rounds of GHASH
1542
AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, pbGhashSrc, 8, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
1543
}
1544
1545
if( todo == 0)
1546
{
1547
CLMUL_3_POST( a0, a1, a2 );
1548
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1549
1550
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS );
1551
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1552
}
1553
}
1554
else
1555
{
1556
// Just do the final 8 rounds of GHASH
1557
for( todo=8; todo>0; todo-- )
1558
{
1559
r0 = _mm_shuffle_epi8( _mm_loadu_si128( (__m128i *) (pbGhashSrc + 0) ), BYTE_REVERSE_ORDER );
1560
pbGhashSrc += SYMCRYPT_AES_BLOCK_SIZE;
1561
1562
CLMUL_ACC_3( r0, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1563
}
1564
1565
CLMUL_3_POST( a0, a1, a2 );
1566
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1567
}
1568
}
1569
1570
if( nBlocks > 0 )
1571
{
1572
// Encrypt 1-7 blocks with pre-generated AES-CTR blocks and GHASH the results
1573
while( nBlocks >= 2 )
1574
{
1575
chain = _mm_add_epi32( chain, chainIncrement2 );
1576
1577
r0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) );
1578
r1 = _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) );
1579
1580
_mm_storeu_si128( (__m128i *) (pbDst + 0), r0 );
1581
_mm_storeu_si128( (__m128i *) (pbDst + 16), r1 );
1582
1583
r0 = _mm_shuffle_epi8( r0, BYTE_REVERSE_ORDER );
1584
r1 = _mm_shuffle_epi8( r1, BYTE_REVERSE_ORDER );
1585
1586
CLMUL_ACC_3( r0, GHASH_H_POWER(expandedKeyTable, todo - 0), GHASH_Hx_POWER(expandedKeyTable, todo - 0), a0, a1, a2 );
1587
CLMUL_ACC_3( r1, GHASH_H_POWER(expandedKeyTable, todo - 1), GHASH_Hx_POWER(expandedKeyTable, todo - 1), a0, a1, a2 );
1588
1589
pbDst += 2*SYMCRYPT_AES_BLOCK_SIZE;
1590
pbSrc += 2*SYMCRYPT_AES_BLOCK_SIZE;
1591
todo -= 2;
1592
nBlocks -= 2;
1593
c0 = c2;
1594
c1 = c3;
1595
c2 = c4;
1596
c3 = c5;
1597
c4 = c6;
1598
}
1599
1600
if( nBlocks > 0 )
1601
{
1602
chain = _mm_add_epi32( chain, chainIncrement1 );
1603
1604
r0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) );
1605
1606
_mm_storeu_si128( (__m128i *) (pbDst + 0), r0 );
1607
1608
r0 = _mm_shuffle_epi8( r0, BYTE_REVERSE_ORDER );
1609
1610
CLMUL_ACC_3( r0, GHASH_H_POWER(expandedKeyTable, 1), GHASH_Hx_POWER(expandedKeyTable, 1), a0, a1, a2 );
1611
}
1612
1613
CLMUL_3_POST( a0, a1, a2 );
1614
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1615
}
1616
1617
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
1618
_mm_storeu_si128( (__m128i *) pbChainingValue, chain );
1619
_mm_storeu_si128( (__m128i *) pState, state );
1620
}
1621
1622
#pragma warning(push)
1623
#pragma warning( disable:4701 )
1624
#pragma runtime_checks( "u", off )
1625
// This call is functionally identical to:
1626
// SymCryptGHashAppendDataPclmulqdq( expandedKeyTable,
1627
// pState,
1628
// pbSrc,
1629
// cbData );
1630
// SymCryptAesCtrMsb64Xmm( pExpandedKey,
1631
// pbChainingValue,
1632
// pbSrc,
1633
// pbDst,
1634
// cbData );
1635
VOID
1636
SYMCRYPT_CALL
1637
SymCryptAesGcmDecryptStitchedXmm(
1638
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
1639
_In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
1640
_In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable,
1641
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
1642
_In_reads_( cbData ) PCBYTE pbSrc,
1643
_Out_writes_( cbData ) PBYTE pbDst,
1644
SIZE_T cbData )
1645
{
1646
__m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue );
1647
1648
__m128i BYTE_REVERSE_ORDER = _mm_set_epi8(
1649
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
1650
__m128i vMultiplicationConstant = _mm_set_epi32( 0, 0, 0xc2000000, 0 );
1651
1652
__m128i chainIncrement1 = _mm_set_epi32( 0, 0, 0, 1 );
1653
__m128i chainIncrement2 = _mm_set_epi32( 0, 0, 0, 2 );
1654
1655
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
1656
1657
__m128i state;
1658
__m128i a0, a1, a2;
1659
SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
1660
SIZE_T todo = 0;
1661
PCBYTE pbGhashSrc = pbSrc;
1662
1663
SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size
1664
1665
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
1666
state = _mm_loadu_si128( (__m128i *) pState );
1667
1668
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS );
1669
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1670
1671
while( nBlocks >= 8 )
1672
{
1673
// In this loop we always have 8 blocks to decrypt and GHASH
1674
c0 = chain;
1675
c1 = _mm_add_epi32( chain, chainIncrement1 );
1676
c2 = _mm_add_epi32( chain, chainIncrement2 );
1677
c3 = _mm_add_epi32( c1, chainIncrement2 );
1678
c4 = _mm_add_epi32( c2, chainIncrement2 );
1679
c5 = _mm_add_epi32( c3, chainIncrement2 );
1680
c6 = _mm_add_epi32( c4, chainIncrement2 );
1681
c7 = _mm_add_epi32( c5, chainIncrement2 );
1682
chain = _mm_add_epi32( c6, chainIncrement2 );
1683
1684
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
1685
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
1686
c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
1687
c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
1688
c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
1689
c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
1690
c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER );
1691
c7 = _mm_shuffle_epi8( c7, BYTE_REVERSE_ORDER );
1692
1693
AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, 8, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
1694
1695
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ) );
1696
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) );
1697
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32) ) ) );
1698
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48) ) ) );
1699
_mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64) ) ) );
1700
_mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80) ) ) );
1701
_mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96) ) ) );
1702
_mm_storeu_si128( (__m128i *) (pbDst +112), _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc +112) ) ) );
1703
1704
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1705
pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1706
nBlocks -= 8;
1707
1708
if ( todo == 0 )
1709
{
1710
CLMUL_3_POST( a0, a1, a2 );
1711
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1712
1713
if ( nBlocks > 0 )
1714
{
1715
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PCLMULQDQ_HPOWERS );
1716
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1717
}
1718
}
1719
}
1720
1721
if( nBlocks > 0 )
1722
{
1723
// We have 1-7 blocks to GHASH and decrypt
1724
// Do the exact number of GHASH blocks we need in parallel with generating either 4 or 8 blocks of AES-CTR
1725
c0 = chain;
1726
c1 = _mm_add_epi32( chain, chainIncrement1 );
1727
c2 = _mm_add_epi32( chain, chainIncrement2 );
1728
c3 = _mm_add_epi32( c1, chainIncrement2 );
1729
c4 = _mm_add_epi32( c2, chainIncrement2 );
1730
1731
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
1732
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
1733
c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
1734
c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
1735
1736
if( nBlocks > 4 )
1737
{
1738
c5 = _mm_add_epi32( c4, chainIncrement1 );
1739
c6 = _mm_add_epi32( c4, chainIncrement2 );
1740
1741
c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
1742
c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
1743
c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER );
1744
1745
AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, nBlocks, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
1746
} else {
1747
AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, pbGhashSrc, nBlocks, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
1748
}
1749
1750
CLMUL_3_POST( a0, a1, a2 );
1751
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1752
1753
// Decrypt 1-7 blocks with pre-generated AES-CTR blocks
1754
while( nBlocks >= 2 )
1755
{
1756
chain = _mm_add_epi32( chain, chainIncrement2 );
1757
1758
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ) );
1759
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) );
1760
1761
pbDst += 2*SYMCRYPT_AES_BLOCK_SIZE;
1762
pbSrc += 2*SYMCRYPT_AES_BLOCK_SIZE;
1763
nBlocks -= 2;
1764
c0 = c2;
1765
c1 = c3;
1766
c2 = c4;
1767
c3 = c5;
1768
c4 = c6;
1769
}
1770
1771
if( nBlocks > 0 )
1772
{
1773
chain = _mm_add_epi32( chain, chainIncrement1 );
1774
1775
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ) );
1776
}
1777
}
1778
1779
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
1780
_mm_storeu_si128( (__m128i *) pbChainingValue, chain );
1781
_mm_storeu_si128((__m128i *)pState, state );
1782
}
1783
#pragma runtime_checks( "u", restore )
1784
#pragma warning(pop)
1785
1786
#ifdef __clang__
1787
#pragma clang attribute pop
1788
#else
1789
#pragma GCC pop_options
1790
#endif
1791
1792
#endif // CPU_X86 | CPU_AMD64
1793
1794