Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/symcrypt/lib/aes-neon.c
15010 views
1
//
2
// aes-neon.c code for AES implementation
3
//
4
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
5
//
6
// All NEON-based code for AES operations
7
//
8
9
#include "precomp.h"
10
11
#if SYMCRYPT_CPU_ARM64
12
13
#pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
14
15
#define vzeroq() vdupq_n_u64(0)
16
17
18
VOID
19
SYMCRYPT_CALL
20
SymCryptAes4SboxNeon( _In_reads_(4) PCBYTE pIn, _Out_writes_(4) PBYTE pOut )
21
{
22
/*
23
__m128i x;
24
25
x = _mm_set1_epi32( *(int *) pIn );
26
27
x = _mm_aeskeygenassist_si128( x, 0 );
28
29
*(unsigned *) pOut = x.m128i_u32[0];
30
*/
31
__n128 x;
32
33
//
34
// There is no pure S-box lookup instruction, but the AESE instruction
35
// does a ShiftRow followed by a SubBytes.
36
// If we duplicate the input value to all 4 lanes, then the ShiftRow does nothing
37
// and the SubBytes will do the S-box lookup.
38
//
39
x = vdupq_n_u32( *(unsigned int *) pIn );
40
x = vaeseq_u8( x, vzeroq() );
41
vst1q_lane_s32( pOut, x, 0 );
42
//*(unsigned int *) pOut = x.n128_u32[0];
43
}
44
45
46
VOID
47
SYMCRYPT_CALL
48
SymCryptAesCreateDecryptionRoundKeyNeon(
49
_In_reads_(16) PCBYTE pEncryptionRoundKey,
50
_Out_writes_(16) PBYTE pDecryptionRoundKey )
51
{
52
*(__n128 *) pDecryptionRoundKey = vaesimcq_u8( *(__n128 *)pEncryptionRoundKey );
53
}
54
55
//
56
// When doing a full round of AES encryption, make sure to give compiler opportunity to schedule dependent
57
// aese/aesmc pairs to enable instruction fusion in many arm64 CPUs
58
//
59
#define AESE_AESMC( c, rk ) \
60
{ \
61
c = vaeseq_u8( c, rk ); \
62
c = vaesmcq_u8( c ); \
63
};
64
65
//
66
// When doing a full round of AES decryption, make sure to give compiler opportunity to schedule dependent
67
// aesd/aesimc pairs to enable instruction fusion in many arm64 CPUs
68
//
69
#define AESD_AESIMC( c, rk ) \
70
{ \
71
c = vaesdq_u8( c, rk ); \
72
c = vaesimcq_u8( c ); \
73
};
74
75
//
76
// Using a loop with AESE_AESMC and AESD_AESIMC, the compiler can still prematurely rearrange the loop and
77
// lose opportunity for scheduling adjacent pairs.
78
// Instead, explicitly unroll the AES rounds with this macro.
79
// Takes the name of first_round, full_round, and final_round macros, and uses them to construct block to
80
// handle AES (128|192|256) for either encrypt or decrypt. For now assume only need at most 8 state
81
// variables in the macros.
82
// Assumes roundKey, keyPtr, and keyLimit are defined in calling context.
83
//
84
#define UNROLL_AES_ROUNDS_FIRST( first_round, full_round, final_round, c0, c1, c2, c3, c4, c5, c6, c7 ) \
85
{ \
86
/* Do 9 full rounds (AES-128|AES-192|AES-256) */ \
87
roundKey = *keyPtr++; \
88
first_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
89
roundKey = *keyPtr++; \
90
full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
91
roundKey = *keyPtr++; \
92
full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
93
roundKey = *keyPtr++; \
94
full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
95
roundKey = *keyPtr++; \
96
full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
97
roundKey = *keyPtr++; \
98
full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
99
roundKey = *keyPtr++; \
100
full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
101
roundKey = *keyPtr++; \
102
full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
103
roundKey = *keyPtr++; \
104
full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
105
roundKey = *keyPtr++; \
106
\
107
if ( keyPtr < keyLimit ) \
108
{ \
109
/* Do 2 more full rounds (AES-192|AES-256) */ \
110
full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
111
roundKey = *keyPtr++; \
112
full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
113
roundKey = *keyPtr++; \
114
\
115
if ( keyPtr < keyLimit ) \
116
{ \
117
/* Do 2 more full rounds (AES-256) */ \
118
full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
119
roundKey = *keyPtr++; \
120
full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
121
roundKey = *keyPtr++; \
122
} \
123
} \
124
\
125
/* Do final round (AES-128|AES-192|AES-256) */ \
126
final_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
127
};
128
129
// Only AES_ENCRYPT_1_CHAIN needs to specify the first round differently from the full round
130
#define UNROLL_AES_ROUNDS( full_round, final_round, c0, c1, c2, c3, c4, c5, c6, c7 ) \
131
UNROLL_AES_ROUNDS_FIRST( full_round, full_round, final_round, c0, c1, c2, c3, c4, c5, c6, c7 )
132
133
#define AES_ENCRYPT_ROUND_1( c0, c1, c2, c3, c4, c5, c6, c7 ) \
134
{ \
135
AESE_AESMC( c0, roundKey ) \
136
};
137
#define AES_ENCRYPT_FINAL_1( c0, c1, c2, c3, c4, c5, c6, c7 ) \
138
{ \
139
c0 = vaeseq_u8( c0, roundKey ); \
140
roundKey = *keyPtr; \
141
c0 = veorq_u8( c0, roundKey ); \
142
};
143
144
#define AES_ENCRYPT_1( pExpandedKey, c0 ) \
145
{ \
146
const __n128 *keyPtr; \
147
const __n128 *keyLimit; \
148
__n128 roundKey; \
149
\
150
keyPtr = (const __n128 *)&pExpandedKey->RoundKey[0]; \
151
keyLimit = (const __n128 *)pExpandedKey->lastEncRoundKey; \
152
\
153
UNROLL_AES_ROUNDS( \
154
AES_ENCRYPT_ROUND_1, \
155
AES_ENCRYPT_FINAL_1, \
156
c0, c1, c2, c3, c4, c5, c6, c7 \
157
) \
158
};
159
160
// Perform AES encryption without the last round key and with a specified first round key
161
//
162
// For algorithms where performance is dominated by a chain of dependent AES rounds (i.e. CBC encryption, CCM, CMAC)
163
// we can gain a reasonable performance uplift by computing (last round key ^ this plaintext block ^ first round key)
164
// off the critical path and using this computed value in place of first round key in the first AESE instruction.
165
#define AES_ENCRYPT_CHAIN_FIRST_1( c0, mergedFirstRoundKey, c2, c3, c4, c5, c6, c7 ) \
166
{ \
167
AESE_AESMC( c0, mergedFirstRoundKey ) \
168
};
169
#define AES_ENCRYPT_CHAIN_FINAL_1( c0, c1, c2, c3, c4, c5, c6, c7 ) \
170
{ \
171
c0 = vaeseq_u8( c0, roundKey ); \
172
};
173
174
#define AES_ENCRYPT_1_CHAIN( pExpandedKey, c0, mergedFirstRoundKey ) \
175
{ \
176
const __n128 *keyPtr; \
177
const __n128 *keyLimit; \
178
__n128 roundKey; \
179
\
180
keyPtr = (const __n128 *)&pExpandedKey->RoundKey[0]; \
181
keyLimit = (const __n128 *)pExpandedKey->lastEncRoundKey; \
182
\
183
UNROLL_AES_ROUNDS_FIRST( \
184
AES_ENCRYPT_CHAIN_FIRST_1, \
185
AES_ENCRYPT_ROUND_1, \
186
AES_ENCRYPT_CHAIN_FINAL_1, \
187
c0, mergedFirstRoundKey, c2, c3, c4, c5, c6, c7 \
188
) \
189
};
190
191
#define AES_ENCRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
192
{ \
193
AESE_AESMC( c0, roundKey ) \
194
AESE_AESMC( c1, roundKey ) \
195
AESE_AESMC( c2, roundKey ) \
196
AESE_AESMC( c3, roundKey ) \
197
};
198
#define AES_ENCRYPT_FINAL_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
199
{ \
200
c0 = vaeseq_u8( c0, roundKey ); \
201
c1 = vaeseq_u8( c1, roundKey ); \
202
c2 = vaeseq_u8( c2, roundKey ); \
203
c3 = vaeseq_u8( c3, roundKey ); \
204
roundKey = *keyPtr; \
205
c0 = veorq_u8( c0, roundKey ); \
206
c1 = veorq_u8( c1, roundKey ); \
207
c2 = veorq_u8( c2, roundKey ); \
208
c3 = veorq_u8( c3, roundKey ); \
209
};
210
211
#define AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 ) \
212
{ \
213
const __n128 *keyPtr; \
214
const __n128 *keyLimit; \
215
__n128 roundKey; \
216
\
217
keyPtr = (const __n128 *)&pExpandedKey->RoundKey[0]; \
218
keyLimit = (const __n128 *)pExpandedKey->lastEncRoundKey; \
219
\
220
UNROLL_AES_ROUNDS( \
221
AES_ENCRYPT_ROUND_4, \
222
AES_ENCRYPT_FINAL_4, \
223
c0, c1, c2, c3, c4, c5, c6, c7 \
224
) \
225
};
226
227
#define AES_ENCRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
228
{ \
229
AESE_AESMC( c0, roundKey ) \
230
AESE_AESMC( c1, roundKey ) \
231
AESE_AESMC( c2, roundKey ) \
232
AESE_AESMC( c3, roundKey ) \
233
AESE_AESMC( c4, roundKey ) \
234
AESE_AESMC( c5, roundKey ) \
235
AESE_AESMC( c6, roundKey ) \
236
AESE_AESMC( c7, roundKey ) \
237
};
238
#define AES_ENCRYPT_FINAL_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
239
{ \
240
c0 = vaeseq_u8( c0, roundKey ); \
241
c1 = vaeseq_u8( c1, roundKey ); \
242
c2 = vaeseq_u8( c2, roundKey ); \
243
c3 = vaeseq_u8( c3, roundKey ); \
244
c4 = vaeseq_u8( c4, roundKey ); \
245
c5 = vaeseq_u8( c5, roundKey ); \
246
c6 = vaeseq_u8( c6, roundKey ); \
247
c7 = vaeseq_u8( c7, roundKey ); \
248
roundKey = *keyPtr; \
249
c0 = veorq_u8( c0, roundKey ); \
250
c1 = veorq_u8( c1, roundKey ); \
251
c2 = veorq_u8( c2, roundKey ); \
252
c3 = veorq_u8( c3, roundKey ); \
253
c4 = veorq_u8( c4, roundKey ); \
254
c5 = veorq_u8( c5, roundKey ); \
255
c6 = veorq_u8( c6, roundKey ); \
256
c7 = veorq_u8( c7, roundKey ); \
257
};
258
259
#define AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \
260
{ \
261
const __n128 *keyPtr; \
262
const __n128 *keyLimit; \
263
__n128 roundKey; \
264
\
265
keyPtr = (const __n128 *)&pExpandedKey->RoundKey[0]; \
266
keyLimit = (const __n128 *)pExpandedKey->lastEncRoundKey; \
267
\
268
UNROLL_AES_ROUNDS( \
269
AES_ENCRYPT_ROUND_8, \
270
AES_ENCRYPT_FINAL_8, \
271
c0, c1, c2, c3, c4, c5, c6, c7 \
272
) \
273
};
274
275
#define AES_DECRYPT_ROUND_1( c0, c1, c2, c3, c4, c5, c6, c7 ) \
276
{ \
277
AESD_AESIMC( c0, roundKey ) \
278
};
279
#define AES_DECRYPT_FINAL_1( c0, c1, c2, c3, c4, c5, c6, c7 ) \
280
{ \
281
c0 = vaesdq_u8( c0, roundKey ); \
282
roundKey = *keyPtr; \
283
c0 = veorq_u8( c0, roundKey ); \
284
};
285
286
#define AES_DECRYPT_1( pExpandedKey, c0 ) \
287
{ \
288
const __n128 *keyPtr; \
289
const __n128 *keyLimit; \
290
__n128 roundKey; \
291
\
292
keyPtr = (const __n128 *)pExpandedKey->lastEncRoundKey; \
293
keyLimit = (const __n128 *)pExpandedKey->lastDecRoundKey; \
294
\
295
UNROLL_AES_ROUNDS( \
296
AES_DECRYPT_ROUND_1, \
297
AES_DECRYPT_FINAL_1, \
298
c0, c1, c2, c3, c4, c5, c6, c7 \
299
) \
300
};
301
302
#define AES_DECRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
303
{ \
304
AESD_AESIMC( c0, roundKey ) \
305
AESD_AESIMC( c1, roundKey ) \
306
AESD_AESIMC( c2, roundKey ) \
307
AESD_AESIMC( c3, roundKey ) \
308
};
309
#define AES_DECRYPT_FINAL_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
310
{ \
311
c0 = vaesdq_u8( c0, roundKey ); \
312
c1 = vaesdq_u8( c1, roundKey ); \
313
c2 = vaesdq_u8( c2, roundKey ); \
314
c3 = vaesdq_u8( c3, roundKey ); \
315
roundKey = *keyPtr; \
316
c0 = veorq_u8( c0, roundKey ); \
317
c1 = veorq_u8( c1, roundKey ); \
318
c2 = veorq_u8( c2, roundKey ); \
319
c3 = veorq_u8( c3, roundKey ); \
320
};
321
322
#define AES_DECRYPT_4( pExpandedKey, c0, c1, c2, c3 ) \
323
{ \
324
const __n128 *keyPtr; \
325
const __n128 *keyLimit; \
326
__n128 roundKey; \
327
\
328
keyPtr = (const __n128 *)pExpandedKey->lastEncRoundKey; \
329
keyLimit = (const __n128 *)pExpandedKey->lastDecRoundKey; \
330
\
331
UNROLL_AES_ROUNDS( \
332
AES_DECRYPT_ROUND_4, \
333
AES_DECRYPT_FINAL_4, \
334
c0, c1, c2, c3, c4, c5, c6, c7 \
335
) \
336
};
337
338
#define AES_DECRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
339
{ \
340
AESD_AESIMC( c0, roundKey ) \
341
AESD_AESIMC( c1, roundKey ) \
342
AESD_AESIMC( c2, roundKey ) \
343
AESD_AESIMC( c3, roundKey ) \
344
AESD_AESIMC( c4, roundKey ) \
345
AESD_AESIMC( c5, roundKey ) \
346
AESD_AESIMC( c6, roundKey ) \
347
AESD_AESIMC( c7, roundKey ) \
348
};
349
#define AES_DECRYPT_FINAL_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
350
{ \
351
c0 = vaesdq_u8( c0, roundKey ); \
352
c1 = vaesdq_u8( c1, roundKey ); \
353
c2 = vaesdq_u8( c2, roundKey ); \
354
c3 = vaesdq_u8( c3, roundKey ); \
355
c4 = vaesdq_u8( c4, roundKey ); \
356
c5 = vaesdq_u8( c5, roundKey ); \
357
c6 = vaesdq_u8( c6, roundKey ); \
358
c7 = vaesdq_u8( c7, roundKey ); \
359
roundKey = *keyPtr; \
360
c0 = veorq_u8( c0, roundKey ); \
361
c1 = veorq_u8( c1, roundKey ); \
362
c2 = veorq_u8( c2, roundKey ); \
363
c3 = veorq_u8( c3, roundKey ); \
364
c4 = veorq_u8( c4, roundKey ); \
365
c5 = veorq_u8( c5, roundKey ); \
366
c6 = veorq_u8( c6, roundKey ); \
367
c7 = veorq_u8( c7, roundKey ); \
368
};
369
370
#define AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \
371
{ \
372
const __n128 *keyPtr; \
373
const __n128 *keyLimit; \
374
__n128 roundKey; \
375
\
376
keyPtr = (const __n128 *)pExpandedKey->lastEncRoundKey; \
377
keyLimit = (const __n128 *)pExpandedKey->lastDecRoundKey; \
378
\
379
UNROLL_AES_ROUNDS( \
380
AES_DECRYPT_ROUND_8, \
381
AES_DECRYPT_FINAL_8, \
382
c0, c1, c2, c3, c4, c5, c6, c7 \
383
) \
384
};
385
386
387
388
VOID
389
SYMCRYPT_CALL
390
SymCryptAesEncryptNeon(
391
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
392
_In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PCBYTE pbSrc,
393
_Out_writes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbDst )
394
{
395
__n128 c;
396
397
c = *( __n128 * ) pbSrc;
398
399
AES_ENCRYPT_1( pExpandedKey, c );
400
401
*(__n128 *) pbDst = c;
402
}
403
404
VOID
405
SYMCRYPT_CALL
406
SymCryptAesDecryptNeon(
407
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
408
_In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PCBYTE pbSrc,
409
_Out_writes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbDst )
410
{
411
__n128 c;
412
413
c = *( __n128 * ) pbSrc;
414
415
AES_DECRYPT_1( pExpandedKey, c );
416
417
*(__n128 *) pbDst = c;
418
}
419
420
421
VOID
422
SYMCRYPT_CALL
423
SymCryptAesCbcEncryptNeon(
424
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
425
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
426
_In_reads_( cbData ) PCBYTE pbSrc,
427
_Out_writes_( cbData ) PBYTE pbDst,
428
SIZE_T cbData )
429
{
430
__n128 c = *(__n128 *)pbChainingValue;
431
__n128 rk0 = *(__n128 *) &pExpandedKey->RoundKey[0];
432
__n128 rkLast = *(__n128 *) pExpandedKey->lastEncRoundKey;
433
__n128 d, rk0AndLast;
434
435
// This algorithm is dominated by chain of dependent AES rounds, so we want to avoid EOR
436
// instructions on the critical path where possible
437
// We can compute (last round key ^ this plaintext block ^ first round key) off the critical
438
// path and use this with AES_ENCRYPT_1_CHAIN so that only AES instructions write to c in
439
// the main loop
440
rk0AndLast = veorq_u8( rk0, rkLast );
441
442
c = veorq_u8( c, rkLast );
443
444
while( cbData >= SYMCRYPT_AES_BLOCK_SIZE )
445
{
446
d = veorq_u8( *(__n128 *)pbSrc, rk0AndLast);
447
AES_ENCRYPT_1_CHAIN( pExpandedKey, c, d );
448
*(__n128 *)pbDst = veorq_u8( c, rkLast );
449
450
pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
451
pbDst += SYMCRYPT_AES_BLOCK_SIZE;
452
cbData -= SYMCRYPT_AES_BLOCK_SIZE;
453
}
454
*(__n128 *)pbChainingValue = veorq_u8( c, rkLast );
455
}
456
457
// Disable warnings and VC++ runtime checks for use of uninitialized values (by design)
458
#pragma warning(push)
459
#pragma warning( disable: 6001 4701 )
460
#pragma runtime_checks( "u", off )
461
VOID
462
SYMCRYPT_CALL
463
SymCryptAesCbcDecryptNeon(
464
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
465
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
466
_In_reads_( cbData ) PCBYTE pbSrc,
467
_Out_writes_( cbData ) PBYTE pbDst,
468
SIZE_T cbData )
469
{
470
__n128 chain;
471
__n128 c0, c1, c2, c3, c4, c5, c6, c7;
472
__n128 d0, d1, d2, d3, d4, d5, d6, d7;
473
const __n128 * pSrc = (const __n128 *) pbSrc;
474
__n128 * pDst = (__n128 *) pbDst;
475
SIZE_T cData = cbData / SYMCRYPT_AES_BLOCK_SIZE;
476
477
if( cData < 1 )
478
{
479
return;
480
}
481
482
chain = *(__n128 *) pbChainingValue;
483
484
//
485
// First we do all multiples of 8 blocks
486
//
487
488
while( cData >= 8 )
489
{
490
d0 = c0 = pSrc[0];
491
d1 = c1 = pSrc[1];
492
d2 = c2 = pSrc[2];
493
d3 = c3 = pSrc[3];
494
d4 = c4 = pSrc[4];
495
d5 = c5 = pSrc[5];
496
d6 = c6 = pSrc[6];
497
d7 = c7 = pSrc[7];
498
499
AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
500
501
c0 = veorq_u8( c0, chain );
502
c1 = veorq_u8( c1, d0 );
503
c2 = veorq_u8( c2, d1 );
504
c3 = veorq_u8( c3, d2 );
505
c4 = veorq_u8( c4, d3 );
506
c5 = veorq_u8( c5, d4 );
507
c6 = veorq_u8( c6, d5 );
508
c7 = veorq_u8( c7, d6 );
509
chain = d7;
510
511
pDst[0] = c0;
512
pDst[1] = c1;
513
pDst[2] = c2;
514
pDst[3] = c3;
515
pDst[4] = c4;
516
pDst[5] = c5;
517
pDst[6] = c6;
518
pDst[7] = c7;
519
520
pSrc += 8;
521
pDst += 8;
522
cData -= 8;
523
}
524
525
if( cData >= 1 )
526
{
527
//
528
// There is remaining work to be done
529
//
530
d0 = c0 = pSrc[0];
531
if( cData >= 2 )
532
{
533
d1 = c1 = pSrc[1];
534
if( cData >= 3 )
535
{
536
d2 = c2 = pSrc[2];
537
if( cData >= 4 )
538
{
539
d3 = c3 = pSrc[3];
540
if( cData >= 5 )
541
{
542
d4 = c4 = pSrc[4];
543
if( cData >= 6 )
544
{
545
d5 = c5 = pSrc[5];
546
if( cData >= 7 )
547
{
548
d6 = c6 = pSrc[6];
549
}
550
}
551
}
552
}
553
}
554
}
555
556
//
557
// Decrypt 1, 4, or 8 blocks in AES-CBC mode. This might decrypt uninitialized registers,
558
// but those will not be used when we store the results.
559
//
560
if( cData > 4 )
561
{
562
AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
563
c0 = veorq_u8( c0, chain );
564
c1 = veorq_u8( c1, d0 );
565
c2 = veorq_u8( c2, d1 );
566
c3 = veorq_u8( c3, d2 );
567
c4 = veorq_u8( c4, d3 );
568
c5 = veorq_u8( c5, d4 );
569
c6 = veorq_u8( c6, d5 );
570
}
571
else if( cData > 1 )
572
{
573
AES_DECRYPT_4( pExpandedKey, c0, c1, c2, c3 );
574
c0 = veorq_u8( c0, chain );
575
c1 = veorq_u8( c1, d0 );
576
c2 = veorq_u8( c2, d1 );
577
c3 = veorq_u8( c3, d2 );
578
} else
579
{
580
AES_DECRYPT_1( pExpandedKey, c0 );
581
c0 = veorq_u8( c0, chain );
582
}
583
584
chain = pSrc[ cData - 1];
585
pDst[0] = c0;
586
if( cData >= 2 )
587
{
588
pDst[1] = c1;
589
if( cData >= 3 )
590
{
591
pDst[2] = c2;
592
if( cData >= 4 )
593
{
594
pDst[3] = c3;
595
if( cData >= 5 )
596
{
597
pDst[4] = c4;
598
if( cData >= 6 )
599
{
600
pDst[5] = c5;
601
if( cData >= 7 )
602
{
603
pDst[6] = c6;
604
}
605
}
606
}
607
}
608
}
609
}
610
}
611
612
*(__n128 *)pbChainingValue = chain;
613
614
return;
615
}
616
#pragma runtime_checks( "u", restore )
617
#pragma warning( pop )
618
619
620
621
VOID
622
SYMCRYPT_CALL
623
SymCryptAesCbcMacNeon(
624
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
625
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
626
_In_reads_( cbData ) PCBYTE pbData,
627
SIZE_T cbData )
628
{
629
__n128 c = *(__n128 *)pbChainingValue;
630
__n128 rk0 = *(__n128 *) &pExpandedKey->RoundKey[0];
631
__n128 rkLast = *(__n128 *) pExpandedKey->lastEncRoundKey;
632
__n128 d, rk0AndLast;
633
634
// This algorithm is dominated by chain of dependent AES rounds, so we want to avoid EOR
635
// instructions on the critical path where possible
636
// We can compute (last round key ^ this plaintext block ^ first round key) off the critical
637
// path and use this with AES_ENCRYPT_1_CHAIN so that only AES instructions write to c in
638
// the main loop
639
rk0AndLast = veorq_u8( rk0, rkLast );
640
641
c = veorq_u8( c, rkLast );
642
643
while( cbData >= SYMCRYPT_AES_BLOCK_SIZE )
644
{
645
d = veorq_u8( *(__n128 *)pbData, rk0AndLast);
646
AES_ENCRYPT_1_CHAIN( pExpandedKey, c, d );
647
648
pbData += SYMCRYPT_AES_BLOCK_SIZE;
649
cbData -= SYMCRYPT_AES_BLOCK_SIZE;
650
}
651
*(__n128 *)pbChainingValue = veorq_u8( c, rkLast );
652
}
653
654
// Disable warnings and VC++ runtime checks for use of uninitialized values (by design)
655
#pragma warning(push)
656
#pragma warning( disable: 6001 4701 )
657
#pragma runtime_checks( "u", off )
658
VOID
659
SYMCRYPT_CALL
660
SymCryptAesEcbEncryptNeon(
661
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
662
_In_reads_( cbData ) PCBYTE pbSrc,
663
_Out_writes_( cbData ) PBYTE pbDst,
664
SIZE_T cbData )
665
{
666
__n128 c0, c1, c2, c3, c4, c5, c6, c7;
667
const __n128 * pSrc = (const __n128 *) pbSrc;
668
__n128 * pDst = (__n128 *) pbDst;
669
670
while( cbData >= 8 * SYMCRYPT_AES_BLOCK_SIZE )
671
{
672
c0 = pSrc[0];
673
c1 = pSrc[1];
674
c2 = pSrc[2];
675
c3 = pSrc[3];
676
c4 = pSrc[4];
677
c5 = pSrc[5];
678
c6 = pSrc[6];
679
c7 = pSrc[7];
680
681
AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
682
683
pDst[0] = c0;
684
pDst[1] = c1;
685
pDst[2] = c2;
686
pDst[3] = c3;
687
pDst[4] = c4;
688
pDst[5] = c5;
689
pDst[6] = c6;
690
pDst[7] = c7;
691
692
pSrc += 8;
693
pDst += 8;
694
cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
695
}
696
697
if( cbData < 16 )
698
{
699
return;
700
}
701
702
c0 = pSrc[0];
703
if( cbData >= 32 )
704
{
705
c1 = pSrc[1];
706
if( cbData >= 48 )
707
{
708
c2 = pSrc[2];
709
if( cbData >= 64 )
710
{
711
c3 = pSrc[3];
712
if( cbData >= 80 )
713
{
714
c4 = pSrc[4];
715
if( cbData >= 96 )
716
{
717
c5 = pSrc[5];
718
if( cbData >= 112 )
719
{
720
c6 = pSrc[6];
721
}
722
}
723
}
724
}
725
}
726
}
727
728
if( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
729
{
730
AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
731
}
732
else if( cbData >= 2 * SYMCRYPT_AES_BLOCK_SIZE )
733
{
734
AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 );
735
}
736
else
737
{
738
AES_ENCRYPT_1( pExpandedKey, c0 );
739
}
740
741
pDst[0] = c0;
742
if( cbData >= 32 )
743
{
744
pDst[1] = c1;
745
if( cbData >= 48 )
746
{
747
pDst[2] = c2;
748
if( cbData >= 64 )
749
{
750
pDst[3] = c3;
751
if( cbData >= 80 )
752
{
753
pDst[4] = c4;
754
if( cbData >= 96 )
755
{
756
pDst[5] = c5;
757
if( cbData >= 112 )
758
{
759
pDst[6] = c6;
760
}
761
}
762
}
763
}
764
}
765
}
766
}
767
#pragma runtime_checks( "u", restore)
768
#pragma warning( pop )
769
770
#pragma warning(push)
771
#pragma warning( disable:4701 ) // "Use of uninitialized variable"
772
#pragma runtime_checks( "u", off )
773
774
#define SYMCRYPT_AesCtrMsbXxNeon SymCryptAesCtrMsb64Neon
775
#define VADDQ_UXX vaddq_u64
776
#define VSUBQ_UXX vsubq_u64
777
778
#include "aes-pattern.c"
779
780
#undef VSUBQ_UXX
781
#undef VADDQ_UXX
782
#undef SYMCRYPT_AesCtrMsbXxNeon
783
784
#define SYMCRYPT_AesCtrMsbXxNeon SymCryptAesCtrMsb32Neon
785
#define VADDQ_UXX vaddq_u32
786
#define VSUBQ_UXX vsubq_u32
787
788
#include "aes-pattern.c"
789
790
#undef VSUBQ_UXX
791
#undef VADDQ_UXX
792
#undef SYMCRYPT_AesCtrMsbXxNeon
793
794
#pragma runtime_checks( "u", restore )
795
#pragma warning(pop)
796
797
798
//
799
// Multiply by alpha
800
//
801
// <</>> indicate shifts on 128-bit values
802
// <<<</>>>> indicate shifts on 32-bit values
803
//
804
805
// Multiply by ALPHA
806
// t1 = Input <<<< 1 words shifted left by 1
807
// t2 = Input >>>> 31 words shifted right by 31
808
// t1 = t1 ^ (t2 << 32) t1 = S << 1
809
// t2 = t2 >> 96 t2 = highest bit of S
810
// t2 = (t2 <<<< 7) + (t2 <<<<3) - (t2) multiply polynomially by 0x87 , we can use - because we only have one bit input
811
// res = t1 ^ t2
812
//
813
#define XTS_MUL_ALPHA_old( _in, _res ) \
814
{\
815
__n128 _t1, _t2;\
816
\
817
_t1 = vshlq_n_u32( _in, 1 ); \
818
_t2 = vshrq_n_u32( _in, 31); \
819
_t1 = veorq_u32( _t1, vextq_u32( vZero, _t2, 3 )); \
820
_t2 = vextq_u32( _t2, vZero, 3); \
821
_t2 = vsubq_u32( vaddq_u32( vshlq_n_u32( _t2, 7 ), vshlq_n_u32( _t2, 3 ) ), _t2 ); \
822
_res = veorq_u32( _t1, _t2 ); \
823
}
824
825
//
826
// Another approach, use signed shift right to duplicate the bits of the leftmost byte
827
// and an AND to mask the modulo reduction and the extraneous bits in the other bytes at the same time.
828
// vAlphaMask = (1, 1, ..., 1, 0x87 )
829
//
830
#define XTS_MUL_ALPHA( _in, _res ) \
831
{\
832
__n128 _t1, _t2;\
833
\
834
_t1 = vshlq_n_u8( _in, 1 ); \
835
_t2 = vshrq_n_s8( _in, 7 ); \
836
_t2 = vextq_u8( _t2, _t2, 15 ); \
837
_t2 = vandq_u8( _t2, vAlphaMask ); \
838
_res = veorq_u8( _t2, _t1 ); \
839
}
840
841
842
// Multiply by ALPHA^2
843
// t1 = Input <<<< 2
844
// t2 = Input >>>> 30
845
// t1 = t1 ^ (t2 << 32)
846
// t2 = t2 >> 96
847
// t2 = (t2 <<<< 7) ^ (t2 <<<< 2) ^ (t2 <<<< 1) ^ t2
848
// res = t1 ^ t2
849
#define XTS_MUL_ALPHA2( _in, _res ) \
850
{\
851
__n128 _t1, _t2;\
852
\
853
_t1 = vshlq_n_u32( _in, 2 ); \
854
_t2 = vshrq_n_u32( _in, 30); \
855
_t1 = veorq_u32( _t1, vextq_u32( vZero, _t2, 3 )); \
856
_t2 = vextq_u32( _t2, vZero, 3 ); \
857
_t2 = veorq_u32( veorq_u32( veorq_u32( _t2, vshlq_n_u32( _t2, 7 )), vshlq_n_u32( _t2, 2 ) ), vshlq_n_u32( _t2, 1 ) ); \
858
_res = veorq_u32( _t1, _t2 ); \
859
}
860
861
// Multiply by ALPHA^4
862
// t1 = Input <<<< 4
863
// t2 = Input >>>> 28
864
// t1 = t1 ^ (t2 << 32)
865
// t2 = t2 >> 96
866
// t2 = (t2 <<<< 7) ^ (t2 <<<< 2) ^ (t2 <<<< 1) ^ t2
867
// res = t1 ^ t2
868
#define XTS_MUL_ALPHA4( _in, _res ) \
869
{\
870
__n128 _t1, _t2;\
871
\
872
_t1 = vshlq_n_u32( _in, 4 ); \
873
_t2 = vshrq_n_u32( _in, 28); \
874
_t1 = veorq_u32( _t1, vextq_u32( vZero, _t2, 3 )); \
875
_t2 = vextq_u32( _t2, vZero, 3 ); \
876
_t2 = veorq_u32( veorq_u32( veorq_u32( _t2, vshlq_n_u32( _t2, 7 )), vshlq_n_u32( _t2, 2 ) ), vshlq_n_u32( _t2, 1 ) ); \
877
_res = veorq_u32( _t1, _t2 ); \
878
}
879
880
#define XTS_MUL_ALPHA5( _in, _res ) \
881
{\
882
__n128 _t1, _t2;\
883
\
884
_t1 = vshlq_n_u32( _in, 5 ); \
885
_t2 = vshrq_n_u32( _in, 27); \
886
_t1 = veorq_u32( _t1, vextq_u32( vZero, _t2, 3 )); \
887
_t2 = vextq_u32( _t2, vZero, 3 ); \
888
_t2 = veorq_u32( veorq_u32( veorq_u32( _t2, vshlq_n_u32( _t2, 7 )), vshlq_n_u32( _t2, 2 ) ), vshlq_n_u32( _t2, 1 ) ); \
889
_res = veorq_u32( _t1, _t2 ); \
890
}
891
892
// Multiply by ALPHA^8
893
// res = (Input << 8) | (Input >> 120)
894
// t2 = (Input >> 120) * 0x86
895
// i.e. ((Input >> 120) <<<< 7) ^ ((Input >> 120) <<<< 2) ^ ((Input >> 120) <<<< 1)
896
// the 0x01 component is already in res where we want it
897
// res = res ^ t2
898
//
899
// vAlphaMultiplier = (0, 0, ..., 0, 0x86 )
900
901
#define XTS_MUL_ALPHA8( _in, _res ) \
902
{\
903
__n128 _t2;\
904
\
905
_res = vextq_u8( _in, _in, 15 ); \
906
_t2 = vmull_p8( vget_low_p8(_res), vAlphaMultiplier ); \
907
_res = veorq_u32( _res, _t2 ); \
908
}
909
910
911
VOID
912
SYMCRYPT_CALL
913
SymCryptXtsAesEncryptDataUnitNeon(
914
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
915
_Inout_updates_(SYMCRYPT_AES_BLOCK_SIZE)PBYTE pbTweakBlock,
916
_In_reads_( cbData ) PCBYTE pbSrc,
917
_Out_writes_( cbData ) PBYTE pbDst,
918
SIZE_T cbData )
919
{
920
__n128 t0, t1, t2, t3, t4, t5, t6, t7;
921
__n128 c0, c1, c2, c3, c4, c5, c6, c7;
922
const __n128 vZero = vmovq_n_u8(0);
923
const __n128 vAlphaMask = SYMCRYPT_SET_N128_U8(0x87, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
924
const __n64 vAlphaMultiplier = SYMCRYPT_SET_N64_U64(0x0000000000000086);
925
926
SIZE_T cbDataMain; // number of bytes to handle in the main loop
927
SIZE_T cbDataTail; // number of bytes to handle in the tail loop
928
BYTE tailBuf[2*SYMCRYPT_AES_BLOCK_SIZE];
929
930
SYMCRYPT_ASSERT(cbData >= SYMCRYPT_AES_BLOCK_SIZE);
931
932
// To simplify logic and unusual size processing, we handle all
933
// data not a multiple of 8 blocks in the tail loop
934
cbDataTail = cbData & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1);
935
// Additionally, so that ciphertext stealing logic does not rely on
936
// reading back from the destination buffer, when we have a non-zero
937
// tail, we ensure that we handle at least 1 whole block in the tail
938
//
939
// Note that our caller has ensured we have at least 1 whole block
940
// to process, this is checked in debug build
941
// This means that cbDataTail is in [1,15] at this point iff there are
942
// at least 8 whole blocks to process; so the below does not cause
943
// cbDataTail or cbDataMain to exceed cbData
944
cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (8*SYMCRYPT_AES_BLOCK_SIZE) : 0;
945
cbDataMain = cbData - cbDataTail;
946
947
SYMCRYPT_ASSERT(cbDataMain <= cbData);
948
SYMCRYPT_ASSERT(cbDataTail <= cbData);
949
SYMCRYPT_ASSERT((cbDataMain & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0);
950
951
t0 = *(__n128 *)pbTweakBlock;
952
953
if( cbDataMain > 0 )
954
{
955
// Set up for main loop entry
956
// NOTE: We load the first 8 blocks and store the last 8 blocks out of the loop to allow
957
// greater instruction interleaving in the main loop.
958
// This appears to give about 5-8% performance uplift on little (in-order) cores and has
959
// no effect on big cores.
960
XTS_MUL_ALPHA4( t0, t4 );
961
XTS_MUL_ALPHA ( t0, t1 );
962
XTS_MUL_ALPHA ( t4, t5 );
963
XTS_MUL_ALPHA ( t1, t2 );
964
XTS_MUL_ALPHA ( t5, t6 );
965
XTS_MUL_ALPHA ( t2, t3 );
966
XTS_MUL_ALPHA ( t6, t7 );
967
968
c0 = veorq_u32( vld1q_u8( pbSrc + (0*16) ), t0 );
969
c1 = veorq_u32( vld1q_u8( pbSrc + (1*16) ), t1 );
970
c2 = veorq_u32( vld1q_u8( pbSrc + (2*16) ), t2 );
971
c3 = veorq_u32( vld1q_u8( pbSrc + (3*16) ), t3 );
972
c4 = veorq_u32( vld1q_u8( pbSrc + (4*16) ), t4 );
973
c5 = veorq_u32( vld1q_u8( pbSrc + (5*16) ), t5 );
974
c6 = veorq_u32( vld1q_u8( pbSrc + (6*16) ), t6 );
975
c7 = veorq_u32( vld1q_u8( pbSrc + (7*16) ), t7 );
976
977
for(;;)
978
{
979
pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
980
981
AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
982
983
cbDataMain -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
984
if( cbDataMain < 8 * SYMCRYPT_AES_BLOCK_SIZE )
985
{
986
break;
987
}
988
989
// Interleave the final xor, write, and compute next tweak block, and load, and first xor.
990
// This reduces register pressure and is more efficient.
991
vst1q_u8( pbDst + (0*16), veorq_u32( c0, t0 ) );
992
vst1q_u8( pbDst + (1*16), veorq_u32( c1, t1 ) );
993
vst1q_u8( pbDst + (2*16), veorq_u32( c2, t2 ) );
994
vst1q_u8( pbDst + (3*16), veorq_u32( c3, t3 ) );
995
vst1q_u8( pbDst + (4*16), veorq_u32( c4, t4 ) );
996
vst1q_u8( pbDst + (5*16), veorq_u32( c5, t5 ) );
997
vst1q_u8( pbDst + (6*16), veorq_u32( c6, t6 ) );
998
vst1q_u8( pbDst + (7*16), veorq_u32( c7, t7 ) );
999
1000
XTS_MUL_ALPHA8( t0, t0 );
1001
XTS_MUL_ALPHA8( t1, t1 );
1002
XTS_MUL_ALPHA8( t2, t2 );
1003
XTS_MUL_ALPHA8( t3, t3 );
1004
XTS_MUL_ALPHA8( t4, t4 );
1005
XTS_MUL_ALPHA8( t5, t5 );
1006
XTS_MUL_ALPHA8( t6, t6 );
1007
XTS_MUL_ALPHA8( t7, t7 );
1008
1009
c0 = veorq_u32( vld1q_u8( pbSrc + (0*16) ), t0 );
1010
c1 = veorq_u32( vld1q_u8( pbSrc + (1*16) ), t1 );
1011
c2 = veorq_u32( vld1q_u8( pbSrc + (2*16) ), t2 );
1012
c3 = veorq_u32( vld1q_u8( pbSrc + (3*16) ), t3 );
1013
c4 = veorq_u32( vld1q_u8( pbSrc + (4*16) ), t4 );
1014
c5 = veorq_u32( vld1q_u8( pbSrc + (5*16) ), t5 );
1015
c6 = veorq_u32( vld1q_u8( pbSrc + (6*16) ), t6 );
1016
c7 = veorq_u32( vld1q_u8( pbSrc + (7*16) ), t7 );
1017
1018
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1019
}
1020
1021
vst1q_u8( pbDst + (0*16), veorq_u32( c0, t0 ) );
1022
vst1q_u8( pbDst + (1*16), veorq_u32( c1, t1 ) );
1023
vst1q_u8( pbDst + (2*16), veorq_u32( c2, t2 ) );
1024
vst1q_u8( pbDst + (3*16), veorq_u32( c3, t3 ) );
1025
vst1q_u8( pbDst + (4*16), veorq_u32( c4, t4 ) );
1026
vst1q_u8( pbDst + (5*16), veorq_u32( c5, t5 ) );
1027
vst1q_u8( pbDst + (6*16), veorq_u32( c6, t6 ) );
1028
vst1q_u8( pbDst + (7*16), veorq_u32( c7, t7 ) );
1029
1030
// We won't do another 8-block set
1031
// Update only the first tweak block in case it is needed for tail
1032
XTS_MUL_ALPHA8( t0, t0 );
1033
1034
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1035
}
1036
1037
if( cbDataTail == 0 )
1038
{
1039
return; // <-- expected case; early return here
1040
}
1041
1042
// Rare case, with data unit length not being multiple of 128 bytes, handle the tail one block at a time
1043
while( cbDataTail >= 2*SYMCRYPT_AES_BLOCK_SIZE )
1044
{
1045
c0 = veorq_u32( vld1q_u8(pbSrc), t0 );
1046
pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
1047
AES_ENCRYPT_1( pExpandedKey, c0 );
1048
vst1q_u8( pbDst, veorq_u32( c0, t0 ) );
1049
pbDst += SYMCRYPT_AES_BLOCK_SIZE;
1050
XTS_MUL_ALPHA( t0, t0 );
1051
cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
1052
}
1053
1054
if( cbDataTail > SYMCRYPT_AES_BLOCK_SIZE )
1055
{
1056
// Ciphertext stealing encryption
1057
//
1058
// +--------------+
1059
// | |
1060
// | V
1061
// +-----------------+ | +-----+-----------+
1062
// | P_m-1 | | | P_m |++++CP+++++|
1063
// +-----------------+ | +-----+-----------+
1064
// | | |
1065
// enc_m-1 | enc_m
1066
// | | |
1067
// V | V
1068
// +-----+-----------+ | +-----------------+
1069
// | C_m |++++CP+++++|--+ | C_m-1 |
1070
// +-----+-----------+ +-----------------+
1071
// | /
1072
// +---------------- / --+
1073
// / |
1074
// | V
1075
// +-----------------+ | +-----+
1076
// | C_m-1 |<-+ | C_m |
1077
// +-----------------+ +-----+
1078
1079
// Encrypt penultimate plaintext block into tailBuf
1080
c0 = veorq_u32( vld1q_u8(pbSrc), t0 );
1081
AES_ENCRYPT_1( pExpandedKey, c0 );
1082
c0 = veorq_u32( c0, t0 );
1083
vst1q_u8( &tailBuf[0], c0 );
1084
vst1q_u8( &tailBuf[SYMCRYPT_AES_BLOCK_SIZE], c0 );
1085
1086
cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
1087
1088
// Copy final plaintext bytes to prefix of tailBuf - we must read before writing to support in-place encryption
1089
memcpy( &tailBuf[0], pbSrc + SYMCRYPT_AES_BLOCK_SIZE, cbDataTail );
1090
// Copy prefix of tailBuf[SYMCRYPT_AES_BLOCK_SIZE] to the right place in the destination buffer
1091
memcpy( pbDst + SYMCRYPT_AES_BLOCK_SIZE, &tailBuf[SYMCRYPT_AES_BLOCK_SIZE], cbDataTail );
1092
1093
// Do final tweak update
1094
XTS_MUL_ALPHA( t0, t0 );
1095
1096
// Load updated tailBuf into c0
1097
c0 = vld1q_u8( &tailBuf[0] );
1098
} else {
1099
// Just load final plaintext block into c0
1100
c0 = vld1q_u8( pbSrc );
1101
}
1102
1103
// Final full block encryption
1104
c0 = veorq_u32( c0, t0 );
1105
AES_ENCRYPT_1( pExpandedKey, c0 );
1106
vst1q_u8( pbDst, veorq_u32( c0, t0 ) );
1107
}
1108
1109
1110
VOID
1111
SYMCRYPT_CALL
1112
SymCryptXtsAesDecryptDataUnitNeon(
1113
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
1114
_Inout_updates_(SYMCRYPT_AES_BLOCK_SIZE)PBYTE pbTweakBlock,
1115
_In_reads_( cbData ) PCBYTE pbSrc,
1116
_Out_writes_( cbData ) PBYTE pbDst,
1117
SIZE_T cbData )
1118
{
1119
__n128 t0, t1, t2, t3, t4, t5, t6, t7;
1120
__n128 c0, c1, c2, c3, c4, c5, c6, c7;
1121
const __n128 vZero = vmovq_n_u8(0);
1122
const __n128 vAlphaMask = SYMCRYPT_SET_N128_U8(0x87, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
1123
const __n64 vAlphaMultiplier = SYMCRYPT_SET_N64_U64(0x0000000000000086);
1124
1125
SIZE_T cbDataMain; // number of bytes to handle in the main loop
1126
SIZE_T cbDataTail; // number of bytes to handle in the tail loop
1127
BYTE tailBuf[2*SYMCRYPT_AES_BLOCK_SIZE];
1128
1129
SYMCRYPT_ASSERT(cbData >= SYMCRYPT_AES_BLOCK_SIZE);
1130
1131
// To simplify logic and unusual size processing, we handle all
1132
// data not a multiple of 8 blocks in the tail loop
1133
cbDataTail = cbData & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1);
1134
// Additionally, so that ciphertext stealing logic does not rely on
1135
// reading back from the destination buffer, when we have a non-zero
1136
// tail, we ensure that we handle at least 1 whole block in the tail
1137
//
1138
// Note that our caller has ensured we have at least 1 whole block
1139
// to process, this is checked in debug build
1140
// This means that cbDataTail is in [1,15] at this point iff there are
1141
// at least 8 whole blocks to process; so the below does not cause
1142
// cbDataTail or cbDataMain to exceed cbData
1143
cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (8*SYMCRYPT_AES_BLOCK_SIZE) : 0;
1144
cbDataMain = cbData - cbDataTail;
1145
1146
SYMCRYPT_ASSERT(cbDataMain <= cbData);
1147
SYMCRYPT_ASSERT(cbDataTail <= cbData);
1148
SYMCRYPT_ASSERT((cbDataMain & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0);
1149
1150
t0 = *(__n128 *)pbTweakBlock;
1151
t7 = t0;
1152
1153
if( cbDataMain > 0 )
1154
{
1155
// Set up for main loop entry
1156
// NOTE: We load the first 8 blocks and store the last 8 blocks out of the loop to allow
1157
// greater instruction interleaving in the main loop.
1158
// This appears to give about 5-8% performance uplift on little (in-order) cores and has
1159
// no effect on big cores.
1160
XTS_MUL_ALPHA4( t0, t4 );
1161
XTS_MUL_ALPHA ( t0, t1 );
1162
XTS_MUL_ALPHA ( t4, t5 );
1163
XTS_MUL_ALPHA ( t1, t2 );
1164
XTS_MUL_ALPHA ( t5, t6 );
1165
XTS_MUL_ALPHA ( t2, t3 );
1166
XTS_MUL_ALPHA ( t6, t7 );
1167
1168
c0 = veorq_u32( vld1q_u8( pbSrc + (0*16) ), t0 );
1169
c1 = veorq_u32( vld1q_u8( pbSrc + (1*16) ), t1 );
1170
c2 = veorq_u32( vld1q_u8( pbSrc + (2*16) ), t2 );
1171
c3 = veorq_u32( vld1q_u8( pbSrc + (3*16) ), t3 );
1172
c4 = veorq_u32( vld1q_u8( pbSrc + (4*16) ), t4 );
1173
c5 = veorq_u32( vld1q_u8( pbSrc + (5*16) ), t5 );
1174
c6 = veorq_u32( vld1q_u8( pbSrc + (6*16) ), t6 );
1175
c7 = veorq_u32( vld1q_u8( pbSrc + (7*16) ), t7 );
1176
1177
for(;;)
1178
{
1179
pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1180
1181
AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
1182
1183
cbDataMain -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
1184
if( cbDataMain < 8 * SYMCRYPT_AES_BLOCK_SIZE )
1185
{
1186
break;
1187
}
1188
1189
// Interleave the final xor, write, and compute next tweak block, and load, and first xor.
1190
// This reduces register pressure and is more efficient.
1191
vst1q_u8( pbDst + (0*16), veorq_u32( c0, t0 ) );
1192
vst1q_u8( pbDst + (1*16), veorq_u32( c1, t1 ) );
1193
vst1q_u8( pbDst + (2*16), veorq_u32( c2, t2 ) );
1194
vst1q_u8( pbDst + (3*16), veorq_u32( c3, t3 ) );
1195
vst1q_u8( pbDst + (4*16), veorq_u32( c4, t4 ) );
1196
vst1q_u8( pbDst + (5*16), veorq_u32( c5, t5 ) );
1197
vst1q_u8( pbDst + (6*16), veorq_u32( c6, t6 ) );
1198
vst1q_u8( pbDst + (7*16), veorq_u32( c7, t7 ) );
1199
1200
XTS_MUL_ALPHA8( t0, t0 );
1201
XTS_MUL_ALPHA8( t1, t1 );
1202
XTS_MUL_ALPHA8( t2, t2 );
1203
XTS_MUL_ALPHA8( t3, t3 );
1204
XTS_MUL_ALPHA8( t4, t4 );
1205
XTS_MUL_ALPHA8( t5, t5 );
1206
XTS_MUL_ALPHA8( t6, t6 );
1207
XTS_MUL_ALPHA8( t7, t7 );
1208
1209
c0 = veorq_u32( vld1q_u8( pbSrc + (0*16) ), t0 );
1210
c1 = veorq_u32( vld1q_u8( pbSrc + (1*16) ), t1 );
1211
c2 = veorq_u32( vld1q_u8( pbSrc + (2*16) ), t2 );
1212
c3 = veorq_u32( vld1q_u8( pbSrc + (3*16) ), t3 );
1213
c4 = veorq_u32( vld1q_u8( pbSrc + (4*16) ), t4 );
1214
c5 = veorq_u32( vld1q_u8( pbSrc + (5*16) ), t5 );
1215
c6 = veorq_u32( vld1q_u8( pbSrc + (6*16) ), t6 );
1216
c7 = veorq_u32( vld1q_u8( pbSrc + (7*16) ), t7 );
1217
1218
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1219
}
1220
1221
vst1q_u8( pbDst + (0*16), veorq_u32( c0, t0 ) );
1222
vst1q_u8( pbDst + (1*16), veorq_u32( c1, t1 ) );
1223
vst1q_u8( pbDst + (2*16), veorq_u32( c2, t2 ) );
1224
vst1q_u8( pbDst + (3*16), veorq_u32( c3, t3 ) );
1225
vst1q_u8( pbDst + (4*16), veorq_u32( c4, t4 ) );
1226
vst1q_u8( pbDst + (5*16), veorq_u32( c5, t5 ) );
1227
vst1q_u8( pbDst + (6*16), veorq_u32( c6, t6 ) );
1228
vst1q_u8( pbDst + (7*16), veorq_u32( c7, t7 ) );
1229
1230
// We won't do another 8-block set
1231
// Update only the first tweak block in case it is needed for tail
1232
XTS_MUL_ALPHA8( t0, t0 );
1233
1234
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1235
}
1236
1237
if( cbDataTail == 0 )
1238
{
1239
return; // <-- expected case; early return here
1240
}
1241
1242
// Rare case, with data unit length not being multiple of 128 bytes, handle the tail one block at a time
1243
while( cbDataTail >= 2*SYMCRYPT_AES_BLOCK_SIZE )
1244
{
1245
c0 = veorq_u32( vld1q_u8( pbSrc ), t0 );
1246
pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
1247
AES_DECRYPT_1( pExpandedKey, c0 );
1248
vst1q_u8( pbDst, veorq_u32( c0, t0 ) );
1249
pbDst += SYMCRYPT_AES_BLOCK_SIZE;
1250
XTS_MUL_ALPHA( t0, t0 );
1251
cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
1252
}
1253
1254
if( cbDataTail > SYMCRYPT_AES_BLOCK_SIZE )
1255
{
1256
// Ciphertext stealing decryption
1257
//
1258
// +--------------+
1259
// | |
1260
// | V
1261
// +-----------------+ | +-----+-----------+
1262
// | C_m-1 | | | C_m |++++CP+++++|
1263
// +-----------------+ | +-----+-----------+
1264
// | | |
1265
// dec_m | dec_m-1
1266
// | | |
1267
// V | V
1268
// +-----+-----------+ | +-----------------+
1269
// | P_m |++++CP+++++|--+ | P_m-1 |
1270
// +-----+-----------+ +-----------------+
1271
// | /
1272
// +---------------- / --+
1273
// / |
1274
// | V
1275
// +-----------------+ | +-----+
1276
// | P_m-1 |<-+ | P_m |
1277
// +-----------------+ +-----+
1278
1279
// Do final tweak update into t1
1280
// Penultimate tweak is in t0, ready for final decryption
1281
XTS_MUL_ALPHA( t0, t1 );
1282
1283
// Decrypt penultimate ciphertext block into tailBuf
1284
c0 = veorq_u32( vld1q_u8( pbSrc ), t1 );
1285
AES_DECRYPT_1( pExpandedKey, c0 );
1286
c0 = veorq_u32( c0, t1 );
1287
vst1q_u8( &tailBuf[0], c0 );
1288
vst1q_u8( &tailBuf[SYMCRYPT_AES_BLOCK_SIZE], c0 );
1289
1290
cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
1291
1292
// Copy final ciphertext bytes to prefix of tailBuf - we must read before writing to support in-place decryption
1293
memcpy( &tailBuf[0], pbSrc + SYMCRYPT_AES_BLOCK_SIZE, cbDataTail );
1294
// Copy prefix of tailBuf[SYMCRYPT_AES_BLOCK_SIZE] to the right place in the destination buffer
1295
memcpy( pbDst + SYMCRYPT_AES_BLOCK_SIZE, &tailBuf[SYMCRYPT_AES_BLOCK_SIZE], cbDataTail );
1296
1297
// Load updated tailBuf into c0
1298
c0 = vld1q_u8( &tailBuf[0] );
1299
} else {
1300
// Just load final ciphertext block into c0
1301
c0 = vld1q_u8( pbSrc );
1302
}
1303
1304
// Final full block decryption
1305
c0 = veorq_u32( c0, t0 );
1306
AES_DECRYPT_1( pExpandedKey, c0 );
1307
vst1q_u8( pbDst, veorq_u32( c0, t0 ) );
1308
}
1309
1310
#include "ghash_definitions.h"
1311
1312
#define AES_ENCRYPT_ROUND_4_GHASH_1( c0, c1, c2, c3, r0, r0x, t0, t1, gHashPointer, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1313
{ \
1314
AESE_AESMC( c0, roundKey ) \
1315
AESE_AESMC( c1, roundKey ) \
1316
AESE_AESMC( c2, roundKey ) \
1317
AESE_AESMC( c3, roundKey ) \
1318
\
1319
r0x = *gHashPointer; \
1320
r0x = vrev64q_u8( r0x ); \
1321
r0 = vextq_u8( r0x, r0x, 8 ); \
1322
r0x = veorq_u8( r0, r0x ); \
1323
gHashPointer++; \
1324
\
1325
t1 = GHASH_H_POWER(gHashExpandedKeyTable, todo); \
1326
t0 = vmullq_p64( r0, t1 ); \
1327
t1 = vmull_high_p64( r0, t1 ); \
1328
\
1329
resl = veorq_u8( resl, t0 ); \
1330
resh = veorq_u8( resh, t1 ); \
1331
\
1332
t1 = GHASH_Hx_POWER(gHashExpandedKeyTable, todo); \
1333
t1 = vmullq_p64( r0x, t1 ); \
1334
\
1335
resm = veorq_u8( resm, t1 ); \
1336
todo--; \
1337
};
1338
1339
//
1340
// Using a loop with AESE_AESMC and AESD_AESIMC, the compiler can still prematurely rearrange the loop and
1341
// lose opportunity for scheduling adjacent pairs.
1342
// Instead, explicitly unroll the AES rounds with this macro.
1343
//
1344
#define AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, gHashPointer, gHashRounds, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1345
{ \
1346
const __n128 *keyPtr; \
1347
const __n128 *keyLimit; \
1348
__n128 roundKey; \
1349
\
1350
keyPtr = (const __n128 *)&pExpandedKey->RoundKey[0]; \
1351
keyLimit = (const __n128 *)pExpandedKey->lastEncRoundKey; \
1352
__n128 t0, t1, r0, r0x; \
1353
SIZE_T aesEncryptGhashLoop; \
1354
\
1355
/* Do gHashRounds full rounds (AES-128|AES-192|AES-256) with stitched GHASH */ \
1356
roundKey = *keyPtr++; \
1357
for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < gHashRounds; aesEncryptGhashLoop++) \
1358
{ \
1359
AES_ENCRYPT_ROUND_4_GHASH_1( c0, c1, c2, c3, r0, r0x, t0, t1, gHashPointer, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1360
roundKey = *keyPtr++; \
1361
} \
1362
\
1363
/* Do 9-gHashRounds full rounds (AES-128|AES-192|AES-256) */ \
1364
for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < (9-gHashRounds); aesEncryptGhashLoop++) \
1365
{ \
1366
AES_ENCRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1367
roundKey = *keyPtr++; \
1368
} \
1369
\
1370
if ( keyPtr < keyLimit ) \
1371
{ \
1372
/* Do 2 more full rounds (AES-192|AES-256) */ \
1373
AES_ENCRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1374
roundKey = *keyPtr++; \
1375
AES_ENCRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1376
roundKey = *keyPtr++; \
1377
\
1378
if ( keyPtr < keyLimit ) \
1379
{ \
1380
/* Do 2 more full rounds (AES-256) */ \
1381
AES_ENCRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1382
roundKey = *keyPtr++; \
1383
AES_ENCRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1384
roundKey = *keyPtr++; \
1385
} \
1386
} \
1387
\
1388
/* Do final round (AES-128|AES-192|AES-256) */ \
1389
AES_ENCRYPT_FINAL_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1390
};
1391
1392
#define AES_ENCRYPT_ROUND_8_GHASH_1( c0, c1, c2, c3, c4, c5, c6, c7, r0, r0x, t0, t1, gHashPointer, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1393
{ \
1394
AESE_AESMC( c0, roundKey ) \
1395
AESE_AESMC( c1, roundKey ) \
1396
AESE_AESMC( c2, roundKey ) \
1397
AESE_AESMC( c3, roundKey ) \
1398
AESE_AESMC( c4, roundKey ) \
1399
AESE_AESMC( c5, roundKey ) \
1400
AESE_AESMC( c6, roundKey ) \
1401
AESE_AESMC( c7, roundKey ) \
1402
\
1403
r0x = *gHashPointer; \
1404
r0x = vrev64q_u8( r0x ); \
1405
r0 = vextq_u8( r0x, r0x, 8 ); \
1406
r0x = veorq_u8( r0, r0x ); \
1407
gHashPointer++; \
1408
\
1409
t1 = GHASH_H_POWER(gHashExpandedKeyTable, todo); \
1410
t0 = vmullq_p64( r0, t1 ); \
1411
t1 = vmull_high_p64( r0, t1 ); \
1412
\
1413
resl = veorq_u8( resl, t0 ); \
1414
resh = veorq_u8( resh, t1 ); \
1415
\
1416
t1 = GHASH_Hx_POWER(gHashExpandedKeyTable, todo); \
1417
t1 = vmullq_p64( r0x, t1 ); \
1418
\
1419
resm = veorq_u8( resm, t1 ); \
1420
todo--; \
1421
};
1422
1423
//
1424
// Using a loop with AESE_AESMC and AESD_AESIMC, the compiler can still prematurely rearrange the loop and
1425
// lose opportunity for scheduling adjacent pairs.
1426
// Instead, explicitly unroll the AES rounds with this macro.
1427
//
1428
#define AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, gHashPointer, gHashRounds, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1429
{ \
1430
const __n128 *keyPtr; \
1431
const __n128 *keyLimit; \
1432
__n128 roundKey; \
1433
\
1434
keyPtr = (const __n128 *)&pExpandedKey->RoundKey[0]; \
1435
keyLimit = (const __n128 *)pExpandedKey->lastEncRoundKey; \
1436
__n128 t0, t1, r0, r0x; \
1437
SIZE_T aesEncryptGhashLoop; \
1438
\
1439
/* Do gHashRounds full rounds (AES-128|AES-192|AES-256) with stitched GHASH */ \
1440
roundKey = *keyPtr++; \
1441
for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < gHashRounds; aesEncryptGhashLoop++) \
1442
{ \
1443
AES_ENCRYPT_ROUND_8_GHASH_1( c0, c1, c2, c3, c4, c5, c6, c7, r0, r0x, t0, t1, gHashPointer, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1444
roundKey = *keyPtr++; \
1445
} \
1446
\
1447
/* Do 9-gHashRounds full rounds (AES-128|AES-192|AES-256) */ \
1448
for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < (9-gHashRounds); aesEncryptGhashLoop++) \
1449
{ \
1450
AES_ENCRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1451
roundKey = *keyPtr++; \
1452
} \
1453
\
1454
if ( keyPtr < keyLimit ) \
1455
{ \
1456
/* Do 2 more full rounds (AES-192|AES-256) */ \
1457
AES_ENCRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1458
roundKey = *keyPtr++; \
1459
AES_ENCRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1460
roundKey = *keyPtr++; \
1461
\
1462
if ( keyPtr < keyLimit ) \
1463
{ \
1464
/* Do 2 more full rounds (AES-256) */ \
1465
AES_ENCRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1466
roundKey = *keyPtr++; \
1467
AES_ENCRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1468
roundKey = *keyPtr++; \
1469
} \
1470
} \
1471
\
1472
/* Do final round (AES-128|AES-192|AES-256) */ \
1473
AES_ENCRYPT_FINAL_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1474
};
1475
1476
// This call is functionally identical to:
1477
// SymCryptAesCtrMsb64Neon( pExpandedKey,
1478
// pbChainingValue,
1479
// pbSrc,
1480
// pbDst,
1481
// cbData );
1482
// SymCryptGHashAppendDataPmull( expandedKeyTable,
1483
// pState,
1484
// pbDstOrig,
1485
// cbDataOrig );
1486
VOID
1487
SYMCRYPT_CALL
1488
SymCryptAesGcmEncryptStitchedNeon(
1489
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
1490
_In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
1491
_In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable,
1492
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
1493
_In_reads_( cbData ) PCBYTE pbSrc,
1494
_Out_writes_( cbData ) PBYTE pbDst,
1495
SIZE_T cbData )
1496
{
1497
__n128 chain = *(__n128 *)pbChainingValue;
1498
const __n128 * pSrc = (const __n128 *) pbSrc;
1499
const __n128 * pGhashSrc = (const __n128 *) pbDst;
1500
__n128 * pDst = (__n128 *) pbDst;
1501
1502
const __n128 chainIncrement1 = SYMCRYPT_SET_N128_U64( 0, 1 );
1503
const __n128 chainIncrement2 = SYMCRYPT_SET_N128_U64( 0, 2 );
1504
const __n128 chainIncrement8 = SYMCRYPT_SET_N128_U64( 0, 8 );
1505
1506
__n128 ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7;
1507
__n128 c0, c1, c2, c3, c4, c5, c6, c7;
1508
__n128 r0, r1;
1509
__n128 r0x, r1x;
1510
1511
__n128 state;
1512
__n128 a0, a1, a2;
1513
const __n64 vMultiplicationConstant = SYMCRYPT_SET_N64_U64(0xc200000000000000);
1514
SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
1515
SIZE_T todo;
1516
1517
SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size
1518
1519
// Our chain variable is in integer format, not the MSBfirst format loaded from memory.
1520
ctr0 = vrev64q_u8( chain );
1521
ctr1 = vaddq_u32( ctr0, chainIncrement1 );
1522
ctr2 = vaddq_u32( ctr0, chainIncrement2 );
1523
ctr3 = vaddq_u32( ctr1, chainIncrement2 );
1524
ctr4 = vaddq_u32( ctr2, chainIncrement2 );
1525
ctr5 = vaddq_u32( ctr3, chainIncrement2 );
1526
ctr6 = vaddq_u32( ctr4, chainIncrement2 );
1527
ctr7 = vaddq_u32( ctr5, chainIncrement2 );
1528
1529
state = *(__n128 *) pState;
1530
1531
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PMULL_HPOWERS );
1532
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1533
1534
// Do 8 blocks of CTR either for tail (if total blocks <8) or for encryption of first 8 blocks
1535
c0 = vrev64q_u8( ctr0 );
1536
c1 = vrev64q_u8( ctr1 );
1537
c2 = vrev64q_u8( ctr2 );
1538
c3 = vrev64q_u8( ctr3 );
1539
c4 = vrev64q_u8( ctr4 );
1540
c5 = vrev64q_u8( ctr5 );
1541
c6 = vrev64q_u8( ctr6 );
1542
c7 = vrev64q_u8( ctr7 );
1543
1544
AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
1545
1546
if ( cbData >= 8 * SYMCRYPT_AES_BLOCK_SIZE )
1547
{
1548
ctr0 = vaddq_u32( ctr0, chainIncrement8 );
1549
ctr1 = vaddq_u32( ctr1, chainIncrement8 );
1550
ctr2 = vaddq_u32( ctr2, chainIncrement8 );
1551
ctr3 = vaddq_u32( ctr3, chainIncrement8 );
1552
ctr4 = vaddq_u32( ctr4, chainIncrement8 );
1553
ctr5 = vaddq_u32( ctr5, chainIncrement8 );
1554
ctr6 = vaddq_u32( ctr6, chainIncrement8 );
1555
ctr7 = vaddq_u32( ctr7, chainIncrement8 );
1556
1557
// Encrypt first 8 blocks
1558
pDst[0] = veorq_u64( pSrc[0], c0 );
1559
pDst[1] = veorq_u64( pSrc[1], c1 );
1560
pDst[2] = veorq_u64( pSrc[2], c2 );
1561
pDst[3] = veorq_u64( pSrc[3], c3 );
1562
pDst[4] = veorq_u64( pSrc[4], c4 );
1563
pDst[5] = veorq_u64( pSrc[5], c5 );
1564
pDst[6] = veorq_u64( pSrc[6], c6 );
1565
pDst[7] = veorq_u64( pSrc[7], c7 );
1566
1567
pDst += 8;
1568
pSrc += 8;
1569
1570
while( nBlocks >= 16 )
1571
{
1572
// In this loop we always have 8 blocks to encrypt and we have already encrypted the previous 8 blocks ready for GHASH
1573
c0 = vrev64q_u8( ctr0 );
1574
c1 = vrev64q_u8( ctr1 );
1575
c2 = vrev64q_u8( ctr2 );
1576
c3 = vrev64q_u8( ctr3 );
1577
c4 = vrev64q_u8( ctr4 );
1578
c5 = vrev64q_u8( ctr5 );
1579
c6 = vrev64q_u8( ctr6 );
1580
c7 = vrev64q_u8( ctr7 );
1581
1582
ctr0 = vaddq_u32( ctr0, chainIncrement8 );
1583
ctr1 = vaddq_u32( ctr1, chainIncrement8 );
1584
ctr2 = vaddq_u32( ctr2, chainIncrement8 );
1585
ctr3 = vaddq_u32( ctr3, chainIncrement8 );
1586
ctr4 = vaddq_u32( ctr4, chainIncrement8 );
1587
ctr5 = vaddq_u32( ctr5, chainIncrement8 );
1588
ctr6 = vaddq_u32( ctr6, chainIncrement8 );
1589
ctr7 = vaddq_u32( ctr7, chainIncrement8 );
1590
1591
AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pGhashSrc, 8, expandedKeyTable, todo, a0, a1, a2 );
1592
1593
pDst[0] = veorq_u64( pSrc[0], c0 );
1594
pDst[1] = veorq_u64( pSrc[1], c1 );
1595
pDst[2] = veorq_u64( pSrc[2], c2 );
1596
pDst[3] = veorq_u64( pSrc[3], c3 );
1597
pDst[4] = veorq_u64( pSrc[4], c4 );
1598
pDst[5] = veorq_u64( pSrc[5], c5 );
1599
pDst[6] = veorq_u64( pSrc[6], c6 );
1600
pDst[7] = veorq_u64( pSrc[7], c7 );
1601
1602
pDst += 8;
1603
pSrc += 8;
1604
nBlocks -= 8;
1605
1606
if (todo == 0)
1607
{
1608
CLMUL_3_POST( a0, a1, a2 );
1609
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1610
1611
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PMULL_HPOWERS );
1612
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1613
}
1614
}
1615
1616
// We now have at least 8 blocks of encrypted data to GHASH and at most 7 blocks left to encrypt
1617
// Do 8 blocks of GHASH in parallel with generating 0, 4, or 8 AES-CTR blocks for tail encryption
1618
nBlocks -= 8;
1619
if (nBlocks > 0)
1620
{
1621
c0 = vrev64q_u8( ctr0 );
1622
c1 = vrev64q_u8( ctr1 );
1623
c2 = vrev64q_u8( ctr2 );
1624
c3 = vrev64q_u8( ctr3 );
1625
1626
if (nBlocks > 4)
1627
{
1628
// Do 8 rounds of AES-CTR for tail in parallel with 8 rounds of GHASH
1629
c4 = vrev64q_u8( ctr4 );
1630
c5 = vrev64q_u8( ctr5 );
1631
c6 = vrev64q_u8( ctr6 );
1632
1633
AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pGhashSrc, 8, expandedKeyTable, todo, a0, a1, a2 );
1634
}
1635
else
1636
{
1637
// Do 4 rounds of AES-CTR for tail in parallel with 8 rounds of GHASH
1638
AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, pGhashSrc, 8, expandedKeyTable, todo, a0, a1, a2 );
1639
}
1640
1641
if( todo == 0)
1642
{
1643
CLMUL_3_POST( a0, a1, a2 );
1644
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1645
1646
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PMULL_HPOWERS );
1647
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1648
}
1649
}
1650
else
1651
{
1652
// Just do the final 8 rounds of GHASH
1653
for( todo=8; todo>0; todo-- )
1654
{
1655
r0x = vrev64q_u8( pGhashSrc[0] );
1656
r0 = vextq_u8( r0x, r0x, 8 );
1657
r0x = veorq_u8( r0, r0x );
1658
pGhashSrc++;
1659
1660
CLMUL_ACCX_3( r0, r0x, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1661
}
1662
1663
CLMUL_3_POST( a0, a1, a2 );
1664
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1665
}
1666
}
1667
1668
if( nBlocks > 0 )
1669
{
1670
// Encrypt 1-7 blocks with pre-generated AES-CTR blocks and GHASH the results
1671
while( nBlocks >= 2 )
1672
{
1673
ctr0 = vaddq_u32( ctr0, chainIncrement2 );
1674
1675
r0 = veorq_u64( pSrc[0], c0 );
1676
r1 = veorq_u64( pSrc[1], c1 );
1677
1678
pDst[0] = r0;
1679
pDst[1] = r1;
1680
1681
r0x = vrev64q_u8( r0 );
1682
r1x = vrev64q_u8( r1 );
1683
r0 = vextq_u8( r0x, r0x, 8 );
1684
r1 = vextq_u8( r1x, r1x, 8 );
1685
r0x = veorq_u8( r0, r0x );
1686
r1x = veorq_u8( r1, r1x );
1687
1688
CLMUL_ACCX_3( r0, r0x, GHASH_H_POWER(expandedKeyTable, todo - 0), GHASH_Hx_POWER(expandedKeyTable, todo - 0), a0, a1, a2 );
1689
CLMUL_ACCX_3( r1, r1x, GHASH_H_POWER(expandedKeyTable, todo - 1), GHASH_Hx_POWER(expandedKeyTable, todo - 1), a0, a1, a2 );
1690
1691
pDst += 2;
1692
pSrc += 2;
1693
todo -= 2;
1694
nBlocks -= 2;
1695
c0 = c2;
1696
c1 = c3;
1697
c2 = c4;
1698
c3 = c5;
1699
c4 = c6;
1700
}
1701
1702
if( nBlocks > 0 )
1703
{
1704
ctr0 = vaddq_u32( ctr0, chainIncrement1 );
1705
1706
r0 = veorq_u64( pSrc[0], c0 );
1707
pDst[0] = r0;
1708
r0x = vrev64q_u8( r0 );
1709
r0 = vextq_u8( r0x, r0x, 8 );
1710
r0x = veorq_u8( r0, r0x );
1711
1712
CLMUL_ACCX_3( r0, r0x, GHASH_H_POWER(expandedKeyTable, 1), GHASH_Hx_POWER(expandedKeyTable, 1), a0, a1, a2 );
1713
}
1714
1715
CLMUL_3_POST( a0, a1, a2 );
1716
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1717
}
1718
1719
chain = vrev64q_u8( ctr0 );
1720
*(__n128 *)pbChainingValue = chain;
1721
*(__n128 *)pState = state;
1722
}
1723
1724
#pragma warning(push)
1725
#pragma warning( disable:4701 ) // "Use of uninitialized variable" -
1726
#pragma runtime_checks( "u", off )
1727
// This call is functionally identical to:
1728
// SymCryptGHashAppendDataPmull(expandedKeyTable,
1729
// pState,
1730
// pbSrc,
1731
// cbData );
1732
// SymCryptAesCtrMsb64Neon( pExpandedKey,
1733
// pbChainingValue,
1734
// pbSrc,
1735
// pbDst,
1736
// cbData );
1737
VOID
1738
SYMCRYPT_CALL
1739
SymCryptAesGcmDecryptStitchedNeon(
1740
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
1741
_In_reads_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
1742
_In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable,
1743
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
1744
_In_reads_( cbData ) PCBYTE pbSrc,
1745
_Out_writes_( cbData ) PBYTE pbDst,
1746
SIZE_T cbData )
1747
{
1748
__n128 chain = *(__n128 *)pbChainingValue;
1749
const __n128 * pSrc = (const __n128 *) pbSrc;
1750
const __n128 * pGhashSrc = (const __n128 *) pbSrc;
1751
__n128 * pDst = (__n128 *) pbDst;
1752
1753
const __n128 chainIncrement1 = SYMCRYPT_SET_N128_U64( 0, 1 );
1754
const __n128 chainIncrement2 = SYMCRYPT_SET_N128_U64( 0, 2 );
1755
const __n128 chainIncrement8 = SYMCRYPT_SET_N128_U64( 0, 8 );
1756
1757
__n128 ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7;
1758
__n128 c0, c1, c2, c3, c4, c5, c6, c7;
1759
1760
__n128 state;
1761
__n128 a0, a1, a2;
1762
const __n64 vMultiplicationConstant = SYMCRYPT_SET_N64_U64(0xc200000000000000);
1763
SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
1764
SIZE_T todo;
1765
1766
SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size
1767
1768
// Our chain variable is in integer format, not the MSBfirst format loaded from memory.
1769
ctr0 = vrev64q_u8( chain );
1770
ctr1 = vaddq_u32( ctr0, chainIncrement1 );
1771
ctr2 = vaddq_u32( ctr0, chainIncrement2 );
1772
ctr3 = vaddq_u32( ctr1, chainIncrement2 );
1773
ctr4 = vaddq_u32( ctr2, chainIncrement2 );
1774
ctr5 = vaddq_u32( ctr3, chainIncrement2 );
1775
ctr6 = vaddq_u32( ctr4, chainIncrement2 );
1776
ctr7 = vaddq_u32( ctr5, chainIncrement2 );
1777
1778
state = *(__n128 *) pState;
1779
1780
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PMULL_HPOWERS );
1781
1782
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1783
1784
while( nBlocks >= 8 )
1785
{
1786
// In this loop we always have 8 blocks to decrypt and GHASH
1787
c0 = vrev64q_u8( ctr0 );
1788
c1 = vrev64q_u8( ctr1 );
1789
c2 = vrev64q_u8( ctr2 );
1790
c3 = vrev64q_u8( ctr3 );
1791
c4 = vrev64q_u8( ctr4 );
1792
c5 = vrev64q_u8( ctr5 );
1793
c6 = vrev64q_u8( ctr6 );
1794
c7 = vrev64q_u8( ctr7 );
1795
1796
ctr0 = vaddq_u32( ctr0, chainIncrement8 );
1797
ctr1 = vaddq_u32( ctr1, chainIncrement8 );
1798
ctr2 = vaddq_u32( ctr2, chainIncrement8 );
1799
ctr3 = vaddq_u32( ctr3, chainIncrement8 );
1800
ctr4 = vaddq_u32( ctr4, chainIncrement8 );
1801
ctr5 = vaddq_u32( ctr5, chainIncrement8 );
1802
ctr6 = vaddq_u32( ctr6, chainIncrement8 );
1803
ctr7 = vaddq_u32( ctr7, chainIncrement8 );
1804
1805
AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pGhashSrc, 8, expandedKeyTable, todo, a0, a1, a2 );
1806
1807
pDst[0] = veorq_u64( pSrc[0], c0 );
1808
pDst[1] = veorq_u64( pSrc[1], c1 );
1809
pDst[2] = veorq_u64( pSrc[2], c2 );
1810
pDst[3] = veorq_u64( pSrc[3], c3 );
1811
pDst[4] = veorq_u64( pSrc[4], c4 );
1812
pDst[5] = veorq_u64( pSrc[5], c5 );
1813
pDst[6] = veorq_u64( pSrc[6], c6 );
1814
pDst[7] = veorq_u64( pSrc[7], c7 );
1815
1816
pDst += 8;
1817
pSrc += 8;
1818
nBlocks -= 8;
1819
1820
if (todo == 0)
1821
{
1822
CLMUL_3_POST( a0, a1, a2 );
1823
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1824
1825
if ( nBlocks > 0 )
1826
{
1827
todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PMULL_HPOWERS );
1828
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1829
}
1830
}
1831
}
1832
1833
if( nBlocks > 0 )
1834
{
1835
// We have 1-7 blocks to GHASH and decrypt
1836
// Do the exact number of GHASH blocks we need in parallel with generating either 4 or 8 blocks of AES-CTR
1837
c0 = vrev64q_u8( ctr0 );
1838
c1 = vrev64q_u8( ctr1 );
1839
c2 = vrev64q_u8( ctr2 );
1840
c3 = vrev64q_u8( ctr3 );
1841
1842
if( nBlocks > 4 )
1843
{
1844
c4 = vrev64q_u8( ctr4 );
1845
c5 = vrev64q_u8( ctr5 );
1846
c6 = vrev64q_u8( ctr6 );
1847
1848
AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pGhashSrc, nBlocks, expandedKeyTable, todo, a0, a1, a2 );
1849
} else {
1850
AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, pGhashSrc, nBlocks, expandedKeyTable, todo, a0, a1, a2 );
1851
}
1852
CLMUL_3_POST( a0, a1, a2 );
1853
MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1854
1855
// Decrypt 1-7 blocks with pre-generated AES-CTR blocks
1856
while( nBlocks >= 2 )
1857
{
1858
ctr0 = vaddq_u32( ctr0, chainIncrement2 );
1859
1860
pDst[0] = veorq_u64( pSrc[0], c0 );
1861
pDst[1] = veorq_u64( pSrc[1], c1 );
1862
1863
pDst += 2;
1864
pSrc += 2;
1865
nBlocks -= 2;
1866
c0 = c2;
1867
c1 = c3;
1868
c2 = c4;
1869
c3 = c5;
1870
c4 = c6;
1871
}
1872
1873
if( nBlocks > 0 )
1874
{
1875
ctr0 = vaddq_u32( ctr0, chainIncrement1 );
1876
1877
pDst[0] = veorq_u64( pSrc[0], c0 );
1878
}
1879
}
1880
1881
chain = vrev64q_u8( ctr0 );
1882
*(__n128 *)pbChainingValue = chain;
1883
*(__n128 *)pState = state;
1884
}
1885
#pragma runtime_checks( "u", restore )
1886
#pragma warning(pop)
1887
#pragma clang attribute pop
1888
1889
#endif
1890
1891