CoCalc -- aes-neon.c

GitHub Repository: wine-mirror/wine
Path: blob/master/libs/symcrypt/lib/aes-neon.c
¹⁵⁰¹⁰ views
1
//
2
// aes-neon.c   code for AES implementation
3
//
4
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
5
//
6
// All NEON-based code for AES operations
7
//
8

9
#include "precomp.h"
10

11
#if SYMCRYPT_CPU_ARM64
12

13
#pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
14

15
#define vzeroq()    vdupq_n_u64(0)
16

17

18
VOID
19
SYMCRYPT_CALL
20
SymCryptAes4SboxNeon( _In_reads_(4) PCBYTE pIn, _Out_writes_(4) PBYTE pOut )
21
{
22
    /*
23
    __m128i x;
24

25
    x = _mm_set1_epi32( *(int *) pIn );
26

27
    x = _mm_aeskeygenassist_si128( x, 0 );
28

29
    *(unsigned *) pOut = x.m128i_u32[0];
30
    */
31
    __n128 x;
32

33
    //
34
    // There is no pure S-box lookup instruction, but the AESE instruction
35
    // does a ShiftRow followed by a SubBytes.
36
    // If we duplicate the input value to all 4 lanes, then the ShiftRow does nothing
37
    // and the SubBytes will do the S-box lookup.
38
    //
39
    x = vdupq_n_u32( *(unsigned int *) pIn );
40
    x = vaeseq_u8( x, vzeroq() );
41
    vst1q_lane_s32( pOut, x, 0 );
42
    //*(unsigned int *) pOut = x.n128_u32[0];
43
}
44

45

46
VOID
47
SYMCRYPT_CALL
48
SymCryptAesCreateDecryptionRoundKeyNeon(
49
    _In_reads_(16)      PCBYTE  pEncryptionRoundKey,
50
    _Out_writes_(16)    PBYTE   pDecryptionRoundKey )
51
{
52
    *(__n128 *) pDecryptionRoundKey = vaesimcq_u8( *(__n128 *)pEncryptionRoundKey );
53
}
54

55
//
56
// When doing a full round of AES encryption, make sure to give compiler opportunity to schedule dependent
57
// aese/aesmc pairs to enable instruction fusion in many arm64 CPUs
58
//
59
#define AESE_AESMC( c, rk ) \
60
{ \
61
    c = vaeseq_u8( c, rk ); \
62
    c = vaesmcq_u8( c ); \
63
};
64

65
//
66
// When doing a full round of AES decryption, make sure to give compiler opportunity to schedule dependent
67
// aesd/aesimc pairs to enable instruction fusion in many arm64 CPUs
68
//
69
#define AESD_AESIMC( c, rk ) \
70
{ \
71
    c = vaesdq_u8( c, rk ); \
72
    c = vaesimcq_u8( c ); \
73
};
74

75
//
76
// Using a loop with AESE_AESMC and AESD_AESIMC, the compiler can still prematurely rearrange the loop and
77
// lose opportunity for scheduling adjacent pairs.
78
// Instead, explicitly unroll the AES rounds with this macro.
79
// Takes the name of first_round, full_round, and final_round macros, and uses them to construct block to
80
// handle AES (128|192|256) for either encrypt or decrypt. For now assume only need at most 8 state
81
// variables in the macros.
82
// Assumes roundKey, keyPtr, and keyLimit are defined in calling context.
83
//
84
#define UNROLL_AES_ROUNDS_FIRST( first_round, full_round, final_round, c0, c1, c2, c3, c4, c5, c6, c7 ) \
85
{ \
86
    /* Do 9 full rounds (AES-128|AES-192|AES-256) */ \
87
    roundKey = *keyPtr++; \
88
    first_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
89
    roundKey = *keyPtr++; \
90
    full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
91
    roundKey = *keyPtr++; \
92
    full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
93
    roundKey = *keyPtr++; \
94
    full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
95
    roundKey = *keyPtr++; \
96
    full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
97
    roundKey = *keyPtr++; \
98
    full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
99
    roundKey = *keyPtr++; \
100
    full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
101
    roundKey = *keyPtr++; \
102
    full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
103
    roundKey = *keyPtr++; \
104
    full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
105
    roundKey = *keyPtr++; \
106
\
107
    if ( keyPtr < keyLimit ) \
108
    { \
109
        /* Do 2 more full rounds (AES-192|AES-256) */ \
110
        full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
111
        roundKey = *keyPtr++; \
112
        full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
113
        roundKey = *keyPtr++; \
114
\
115
        if ( keyPtr < keyLimit ) \
116
        { \
117
            /* Do 2 more full rounds (AES-256) */ \
118
            full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
119
            roundKey = *keyPtr++; \
120
            full_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
121
            roundKey = *keyPtr++; \
122
        } \
123
    } \
124
\
125
    /* Do final round (AES-128|AES-192|AES-256) */ \
126
    final_round( c0, c1, c2, c3, c4, c5, c6, c7 ) \
127
};
128

129
// Only AES_ENCRYPT_1_CHAIN needs to specify the first round differently from the full round
130
#define UNROLL_AES_ROUNDS( full_round, final_round, c0, c1, c2, c3, c4, c5, c6, c7 ) \
131
    UNROLL_AES_ROUNDS_FIRST( full_round, full_round, final_round, c0, c1, c2, c3, c4, c5, c6, c7 )
132

133
#define AES_ENCRYPT_ROUND_1( c0, c1, c2, c3, c4, c5, c6, c7 ) \
134
{ \
135
    AESE_AESMC( c0, roundKey ) \
136
};
137
#define AES_ENCRYPT_FINAL_1( c0, c1, c2, c3, c4, c5, c6, c7 ) \
138
{ \
139
    c0 = vaeseq_u8( c0, roundKey ); \
140
    roundKey = *keyPtr; \
141
    c0 = veorq_u8( c0, roundKey ); \
142
};
143

144
#define AES_ENCRYPT_1( pExpandedKey, c0 ) \
145
{ \
146
    const __n128 *keyPtr; \
147
    const __n128 *keyLimit; \
148
    __n128 roundKey; \
149
\
150
    keyPtr = (const __n128 *)&pExpandedKey->RoundKey[0]; \
151
    keyLimit = (const __n128 *)pExpandedKey->lastEncRoundKey; \
152
\
153
    UNROLL_AES_ROUNDS( \
154
        AES_ENCRYPT_ROUND_1, \
155
        AES_ENCRYPT_FINAL_1, \
156
        c0, c1, c2, c3, c4, c5, c6, c7 \
157
    ) \
158
};
159

160
// Perform AES encryption without the last round key and with a specified first round key
161
//
162
// For algorithms where performance is dominated by a chain of dependent AES rounds (i.e. CBC encryption, CCM, CMAC)
163
// we can gain a reasonable performance uplift by computing (last round key ^ this plaintext block ^ first round key)
164
// off the critical path and using this computed value in place of first round key in the first AESE instruction.
165
#define AES_ENCRYPT_CHAIN_FIRST_1( c0, mergedFirstRoundKey, c2, c3, c4, c5, c6, c7 ) \
166
{ \
167
    AESE_AESMC( c0, mergedFirstRoundKey ) \
168
};
169
#define AES_ENCRYPT_CHAIN_FINAL_1( c0, c1, c2, c3, c4, c5, c6, c7 ) \
170
{ \
171
    c0 = vaeseq_u8( c0, roundKey ); \
172
};
173

174
#define AES_ENCRYPT_1_CHAIN( pExpandedKey, c0, mergedFirstRoundKey ) \
175
{ \
176
    const __n128 *keyPtr; \
177
    const __n128 *keyLimit; \
178
    __n128 roundKey; \
179
\
180
    keyPtr = (const __n128 *)&pExpandedKey->RoundKey[0]; \
181
    keyLimit = (const __n128 *)pExpandedKey->lastEncRoundKey; \
182
\
183
    UNROLL_AES_ROUNDS_FIRST( \
184
        AES_ENCRYPT_CHAIN_FIRST_1, \
185
        AES_ENCRYPT_ROUND_1, \
186
        AES_ENCRYPT_CHAIN_FINAL_1, \
187
        c0, mergedFirstRoundKey, c2, c3, c4, c5, c6, c7 \
188
    ) \
189
};
190

191
#define AES_ENCRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
192
{ \
193
    AESE_AESMC( c0, roundKey ) \
194
    AESE_AESMC( c1, roundKey ) \
195
    AESE_AESMC( c2, roundKey ) \
196
    AESE_AESMC( c3, roundKey ) \
197
};
198
#define AES_ENCRYPT_FINAL_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
199
{ \
200
    c0 = vaeseq_u8( c0, roundKey ); \
201
    c1 = vaeseq_u8( c1, roundKey ); \
202
    c2 = vaeseq_u8( c2, roundKey ); \
203
    c3 = vaeseq_u8( c3, roundKey ); \
204
    roundKey = *keyPtr; \
205
    c0 = veorq_u8( c0, roundKey ); \
206
    c1 = veorq_u8( c1, roundKey ); \
207
    c2 = veorq_u8( c2, roundKey ); \
208
    c3 = veorq_u8( c3, roundKey ); \
209
};
210

211
#define AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 ) \
212
{ \
213
    const __n128 *keyPtr; \
214
    const __n128 *keyLimit; \
215
    __n128 roundKey; \
216
\
217
    keyPtr = (const __n128 *)&pExpandedKey->RoundKey[0]; \
218
    keyLimit = (const __n128 *)pExpandedKey->lastEncRoundKey; \
219
\
220
    UNROLL_AES_ROUNDS( \
221
        AES_ENCRYPT_ROUND_4, \
222
        AES_ENCRYPT_FINAL_4, \
223
        c0, c1, c2, c3, c4, c5, c6, c7 \
224
    ) \
225
};
226

227
#define AES_ENCRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
228
{ \
229
    AESE_AESMC( c0, roundKey ) \
230
    AESE_AESMC( c1, roundKey ) \
231
    AESE_AESMC( c2, roundKey ) \
232
    AESE_AESMC( c3, roundKey ) \
233
    AESE_AESMC( c4, roundKey ) \
234
    AESE_AESMC( c5, roundKey ) \
235
    AESE_AESMC( c6, roundKey ) \
236
    AESE_AESMC( c7, roundKey ) \
237
};
238
#define AES_ENCRYPT_FINAL_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
239
{ \
240
    c0 = vaeseq_u8( c0, roundKey ); \
241
    c1 = vaeseq_u8( c1, roundKey ); \
242
    c2 = vaeseq_u8( c2, roundKey ); \
243
    c3 = vaeseq_u8( c3, roundKey ); \
244
    c4 = vaeseq_u8( c4, roundKey ); \
245
    c5 = vaeseq_u8( c5, roundKey ); \
246
    c6 = vaeseq_u8( c6, roundKey ); \
247
    c7 = vaeseq_u8( c7, roundKey ); \
248
    roundKey = *keyPtr; \
249
    c0 = veorq_u8( c0, roundKey ); \
250
    c1 = veorq_u8( c1, roundKey ); \
251
    c2 = veorq_u8( c2, roundKey ); \
252
    c3 = veorq_u8( c3, roundKey ); \
253
    c4 = veorq_u8( c4, roundKey ); \
254
    c5 = veorq_u8( c5, roundKey ); \
255
    c6 = veorq_u8( c6, roundKey ); \
256
    c7 = veorq_u8( c7, roundKey ); \
257
};
258

259
#define AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \
260
{ \
261
    const __n128 *keyPtr; \
262
    const __n128 *keyLimit; \
263
    __n128 roundKey; \
264
\
265
    keyPtr = (const __n128 *)&pExpandedKey->RoundKey[0]; \
266
    keyLimit = (const __n128 *)pExpandedKey->lastEncRoundKey; \
267
\
268
    UNROLL_AES_ROUNDS( \
269
        AES_ENCRYPT_ROUND_8, \
270
        AES_ENCRYPT_FINAL_8, \
271
        c0, c1, c2, c3, c4, c5, c6, c7 \
272
    ) \
273
};
274

275
#define AES_DECRYPT_ROUND_1( c0, c1, c2, c3, c4, c5, c6, c7 ) \
276
{ \
277
    AESD_AESIMC( c0, roundKey ) \
278
};
279
#define AES_DECRYPT_FINAL_1( c0, c1, c2, c3, c4, c5, c6, c7 ) \
280
{ \
281
    c0 = vaesdq_u8( c0, roundKey ); \
282
    roundKey = *keyPtr; \
283
    c0 = veorq_u8( c0, roundKey ); \
284
};
285

286
#define AES_DECRYPT_1( pExpandedKey, c0 ) \
287
{ \
288
    const __n128 *keyPtr; \
289
    const __n128 *keyLimit; \
290
    __n128 roundKey; \
291
\
292
    keyPtr = (const __n128 *)pExpandedKey->lastEncRoundKey; \
293
    keyLimit = (const __n128 *)pExpandedKey->lastDecRoundKey; \
294
\
295
    UNROLL_AES_ROUNDS( \
296
        AES_DECRYPT_ROUND_1, \
297
        AES_DECRYPT_FINAL_1, \
298
        c0, c1, c2, c3, c4, c5, c6, c7 \
299
    ) \
300
};
301

302
#define AES_DECRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
303
{ \
304
    AESD_AESIMC( c0, roundKey ) \
305
    AESD_AESIMC( c1, roundKey ) \
306
    AESD_AESIMC( c2, roundKey ) \
307
    AESD_AESIMC( c3, roundKey ) \
308
};
309
#define AES_DECRYPT_FINAL_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
310
{ \
311
    c0 = vaesdq_u8( c0, roundKey ); \
312
    c1 = vaesdq_u8( c1, roundKey ); \
313
    c2 = vaesdq_u8( c2, roundKey ); \
314
    c3 = vaesdq_u8( c3, roundKey ); \
315
    roundKey = *keyPtr; \
316
    c0 = veorq_u8( c0, roundKey ); \
317
    c1 = veorq_u8( c1, roundKey ); \
318
    c2 = veorq_u8( c2, roundKey ); \
319
    c3 = veorq_u8( c3, roundKey ); \
320
};
321

322
#define AES_DECRYPT_4( pExpandedKey, c0, c1, c2, c3 ) \
323
{ \
324
    const __n128 *keyPtr; \
325
    const __n128 *keyLimit; \
326
    __n128 roundKey; \
327
\
328
    keyPtr = (const __n128 *)pExpandedKey->lastEncRoundKey; \
329
    keyLimit = (const __n128 *)pExpandedKey->lastDecRoundKey; \
330
\
331
    UNROLL_AES_ROUNDS( \
332
        AES_DECRYPT_ROUND_4, \
333
        AES_DECRYPT_FINAL_4, \
334
        c0, c1, c2, c3, c4, c5, c6, c7 \
335
    ) \
336
};
337

338
#define AES_DECRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
339
{ \
340
    AESD_AESIMC( c0, roundKey ) \
341
    AESD_AESIMC( c1, roundKey ) \
342
    AESD_AESIMC( c2, roundKey ) \
343
    AESD_AESIMC( c3, roundKey ) \
344
    AESD_AESIMC( c4, roundKey ) \
345
    AESD_AESIMC( c5, roundKey ) \
346
    AESD_AESIMC( c6, roundKey ) \
347
    AESD_AESIMC( c7, roundKey ) \
348
};
349
#define AES_DECRYPT_FINAL_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
350
{ \
351
    c0 = vaesdq_u8( c0, roundKey ); \
352
    c1 = vaesdq_u8( c1, roundKey ); \
353
    c2 = vaesdq_u8( c2, roundKey ); \
354
    c3 = vaesdq_u8( c3, roundKey ); \
355
    c4 = vaesdq_u8( c4, roundKey ); \
356
    c5 = vaesdq_u8( c5, roundKey ); \
357
    c6 = vaesdq_u8( c6, roundKey ); \
358
    c7 = vaesdq_u8( c7, roundKey ); \
359
    roundKey = *keyPtr; \
360
    c0 = veorq_u8( c0, roundKey ); \
361
    c1 = veorq_u8( c1, roundKey ); \
362
    c2 = veorq_u8( c2, roundKey ); \
363
    c3 = veorq_u8( c3, roundKey ); \
364
    c4 = veorq_u8( c4, roundKey ); \
365
    c5 = veorq_u8( c5, roundKey ); \
366
    c6 = veorq_u8( c6, roundKey ); \
367
    c7 = veorq_u8( c7, roundKey ); \
368
};
369

370
#define AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 ) \
371
{ \
372
    const __n128 *keyPtr; \
373
    const __n128 *keyLimit; \
374
    __n128 roundKey; \
375
\
376
    keyPtr = (const __n128 *)pExpandedKey->lastEncRoundKey; \
377
    keyLimit = (const __n128 *)pExpandedKey->lastDecRoundKey; \
378
\
379
    UNROLL_AES_ROUNDS( \
380
        AES_DECRYPT_ROUND_8, \
381
        AES_DECRYPT_FINAL_8, \
382
        c0, c1, c2, c3, c4, c5, c6, c7 \
383
    ) \
384
};
385

386

387

388
VOID
389
SYMCRYPT_CALL
390
SymCryptAesEncryptNeon(
391
    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
392
    _In_reads_( SYMCRYPT_AES_BLOCK_SIZE )   PCBYTE                      pbSrc,
393
    _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE                       pbDst )
394
{
395
    __n128 c;
396

397
    c = *( __n128 * ) pbSrc;
398

399
    AES_ENCRYPT_1( pExpandedKey, c );
400

401
    *(__n128 *) pbDst = c;
402
}
403

404
VOID
405
SYMCRYPT_CALL
406
SymCryptAesDecryptNeon(
407
    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
408
    _In_reads_( SYMCRYPT_AES_BLOCK_SIZE )   PCBYTE                      pbSrc,
409
    _Out_writes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE                       pbDst )
410
{
411
    __n128 c;
412

413
    c = *( __n128 * ) pbSrc;
414

415
    AES_DECRYPT_1( pExpandedKey, c );
416

417
    *(__n128 *) pbDst = c;
418
}
419

420

421
VOID
422
SYMCRYPT_CALL
423
SymCryptAesCbcEncryptNeon(
424
    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
425
    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
426
    _In_reads_( cbData )                        PCBYTE                      pbSrc,
427
    _Out_writes_( cbData )                      PBYTE                       pbDst,
428
                                                SIZE_T                      cbData )
429
{
430
    __n128 c = *(__n128 *)pbChainingValue;
431
    __n128 rk0 = *(__n128 *) &pExpandedKey->RoundKey[0];
432
    __n128 rkLast = *(__n128 *) pExpandedKey->lastEncRoundKey;
433
    __n128 d, rk0AndLast;
434

435
    // This algorithm is dominated by chain of dependent AES rounds, so we want to avoid EOR
436
    // instructions on the critical path where possible
437
    // We can compute (last round key ^ this plaintext block ^ first round key) off the critical
438
    // path and use this with AES_ENCRYPT_1_CHAIN so that only AES instructions write to c in
439
    // the main loop
440
    rk0AndLast = veorq_u8( rk0, rkLast );
441

442
    c = veorq_u8( c, rkLast );
443

444
    while( cbData >= SYMCRYPT_AES_BLOCK_SIZE )
445
    {
446
        d = veorq_u8( *(__n128 *)pbSrc, rk0AndLast);
447
        AES_ENCRYPT_1_CHAIN( pExpandedKey, c, d );
448
        *(__n128 *)pbDst = veorq_u8( c, rkLast );
449

450
        pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
451
        pbDst += SYMCRYPT_AES_BLOCK_SIZE;
452
        cbData -= SYMCRYPT_AES_BLOCK_SIZE;
453
    }
454
    *(__n128 *)pbChainingValue = veorq_u8( c, rkLast );
455
}
456

457
// Disable warnings and VC++ runtime checks for use of uninitialized values (by design)
458
#pragma warning(push)
459
#pragma warning( disable: 6001 4701 )
460
#pragma runtime_checks( "u", off )
461
VOID
462
SYMCRYPT_CALL
463
SymCryptAesCbcDecryptNeon(
464
    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
465
    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
466
    _In_reads_( cbData )                        PCBYTE                      pbSrc,
467
    _Out_writes_( cbData )                      PBYTE                       pbDst,
468
                                                SIZE_T                      cbData )
469
{
470
    __n128 chain;
471
    __n128 c0, c1, c2, c3, c4, c5, c6, c7;
472
    __n128 d0, d1, d2, d3, d4, d5, d6, d7;
473
    const __n128 * pSrc = (const __n128 *) pbSrc;
474
    __n128 * pDst = (__n128 *) pbDst;
475
    SIZE_T  cData = cbData / SYMCRYPT_AES_BLOCK_SIZE;
476

477
    if( cData < 1 )
478
    {
479
        return;
480
    }
481

482
    chain = *(__n128 *) pbChainingValue;
483

484
    //
485
    // First we do all multiples of 8 blocks
486
    //
487

488
    while( cData >= 8 )
489
    {
490
        d0 = c0 = pSrc[0];
491
        d1 = c1 = pSrc[1];
492
        d2 = c2 = pSrc[2];
493
        d3 = c3 = pSrc[3];
494
        d4 = c4 = pSrc[4];
495
        d5 = c5 = pSrc[5];
496
        d6 = c6 = pSrc[6];
497
        d7 = c7 = pSrc[7];
498

499
        AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
500

501
        c0 = veorq_u8( c0, chain );
502
        c1 = veorq_u8( c1, d0 );
503
        c2 = veorq_u8( c2, d1 );
504
        c3 = veorq_u8( c3, d2 );
505
        c4 = veorq_u8( c4, d3 );
506
        c5 = veorq_u8( c5, d4 );
507
        c6 = veorq_u8( c6, d5 );
508
        c7 = veorq_u8( c7, d6 );
509
        chain = d7;
510

511
        pDst[0] = c0;
512
        pDst[1] = c1;
513
        pDst[2] = c2;
514
        pDst[3] = c3;
515
        pDst[4] = c4;
516
        pDst[5] = c5;
517
        pDst[6] = c6;
518
        pDst[7] = c7;
519

520
        pSrc   += 8;
521
        pDst   += 8;
522
        cData  -= 8;
523
    }
524

525
    if( cData >= 1 )
526
    {
527
        //
528
        // There is remaining work to be done
529
        //
530
        d0 = c0 = pSrc[0];
531
        if( cData >= 2 )
532
        {
533
        d1 = c1 = pSrc[1];
534
            if( cData >= 3 )
535
            {
536
        d2 = c2 = pSrc[2];
537
                if( cData >= 4 )
538
                {
539
        d3 = c3 = pSrc[3];
540
                    if( cData >= 5 )
541
                    {
542
        d4 = c4 = pSrc[4];
543
                        if( cData >= 6 )
544
                        {
545
        d5 = c5 = pSrc[5];
546
                            if( cData >= 7 )
547
                            {
548
        d6 = c6 = pSrc[6];
549
                            }
550
                        }
551
                    }
552
                }
553
            }
554
        }
555

556
        //
557
        // Decrypt 1, 4, or 8 blocks in AES-CBC mode. This might decrypt uninitialized registers,
558
        // but those will not be used when we store the results.
559
        //
560
        if( cData > 4 )
561
        {
562
            AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
563
            c0 = veorq_u8( c0, chain );
564
            c1 = veorq_u8( c1, d0 );
565
            c2 = veorq_u8( c2, d1 );
566
            c3 = veorq_u8( c3, d2 );
567
            c4 = veorq_u8( c4, d3 );
568
            c5 = veorq_u8( c5, d4 );
569
            c6 = veorq_u8( c6, d5 );
570
        }
571
        else if( cData > 1 )
572
        {
573
            AES_DECRYPT_4( pExpandedKey, c0, c1, c2, c3 );
574
            c0 = veorq_u8( c0, chain );
575
            c1 = veorq_u8( c1, d0 );
576
            c2 = veorq_u8( c2, d1 );
577
            c3 = veorq_u8( c3, d2 );
578
        } else
579
        {
580
            AES_DECRYPT_1( pExpandedKey, c0 );
581
            c0 = veorq_u8( c0, chain );
582
        }
583

584
        chain = pSrc[ cData - 1];
585
        pDst[0] = c0;
586
        if( cData >= 2 )
587
        {
588
        pDst[1] = c1;
589
            if( cData >= 3 )
590
            {
591
        pDst[2] = c2;
592
                if( cData >= 4 )
593
                {
594
        pDst[3] = c3;
595
                    if( cData >= 5 )
596
                    {
597
        pDst[4] = c4;
598
                        if( cData >= 6 )
599
                        {
600
        pDst[5] = c5;
601
                            if( cData >= 7 )
602
                            {
603
        pDst[6] = c6;
604
                            }
605
                        }
606
                    }
607
                }
608
            }
609
        }
610
    }
611

612
    *(__n128 *)pbChainingValue = chain;
613

614
    return;
615
}
616
#pragma runtime_checks( "u", restore )
617
#pragma warning( pop )
618

619

620

621
VOID
622
SYMCRYPT_CALL
623
SymCryptAesCbcMacNeon(
624
    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
625
    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
626
    _In_reads_( cbData )                        PCBYTE                      pbData,
627
                                                SIZE_T                      cbData )
628
{
629
    __n128 c = *(__n128 *)pbChainingValue;
630
    __n128 rk0 = *(__n128 *) &pExpandedKey->RoundKey[0];
631
    __n128 rkLast = *(__n128 *) pExpandedKey->lastEncRoundKey;
632
    __n128 d, rk0AndLast;
633

634
    // This algorithm is dominated by chain of dependent AES rounds, so we want to avoid EOR
635
    // instructions on the critical path where possible
636
    // We can compute (last round key ^ this plaintext block ^ first round key) off the critical
637
    // path and use this with AES_ENCRYPT_1_CHAIN so that only AES instructions write to c in
638
    // the main loop
639
    rk0AndLast = veorq_u8( rk0, rkLast );
640

641
    c = veorq_u8( c, rkLast );
642

643
    while( cbData >= SYMCRYPT_AES_BLOCK_SIZE )
644
    {
645
        d = veorq_u8( *(__n128 *)pbData, rk0AndLast);
646
        AES_ENCRYPT_1_CHAIN( pExpandedKey, c, d );
647

648
        pbData += SYMCRYPT_AES_BLOCK_SIZE;
649
        cbData -= SYMCRYPT_AES_BLOCK_SIZE;
650
    }
651
    *(__n128 *)pbChainingValue = veorq_u8( c, rkLast );
652
}
653

654
// Disable warnings and VC++ runtime checks for use of uninitialized values (by design)
655
#pragma warning(push)
656
#pragma warning( disable: 6001 4701 )
657
#pragma runtime_checks( "u", off )
658
VOID
659
SYMCRYPT_CALL
660
SymCryptAesEcbEncryptNeon(
661
    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
662
    _In_reads_( cbData )                        PCBYTE                      pbSrc,
663
    _Out_writes_( cbData )                      PBYTE                       pbDst,
664
                                                SIZE_T                      cbData )
665
{
666
    __n128 c0, c1, c2, c3, c4, c5, c6, c7;
667
    const __n128 * pSrc = (const __n128 *) pbSrc;
668
    __n128 * pDst = (__n128 *) pbDst;
669

670
    while( cbData >= 8 * SYMCRYPT_AES_BLOCK_SIZE )
671
    {
672
        c0 = pSrc[0];
673
        c1 = pSrc[1];
674
        c2 = pSrc[2];
675
        c3 = pSrc[3];
676
        c4 = pSrc[4];
677
        c5 = pSrc[5];
678
        c6 = pSrc[6];
679
        c7 = pSrc[7];
680

681
        AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
682

683
        pDst[0] = c0;
684
        pDst[1] = c1;
685
        pDst[2] = c2;
686
        pDst[3] = c3;
687
        pDst[4] = c4;
688
        pDst[5] = c5;
689
        pDst[6] = c6;
690
        pDst[7] = c7;
691

692
        pSrc  += 8;
693
        pDst  += 8;
694
        cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
695
    }
696

697
    if( cbData < 16 )
698
    {
699
        return;
700
    }
701

702
    c0 = pSrc[0];
703
    if( cbData >= 32 )
704
    {
705
    c1 = pSrc[1];
706
        if( cbData >= 48 )
707
        {
708
    c2 = pSrc[2];
709
            if( cbData >= 64 )
710
            {
711
    c3 = pSrc[3];
712
                if( cbData >= 80 )
713
                {
714
    c4 = pSrc[4];
715
                    if( cbData >= 96 )
716
                    {
717
    c5 = pSrc[5];
718
                        if( cbData >= 112 )
719
                        {
720
    c6 = pSrc[6];
721
                        }
722
                    }
723
                }
724
            }
725
        }
726
    }
727

728
    if( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
729
    {
730
        AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
731
    }
732
    else if( cbData >= 2 * SYMCRYPT_AES_BLOCK_SIZE )
733
    {
734
        AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 );
735
    }
736
    else
737
    {
738
        AES_ENCRYPT_1( pExpandedKey, c0 );
739
    }
740

741
    pDst[0] = c0;
742
    if( cbData >= 32 )
743
    {
744
    pDst[1] = c1;
745
        if( cbData >= 48 )
746
        {
747
    pDst[2] = c2;
748
            if( cbData >= 64 )
749
            {
750
    pDst[3] = c3;
751
                if( cbData >= 80 )
752
                {
753
    pDst[4] = c4;
754
                    if( cbData >= 96 )
755
                    {
756
    pDst[5] = c5;
757
                        if( cbData >= 112 )
758
                        {
759
    pDst[6] = c6;
760
                        }
761
                    }
762
                }
763
            }
764
        }
765
    }
766
}
767
#pragma runtime_checks( "u", restore)
768
#pragma warning( pop )
769

770
#pragma warning(push)
771
#pragma warning( disable:4701 ) // "Use of uninitialized variable"
772
#pragma runtime_checks( "u", off )
773

774
#define SYMCRYPT_AesCtrMsbXxNeon    SymCryptAesCtrMsb64Neon
775
#define VADDQ_UXX                   vaddq_u64
776
#define VSUBQ_UXX                   vsubq_u64
777

778
#include "aes-pattern.c"
779

780
#undef VSUBQ_UXX
781
#undef VADDQ_UXX
782
#undef SYMCRYPT_AesCtrMsbXxNeon
783

784
#define SYMCRYPT_AesCtrMsbXxNeon    SymCryptAesCtrMsb32Neon
785
#define VADDQ_UXX                   vaddq_u32
786
#define VSUBQ_UXX                   vsubq_u32
787

788
#include "aes-pattern.c"
789

790
#undef VSUBQ_UXX
791
#undef VADDQ_UXX
792
#undef SYMCRYPT_AesCtrMsbXxNeon
793

794
#pragma runtime_checks( "u", restore )
795
#pragma warning(pop)
796

797

798
//
799
// Multiply by alpha
800
//
801
// <</>> indicate shifts on 128-bit values
802
// <<<</>>>> indicate shifts on 32-bit values
803
//
804

805
// Multiply by ALPHA
806
// t1 = Input <<<< 1                        words shifted left by 1
807
// t2 = Input >>>> 31                       words shifted right by 31
808
// t1 = t1 ^ (t2 << 32)                     t1 = S << 1
809
// t2 = t2 >> 96                            t2 = highest bit of S
810
// t2 = (t2 <<<< 7) + (t2 <<<<3) - (t2)     multiply polynomially by 0x87 , we can use - because we only have one bit input
811
// res = t1 ^ t2
812
//
813
#define XTS_MUL_ALPHA_old( _in, _res ) \
814
{\
815
    __n128 _t1, _t2;\
816
\
817
    _t1 = vshlq_n_u32( _in, 1 ); \
818
    _t2 = vshrq_n_u32( _in, 31); \
819
    _t1 = veorq_u32( _t1, vextq_u32( vZero, _t2, 3 )); \
820
    _t2 = vextq_u32( _t2, vZero, 3); \
821
    _t2 = vsubq_u32( vaddq_u32( vshlq_n_u32( _t2, 7 ), vshlq_n_u32( _t2, 3 ) ), _t2 ); \
822
    _res = veorq_u32( _t1, _t2 ); \
823
}
824

825
//
826
// Another approach, use signed shift right to duplicate the bits of the leftmost byte
827
// and an AND to mask the modulo reduction and the extraneous bits in the other bytes at the same time.
828
// vAlphaMask = (1, 1, ..., 1, 0x87 )
829
//
830
#define XTS_MUL_ALPHA( _in, _res ) \
831
{\
832
    __n128 _t1, _t2;\
833
\
834
    _t1 = vshlq_n_u8( _in, 1 ); \
835
    _t2 = vshrq_n_s8( _in, 7 ); \
836
    _t2 = vextq_u8( _t2, _t2, 15 ); \
837
    _t2 = vandq_u8( _t2, vAlphaMask ); \
838
    _res = veorq_u8( _t2, _t1 ); \
839
}
840

841

842
// Multiply by ALPHA^2
843
// t1 = Input <<<< 2
844
// t2 = Input >>>> 30
845
// t1 = t1 ^ (t2 << 32)
846
// t2 = t2 >> 96
847
// t2 = (t2 <<<< 7) ^ (t2 <<<< 2) ^ (t2 <<<< 1) ^ t2
848
// res = t1 ^ t2
849
#define XTS_MUL_ALPHA2( _in, _res ) \
850
{\
851
    __n128 _t1, _t2;\
852
\
853
    _t1 = vshlq_n_u32( _in, 2 ); \
854
    _t2 = vshrq_n_u32( _in, 30); \
855
    _t1 = veorq_u32( _t1, vextq_u32( vZero, _t2, 3 )); \
856
    _t2 = vextq_u32( _t2, vZero, 3 ); \
857
    _t2 = veorq_u32( veorq_u32( veorq_u32( _t2, vshlq_n_u32( _t2, 7 )), vshlq_n_u32( _t2, 2 ) ), vshlq_n_u32( _t2, 1 ) ); \
858
    _res = veorq_u32( _t1, _t2 ); \
859
}
860

861
// Multiply by ALPHA^4
862
// t1 = Input <<<< 4
863
// t2 = Input >>>> 28
864
// t1 = t1 ^ (t2 << 32)
865
// t2 = t2 >> 96
866
// t2 = (t2 <<<< 7) ^ (t2 <<<< 2) ^ (t2 <<<< 1) ^ t2
867
// res = t1 ^ t2
868
#define XTS_MUL_ALPHA4( _in, _res ) \
869
{\
870
    __n128 _t1, _t2;\
871
\
872
    _t1 = vshlq_n_u32( _in, 4 ); \
873
    _t2 = vshrq_n_u32( _in, 28); \
874
    _t1 = veorq_u32( _t1, vextq_u32( vZero, _t2, 3 )); \
875
    _t2 = vextq_u32( _t2, vZero, 3 ); \
876
    _t2 = veorq_u32( veorq_u32( veorq_u32( _t2, vshlq_n_u32( _t2, 7 )), vshlq_n_u32( _t2, 2 ) ), vshlq_n_u32( _t2, 1 ) ); \
877
    _res = veorq_u32( _t1, _t2 ); \
878
}
879

880
#define XTS_MUL_ALPHA5( _in, _res ) \
881
{\
882
    __n128 _t1, _t2;\
883
\
884
    _t1 = vshlq_n_u32( _in, 5 ); \
885
    _t2 = vshrq_n_u32( _in, 27); \
886
    _t1 = veorq_u32( _t1, vextq_u32( vZero, _t2, 3 )); \
887
    _t2 = vextq_u32( _t2, vZero, 3 ); \
888
    _t2 = veorq_u32( veorq_u32( veorq_u32( _t2, vshlq_n_u32( _t2, 7 )), vshlq_n_u32( _t2, 2 ) ), vshlq_n_u32( _t2, 1 ) ); \
889
    _res = veorq_u32( _t1, _t2 ); \
890
}
891

892
// Multiply by ALPHA^8
893
// res = (Input << 8) | (Input >> 120)
894
// t2 = (Input >> 120) * 0x86
895
//      i.e. ((Input >> 120) <<<< 7) ^ ((Input >> 120) <<<< 2) ^ ((Input >> 120) <<<< 1)
896
//           the 0x01 component is already in res where we want it
897
// res = res ^ t2
898
//
899
// vAlphaMultiplier = (0, 0, ..., 0, 0x86 )
900

901
#define XTS_MUL_ALPHA8( _in, _res ) \
902
{\
903
    __n128 _t2;\
904
\
905
    _res = vextq_u8( _in, _in, 15 ); \
906
    _t2 = vmull_p8( vget_low_p8(_res), vAlphaMultiplier ); \
907
    _res = veorq_u32( _res, _t2 ); \
908
}
909

910

911
VOID
912
SYMCRYPT_CALL
913
SymCryptXtsAesEncryptDataUnitNeon(
914
    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
915
    _Inout_updates_(SYMCRYPT_AES_BLOCK_SIZE)PBYTE                       pbTweakBlock,
916
    _In_reads_( cbData )                    PCBYTE                      pbSrc,
917
    _Out_writes_( cbData )                  PBYTE                       pbDst,
918
                                            SIZE_T                      cbData )
919
{
920
    __n128 t0, t1, t2, t3, t4, t5, t6, t7;
921
    __n128 c0, c1, c2, c3, c4, c5, c6, c7;
922
    const __n128 vZero = vmovq_n_u8(0);
923
    const __n128 vAlphaMask = SYMCRYPT_SET_N128_U8(0x87, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
924
    const __n64 vAlphaMultiplier = SYMCRYPT_SET_N64_U64(0x0000000000000086);
925

926
    SIZE_T cbDataMain;  // number of bytes to handle in the main loop
927
    SIZE_T cbDataTail;  // number of bytes to handle in the tail loop
928
    BYTE tailBuf[2*SYMCRYPT_AES_BLOCK_SIZE];
929

930
    SYMCRYPT_ASSERT(cbData >= SYMCRYPT_AES_BLOCK_SIZE);
931

932
    // To simplify logic and unusual size processing, we handle all
933
    // data not a multiple of 8 blocks in the tail loop
934
    cbDataTail = cbData & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1);
935
    // Additionally, so that ciphertext stealing logic does not rely on
936
    // reading back from the destination buffer, when we have a non-zero
937
    // tail, we ensure that we handle at least 1 whole block in the tail
938
    //
939
    // Note that our caller has ensured we have at least 1 whole block
940
    // to process, this is checked in debug build
941
    // This means that cbDataTail is in [1,15] at this point iff there are
942
    // at least 8 whole blocks to process; so the below does not cause
943
    // cbDataTail or cbDataMain to exceed cbData
944
    cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (8*SYMCRYPT_AES_BLOCK_SIZE) : 0;
945
    cbDataMain = cbData - cbDataTail;
946

947
    SYMCRYPT_ASSERT(cbDataMain <= cbData);
948
    SYMCRYPT_ASSERT(cbDataTail <= cbData);
949
    SYMCRYPT_ASSERT((cbDataMain & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0);
950

951
    t0 = *(__n128 *)pbTweakBlock;
952

953
    if( cbDataMain > 0 )
954
    {
955
        // Set up for main loop entry
956
        // NOTE: We load the first 8 blocks and store the last 8 blocks out of the loop to allow
957
        // greater instruction interleaving in the main loop.
958
        // This appears to give about 5-8% performance uplift on little (in-order) cores and has
959
        // no effect on big cores.
960
        XTS_MUL_ALPHA4( t0, t4 );
961
        XTS_MUL_ALPHA ( t0, t1 );
962
        XTS_MUL_ALPHA ( t4, t5 );
963
        XTS_MUL_ALPHA ( t1, t2 );
964
        XTS_MUL_ALPHA ( t5, t6 );
965
        XTS_MUL_ALPHA ( t2, t3 );
966
        XTS_MUL_ALPHA ( t6, t7 );
967

968
        c0 = veorq_u32( vld1q_u8( pbSrc + (0*16) ), t0 );
969
        c1 = veorq_u32( vld1q_u8( pbSrc + (1*16) ), t1 );
970
        c2 = veorq_u32( vld1q_u8( pbSrc + (2*16) ), t2 );
971
        c3 = veorq_u32( vld1q_u8( pbSrc + (3*16) ), t3 );
972
        c4 = veorq_u32( vld1q_u8( pbSrc + (4*16) ), t4 );
973
        c5 = veorq_u32( vld1q_u8( pbSrc + (5*16) ), t5 );
974
        c6 = veorq_u32( vld1q_u8( pbSrc + (6*16) ), t6 );
975
        c7 = veorq_u32( vld1q_u8( pbSrc + (7*16) ), t7 );
976

977
        for(;;)
978
        {
979
            pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
980

981
            AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
982

983
            cbDataMain -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
984
            if( cbDataMain < 8 * SYMCRYPT_AES_BLOCK_SIZE )
985
            {
986
                break;
987
            }
988

989
            // Interleave the final xor, write, and compute next tweak block, and load, and first xor.
990
            // This reduces register pressure and is more efficient.
991
            vst1q_u8( pbDst + (0*16), veorq_u32( c0, t0 ) );
992
            vst1q_u8( pbDst + (1*16), veorq_u32( c1, t1 ) );
993
            vst1q_u8( pbDst + (2*16), veorq_u32( c2, t2 ) );
994
            vst1q_u8( pbDst + (3*16), veorq_u32( c3, t3 ) );
995
            vst1q_u8( pbDst + (4*16), veorq_u32( c4, t4 ) );
996
            vst1q_u8( pbDst + (5*16), veorq_u32( c5, t5 ) );
997
            vst1q_u8( pbDst + (6*16), veorq_u32( c6, t6 ) );
998
            vst1q_u8( pbDst + (7*16), veorq_u32( c7, t7 ) );
999

1000
            XTS_MUL_ALPHA8( t0, t0 );
1001
            XTS_MUL_ALPHA8( t1, t1 );
1002
            XTS_MUL_ALPHA8( t2, t2 );
1003
            XTS_MUL_ALPHA8( t3, t3 );
1004
            XTS_MUL_ALPHA8( t4, t4 );
1005
            XTS_MUL_ALPHA8( t5, t5 );
1006
            XTS_MUL_ALPHA8( t6, t6 );
1007
            XTS_MUL_ALPHA8( t7, t7 );
1008

1009
            c0 = veorq_u32( vld1q_u8( pbSrc + (0*16) ), t0 );
1010
            c1 = veorq_u32( vld1q_u8( pbSrc + (1*16) ), t1 );
1011
            c2 = veorq_u32( vld1q_u8( pbSrc + (2*16) ), t2 );
1012
            c3 = veorq_u32( vld1q_u8( pbSrc + (3*16) ), t3 );
1013
            c4 = veorq_u32( vld1q_u8( pbSrc + (4*16) ), t4 );
1014
            c5 = veorq_u32( vld1q_u8( pbSrc + (5*16) ), t5 );
1015
            c6 = veorq_u32( vld1q_u8( pbSrc + (6*16) ), t6 );
1016
            c7 = veorq_u32( vld1q_u8( pbSrc + (7*16) ), t7 );
1017

1018
            pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1019
        }
1020

1021
        vst1q_u8( pbDst + (0*16), veorq_u32( c0, t0 ) );
1022
        vst1q_u8( pbDst + (1*16), veorq_u32( c1, t1 ) );
1023
        vst1q_u8( pbDst + (2*16), veorq_u32( c2, t2 ) );
1024
        vst1q_u8( pbDst + (3*16), veorq_u32( c3, t3 ) );
1025
        vst1q_u8( pbDst + (4*16), veorq_u32( c4, t4 ) );
1026
        vst1q_u8( pbDst + (5*16), veorq_u32( c5, t5 ) );
1027
        vst1q_u8( pbDst + (6*16), veorq_u32( c6, t6 ) );
1028
        vst1q_u8( pbDst + (7*16), veorq_u32( c7, t7 ) );
1029

1030
        // We won't do another 8-block set
1031
        // Update only the first tweak block in case it is needed for tail
1032
        XTS_MUL_ALPHA8( t0, t0 );
1033

1034
        pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1035
    }
1036

1037
    if( cbDataTail == 0 )
1038
    {
1039
        return; // <-- expected case; early return here
1040
    }
1041

1042
    // Rare case, with data unit length not being multiple of 128 bytes, handle the tail one block at a time
1043
    while( cbDataTail >= 2*SYMCRYPT_AES_BLOCK_SIZE )
1044
    {
1045
        c0 = veorq_u32( vld1q_u8(pbSrc), t0 );
1046
        pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
1047
        AES_ENCRYPT_1( pExpandedKey, c0 );
1048
        vst1q_u8( pbDst, veorq_u32( c0, t0 ) );
1049
        pbDst += SYMCRYPT_AES_BLOCK_SIZE;
1050
        XTS_MUL_ALPHA( t0, t0 );
1051
        cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
1052
    }
1053

1054
    if( cbDataTail > SYMCRYPT_AES_BLOCK_SIZE )
1055
    {
1056
        // Ciphertext stealing encryption
1057
        //
1058
        //                      +--------------+
1059
        //                      |              |
1060
        //                      |              V
1061
        // +-----------------+  |  +-----+-----------+
1062
        // |      P_m-1      |  |  | P_m |++++CP+++++|
1063
        // +-----------------+  |  +-----+-----------+
1064
        //          |           |           |
1065
        //       enc_m-1        |         enc_m
1066
        //          |           |           |
1067
        //          V           |           V
1068
        // +-----+-----------+  |  +-----------------+
1069
        // | C_m |++++CP+++++|--+  |      C_m-1      |
1070
        // +-----+-----------+     +-----------------+
1071
        //    |                   /
1072
        //    +----------------  /  --+
1073
        //                      /     |
1074
        //                      |     V
1075
        // +-----------------+  |  +-----+
1076
        // |      C_m-1      |<-+  | C_m |
1077
        // +-----------------+     +-----+
1078

1079
        // Encrypt penultimate plaintext block into tailBuf
1080
        c0 = veorq_u32( vld1q_u8(pbSrc), t0 );
1081
        AES_ENCRYPT_1( pExpandedKey, c0 );
1082
        c0 = veorq_u32( c0, t0 );
1083
        vst1q_u8( &tailBuf[0], c0 );
1084
        vst1q_u8( &tailBuf[SYMCRYPT_AES_BLOCK_SIZE], c0 );
1085

1086
        cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
1087

1088
        // Copy final plaintext bytes to prefix of tailBuf - we must read before writing to support in-place encryption
1089
        memcpy( &tailBuf[0], pbSrc + SYMCRYPT_AES_BLOCK_SIZE, cbDataTail );
1090
        // Copy prefix of tailBuf[SYMCRYPT_AES_BLOCK_SIZE] to the right place in the destination buffer
1091
        memcpy( pbDst + SYMCRYPT_AES_BLOCK_SIZE, &tailBuf[SYMCRYPT_AES_BLOCK_SIZE], cbDataTail );
1092

1093
        // Do final tweak update
1094
        XTS_MUL_ALPHA( t0, t0 );
1095

1096
        // Load updated tailBuf into c0
1097
        c0 = vld1q_u8( &tailBuf[0] );
1098
    } else {
1099
        // Just load final plaintext block into c0
1100
        c0 = vld1q_u8( pbSrc );
1101
    }
1102

1103
    // Final full block encryption
1104
    c0 = veorq_u32( c0, t0 );
1105
    AES_ENCRYPT_1( pExpandedKey, c0 );
1106
    vst1q_u8( pbDst, veorq_u32( c0, t0 ) );
1107
}
1108

1109

1110
VOID
1111
SYMCRYPT_CALL
1112
SymCryptXtsAesDecryptDataUnitNeon(
1113
    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
1114
    _Inout_updates_(SYMCRYPT_AES_BLOCK_SIZE)PBYTE                       pbTweakBlock,
1115
    _In_reads_( cbData )                    PCBYTE                      pbSrc,
1116
    _Out_writes_( cbData )                  PBYTE                       pbDst,
1117
                                            SIZE_T                      cbData )
1118
{
1119
    __n128 t0, t1, t2, t3, t4, t5, t6, t7;
1120
    __n128 c0, c1, c2, c3, c4, c5, c6, c7;
1121
    const __n128 vZero = vmovq_n_u8(0);
1122
    const __n128 vAlphaMask = SYMCRYPT_SET_N128_U8(0x87, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
1123
    const __n64 vAlphaMultiplier = SYMCRYPT_SET_N64_U64(0x0000000000000086);
1124

1125
    SIZE_T cbDataMain;  // number of bytes to handle in the main loop
1126
    SIZE_T cbDataTail;  // number of bytes to handle in the tail loop
1127
    BYTE tailBuf[2*SYMCRYPT_AES_BLOCK_SIZE];
1128

1129
    SYMCRYPT_ASSERT(cbData >= SYMCRYPT_AES_BLOCK_SIZE);
1130

1131
    // To simplify logic and unusual size processing, we handle all
1132
    // data not a multiple of 8 blocks in the tail loop
1133
    cbDataTail = cbData & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1);
1134
    // Additionally, so that ciphertext stealing logic does not rely on
1135
    // reading back from the destination buffer, when we have a non-zero
1136
    // tail, we ensure that we handle at least 1 whole block in the tail
1137
    //
1138
    // Note that our caller has ensured we have at least 1 whole block
1139
    // to process, this is checked in debug build
1140
    // This means that cbDataTail is in [1,15] at this point iff there are
1141
    // at least 8 whole blocks to process; so the below does not cause
1142
    // cbDataTail or cbDataMain to exceed cbData
1143
    cbDataTail += ((cbDataTail > 0) && (cbDataTail < SYMCRYPT_AES_BLOCK_SIZE)) ? (8*SYMCRYPT_AES_BLOCK_SIZE) : 0;
1144
    cbDataMain = cbData - cbDataTail;
1145

1146
    SYMCRYPT_ASSERT(cbDataMain <= cbData);
1147
    SYMCRYPT_ASSERT(cbDataTail <= cbData);
1148
    SYMCRYPT_ASSERT((cbDataMain & ((8*SYMCRYPT_AES_BLOCK_SIZE)-1)) == 0);
1149

1150
    t0 = *(__n128 *)pbTweakBlock;
1151
    t7 = t0;
1152

1153
    if( cbDataMain > 0 )
1154
    {
1155
        // Set up for main loop entry
1156
        // NOTE: We load the first 8 blocks and store the last 8 blocks out of the loop to allow
1157
        // greater instruction interleaving in the main loop.
1158
        // This appears to give about 5-8% performance uplift on little (in-order) cores and has
1159
        // no effect on big cores.
1160
        XTS_MUL_ALPHA4( t0, t4 );
1161
        XTS_MUL_ALPHA ( t0, t1 );
1162
        XTS_MUL_ALPHA ( t4, t5 );
1163
        XTS_MUL_ALPHA ( t1, t2 );
1164
        XTS_MUL_ALPHA ( t5, t6 );
1165
        XTS_MUL_ALPHA ( t2, t3 );
1166
        XTS_MUL_ALPHA ( t6, t7 );
1167

1168
        c0 = veorq_u32( vld1q_u8( pbSrc + (0*16) ), t0 );
1169
        c1 = veorq_u32( vld1q_u8( pbSrc + (1*16) ), t1 );
1170
        c2 = veorq_u32( vld1q_u8( pbSrc + (2*16) ), t2 );
1171
        c3 = veorq_u32( vld1q_u8( pbSrc + (3*16) ), t3 );
1172
        c4 = veorq_u32( vld1q_u8( pbSrc + (4*16) ), t4 );
1173
        c5 = veorq_u32( vld1q_u8( pbSrc + (5*16) ), t5 );
1174
        c6 = veorq_u32( vld1q_u8( pbSrc + (6*16) ), t6 );
1175
        c7 = veorq_u32( vld1q_u8( pbSrc + (7*16) ), t7 );
1176

1177
        for(;;)
1178
        {
1179
            pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1180

1181
            AES_DECRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
1182

1183
            cbDataMain -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
1184
            if( cbDataMain < 8 * SYMCRYPT_AES_BLOCK_SIZE )
1185
            {
1186
                break;
1187
            }
1188

1189
            // Interleave the final xor, write, and compute next tweak block, and load, and first xor.
1190
            // This reduces register pressure and is more efficient.
1191
            vst1q_u8( pbDst + (0*16), veorq_u32( c0, t0 ) );
1192
            vst1q_u8( pbDst + (1*16), veorq_u32( c1, t1 ) );
1193
            vst1q_u8( pbDst + (2*16), veorq_u32( c2, t2 ) );
1194
            vst1q_u8( pbDst + (3*16), veorq_u32( c3, t3 ) );
1195
            vst1q_u8( pbDst + (4*16), veorq_u32( c4, t4 ) );
1196
            vst1q_u8( pbDst + (5*16), veorq_u32( c5, t5 ) );
1197
            vst1q_u8( pbDst + (6*16), veorq_u32( c6, t6 ) );
1198
            vst1q_u8( pbDst + (7*16), veorq_u32( c7, t7 ) );
1199

1200
            XTS_MUL_ALPHA8( t0, t0 );
1201
            XTS_MUL_ALPHA8( t1, t1 );
1202
            XTS_MUL_ALPHA8( t2, t2 );
1203
            XTS_MUL_ALPHA8( t3, t3 );
1204
            XTS_MUL_ALPHA8( t4, t4 );
1205
            XTS_MUL_ALPHA8( t5, t5 );
1206
            XTS_MUL_ALPHA8( t6, t6 );
1207
            XTS_MUL_ALPHA8( t7, t7 );
1208

1209
            c0 = veorq_u32( vld1q_u8( pbSrc + (0*16) ), t0 );
1210
            c1 = veorq_u32( vld1q_u8( pbSrc + (1*16) ), t1 );
1211
            c2 = veorq_u32( vld1q_u8( pbSrc + (2*16) ), t2 );
1212
            c3 = veorq_u32( vld1q_u8( pbSrc + (3*16) ), t3 );
1213
            c4 = veorq_u32( vld1q_u8( pbSrc + (4*16) ), t4 );
1214
            c5 = veorq_u32( vld1q_u8( pbSrc + (5*16) ), t5 );
1215
            c6 = veorq_u32( vld1q_u8( pbSrc + (6*16) ), t6 );
1216
            c7 = veorq_u32( vld1q_u8( pbSrc + (7*16) ), t7 );
1217

1218
            pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1219
        }
1220

1221
        vst1q_u8( pbDst + (0*16), veorq_u32( c0, t0 ) );
1222
        vst1q_u8( pbDst + (1*16), veorq_u32( c1, t1 ) );
1223
        vst1q_u8( pbDst + (2*16), veorq_u32( c2, t2 ) );
1224
        vst1q_u8( pbDst + (3*16), veorq_u32( c3, t3 ) );
1225
        vst1q_u8( pbDst + (4*16), veorq_u32( c4, t4 ) );
1226
        vst1q_u8( pbDst + (5*16), veorq_u32( c5, t5 ) );
1227
        vst1q_u8( pbDst + (6*16), veorq_u32( c6, t6 ) );
1228
        vst1q_u8( pbDst + (7*16), veorq_u32( c7, t7 ) );
1229

1230
        // We won't do another 8-block set
1231
        // Update only the first tweak block in case it is needed for tail
1232
        XTS_MUL_ALPHA8( t0, t0 );
1233

1234
        pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
1235
    }
1236

1237
    if( cbDataTail == 0 )
1238
    {
1239
        return; // <-- expected case; early return here
1240
    }
1241

1242
    // Rare case, with data unit length not being multiple of 128 bytes, handle the tail one block at a time
1243
    while( cbDataTail >= 2*SYMCRYPT_AES_BLOCK_SIZE )
1244
    {
1245
        c0 = veorq_u32( vld1q_u8( pbSrc ), t0 );
1246
        pbSrc += SYMCRYPT_AES_BLOCK_SIZE;
1247
        AES_DECRYPT_1( pExpandedKey, c0 );
1248
        vst1q_u8( pbDst, veorq_u32( c0, t0 ) );
1249
        pbDst += SYMCRYPT_AES_BLOCK_SIZE;
1250
        XTS_MUL_ALPHA( t0, t0 );
1251
        cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
1252
    }
1253

1254
    if( cbDataTail > SYMCRYPT_AES_BLOCK_SIZE )
1255
    {
1256
        // Ciphertext stealing decryption
1257
        //
1258
        //                      +--------------+
1259
        //                      |              |
1260
        //                      |              V
1261
        // +-----------------+  |  +-----+-----------+
1262
        // |      C_m-1      |  |  | C_m |++++CP+++++|
1263
        // +-----------------+  |  +-----+-----------+
1264
        //          |           |           |
1265
        //        dec_m         |        dec_m-1
1266
        //          |           |           |
1267
        //          V           |           V
1268
        // +-----+-----------+  |  +-----------------+
1269
        // | P_m |++++CP+++++|--+  |      P_m-1      |
1270
        // +-----+-----------+     +-----------------+
1271
        //    |                   /
1272
        //    +----------------  /  --+
1273
        //                      /     |
1274
        //                      |     V
1275
        // +-----------------+  |  +-----+
1276
        // |      P_m-1      |<-+  | P_m |
1277
        // +-----------------+     +-----+
1278

1279
        // Do final tweak update into t1
1280
        // Penultimate tweak is in t0, ready for final decryption
1281
        XTS_MUL_ALPHA( t0, t1 );
1282

1283
        // Decrypt penultimate ciphertext block into tailBuf
1284
        c0 = veorq_u32( vld1q_u8( pbSrc ), t1 );
1285
        AES_DECRYPT_1( pExpandedKey, c0 );
1286
        c0 = veorq_u32( c0, t1 );
1287
        vst1q_u8( &tailBuf[0], c0 );
1288
        vst1q_u8( &tailBuf[SYMCRYPT_AES_BLOCK_SIZE], c0 );
1289

1290
        cbDataTail -= SYMCRYPT_AES_BLOCK_SIZE;
1291

1292
        // Copy final ciphertext bytes to prefix of tailBuf - we must read before writing to support in-place decryption
1293
        memcpy( &tailBuf[0], pbSrc + SYMCRYPT_AES_BLOCK_SIZE, cbDataTail );
1294
        // Copy prefix of tailBuf[SYMCRYPT_AES_BLOCK_SIZE] to the right place in the destination buffer
1295
        memcpy( pbDst + SYMCRYPT_AES_BLOCK_SIZE, &tailBuf[SYMCRYPT_AES_BLOCK_SIZE], cbDataTail );
1296

1297
        // Load updated tailBuf into c0
1298
        c0 = vld1q_u8( &tailBuf[0] );
1299
    } else {
1300
        // Just load final ciphertext block into c0
1301
        c0 = vld1q_u8( pbSrc );
1302
    }
1303

1304
    // Final full block decryption
1305
    c0 = veorq_u32( c0, t0 );
1306
    AES_DECRYPT_1( pExpandedKey, c0 );
1307
    vst1q_u8( pbDst, veorq_u32( c0, t0 ) );
1308
}
1309

1310
#include "ghash_definitions.h"
1311

1312
#define AES_ENCRYPT_ROUND_4_GHASH_1( c0, c1, c2, c3, r0, r0x, t0, t1, gHashPointer, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1313
{ \
1314
    AESE_AESMC( c0, roundKey ) \
1315
    AESE_AESMC( c1, roundKey ) \
1316
    AESE_AESMC( c2, roundKey ) \
1317
    AESE_AESMC( c3, roundKey ) \
1318
\
1319
    r0x = *gHashPointer; \
1320
    r0x = vrev64q_u8( r0x ); \
1321
    r0 = vextq_u8( r0x, r0x, 8 ); \
1322
    r0x = veorq_u8( r0, r0x ); \
1323
    gHashPointer++; \
1324
\
1325
    t1 = GHASH_H_POWER(gHashExpandedKeyTable, todo); \
1326
    t0 = vmullq_p64( r0, t1 ); \
1327
    t1 = vmull_high_p64( r0, t1 ); \
1328
\
1329
    resl = veorq_u8( resl, t0 ); \
1330
    resh = veorq_u8( resh, t1 ); \
1331
\
1332
    t1 = GHASH_Hx_POWER(gHashExpandedKeyTable, todo); \
1333
    t1 = vmullq_p64( r0x, t1 ); \
1334
\
1335
    resm = veorq_u8( resm, t1 ); \
1336
    todo--; \
1337
};
1338

1339
//
1340
// Using a loop with AESE_AESMC and AESD_AESIMC, the compiler can still prematurely rearrange the loop and
1341
// lose opportunity for scheduling adjacent pairs.
1342
// Instead, explicitly unroll the AES rounds with this macro.
1343
//
1344
#define AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, gHashPointer, gHashRounds, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1345
{ \
1346
    const __n128 *keyPtr; \
1347
    const __n128 *keyLimit; \
1348
    __n128 roundKey; \
1349
\
1350
    keyPtr = (const __n128 *)&pExpandedKey->RoundKey[0]; \
1351
    keyLimit = (const __n128 *)pExpandedKey->lastEncRoundKey; \
1352
    __n128 t0, t1, r0, r0x; \
1353
    SIZE_T aesEncryptGhashLoop; \
1354
\
1355
    /* Do gHashRounds full rounds (AES-128|AES-192|AES-256) with stitched GHASH */ \
1356
    roundKey = *keyPtr++; \
1357
    for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < gHashRounds; aesEncryptGhashLoop++) \
1358
    { \
1359
        AES_ENCRYPT_ROUND_4_GHASH_1( c0, c1, c2, c3, r0, r0x, t0, t1, gHashPointer, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1360
        roundKey = *keyPtr++; \
1361
    } \
1362
\
1363
    /* Do 9-gHashRounds full rounds (AES-128|AES-192|AES-256) */ \
1364
    for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < (9-gHashRounds); aesEncryptGhashLoop++) \
1365
    { \
1366
        AES_ENCRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1367
        roundKey = *keyPtr++; \
1368
    } \
1369
\
1370
    if ( keyPtr < keyLimit ) \
1371
    { \
1372
        /* Do 2 more full rounds (AES-192|AES-256) */ \
1373
        AES_ENCRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1374
        roundKey = *keyPtr++; \
1375
        AES_ENCRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1376
        roundKey = *keyPtr++; \
1377
\
1378
        if ( keyPtr < keyLimit ) \
1379
        { \
1380
            /* Do 2 more full rounds (AES-256) */ \
1381
            AES_ENCRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1382
            roundKey = *keyPtr++; \
1383
            AES_ENCRYPT_ROUND_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1384
            roundKey = *keyPtr++; \
1385
        } \
1386
    } \
1387
\
1388
    /* Do final round (AES-128|AES-192|AES-256) */ \
1389
    AES_ENCRYPT_FINAL_4( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1390
};
1391

1392
#define AES_ENCRYPT_ROUND_8_GHASH_1( c0, c1, c2, c3, c4, c5, c6, c7, r0, r0x, t0, t1, gHashPointer, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1393
{ \
1394
    AESE_AESMC( c0, roundKey ) \
1395
    AESE_AESMC( c1, roundKey ) \
1396
    AESE_AESMC( c2, roundKey ) \
1397
    AESE_AESMC( c3, roundKey ) \
1398
    AESE_AESMC( c4, roundKey ) \
1399
    AESE_AESMC( c5, roundKey ) \
1400
    AESE_AESMC( c6, roundKey ) \
1401
    AESE_AESMC( c7, roundKey ) \
1402
\
1403
    r0x = *gHashPointer; \
1404
    r0x = vrev64q_u8( r0x ); \
1405
    r0 = vextq_u8( r0x, r0x, 8 ); \
1406
    r0x = veorq_u8( r0, r0x ); \
1407
    gHashPointer++; \
1408
\
1409
    t1 = GHASH_H_POWER(gHashExpandedKeyTable, todo); \
1410
    t0 = vmullq_p64( r0, t1 ); \
1411
    t1 = vmull_high_p64( r0, t1 ); \
1412
\
1413
    resl = veorq_u8( resl, t0 ); \
1414
    resh = veorq_u8( resh, t1 ); \
1415
\
1416
    t1 = GHASH_Hx_POWER(gHashExpandedKeyTable, todo); \
1417
    t1 = vmullq_p64( r0x, t1 ); \
1418
\
1419
    resm = veorq_u8( resm, t1 ); \
1420
    todo--; \
1421
};
1422

1423
//
1424
// Using a loop with AESE_AESMC and AESD_AESIMC, the compiler can still prematurely rearrange the loop and
1425
// lose opportunity for scheduling adjacent pairs.
1426
// Instead, explicitly unroll the AES rounds with this macro.
1427
//
1428
#define AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, gHashPointer, gHashRounds, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1429
{ \
1430
    const __n128 *keyPtr; \
1431
    const __n128 *keyLimit; \
1432
    __n128 roundKey; \
1433
\
1434
    keyPtr = (const __n128 *)&pExpandedKey->RoundKey[0]; \
1435
    keyLimit = (const __n128 *)pExpandedKey->lastEncRoundKey; \
1436
    __n128 t0, t1, r0, r0x; \
1437
    SIZE_T aesEncryptGhashLoop; \
1438
\
1439
    /* Do gHashRounds full rounds (AES-128|AES-192|AES-256) with stitched GHASH */ \
1440
    roundKey = *keyPtr++; \
1441
    for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < gHashRounds; aesEncryptGhashLoop++) \
1442
    { \
1443
        AES_ENCRYPT_ROUND_8_GHASH_1( c0, c1, c2, c3, c4, c5, c6, c7, r0, r0x, t0, t1, gHashPointer, gHashExpandedKeyTable, todo, resl, resm, resh ) \
1444
        roundKey = *keyPtr++; \
1445
    } \
1446
\
1447
    /* Do 9-gHashRounds full rounds (AES-128|AES-192|AES-256) */ \
1448
    for( aesEncryptGhashLoop = 0; aesEncryptGhashLoop < (9-gHashRounds); aesEncryptGhashLoop++) \
1449
    { \
1450
        AES_ENCRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1451
        roundKey = *keyPtr++; \
1452
    } \
1453
\
1454
    if ( keyPtr < keyLimit ) \
1455
    { \
1456
        /* Do 2 more full rounds (AES-192|AES-256) */ \
1457
        AES_ENCRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1458
        roundKey = *keyPtr++; \
1459
        AES_ENCRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1460
        roundKey = *keyPtr++; \
1461
\
1462
        if ( keyPtr < keyLimit ) \
1463
        { \
1464
            /* Do 2 more full rounds (AES-256) */ \
1465
            AES_ENCRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1466
            roundKey = *keyPtr++; \
1467
            AES_ENCRYPT_ROUND_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1468
            roundKey = *keyPtr++; \
1469
        } \
1470
    } \
1471
\
1472
    /* Do final round (AES-128|AES-192|AES-256) */ \
1473
    AES_ENCRYPT_FINAL_8( c0, c1, c2, c3, c4, c5, c6, c7 ) \
1474
};
1475

1476
// This call is functionally identical to:
1477
// SymCryptAesCtrMsb64Neon( pExpandedKey,
1478
//                          pbChainingValue,
1479
//                          pbSrc,
1480
//                          pbDst,
1481
//                          cbData );
1482
// SymCryptGHashAppendDataPmull(    expandedKeyTable,
1483
//                                  pState,
1484
//                                  pbDstOrig,
1485
//                                  cbDataOrig );
1486
VOID
1487
SYMCRYPT_CALL
1488
SymCryptAesGcmEncryptStitchedNeon(
1489
    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
1490
    _In_reads_( SYMCRYPT_AES_BLOCK_SIZE )   PBYTE                       pbChainingValue,
1491
    _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT    expandedKeyTable,
1492
    _Inout_                                 PSYMCRYPT_GF128_ELEMENT     pState,
1493
    _In_reads_( cbData )                    PCBYTE                      pbSrc,
1494
    _Out_writes_( cbData )                  PBYTE                       pbDst,
1495
                                            SIZE_T                      cbData )
1496
{
1497
    __n128          chain = *(__n128 *)pbChainingValue;
1498
    const __n128 *  pSrc = (const __n128 *) pbSrc;
1499
    const __n128 *  pGhashSrc = (const __n128 *) pbDst;
1500
    __n128 *        pDst = (__n128 *) pbDst;
1501

1502
    const __n128 chainIncrement1 = SYMCRYPT_SET_N128_U64( 0, 1 );
1503
    const __n128 chainIncrement2 = SYMCRYPT_SET_N128_U64( 0, 2 );
1504
    const __n128 chainIncrement8 = SYMCRYPT_SET_N128_U64( 0, 8 );
1505

1506
    __n128 ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7;
1507
    __n128 c0, c1, c2, c3, c4, c5, c6, c7;
1508
    __n128 r0, r1;
1509
    __n128 r0x, r1x;
1510

1511
    __n128 state;
1512
    __n128 a0, a1, a2;
1513
    const __n64 vMultiplicationConstant = SYMCRYPT_SET_N64_U64(0xc200000000000000);
1514
    SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
1515
    SIZE_T todo;
1516

1517
    SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size
1518

1519
    // Our chain variable is in integer format, not the MSBfirst format loaded from memory.
1520
    ctr0 = vrev64q_u8( chain );
1521
    ctr1 = vaddq_u32( ctr0, chainIncrement1 );
1522
    ctr2 = vaddq_u32( ctr0, chainIncrement2 );
1523
    ctr3 = vaddq_u32( ctr1, chainIncrement2 );
1524
    ctr4 = vaddq_u32( ctr2, chainIncrement2 );
1525
    ctr5 = vaddq_u32( ctr3, chainIncrement2 );
1526
    ctr6 = vaddq_u32( ctr4, chainIncrement2 );
1527
    ctr7 = vaddq_u32( ctr5, chainIncrement2 );
1528

1529
    state = *(__n128 *) pState;
1530

1531
    todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PMULL_HPOWERS );
1532
    CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1533

1534
    // Do 8 blocks of CTR either for tail (if total blocks <8) or for encryption of first 8 blocks
1535
    c0 = vrev64q_u8( ctr0 );
1536
    c1 = vrev64q_u8( ctr1 );
1537
    c2 = vrev64q_u8( ctr2 );
1538
    c3 = vrev64q_u8( ctr3 );
1539
    c4 = vrev64q_u8( ctr4 );
1540
    c5 = vrev64q_u8( ctr5 );
1541
    c6 = vrev64q_u8( ctr6 );
1542
    c7 = vrev64q_u8( ctr7 );
1543

1544
    AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
1545

1546
    if ( cbData >= 8 * SYMCRYPT_AES_BLOCK_SIZE )
1547
    {
1548
        ctr0 = vaddq_u32( ctr0, chainIncrement8 );
1549
        ctr1 = vaddq_u32( ctr1, chainIncrement8 );
1550
        ctr2 = vaddq_u32( ctr2, chainIncrement8 );
1551
        ctr3 = vaddq_u32( ctr3, chainIncrement8 );
1552
        ctr4 = vaddq_u32( ctr4, chainIncrement8 );
1553
        ctr5 = vaddq_u32( ctr5, chainIncrement8 );
1554
        ctr6 = vaddq_u32( ctr6, chainIncrement8 );
1555
        ctr7 = vaddq_u32( ctr7, chainIncrement8 );
1556

1557
        // Encrypt first 8 blocks
1558
        pDst[0] = veorq_u64( pSrc[0], c0 );
1559
        pDst[1] = veorq_u64( pSrc[1], c1 );
1560
        pDst[2] = veorq_u64( pSrc[2], c2 );
1561
        pDst[3] = veorq_u64( pSrc[3], c3 );
1562
        pDst[4] = veorq_u64( pSrc[4], c4 );
1563
        pDst[5] = veorq_u64( pSrc[5], c5 );
1564
        pDst[6] = veorq_u64( pSrc[6], c6 );
1565
        pDst[7] = veorq_u64( pSrc[7], c7 );
1566

1567
        pDst  += 8;
1568
        pSrc  += 8;
1569

1570
        while( nBlocks >= 16 )
1571
        {
1572
            // In this loop we always have 8 blocks to encrypt and we have already encrypted the previous 8 blocks ready for GHASH
1573
            c0 = vrev64q_u8( ctr0 );
1574
            c1 = vrev64q_u8( ctr1 );
1575
            c2 = vrev64q_u8( ctr2 );
1576
            c3 = vrev64q_u8( ctr3 );
1577
            c4 = vrev64q_u8( ctr4 );
1578
            c5 = vrev64q_u8( ctr5 );
1579
            c6 = vrev64q_u8( ctr6 );
1580
            c7 = vrev64q_u8( ctr7 );
1581

1582
            ctr0 = vaddq_u32( ctr0, chainIncrement8 );
1583
            ctr1 = vaddq_u32( ctr1, chainIncrement8 );
1584
            ctr2 = vaddq_u32( ctr2, chainIncrement8 );
1585
            ctr3 = vaddq_u32( ctr3, chainIncrement8 );
1586
            ctr4 = vaddq_u32( ctr4, chainIncrement8 );
1587
            ctr5 = vaddq_u32( ctr5, chainIncrement8 );
1588
            ctr6 = vaddq_u32( ctr6, chainIncrement8 );
1589
            ctr7 = vaddq_u32( ctr7, chainIncrement8 );
1590

1591
            AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pGhashSrc, 8, expandedKeyTable, todo, a0, a1, a2 );
1592

1593
            pDst[0] = veorq_u64( pSrc[0], c0 );
1594
            pDst[1] = veorq_u64( pSrc[1], c1 );
1595
            pDst[2] = veorq_u64( pSrc[2], c2 );
1596
            pDst[3] = veorq_u64( pSrc[3], c3 );
1597
            pDst[4] = veorq_u64( pSrc[4], c4 );
1598
            pDst[5] = veorq_u64( pSrc[5], c5 );
1599
            pDst[6] = veorq_u64( pSrc[6], c6 );
1600
            pDst[7] = veorq_u64( pSrc[7], c7 );
1601

1602
            pDst  += 8;
1603
            pSrc  += 8;
1604
            nBlocks -= 8;
1605

1606
            if (todo == 0)
1607
            {
1608
                CLMUL_3_POST( a0, a1, a2 );
1609
                MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1610

1611
                todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PMULL_HPOWERS );
1612
                CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1613
            }
1614
        }
1615

1616
        // We now have at least 8 blocks of encrypted data to GHASH and at most 7 blocks left to encrypt
1617
        // Do 8 blocks of GHASH in parallel with generating 0, 4, or 8 AES-CTR blocks for tail encryption
1618
        nBlocks -= 8;
1619
        if (nBlocks > 0)
1620
        {
1621
            c0 = vrev64q_u8( ctr0 );
1622
            c1 = vrev64q_u8( ctr1 );
1623
            c2 = vrev64q_u8( ctr2 );
1624
            c3 = vrev64q_u8( ctr3 );
1625

1626
            if (nBlocks > 4)
1627
            {
1628
                // Do 8 rounds of AES-CTR for tail in parallel with 8 rounds of GHASH
1629
                c4 = vrev64q_u8( ctr4 );
1630
                c5 = vrev64q_u8( ctr5 );
1631
                c6 = vrev64q_u8( ctr6 );
1632

1633
                AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pGhashSrc, 8, expandedKeyTable, todo, a0, a1, a2 );
1634
            }
1635
            else
1636
            {
1637
                // Do 4 rounds of AES-CTR for tail in parallel with 8 rounds of GHASH
1638
                AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, pGhashSrc, 8, expandedKeyTable, todo, a0, a1, a2 );
1639
            }
1640

1641
            if( todo == 0)
1642
            {
1643
                CLMUL_3_POST( a0, a1, a2 );
1644
                MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1645

1646
                todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PMULL_HPOWERS );
1647
                CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1648
            }
1649
        }
1650
        else
1651
        {
1652
            // Just do the final 8 rounds of GHASH
1653
            for( todo=8; todo>0; todo-- )
1654
            {
1655
                r0x = vrev64q_u8( pGhashSrc[0] );
1656
                r0 = vextq_u8( r0x, r0x, 8 );
1657
                r0x = veorq_u8( r0, r0x );
1658
                pGhashSrc++;
1659

1660
                CLMUL_ACCX_3( r0, r0x, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1661
            }
1662

1663
            CLMUL_3_POST( a0, a1, a2 );
1664
            MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1665
        }
1666
    }
1667

1668
    if( nBlocks > 0 )
1669
    {
1670
        // Encrypt 1-7 blocks with pre-generated AES-CTR blocks and GHASH the results
1671
        while( nBlocks >= 2 )
1672
        {
1673
            ctr0 = vaddq_u32( ctr0, chainIncrement2 );
1674

1675
            r0 = veorq_u64( pSrc[0], c0 );
1676
            r1 = veorq_u64( pSrc[1], c1 );
1677

1678
            pDst[0] = r0;
1679
            pDst[1] = r1;
1680

1681
            r0x = vrev64q_u8( r0 );
1682
            r1x = vrev64q_u8( r1 );
1683
            r0 = vextq_u8( r0x, r0x, 8 );
1684
            r1 = vextq_u8( r1x, r1x, 8 );
1685
            r0x = veorq_u8( r0, r0x );
1686
            r1x = veorq_u8( r1, r1x );
1687

1688
            CLMUL_ACCX_3( r0, r0x, GHASH_H_POWER(expandedKeyTable, todo - 0), GHASH_Hx_POWER(expandedKeyTable, todo - 0), a0, a1, a2 );
1689
            CLMUL_ACCX_3( r1, r1x, GHASH_H_POWER(expandedKeyTable, todo - 1), GHASH_Hx_POWER(expandedKeyTable, todo - 1), a0, a1, a2 );
1690

1691
            pDst    += 2;
1692
            pSrc    += 2;
1693
            todo    -= 2;
1694
            nBlocks -= 2;
1695
            c0 = c2;
1696
            c1 = c3;
1697
            c2 = c4;
1698
            c3 = c5;
1699
            c4 = c6;
1700
        }
1701

1702
        if( nBlocks > 0 )
1703
        {
1704
            ctr0 = vaddq_u32( ctr0, chainIncrement1 );
1705

1706
            r0 = veorq_u64( pSrc[0], c0 );
1707
            pDst[0] = r0;
1708
            r0x = vrev64q_u8( r0 );
1709
            r0 = vextq_u8( r0x, r0x, 8 );
1710
            r0x = veorq_u8( r0, r0x );
1711

1712
            CLMUL_ACCX_3( r0, r0x, GHASH_H_POWER(expandedKeyTable, 1), GHASH_Hx_POWER(expandedKeyTable, 1), a0, a1, a2 );
1713
        }
1714

1715
        CLMUL_3_POST( a0, a1, a2 );
1716
        MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1717
    }
1718

1719
    chain = vrev64q_u8( ctr0 );
1720
    *(__n128 *)pbChainingValue = chain;
1721
    *(__n128 *)pState = state;
1722
}
1723

1724
#pragma warning(push)
1725
#pragma warning( disable:4701 ) // "Use of uninitialized variable" -
1726
#pragma runtime_checks( "u", off )
1727
// This call is functionally identical to:
1728
// SymCryptGHashAppendDataPmull(expandedKeyTable,
1729
//                              pState,
1730
//                              pbSrc,
1731
//                              cbData );
1732
// SymCryptAesCtrMsb64Neon( pExpandedKey,
1733
//                          pbChainingValue,
1734
//                          pbSrc,
1735
//                          pbDst,
1736
//                          cbData );
1737
VOID
1738
SYMCRYPT_CALL
1739
SymCryptAesGcmDecryptStitchedNeon(
1740
    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
1741
    _In_reads_( SYMCRYPT_AES_BLOCK_SIZE )   PBYTE                       pbChainingValue,
1742
    _In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT    expandedKeyTable,
1743
    _Inout_                                 PSYMCRYPT_GF128_ELEMENT     pState,
1744
    _In_reads_( cbData )                    PCBYTE                      pbSrc,
1745
    _Out_writes_( cbData )                  PBYTE                       pbDst,
1746
                                            SIZE_T                      cbData )
1747
{
1748
    __n128          chain = *(__n128 *)pbChainingValue;
1749
    const __n128 *  pSrc = (const __n128 *) pbSrc;
1750
    const __n128 *  pGhashSrc = (const __n128 *) pbSrc;
1751
    __n128 *        pDst = (__n128 *) pbDst;
1752

1753
    const __n128 chainIncrement1 = SYMCRYPT_SET_N128_U64( 0, 1 );
1754
    const __n128 chainIncrement2 = SYMCRYPT_SET_N128_U64( 0, 2 );
1755
    const __n128 chainIncrement8 = SYMCRYPT_SET_N128_U64( 0, 8 );
1756

1757
    __n128 ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7;
1758
    __n128 c0, c1, c2, c3, c4, c5, c6, c7;
1759

1760
    __n128 state;
1761
    __n128 a0, a1, a2;
1762
    const __n64 vMultiplicationConstant = SYMCRYPT_SET_N64_U64(0xc200000000000000);
1763
    SIZE_T nBlocks = cbData / SYMCRYPT_GF128_BLOCK_SIZE;
1764
    SIZE_T todo;
1765

1766
    SYMCRYPT_ASSERT( (cbData & SYMCRYPT_GCM_BLOCK_MOD_MASK) == 0 ); // cbData is multiple of block size
1767

1768
    // Our chain variable is in integer format, not the MSBfirst format loaded from memory.
1769
    ctr0 = vrev64q_u8( chain );
1770
    ctr1 = vaddq_u32( ctr0, chainIncrement1 );
1771
    ctr2 = vaddq_u32( ctr0, chainIncrement2 );
1772
    ctr3 = vaddq_u32( ctr1, chainIncrement2 );
1773
    ctr4 = vaddq_u32( ctr2, chainIncrement2 );
1774
    ctr5 = vaddq_u32( ctr3, chainIncrement2 );
1775
    ctr6 = vaddq_u32( ctr4, chainIncrement2 );
1776
    ctr7 = vaddq_u32( ctr5, chainIncrement2 );
1777

1778
    state = *(__n128 *) pState;
1779

1780
    todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PMULL_HPOWERS );
1781

1782
    CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1783

1784
    while( nBlocks >= 8 )
1785
    {
1786
        // In this loop we always have 8 blocks to decrypt and GHASH
1787
        c0 = vrev64q_u8( ctr0 );
1788
        c1 = vrev64q_u8( ctr1 );
1789
        c2 = vrev64q_u8( ctr2 );
1790
        c3 = vrev64q_u8( ctr3 );
1791
        c4 = vrev64q_u8( ctr4 );
1792
        c5 = vrev64q_u8( ctr5 );
1793
        c6 = vrev64q_u8( ctr6 );
1794
        c7 = vrev64q_u8( ctr7 );
1795

1796
        ctr0 = vaddq_u32( ctr0, chainIncrement8 );
1797
        ctr1 = vaddq_u32( ctr1, chainIncrement8 );
1798
        ctr2 = vaddq_u32( ctr2, chainIncrement8 );
1799
        ctr3 = vaddq_u32( ctr3, chainIncrement8 );
1800
        ctr4 = vaddq_u32( ctr4, chainIncrement8 );
1801
        ctr5 = vaddq_u32( ctr5, chainIncrement8 );
1802
        ctr6 = vaddq_u32( ctr6, chainIncrement8 );
1803
        ctr7 = vaddq_u32( ctr7, chainIncrement8 );
1804

1805
        AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pGhashSrc, 8, expandedKeyTable, todo, a0, a1, a2 );
1806

1807
        pDst[0] = veorq_u64( pSrc[0], c0 );
1808
        pDst[1] = veorq_u64( pSrc[1], c1 );
1809
        pDst[2] = veorq_u64( pSrc[2], c2 );
1810
        pDst[3] = veorq_u64( pSrc[3], c3 );
1811
        pDst[4] = veorq_u64( pSrc[4], c4 );
1812
        pDst[5] = veorq_u64( pSrc[5], c5 );
1813
        pDst[6] = veorq_u64( pSrc[6], c6 );
1814
        pDst[7] = veorq_u64( pSrc[7], c7 );
1815

1816
        pDst  += 8;
1817
        pSrc  += 8;
1818
        nBlocks -= 8;
1819

1820
        if (todo == 0)
1821
        {
1822
            CLMUL_3_POST( a0, a1, a2 );
1823
            MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1824

1825
            if ( nBlocks > 0 )
1826
            {
1827
                todo = SYMCRYPT_MIN( nBlocks, SYMCRYPT_GHASH_PMULL_HPOWERS );
1828
                CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0, a1, a2 );
1829
            }
1830
        }
1831
    }
1832

1833
    if( nBlocks > 0 )
1834
    {
1835
        // We have 1-7 blocks to GHASH and decrypt
1836
        // Do the exact number of GHASH blocks we need in parallel with generating either 4 or 8 blocks of AES-CTR
1837
        c0 = vrev64q_u8( ctr0 );
1838
        c1 = vrev64q_u8( ctr1 );
1839
        c2 = vrev64q_u8( ctr2 );
1840
        c3 = vrev64q_u8( ctr3 );
1841

1842
        if( nBlocks > 4 )
1843
        {
1844
            c4 = vrev64q_u8( ctr4 );
1845
            c5 = vrev64q_u8( ctr5 );
1846
            c6 = vrev64q_u8( ctr6 );
1847

1848
            AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pGhashSrc, nBlocks, expandedKeyTable, todo, a0, a1, a2 );
1849
        } else {
1850
            AES_GCM_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3, pGhashSrc, nBlocks, expandedKeyTable, todo, a0, a1, a2 );
1851
        }
1852
        CLMUL_3_POST( a0, a1, a2 );
1853
        MODREDUCE( vMultiplicationConstant, a0, a1, a2, state );
1854

1855
        // Decrypt 1-7 blocks with pre-generated AES-CTR blocks
1856
        while( nBlocks >= 2 )
1857
        {
1858
            ctr0 = vaddq_u32( ctr0, chainIncrement2 );
1859

1860
            pDst[0] = veorq_u64( pSrc[0], c0 );
1861
            pDst[1] = veorq_u64( pSrc[1], c1 );
1862

1863
            pDst    += 2;
1864
            pSrc    += 2;
1865
            nBlocks -= 2;
1866
            c0 = c2;
1867
            c1 = c3;
1868
            c2 = c4;
1869
            c3 = c5;
1870
            c4 = c6;
1871
        }
1872

1873
        if( nBlocks > 0 )
1874
        {
1875
            ctr0 = vaddq_u32( ctr0, chainIncrement1 );
1876

1877
            pDst[0] = veorq_u64( pSrc[0], c0 );
1878
        }
1879
    }
1880

1881
    chain = vrev64q_u8( ctr0 );
1882
    *(__n128 *)pbChainingValue = chain;
1883
    *(__n128 *)pState = state;
1884
}
1885
#pragma runtime_checks( "u", restore )
1886
#pragma warning(pop)
1887
#pragma clang attribute pop
1888

1889
#endif
1890

1891
Product

Resources

Company