Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/mbedtls/library/aesce.c
9898 views
1
/*
2
* Armv8-A Cryptographic Extension support functions for Aarch64
3
*
4
* Copyright The Mbed TLS Contributors
5
* SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6
*/
7
8
#if defined(__clang__) && (__clang_major__ >= 4)
9
10
/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,
11
* but that is defined by build_info.h, and we need this block to happen first. */
12
#if defined(__ARM_ARCH)
13
#if __ARM_ARCH >= 8
14
#define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
15
#endif
16
#endif
17
18
#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
19
/* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
20
*
21
* The intrinsic declaration are guarded by predefined ACLE macros in clang:
22
* these are normally only enabled by the -march option on the command line.
23
* By defining the macros ourselves we gain access to those declarations without
24
* requiring -march on the command line.
25
*
26
* `arm_neon.h` is included by common.h, so we put these defines
27
* at the top of this file, before any includes.
28
*/
29
#define __ARM_FEATURE_CRYPTO 1
30
/* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
31
*
32
* `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
33
* for older compilers.
34
*/
35
#define __ARM_FEATURE_AES 1
36
#define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
37
#endif
38
39
#endif /* defined(__clang__) && (__clang_major__ >= 4) */
40
41
#include <string.h>
42
#include "common.h"
43
44
#if defined(MBEDTLS_AESCE_C)
45
46
#include "aesce.h"
47
48
#if defined(MBEDTLS_AESCE_HAVE_CODE)
49
50
/* Compiler version checks. */
51
#if defined(__clang__)
52
# if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
53
# error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
54
# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
55
# error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
56
# endif
57
#elif defined(__GNUC__)
58
# if __GNUC__ < 6
59
# error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
60
# endif
61
#elif defined(_MSC_VER)
62
/* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
63
* please update this and document of `MBEDTLS_AESCE_C` in
64
* `mbedtls_config.h`. */
65
# if _MSC_VER < 1929
66
# error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
67
# endif
68
#elif defined(__ARMCC_VERSION)
69
# if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
70
/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
71
* If someone verified that, please update this and document of
72
* `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
73
# error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
74
# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
75
# error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
76
# endif
77
#endif
78
79
#if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
80
defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
81
# if defined(__ARMCOMPILER_VERSION)
82
# if __ARMCOMPILER_VERSION <= 6090000
83
# error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
84
# else
85
# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
86
# define MBEDTLS_POP_TARGET_PRAGMA
87
# endif
88
# elif defined(__clang__)
89
# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
90
# define MBEDTLS_POP_TARGET_PRAGMA
91
# elif defined(__GNUC__)
92
# pragma GCC push_options
93
# pragma GCC target ("+crypto")
94
# define MBEDTLS_POP_TARGET_PRAGMA
95
# elif defined(_MSC_VER)
96
# error "Required feature(__ARM_FEATURE_AES) is not enabled."
97
# endif
98
#endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
99
MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
100
101
#if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
102
103
#include <sys/auxv.h>
104
#if !defined(HWCAP_NEON)
105
#define HWCAP_NEON (1 << 12)
106
#endif
107
#if !defined(HWCAP2_AES)
108
#define HWCAP2_AES (1 << 0)
109
#endif
110
#if !defined(HWCAP_AES)
111
#define HWCAP_AES (1 << 3)
112
#endif
113
#if !defined(HWCAP_ASIMD)
114
#define HWCAP_ASIMD (1 << 1)
115
#endif
116
117
signed char mbedtls_aesce_has_support_result = -1;
118
119
#if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
120
/*
121
* AES instruction support detection routine
122
*/
123
int mbedtls_aesce_has_support_impl(void)
124
{
125
/* To avoid many calls to getauxval, cache the result. This is
126
* thread-safe, because we store the result in a char so cannot
127
* be vulnerable to non-atomic updates.
128
* It is possible that we could end up setting result more than
129
* once, but that is harmless.
130
*/
131
if (mbedtls_aesce_has_support_result == -1) {
132
#if defined(MBEDTLS_ARCH_IS_ARM32)
133
unsigned long auxval = getauxval(AT_HWCAP);
134
unsigned long auxval2 = getauxval(AT_HWCAP2);
135
if (((auxval & HWCAP_NEON) == HWCAP_NEON) &&
136
((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
137
mbedtls_aesce_has_support_result = 1;
138
} else {
139
mbedtls_aesce_has_support_result = 0;
140
}
141
#else
142
unsigned long auxval = getauxval(AT_HWCAP);
143
if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
144
(HWCAP_ASIMD | HWCAP_AES)) {
145
mbedtls_aesce_has_support_result = 1;
146
} else {
147
mbedtls_aesce_has_support_result = 0;
148
}
149
#endif
150
}
151
return mbedtls_aesce_has_support_result;
152
}
153
#endif
154
155
#endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
156
157
/* Single round of AESCE encryption */
158
#define AESCE_ENCRYPT_ROUND \
159
block = vaeseq_u8(block, vld1q_u8(keys)); \
160
block = vaesmcq_u8(block); \
161
keys += 16
162
/* Two rounds of AESCE encryption */
163
#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
164
165
MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
166
static uint8x16_t aesce_encrypt_block(uint8x16_t block,
167
unsigned char *keys,
168
int rounds)
169
{
170
/* 10, 12 or 14 rounds. Unroll loop. */
171
if (rounds == 10) {
172
goto rounds_10;
173
}
174
if (rounds == 12) {
175
goto rounds_12;
176
}
177
AESCE_ENCRYPT_ROUND_X2;
178
rounds_12:
179
AESCE_ENCRYPT_ROUND_X2;
180
rounds_10:
181
AESCE_ENCRYPT_ROUND_X2;
182
AESCE_ENCRYPT_ROUND_X2;
183
AESCE_ENCRYPT_ROUND_X2;
184
AESCE_ENCRYPT_ROUND_X2;
185
AESCE_ENCRYPT_ROUND;
186
187
/* AES AddRoundKey for the previous round.
188
* SubBytes, ShiftRows for the final round. */
189
block = vaeseq_u8(block, vld1q_u8(keys));
190
keys += 16;
191
192
/* Final round: no MixColumns */
193
194
/* Final AddRoundKey */
195
block = veorq_u8(block, vld1q_u8(keys));
196
197
return block;
198
}
199
200
/* Single round of AESCE decryption
201
*
202
* AES AddRoundKey, SubBytes, ShiftRows
203
*
204
* block = vaesdq_u8(block, vld1q_u8(keys));
205
*
206
* AES inverse MixColumns for the next round.
207
*
208
* This means that we switch the order of the inverse AddRoundKey and
209
* inverse MixColumns operations. We have to do this as AddRoundKey is
210
* done in an atomic instruction together with the inverses of SubBytes
211
* and ShiftRows.
212
*
213
* It works because MixColumns is a linear operation over GF(2^8) and
214
* AddRoundKey is an exclusive or, which is equivalent to addition over
215
* GF(2^8). (The inverse of MixColumns needs to be applied to the
216
* affected round keys separately which has been done when the
217
* decryption round keys were calculated.)
218
*
219
* block = vaesimcq_u8(block);
220
*/
221
#define AESCE_DECRYPT_ROUND \
222
block = vaesdq_u8(block, vld1q_u8(keys)); \
223
block = vaesimcq_u8(block); \
224
keys += 16
225
/* Two rounds of AESCE decryption */
226
#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
227
228
#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
229
static uint8x16_t aesce_decrypt_block(uint8x16_t block,
230
unsigned char *keys,
231
int rounds)
232
{
233
/* 10, 12 or 14 rounds. Unroll loop. */
234
if (rounds == 10) {
235
goto rounds_10;
236
}
237
if (rounds == 12) {
238
goto rounds_12;
239
}
240
AESCE_DECRYPT_ROUND_X2;
241
rounds_12:
242
AESCE_DECRYPT_ROUND_X2;
243
rounds_10:
244
AESCE_DECRYPT_ROUND_X2;
245
AESCE_DECRYPT_ROUND_X2;
246
AESCE_DECRYPT_ROUND_X2;
247
AESCE_DECRYPT_ROUND_X2;
248
AESCE_DECRYPT_ROUND;
249
250
/* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
251
* last full round. */
252
block = vaesdq_u8(block, vld1q_u8(keys));
253
keys += 16;
254
255
/* Inverse AddRoundKey for inverting the initial round key addition. */
256
block = veorq_u8(block, vld1q_u8(keys));
257
258
return block;
259
}
260
#endif
261
262
/*
263
* AES-ECB block en(de)cryption
264
*/
265
int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
266
int mode,
267
const unsigned char input[16],
268
unsigned char output[16])
269
{
270
uint8x16_t block = vld1q_u8(&input[0]);
271
unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
272
273
#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
274
if (mode == MBEDTLS_AES_DECRYPT) {
275
block = aesce_decrypt_block(block, keys, ctx->nr);
276
} else
277
#else
278
(void) mode;
279
#endif
280
{
281
block = aesce_encrypt_block(block, keys, ctx->nr);
282
}
283
vst1q_u8(&output[0], block);
284
285
return 0;
286
}
287
288
/*
289
* Compute decryption round keys from encryption round keys
290
*/
291
#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
292
void mbedtls_aesce_inverse_key(unsigned char *invkey,
293
const unsigned char *fwdkey,
294
int nr)
295
{
296
int i, j;
297
j = nr;
298
vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
299
for (i = 1, j--; j > 0; i++, j--) {
300
vst1q_u8(invkey + i * 16,
301
vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
302
}
303
vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
304
305
}
306
#endif
307
308
static inline uint32_t aes_rot_word(uint32_t word)
309
{
310
return (word << (32 - 8)) | (word >> 8);
311
}
312
313
static inline uint32_t aes_sub_word(uint32_t in)
314
{
315
uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
316
uint8x16_t zero = vdupq_n_u8(0);
317
318
/* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
319
* the correct result as ShiftRows doesn't change the first row. */
320
v = vaeseq_u8(zero, v);
321
return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
322
}
323
324
/*
325
* Key expansion function
326
*/
327
static void aesce_setkey_enc(unsigned char *rk,
328
const unsigned char *key,
329
const size_t key_bit_length)
330
{
331
static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
332
0x20, 0x40, 0x80, 0x1b, 0x36 };
333
/* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
334
* - Section 5, Nr = Nk + 6
335
* - Section 5.2, the length of round keys is Nb*(Nr+1)
336
*/
337
const size_t key_len_in_words = key_bit_length / 32; /* Nk */
338
const size_t round_key_len_in_words = 4; /* Nb */
339
const size_t rounds_needed = key_len_in_words + 6; /* Nr */
340
const size_t round_keys_len_in_words =
341
round_key_len_in_words * (rounds_needed + 1); /* Nb*(Nr+1) */
342
const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
343
344
memcpy(rk, key, key_len_in_words * 4);
345
346
for (uint32_t *rki = (uint32_t *) rk;
347
rki + key_len_in_words < rko_end;
348
rki += key_len_in_words) {
349
350
size_t iteration = (size_t) (rki - (uint32_t *) rk) / key_len_in_words;
351
uint32_t *rko;
352
rko = rki + key_len_in_words;
353
rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
354
rko[0] ^= rcon[iteration] ^ rki[0];
355
rko[1] = rko[0] ^ rki[1];
356
rko[2] = rko[1] ^ rki[2];
357
rko[3] = rko[2] ^ rki[3];
358
if (rko + key_len_in_words > rko_end) {
359
/* Do not write overflow words.*/
360
continue;
361
}
362
#if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
363
switch (key_bit_length) {
364
case 128:
365
break;
366
case 192:
367
rko[4] = rko[3] ^ rki[4];
368
rko[5] = rko[4] ^ rki[5];
369
break;
370
case 256:
371
rko[4] = aes_sub_word(rko[3]) ^ rki[4];
372
rko[5] = rko[4] ^ rki[5];
373
rko[6] = rko[5] ^ rki[6];
374
rko[7] = rko[6] ^ rki[7];
375
break;
376
}
377
#endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
378
}
379
}
380
381
/*
382
* Key expansion, wrapper
383
*/
384
int mbedtls_aesce_setkey_enc(unsigned char *rk,
385
const unsigned char *key,
386
size_t bits)
387
{
388
switch (bits) {
389
case 128:
390
case 192:
391
case 256:
392
aesce_setkey_enc(rk, key, bits);
393
break;
394
default:
395
return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
396
}
397
398
return 0;
399
}
400
401
#if defined(MBEDTLS_GCM_C)
402
403
#if defined(MBEDTLS_ARCH_IS_ARM32)
404
405
#if defined(__clang__)
406
/* On clang for A32/T32, work around some missing intrinsics and types which are listed in
407
* [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
408
* These are only required for GCM.
409
*/
410
#define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
411
412
typedef uint8x16_t poly128_t;
413
414
static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
415
{
416
poly128_t r;
417
asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
418
return r;
419
}
420
421
/* This is set to cause some more missing intrinsics to be defined below */
422
#define COMMON_MISSING_INTRINSICS
423
424
static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
425
{
426
return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
427
(poly64_t) (vget_high_u64((uint64x2_t) b)));
428
}
429
430
#endif /* defined(__clang__) */
431
432
static inline uint8x16_t vrbitq_u8(uint8x16_t x)
433
{
434
/* There is no vrbitq_u8 instruction in A32/T32, so provide
435
* an equivalent non-Neon implementation. Reverse bit order in each
436
* byte with 4x rbit, rev. */
437
asm ("ldm %[p], { r2-r5 } \n\t"
438
"rbit r2, r2 \n\t"
439
"rev r2, r2 \n\t"
440
"rbit r3, r3 \n\t"
441
"rev r3, r3 \n\t"
442
"rbit r4, r4 \n\t"
443
"rev r4, r4 \n\t"
444
"rbit r5, r5 \n\t"
445
"rev r5, r5 \n\t"
446
"stm %[p], { r2-r5 } \n\t"
447
:
448
/* Output: 16 bytes of memory pointed to by &x */
449
"+m" (*(uint8_t(*)[16]) &x)
450
:
451
[p] "r" (&x)
452
:
453
"r2", "r3", "r4", "r5"
454
);
455
return x;
456
}
457
458
#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
459
460
#if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
461
/* Some intrinsics are not available for GCC 5.X. */
462
#define COMMON_MISSING_INTRINSICS
463
#endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
464
465
466
#if defined(COMMON_MISSING_INTRINSICS)
467
468
/* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
469
470
#define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
471
#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
472
473
static inline poly64x1_t vget_low_p64(poly64x2_t a)
474
{
475
uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
476
return (poly64x1_t) r;
477
478
}
479
480
#endif /* COMMON_MISSING_INTRINSICS */
481
482
/* vmull_p64/vmull_high_p64 wrappers.
483
*
484
* Older compilers miss some intrinsic functions for `poly*_t`. We use
485
* uint8x16_t and uint8x16x3_t as input/output parameters.
486
*/
487
#if defined(MBEDTLS_COMPILER_IS_GCC)
488
/* GCC reports incompatible type error without cast. GCC think poly64_t and
489
* poly64x1_t are different, that is different with MSVC and Clang. */
490
#define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
491
#else
492
/* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
493
* error with/without cast. And I think poly64_t and poly64x1_t are same, no
494
* cast for clang also. */
495
#define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
496
#endif /* MBEDTLS_COMPILER_IS_GCC */
497
498
static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
499
{
500
501
return vreinterpretq_u8_p128(
502
MBEDTLS_VMULL_P64(
503
(poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)),
504
(poly64_t) vget_low_p64(vreinterpretq_p64_u8(b))
505
));
506
}
507
508
static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
509
{
510
return vreinterpretq_u8_p128(
511
vmull_high_p64(vreinterpretq_p64_u8(a),
512
vreinterpretq_p64_u8(b)));
513
}
514
515
/* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
516
* `x^128 + x^7 + x^2 + x + 1`.
517
*
518
* Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
519
* multiplies to generate a 128b.
520
*
521
* `poly_mult_128` executes polynomial multiplication and outputs 256b that
522
* represented by 3 128b due to code size optimization.
523
*
524
* Output layout:
525
* | | | |
526
* |------------|-------------|-------------|
527
* | ret.val[0] | h3:h2:00:00 | high 128b |
528
* | ret.val[1] | :m2:m1:00 | middle 128b |
529
* | ret.val[2] | : :l1:l0 | low 128b |
530
*/
531
static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
532
{
533
uint8x16x3_t ret;
534
uint8x16_t h, m, l; /* retval high/middle/low */
535
uint8x16_t c, d, e;
536
537
h = pmull_high(a, b); /* h3:h2:00:00 = a1*b1 */
538
l = pmull_low(a, b); /* : :l1:l0 = a0*b0 */
539
c = vextq_u8(b, b, 8); /* :c1:c0 = b0:b1 */
540
d = pmull_high(a, c); /* :d2:d1:00 = a1*b0 */
541
e = pmull_low(a, c); /* :e2:e1:00 = a0*b1 */
542
m = veorq_u8(d, e); /* :m2:m1:00 = d + e */
543
544
ret.val[0] = h;
545
ret.val[1] = m;
546
ret.val[2] = l;
547
return ret;
548
}
549
550
/*
551
* Modulo reduction.
552
*
553
* See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
554
*
555
* Section 4.3
556
*
557
* Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
558
* z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
559
* consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
560
* operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
561
* simply multiply the higher part of the operand by r(z) and add it to l(z). If
562
* the result is still larger than 128 bits, we reduce again.
563
*/
564
static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
565
{
566
uint8x16_t const ZERO = vdupq_n_u8(0);
567
568
uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
569
#if defined(__GNUC__)
570
/* use 'asm' as an optimisation barrier to prevent loading MODULO from
571
* memory. It is for GNUC compatible compilers.
572
*/
573
asm volatile ("" : "+w" (r));
574
#endif
575
uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
576
uint8x16_t h, m, l; /* input high/middle/low 128b */
577
uint8x16_t c, d, e, f, g, n, o;
578
h = input.val[0]; /* h3:h2:00:00 */
579
m = input.val[1]; /* :m2:m1:00 */
580
l = input.val[2]; /* : :l1:l0 */
581
c = pmull_high(h, MODULO); /* :c2:c1:00 = reduction of h3 */
582
d = pmull_low(h, MODULO); /* : :d1:d0 = reduction of h2 */
583
e = veorq_u8(c, m); /* :e2:e1:00 = m2:m1:00 + c2:c1:00 */
584
f = pmull_high(e, MODULO); /* : :f1:f0 = reduction of e2 */
585
g = vextq_u8(ZERO, e, 8); /* : :g1:00 = e1:00 */
586
n = veorq_u8(d, l); /* : :n1:n0 = d1:d0 + l1:l0 */
587
o = veorq_u8(n, f); /* o1:o0 = f1:f0 + n1:n0 */
588
return veorq_u8(o, g); /* = o1:o0 + g1:00 */
589
}
590
591
/*
592
* GCM multiplication: c = a times b in GF(2^128)
593
*/
594
void mbedtls_aesce_gcm_mult(unsigned char c[16],
595
const unsigned char a[16],
596
const unsigned char b[16])
597
{
598
uint8x16_t va, vb, vc;
599
va = vrbitq_u8(vld1q_u8(&a[0]));
600
vb = vrbitq_u8(vld1q_u8(&b[0]));
601
vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
602
vst1q_u8(&c[0], vc);
603
}
604
605
#endif /* MBEDTLS_GCM_C */
606
607
#if defined(MBEDTLS_POP_TARGET_PRAGMA)
608
#if defined(__clang__)
609
#pragma clang attribute pop
610
#elif defined(__GNUC__)
611
#pragma GCC pop_options
612
#endif
613
#undef MBEDTLS_POP_TARGET_PRAGMA
614
#endif
615
616
#endif /* MBEDTLS_AESCE_HAVE_CODE */
617
618
#endif /* MBEDTLS_AESCE_C */
619
620