CoCalc -- aesce.c

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/mbedtls/library/aesce.c
⁹⁸⁹⁸ views
1
/*
2
 *  Armv8-A Cryptographic Extension support functions for Aarch64
3
 *
4
 *  Copyright The Mbed TLS Contributors
5
 *  SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6
 */
7

8
#if defined(__clang__) &&  (__clang_major__ >= 4)
9

10
/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,
11
 * but that is defined by build_info.h, and we need this block to happen first. */
12
#if defined(__ARM_ARCH)
13
#if __ARM_ARCH >= 8
14
#define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
15
#endif
16
#endif
17

18
#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
19
/* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
20
 *
21
 * The intrinsic declaration are guarded by predefined ACLE macros in clang:
22
 * these are normally only enabled by the -march option on the command line.
23
 * By defining the macros ourselves we gain access to those declarations without
24
 * requiring -march on the command line.
25
 *
26
 * `arm_neon.h` is included by common.h, so we put these defines
27
 * at the top of this file, before any includes.
28
 */
29
#define __ARM_FEATURE_CRYPTO 1
30
/* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
31
 *
32
 * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
33
 * for older compilers.
34
 */
35
#define __ARM_FEATURE_AES    1
36
#define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
37
#endif
38

39
#endif /* defined(__clang__) &&  (__clang_major__ >= 4) */
40

41
#include <string.h>
42
#include "common.h"
43

44
#if defined(MBEDTLS_AESCE_C)
45

46
#include "aesce.h"
47

48
#if defined(MBEDTLS_AESCE_HAVE_CODE)
49

50
/* Compiler version checks. */
51
#if defined(__clang__)
52
#   if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
53
#       error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
54
#   elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
55
#       error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
56
#   endif
57
#elif defined(__GNUC__)
58
#   if __GNUC__ < 6
59
#       error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
60
#   endif
61
#elif defined(_MSC_VER)
62
/* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
63
 *       please update this and document of `MBEDTLS_AESCE_C` in
64
 *       `mbedtls_config.h`. */
65
#   if _MSC_VER < 1929
66
#       error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
67
#   endif
68
#elif defined(__ARMCC_VERSION)
69
#    if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
70
/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
71
 * If someone verified that, please update this and document of
72
 * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
73
#         error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
74
#    elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
75
#         error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
76
#    endif
77
#endif
78

79
#if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
80
    defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
81
#   if defined(__ARMCOMPILER_VERSION)
82
#       if __ARMCOMPILER_VERSION <= 6090000
83
#           error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
84
#       else
85
#           pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
86
#           define MBEDTLS_POP_TARGET_PRAGMA
87
#       endif
88
#   elif defined(__clang__)
89
#       pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
90
#       define MBEDTLS_POP_TARGET_PRAGMA
91
#   elif defined(__GNUC__)
92
#       pragma GCC push_options
93
#       pragma GCC target ("+crypto")
94
#       define MBEDTLS_POP_TARGET_PRAGMA
95
#   elif defined(_MSC_VER)
96
#       error "Required feature(__ARM_FEATURE_AES) is not enabled."
97
#   endif
98
#endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
99
          MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
100

101
#if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
102

103
#include <sys/auxv.h>
104
#if !defined(HWCAP_NEON)
105
#define HWCAP_NEON  (1 << 12)
106
#endif
107
#if !defined(HWCAP2_AES)
108
#define HWCAP2_AES  (1 << 0)
109
#endif
110
#if !defined(HWCAP_AES)
111
#define HWCAP_AES   (1 << 3)
112
#endif
113
#if !defined(HWCAP_ASIMD)
114
#define HWCAP_ASIMD (1 << 1)
115
#endif
116

117
signed char mbedtls_aesce_has_support_result = -1;
118

119
#if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
120
/*
121
 * AES instruction support detection routine
122
 */
123
int mbedtls_aesce_has_support_impl(void)
124
{
125
    /* To avoid many calls to getauxval, cache the result. This is
126
     * thread-safe, because we store the result in a char so cannot
127
     * be vulnerable to non-atomic updates.
128
     * It is possible that we could end up setting result more than
129
     * once, but that is harmless.
130
     */
131
    if (mbedtls_aesce_has_support_result == -1) {
132
#if defined(MBEDTLS_ARCH_IS_ARM32)
133
        unsigned long auxval  = getauxval(AT_HWCAP);
134
        unsigned long auxval2 = getauxval(AT_HWCAP2);
135
        if (((auxval  & HWCAP_NEON) == HWCAP_NEON) &&
136
            ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
137
            mbedtls_aesce_has_support_result = 1;
138
        } else {
139
            mbedtls_aesce_has_support_result = 0;
140
        }
141
#else
142
        unsigned long auxval = getauxval(AT_HWCAP);
143
        if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
144
            (HWCAP_ASIMD | HWCAP_AES)) {
145
            mbedtls_aesce_has_support_result = 1;
146
        } else {
147
            mbedtls_aesce_has_support_result = 0;
148
        }
149
#endif
150
    }
151
    return mbedtls_aesce_has_support_result;
152
}
153
#endif
154

155
#endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
156

157
/* Single round of AESCE encryption */
158
#define AESCE_ENCRYPT_ROUND                   \
159
    block = vaeseq_u8(block, vld1q_u8(keys)); \
160
    block = vaesmcq_u8(block);                \
161
    keys += 16
162
/* Two rounds of AESCE encryption */
163
#define AESCE_ENCRYPT_ROUND_X2        AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
164

165
MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
166
static uint8x16_t aesce_encrypt_block(uint8x16_t block,
167
                                      unsigned char *keys,
168
                                      int rounds)
169
{
170
    /* 10, 12 or 14 rounds. Unroll loop. */
171
    if (rounds == 10) {
172
        goto rounds_10;
173
    }
174
    if (rounds == 12) {
175
        goto rounds_12;
176
    }
177
    AESCE_ENCRYPT_ROUND_X2;
178
rounds_12:
179
    AESCE_ENCRYPT_ROUND_X2;
180
rounds_10:
181
    AESCE_ENCRYPT_ROUND_X2;
182
    AESCE_ENCRYPT_ROUND_X2;
183
    AESCE_ENCRYPT_ROUND_X2;
184
    AESCE_ENCRYPT_ROUND_X2;
185
    AESCE_ENCRYPT_ROUND;
186

187
    /* AES AddRoundKey for the previous round.
188
     * SubBytes, ShiftRows for the final round.  */
189
    block = vaeseq_u8(block, vld1q_u8(keys));
190
    keys += 16;
191

192
    /* Final round: no MixColumns */
193

194
    /* Final AddRoundKey */
195
    block = veorq_u8(block, vld1q_u8(keys));
196

197
    return block;
198
}
199

200
/* Single round of AESCE decryption
201
 *
202
 * AES AddRoundKey, SubBytes, ShiftRows
203
 *
204
 *      block = vaesdq_u8(block, vld1q_u8(keys));
205
 *
206
 * AES inverse MixColumns for the next round.
207
 *
208
 * This means that we switch the order of the inverse AddRoundKey and
209
 * inverse MixColumns operations. We have to do this as AddRoundKey is
210
 * done in an atomic instruction together with the inverses of SubBytes
211
 * and ShiftRows.
212
 *
213
 * It works because MixColumns is a linear operation over GF(2^8) and
214
 * AddRoundKey is an exclusive or, which is equivalent to addition over
215
 * GF(2^8). (The inverse of MixColumns needs to be applied to the
216
 * affected round keys separately which has been done when the
217
 * decryption round keys were calculated.)
218
 *
219
 *      block = vaesimcq_u8(block);
220
 */
221
#define AESCE_DECRYPT_ROUND                   \
222
    block = vaesdq_u8(block, vld1q_u8(keys)); \
223
    block = vaesimcq_u8(block);               \
224
    keys += 16
225
/* Two rounds of AESCE decryption */
226
#define AESCE_DECRYPT_ROUND_X2        AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
227

228
#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
229
static uint8x16_t aesce_decrypt_block(uint8x16_t block,
230
                                      unsigned char *keys,
231
                                      int rounds)
232
{
233
    /* 10, 12 or 14 rounds. Unroll loop. */
234
    if (rounds == 10) {
235
        goto rounds_10;
236
    }
237
    if (rounds == 12) {
238
        goto rounds_12;
239
    }
240
    AESCE_DECRYPT_ROUND_X2;
241
rounds_12:
242
    AESCE_DECRYPT_ROUND_X2;
243
rounds_10:
244
    AESCE_DECRYPT_ROUND_X2;
245
    AESCE_DECRYPT_ROUND_X2;
246
    AESCE_DECRYPT_ROUND_X2;
247
    AESCE_DECRYPT_ROUND_X2;
248
    AESCE_DECRYPT_ROUND;
249

250
    /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
251
     * last full round. */
252
    block = vaesdq_u8(block, vld1q_u8(keys));
253
    keys += 16;
254

255
    /* Inverse AddRoundKey for inverting the initial round key addition. */
256
    block = veorq_u8(block, vld1q_u8(keys));
257

258
    return block;
259
}
260
#endif
261

262
/*
263
 * AES-ECB block en(de)cryption
264
 */
265
int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
266
                            int mode,
267
                            const unsigned char input[16],
268
                            unsigned char output[16])
269
{
270
    uint8x16_t block = vld1q_u8(&input[0]);
271
    unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
272

273
#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
274
    if (mode == MBEDTLS_AES_DECRYPT) {
275
        block = aesce_decrypt_block(block, keys, ctx->nr);
276
    } else
277
#else
278
    (void) mode;
279
#endif
280
    {
281
        block = aesce_encrypt_block(block, keys, ctx->nr);
282
    }
283
    vst1q_u8(&output[0], block);
284

285
    return 0;
286
}
287

288
/*
289
 * Compute decryption round keys from encryption round keys
290
 */
291
#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
292
void mbedtls_aesce_inverse_key(unsigned char *invkey,
293
                               const unsigned char *fwdkey,
294
                               int nr)
295
{
296
    int i, j;
297
    j = nr;
298
    vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
299
    for (i = 1, j--; j > 0; i++, j--) {
300
        vst1q_u8(invkey + i * 16,
301
                 vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
302
    }
303
    vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
304

305
}
306
#endif
307

308
static inline uint32_t aes_rot_word(uint32_t word)
309
{
310
    return (word << (32 - 8)) | (word >> 8);
311
}
312

313
static inline uint32_t aes_sub_word(uint32_t in)
314
{
315
    uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
316
    uint8x16_t zero = vdupq_n_u8(0);
317

318
    /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
319
     * the correct result as ShiftRows doesn't change the first row. */
320
    v = vaeseq_u8(zero, v);
321
    return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
322
}
323

324
/*
325
 * Key expansion function
326
 */
327
static void aesce_setkey_enc(unsigned char *rk,
328
                             const unsigned char *key,
329
                             const size_t key_bit_length)
330
{
331
    static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
332
                                    0x20, 0x40, 0x80, 0x1b, 0x36 };
333
    /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
334
     *   - Section 5, Nr = Nk + 6
335
     *   - Section 5.2, the length of round keys is Nb*(Nr+1)
336
     */
337
    const size_t key_len_in_words = key_bit_length / 32;    /* Nk */
338
    const size_t round_key_len_in_words = 4;                /* Nb */
339
    const size_t rounds_needed = key_len_in_words + 6;      /* Nr */
340
    const size_t round_keys_len_in_words =
341
        round_key_len_in_words * (rounds_needed + 1);       /* Nb*(Nr+1) */
342
    const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
343

344
    memcpy(rk, key, key_len_in_words * 4);
345

346
    for (uint32_t *rki = (uint32_t *) rk;
347
         rki + key_len_in_words < rko_end;
348
         rki += key_len_in_words) {
349

350
        size_t iteration = (size_t) (rki - (uint32_t *) rk) / key_len_in_words;
351
        uint32_t *rko;
352
        rko = rki + key_len_in_words;
353
        rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
354
        rko[0] ^= rcon[iteration] ^ rki[0];
355
        rko[1] = rko[0] ^ rki[1];
356
        rko[2] = rko[1] ^ rki[2];
357
        rko[3] = rko[2] ^ rki[3];
358
        if (rko + key_len_in_words > rko_end) {
359
            /* Do not write overflow words.*/
360
            continue;
361
        }
362
#if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
363
        switch (key_bit_length) {
364
            case 128:
365
                break;
366
            case 192:
367
                rko[4] = rko[3] ^ rki[4];
368
                rko[5] = rko[4] ^ rki[5];
369
                break;
370
            case 256:
371
                rko[4] = aes_sub_word(rko[3]) ^ rki[4];
372
                rko[5] = rko[4] ^ rki[5];
373
                rko[6] = rko[5] ^ rki[6];
374
                rko[7] = rko[6] ^ rki[7];
375
                break;
376
        }
377
#endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
378
    }
379
}
380

381
/*
382
 * Key expansion, wrapper
383
 */
384
int mbedtls_aesce_setkey_enc(unsigned char *rk,
385
                             const unsigned char *key,
386
                             size_t bits)
387
{
388
    switch (bits) {
389
        case 128:
390
        case 192:
391
        case 256:
392
            aesce_setkey_enc(rk, key, bits);
393
            break;
394
        default:
395
            return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
396
    }
397

398
    return 0;
399
}
400

401
#if defined(MBEDTLS_GCM_C)
402

403
#if defined(MBEDTLS_ARCH_IS_ARM32)
404

405
#if defined(__clang__)
406
/* On clang for A32/T32, work around some missing intrinsics and types which are listed in
407
 * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
408
 * These are only required for GCM.
409
 */
410
#define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
411

412
typedef uint8x16_t poly128_t;
413

414
static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
415
{
416
    poly128_t r;
417
    asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
418
    return r;
419
}
420

421
/* This is set to cause some more missing intrinsics to be defined below */
422
#define COMMON_MISSING_INTRINSICS
423

424
static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
425
{
426
    return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
427
                     (poly64_t) (vget_high_u64((uint64x2_t) b)));
428
}
429

430
#endif /* defined(__clang__) */
431

432
static inline uint8x16_t vrbitq_u8(uint8x16_t x)
433
{
434
    /* There is no vrbitq_u8 instruction in A32/T32, so provide
435
     * an equivalent non-Neon implementation. Reverse bit order in each
436
     * byte with 4x rbit, rev. */
437
    asm ("ldm  %[p], { r2-r5 } \n\t"
438
         "rbit r2, r2          \n\t"
439
         "rev  r2, r2          \n\t"
440
         "rbit r3, r3          \n\t"
441
         "rev  r3, r3          \n\t"
442
         "rbit r4, r4          \n\t"
443
         "rev  r4, r4          \n\t"
444
         "rbit r5, r5          \n\t"
445
         "rev  r5, r5          \n\t"
446
         "stm  %[p], { r2-r5 } \n\t"
447
         :
448
         /* Output: 16 bytes of memory pointed to by &x */
449
         "+m" (*(uint8_t(*)[16]) &x)
450
         :
451
         [p] "r" (&x)
452
         :
453
         "r2", "r3", "r4", "r5"
454
         );
455
    return x;
456
}
457

458
#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
459

460
#if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
461
/* Some intrinsics are not available for GCC 5.X. */
462
#define COMMON_MISSING_INTRINSICS
463
#endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
464

465

466
#if defined(COMMON_MISSING_INTRINSICS)
467

468
/* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
469

470
#define vreinterpretq_p64_u8(a)  ((poly64x2_t) a)
471
#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
472

473
static inline poly64x1_t vget_low_p64(poly64x2_t a)
474
{
475
    uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
476
    return (poly64x1_t) r;
477

478
}
479

480
#endif /* COMMON_MISSING_INTRINSICS */
481

482
/* vmull_p64/vmull_high_p64 wrappers.
483
 *
484
 * Older compilers miss some intrinsic functions for `poly*_t`. We use
485
 * uint8x16_t and uint8x16x3_t as input/output parameters.
486
 */
487
#if defined(MBEDTLS_COMPILER_IS_GCC)
488
/* GCC reports incompatible type error without cast. GCC think poly64_t and
489
 * poly64x1_t are different, that is different with MSVC and Clang. */
490
#define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
491
#else
492
/* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
493
 * error with/without cast. And I think poly64_t and poly64x1_t are same, no
494
 * cast for clang also. */
495
#define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
496
#endif /* MBEDTLS_COMPILER_IS_GCC */
497

498
static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
499
{
500

501
    return vreinterpretq_u8_p128(
502
        MBEDTLS_VMULL_P64(
503
            (poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)),
504
            (poly64_t) vget_low_p64(vreinterpretq_p64_u8(b))
505
            ));
506
}
507

508
static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
509
{
510
    return vreinterpretq_u8_p128(
511
        vmull_high_p64(vreinterpretq_p64_u8(a),
512
                       vreinterpretq_p64_u8(b)));
513
}
514

515
/* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
516
 * `x^128 + x^7 + x^2 + x + 1`.
517
 *
518
 * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
519
 * multiplies to generate a 128b.
520
 *
521
 * `poly_mult_128` executes polynomial multiplication and outputs 256b that
522
 * represented by 3 128b due to code size optimization.
523
 *
524
 * Output layout:
525
 * |            |             |             |
526
 * |------------|-------------|-------------|
527
 * | ret.val[0] | h3:h2:00:00 | high   128b |
528
 * | ret.val[1] |   :m2:m1:00 | middle 128b |
529
 * | ret.val[2] |   :  :l1:l0 | low    128b |
530
 */
531
static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
532
{
533
    uint8x16x3_t ret;
534
    uint8x16_t h, m, l; /* retval high/middle/low */
535
    uint8x16_t c, d, e;
536

537
    h = pmull_high(a, b);                       /* h3:h2:00:00 = a1*b1 */
538
    l = pmull_low(a, b);                        /*   :  :l1:l0 = a0*b0 */
539
    c = vextq_u8(b, b, 8);                      /*      :c1:c0 = b0:b1 */
540
    d = pmull_high(a, c);                       /*   :d2:d1:00 = a1*b0 */
541
    e = pmull_low(a, c);                        /*   :e2:e1:00 = a0*b1 */
542
    m = veorq_u8(d, e);                         /*   :m2:m1:00 = d + e */
543

544
    ret.val[0] = h;
545
    ret.val[1] = m;
546
    ret.val[2] = l;
547
    return ret;
548
}
549

550
/*
551
 * Modulo reduction.
552
 *
553
 * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
554
 *
555
 * Section 4.3
556
 *
557
 * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
558
 * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
559
 * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
560
 * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
561
 * simply multiply the higher part of the operand by r(z) and add it to l(z). If
562
 * the result is still larger than 128 bits, we reduce again.
563
 */
564
static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
565
{
566
    uint8x16_t const ZERO = vdupq_n_u8(0);
567

568
    uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
569
#if defined(__GNUC__)
570
    /* use 'asm' as an optimisation barrier to prevent loading MODULO from
571
     * memory. It is for GNUC compatible compilers.
572
     */
573
    asm volatile ("" : "+w" (r));
574
#endif
575
    uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
576
    uint8x16_t h, m, l; /* input high/middle/low 128b */
577
    uint8x16_t c, d, e, f, g, n, o;
578
    h = input.val[0];            /* h3:h2:00:00                          */
579
    m = input.val[1];            /*   :m2:m1:00                          */
580
    l = input.val[2];            /*   :  :l1:l0                          */
581
    c = pmull_high(h, MODULO);   /*   :c2:c1:00 = reduction of h3        */
582
    d = pmull_low(h, MODULO);    /*   :  :d1:d0 = reduction of h2        */
583
    e = veorq_u8(c, m);          /*   :e2:e1:00 = m2:m1:00 + c2:c1:00    */
584
    f = pmull_high(e, MODULO);   /*   :  :f1:f0 = reduction of e2        */
585
    g = vextq_u8(ZERO, e, 8);    /*   :  :g1:00 = e1:00                  */
586
    n = veorq_u8(d, l);          /*   :  :n1:n0 = d1:d0 + l1:l0          */
587
    o = veorq_u8(n, f);          /*       o1:o0 = f1:f0 + n1:n0          */
588
    return veorq_u8(o, g);       /*             = o1:o0 + g1:00          */
589
}
590

591
/*
592
 * GCM multiplication: c = a times b in GF(2^128)
593
 */
594
void mbedtls_aesce_gcm_mult(unsigned char c[16],
595
                            const unsigned char a[16],
596
                            const unsigned char b[16])
597
{
598
    uint8x16_t va, vb, vc;
599
    va = vrbitq_u8(vld1q_u8(&a[0]));
600
    vb = vrbitq_u8(vld1q_u8(&b[0]));
601
    vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
602
    vst1q_u8(&c[0], vc);
603
}
604

605
#endif /* MBEDTLS_GCM_C */
606

607
#if defined(MBEDTLS_POP_TARGET_PRAGMA)
608
#if defined(__clang__)
609
#pragma clang attribute pop
610
#elif defined(__GNUC__)
611
#pragma GCC pop_options
612
#endif
613
#undef MBEDTLS_POP_TARGET_PRAGMA
614
#endif
615

616
#endif /* MBEDTLS_AESCE_HAVE_CODE */
617

618
#endif /* MBEDTLS_AESCE_C */
619

620
Product

Resources

Company