Path: blob/main/contrib/bearssl/src/symcipher/aes_x86ni_ctr.c
39482 views
/*1* Copyright (c) 2017 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#define BR_ENABLE_INTRINSICS 125#include "inner.h"2627#if BR_AES_X86NI2829/* see bearssl_block.h */30const br_block_ctr_class *31br_aes_x86ni_ctr_get_vtable(void)32{33return br_aes_x86ni_supported() ? &br_aes_x86ni_ctr_vtable : NULL;34}3536/* see bearssl_block.h */37void38br_aes_x86ni_ctr_init(br_aes_x86ni_ctr_keys *ctx,39const void *key, size_t len)40{41ctx->vtable = &br_aes_x86ni_ctr_vtable;42ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len);43}4445BR_TARGETS_X86_UP4647/* see bearssl_block.h */48BR_TARGET("sse2,sse4.1,aes")49uint32_t50br_aes_x86ni_ctr_run(const br_aes_x86ni_ctr_keys *ctx,51const void *iv, uint32_t cc, void *data, size_t len)52{53unsigned char *buf;54unsigned char ivbuf[16];55unsigned num_rounds;56__m128i sk[15];57__m128i ivx;58unsigned u;5960buf = data;61memcpy(ivbuf, iv, 12);62num_rounds = ctx->num_rounds;63for (u = 0; u <= num_rounds; u ++) {64sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));65}66ivx = _mm_loadu_si128((void *)ivbuf);67while (len > 0) {68__m128i x0, x1, x2, x3;6970x0 = _mm_insert_epi32(ivx, br_bswap32(cc + 0), 3);71x1 = _mm_insert_epi32(ivx, br_bswap32(cc + 1), 3);72x2 = _mm_insert_epi32(ivx, br_bswap32(cc + 2), 3);73x3 = _mm_insert_epi32(ivx, br_bswap32(cc + 3), 3);74x0 = _mm_xor_si128(x0, sk[0]);75x1 = _mm_xor_si128(x1, sk[0]);76x2 = _mm_xor_si128(x2, sk[0]);77x3 = _mm_xor_si128(x3, sk[0]);78x0 = _mm_aesenc_si128(x0, sk[1]);79x1 = _mm_aesenc_si128(x1, sk[1]);80x2 = _mm_aesenc_si128(x2, sk[1]);81x3 = _mm_aesenc_si128(x3, sk[1]);82x0 = _mm_aesenc_si128(x0, sk[2]);83x1 = _mm_aesenc_si128(x1, sk[2]);84x2 = _mm_aesenc_si128(x2, sk[2]);85x3 = _mm_aesenc_si128(x3, sk[2]);86x0 = _mm_aesenc_si128(x0, sk[3]);87x1 = _mm_aesenc_si128(x1, sk[3]);88x2 = _mm_aesenc_si128(x2, sk[3]);89x3 = _mm_aesenc_si128(x3, sk[3]);90x0 = _mm_aesenc_si128(x0, sk[4]);91x1 = _mm_aesenc_si128(x1, sk[4]);92x2 = _mm_aesenc_si128(x2, sk[4]);93x3 = _mm_aesenc_si128(x3, sk[4]);94x0 = _mm_aesenc_si128(x0, sk[5]);95x1 = _mm_aesenc_si128(x1, sk[5]);96x2 = _mm_aesenc_si128(x2, sk[5]);97x3 = _mm_aesenc_si128(x3, sk[5]);98x0 = _mm_aesenc_si128(x0, sk[6]);99x1 = _mm_aesenc_si128(x1, sk[6]);100x2 = _mm_aesenc_si128(x2, sk[6]);101x3 = _mm_aesenc_si128(x3, sk[6]);102x0 = _mm_aesenc_si128(x0, sk[7]);103x1 = _mm_aesenc_si128(x1, sk[7]);104x2 = _mm_aesenc_si128(x2, sk[7]);105x3 = _mm_aesenc_si128(x3, sk[7]);106x0 = _mm_aesenc_si128(x0, sk[8]);107x1 = _mm_aesenc_si128(x1, sk[8]);108x2 = _mm_aesenc_si128(x2, sk[8]);109x3 = _mm_aesenc_si128(x3, sk[8]);110x0 = _mm_aesenc_si128(x0, sk[9]);111x1 = _mm_aesenc_si128(x1, sk[9]);112x2 = _mm_aesenc_si128(x2, sk[9]);113x3 = _mm_aesenc_si128(x3, sk[9]);114if (num_rounds == 10) {115x0 = _mm_aesenclast_si128(x0, sk[10]);116x1 = _mm_aesenclast_si128(x1, sk[10]);117x2 = _mm_aesenclast_si128(x2, sk[10]);118x3 = _mm_aesenclast_si128(x3, sk[10]);119} else if (num_rounds == 12) {120x0 = _mm_aesenc_si128(x0, sk[10]);121x1 = _mm_aesenc_si128(x1, sk[10]);122x2 = _mm_aesenc_si128(x2, sk[10]);123x3 = _mm_aesenc_si128(x3, sk[10]);124x0 = _mm_aesenc_si128(x0, sk[11]);125x1 = _mm_aesenc_si128(x1, sk[11]);126x2 = _mm_aesenc_si128(x2, sk[11]);127x3 = _mm_aesenc_si128(x3, sk[11]);128x0 = _mm_aesenclast_si128(x0, sk[12]);129x1 = _mm_aesenclast_si128(x1, sk[12]);130x2 = _mm_aesenclast_si128(x2, sk[12]);131x3 = _mm_aesenclast_si128(x3, sk[12]);132} else {133x0 = _mm_aesenc_si128(x0, sk[10]);134x1 = _mm_aesenc_si128(x1, sk[10]);135x2 = _mm_aesenc_si128(x2, sk[10]);136x3 = _mm_aesenc_si128(x3, sk[10]);137x0 = _mm_aesenc_si128(x0, sk[11]);138x1 = _mm_aesenc_si128(x1, sk[11]);139x2 = _mm_aesenc_si128(x2, sk[11]);140x3 = _mm_aesenc_si128(x3, sk[11]);141x0 = _mm_aesenc_si128(x0, sk[12]);142x1 = _mm_aesenc_si128(x1, sk[12]);143x2 = _mm_aesenc_si128(x2, sk[12]);144x3 = _mm_aesenc_si128(x3, sk[12]);145x0 = _mm_aesenc_si128(x0, sk[13]);146x1 = _mm_aesenc_si128(x1, sk[13]);147x2 = _mm_aesenc_si128(x2, sk[13]);148x3 = _mm_aesenc_si128(x3, sk[13]);149x0 = _mm_aesenclast_si128(x0, sk[14]);150x1 = _mm_aesenclast_si128(x1, sk[14]);151x2 = _mm_aesenclast_si128(x2, sk[14]);152x3 = _mm_aesenclast_si128(x3, sk[14]);153}154if (len >= 64) {155x0 = _mm_xor_si128(x0,156_mm_loadu_si128((void *)(buf + 0)));157x1 = _mm_xor_si128(x1,158_mm_loadu_si128((void *)(buf + 16)));159x2 = _mm_xor_si128(x2,160_mm_loadu_si128((void *)(buf + 32)));161x3 = _mm_xor_si128(x3,162_mm_loadu_si128((void *)(buf + 48)));163_mm_storeu_si128((void *)(buf + 0), x0);164_mm_storeu_si128((void *)(buf + 16), x1);165_mm_storeu_si128((void *)(buf + 32), x2);166_mm_storeu_si128((void *)(buf + 48), x3);167buf += 64;168len -= 64;169cc += 4;170} else {171unsigned char tmp[64];172173_mm_storeu_si128((void *)(tmp + 0), x0);174_mm_storeu_si128((void *)(tmp + 16), x1);175_mm_storeu_si128((void *)(tmp + 32), x2);176_mm_storeu_si128((void *)(tmp + 48), x3);177for (u = 0; u < len; u ++) {178buf[u] ^= tmp[u];179}180cc += (uint32_t)len >> 4;181break;182}183}184return cc;185}186187BR_TARGETS_X86_DOWN188189/* see bearssl_block.h */190const br_block_ctr_class br_aes_x86ni_ctr_vtable = {191sizeof(br_aes_x86ni_ctr_keys),19216,1934,194(void (*)(const br_block_ctr_class **, const void *, size_t))195&br_aes_x86ni_ctr_init,196(uint32_t (*)(const br_block_ctr_class *const *,197const void *, uint32_t, void *, size_t))198&br_aes_x86ni_ctr_run199};200201#else202203/* see bearssl_block.h */204const br_block_ctr_class *205br_aes_x86ni_ctr_get_vtable(void)206{207return NULL;208}209210#endif211212213