Path: blob/main/contrib/bearssl/src/symcipher/aes_x86ni_cbcdec.c
39482 views
/*1* Copyright (c) 2017 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#define BR_ENABLE_INTRINSICS 125#include "inner.h"2627#if BR_AES_X86NI2829/* see bearssl_block.h */30const br_block_cbcdec_class *31br_aes_x86ni_cbcdec_get_vtable(void)32{33return br_aes_x86ni_supported() ? &br_aes_x86ni_cbcdec_vtable : NULL;34}3536/* see bearssl_block.h */37void38br_aes_x86ni_cbcdec_init(br_aes_x86ni_cbcdec_keys *ctx,39const void *key, size_t len)40{41ctx->vtable = &br_aes_x86ni_cbcdec_vtable;42ctx->num_rounds = br_aes_x86ni_keysched_dec(ctx->skey.skni, key, len);43}4445BR_TARGETS_X86_UP4647/* see bearssl_block.h */48BR_TARGET("sse2,aes")49void50br_aes_x86ni_cbcdec_run(const br_aes_x86ni_cbcdec_keys *ctx,51void *iv, void *data, size_t len)52{53unsigned char *buf;54unsigned num_rounds;55__m128i sk[15], ivx;56unsigned u;5758buf = data;59ivx = _mm_loadu_si128(iv);60num_rounds = ctx->num_rounds;61for (u = 0; u <= num_rounds; u ++) {62sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));63}64while (len > 0) {65__m128i x0, x1, x2, x3, e0, e1, e2, e3;6667x0 = _mm_loadu_si128((void *)(buf + 0));68if (len >= 64) {69x1 = _mm_loadu_si128((void *)(buf + 16));70x2 = _mm_loadu_si128((void *)(buf + 32));71x3 = _mm_loadu_si128((void *)(buf + 48));72} else {73x0 = _mm_loadu_si128((void *)(buf + 0));74if (len >= 32) {75x1 = _mm_loadu_si128((void *)(buf + 16));76if (len >= 48) {77x2 = _mm_loadu_si128(78(void *)(buf + 32));79x3 = x2;80} else {81x2 = x0;82x3 = x1;83}84} else {85x1 = x0;86x2 = x0;87x3 = x0;88}89}90e0 = x0;91e1 = x1;92e2 = x2;93e3 = x3;94x0 = _mm_xor_si128(x0, sk[0]);95x1 = _mm_xor_si128(x1, sk[0]);96x2 = _mm_xor_si128(x2, sk[0]);97x3 = _mm_xor_si128(x3, sk[0]);98x0 = _mm_aesdec_si128(x0, sk[1]);99x1 = _mm_aesdec_si128(x1, sk[1]);100x2 = _mm_aesdec_si128(x2, sk[1]);101x3 = _mm_aesdec_si128(x3, sk[1]);102x0 = _mm_aesdec_si128(x0, sk[2]);103x1 = _mm_aesdec_si128(x1, sk[2]);104x2 = _mm_aesdec_si128(x2, sk[2]);105x3 = _mm_aesdec_si128(x3, sk[2]);106x0 = _mm_aesdec_si128(x0, sk[3]);107x1 = _mm_aesdec_si128(x1, sk[3]);108x2 = _mm_aesdec_si128(x2, sk[3]);109x3 = _mm_aesdec_si128(x3, sk[3]);110x0 = _mm_aesdec_si128(x0, sk[4]);111x1 = _mm_aesdec_si128(x1, sk[4]);112x2 = _mm_aesdec_si128(x2, sk[4]);113x3 = _mm_aesdec_si128(x3, sk[4]);114x0 = _mm_aesdec_si128(x0, sk[5]);115x1 = _mm_aesdec_si128(x1, sk[5]);116x2 = _mm_aesdec_si128(x2, sk[5]);117x3 = _mm_aesdec_si128(x3, sk[5]);118x0 = _mm_aesdec_si128(x0, sk[6]);119x1 = _mm_aesdec_si128(x1, sk[6]);120x2 = _mm_aesdec_si128(x2, sk[6]);121x3 = _mm_aesdec_si128(x3, sk[6]);122x0 = _mm_aesdec_si128(x0, sk[7]);123x1 = _mm_aesdec_si128(x1, sk[7]);124x2 = _mm_aesdec_si128(x2, sk[7]);125x3 = _mm_aesdec_si128(x3, sk[7]);126x0 = _mm_aesdec_si128(x0, sk[8]);127x1 = _mm_aesdec_si128(x1, sk[8]);128x2 = _mm_aesdec_si128(x2, sk[8]);129x3 = _mm_aesdec_si128(x3, sk[8]);130x0 = _mm_aesdec_si128(x0, sk[9]);131x1 = _mm_aesdec_si128(x1, sk[9]);132x2 = _mm_aesdec_si128(x2, sk[9]);133x3 = _mm_aesdec_si128(x3, sk[9]);134if (num_rounds == 10) {135x0 = _mm_aesdeclast_si128(x0, sk[10]);136x1 = _mm_aesdeclast_si128(x1, sk[10]);137x2 = _mm_aesdeclast_si128(x2, sk[10]);138x3 = _mm_aesdeclast_si128(x3, sk[10]);139} else if (num_rounds == 12) {140x0 = _mm_aesdec_si128(x0, sk[10]);141x1 = _mm_aesdec_si128(x1, sk[10]);142x2 = _mm_aesdec_si128(x2, sk[10]);143x3 = _mm_aesdec_si128(x3, sk[10]);144x0 = _mm_aesdec_si128(x0, sk[11]);145x1 = _mm_aesdec_si128(x1, sk[11]);146x2 = _mm_aesdec_si128(x2, sk[11]);147x3 = _mm_aesdec_si128(x3, sk[11]);148x0 = _mm_aesdeclast_si128(x0, sk[12]);149x1 = _mm_aesdeclast_si128(x1, sk[12]);150x2 = _mm_aesdeclast_si128(x2, sk[12]);151x3 = _mm_aesdeclast_si128(x3, sk[12]);152} else {153x0 = _mm_aesdec_si128(x0, sk[10]);154x1 = _mm_aesdec_si128(x1, sk[10]);155x2 = _mm_aesdec_si128(x2, sk[10]);156x3 = _mm_aesdec_si128(x3, sk[10]);157x0 = _mm_aesdec_si128(x0, sk[11]);158x1 = _mm_aesdec_si128(x1, sk[11]);159x2 = _mm_aesdec_si128(x2, sk[11]);160x3 = _mm_aesdec_si128(x3, sk[11]);161x0 = _mm_aesdec_si128(x0, sk[12]);162x1 = _mm_aesdec_si128(x1, sk[12]);163x2 = _mm_aesdec_si128(x2, sk[12]);164x3 = _mm_aesdec_si128(x3, sk[12]);165x0 = _mm_aesdec_si128(x0, sk[13]);166x1 = _mm_aesdec_si128(x1, sk[13]);167x2 = _mm_aesdec_si128(x2, sk[13]);168x3 = _mm_aesdec_si128(x3, sk[13]);169x0 = _mm_aesdeclast_si128(x0, sk[14]);170x1 = _mm_aesdeclast_si128(x1, sk[14]);171x2 = _mm_aesdeclast_si128(x2, sk[14]);172x3 = _mm_aesdeclast_si128(x3, sk[14]);173}174x0 = _mm_xor_si128(x0, ivx);175x1 = _mm_xor_si128(x1, e0);176x2 = _mm_xor_si128(x2, e1);177x3 = _mm_xor_si128(x3, e2);178ivx = e3;179_mm_storeu_si128((void *)(buf + 0), x0);180if (len >= 64) {181_mm_storeu_si128((void *)(buf + 16), x1);182_mm_storeu_si128((void *)(buf + 32), x2);183_mm_storeu_si128((void *)(buf + 48), x3);184buf += 64;185len -= 64;186} else {187if (len >= 32) {188_mm_storeu_si128((void *)(buf + 16), x1);189if (len >= 48) {190_mm_storeu_si128(191(void *)(buf + 32), x2);192}193}194break;195}196}197_mm_storeu_si128(iv, ivx);198}199200BR_TARGETS_X86_DOWN201202/* see bearssl_block.h */203const br_block_cbcdec_class br_aes_x86ni_cbcdec_vtable = {204sizeof(br_aes_x86ni_cbcdec_keys),20516,2064,207(void (*)(const br_block_cbcdec_class **, const void *, size_t))208&br_aes_x86ni_cbcdec_init,209(void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t))210&br_aes_x86ni_cbcdec_run211};212213#else214215/* see bearssl_block.h */216const br_block_cbcdec_class *217br_aes_x86ni_cbcdec_get_vtable(void)218{219return NULL;220}221222#endif223224225