Path: blob/a-new-beginning/SharedDependencies/Sources/cryptopp/chacha_avx.cpp
2 views
// chacha_avx.cpp - written and placed in the public domain by1// Jack Lloyd and Jeffrey Walton2//3// This source file uses intrinsics and built-ins to gain access to4// AVX2 instructions. A separate source file is needed because5// additional CXXFLAGS are required to enable the appropriate6// instructions sets in some build configurations.7//8// AVX2 implementation based on Botan's chacha_avx.cpp. Many thanks9// to Jack Lloyd and the Botan team for allowing us to use it.10//11// Here are some relative numbers for ChaCha8:12// * Intel Skylake, 3.0 GHz: AVX2 at 4411 MB/s; 0.57 cpb.13// * Intel Broadwell, 2.3 GHz: AVX2 at 3828 MB/s; 0.58 cpb.14// * AMD Bulldozer, 3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb.1516#include "pch.h"17#include "config.h"1819#include "chacha.h"20#include "misc.h"2122#if defined(CRYPTOPP_AVX2_AVAILABLE)23# include <xmmintrin.h>24# include <emmintrin.h>25# include <immintrin.h>26#endif2728// Squash MS LNK4221 and libtool warnings29extern const char CHACHA_AVX_FNAME[] = __FILE__;3031// Sun Studio 12.4 OK, 12.5 and 12.6 compile error.32#if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150)33# define MAYBE_CONST34#else35# define MAYBE_CONST const36#endif3738// VS2017 and global optimization bug. Also see39// https://github.com/weidai11/cryptopp/issues/649 and40// https://github.com/weidai11/cryptopp/issues/735. The41// 649 issue affects AES but it is the same here. The 73542// issue is ChaCha AVX2 cut-in where it surfaced again.43#if (CRYPTOPP_MSC_VERSION >= 1910) && (CRYPTOPP_MSC_VERSION <= 1916)44# ifndef CRYPTOPP_DEBUG45# pragma optimize("", off)46# pragma optimize("ts", on)47# endif48#endif4950// The data is aligned, but Clang issues warning based on type51// and not the actual alignment of the variable and data.52#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE53# pragma GCC diagnostic ignored "-Wcast-align"54#endif5556ANONYMOUS_NAMESPACE_BEGIN5758#if (CRYPTOPP_AVX2_AVAILABLE)5960template <unsigned int R>61inline __m256i RotateLeft(const __m256i val)62{63return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));64}6566template <>67inline __m256i RotateLeft<8>(const __m256i val)68{69const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,7014,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);71return _mm256_shuffle_epi8(val, mask);72}7374template <>75inline __m256i RotateLeft<16>(const __m256i val)76{77const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,7813,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);79return _mm256_shuffle_epi8(val, mask);80}8182#endif // CRYPTOPP_AVX2_AVAILABLE8384ANONYMOUS_NAMESPACE_END8586NAMESPACE_BEGIN(CryptoPP)8788#if (CRYPTOPP_AVX2_AVAILABLE)8990void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds)91{92const __m256i state0 = _mm256_broadcastsi128_si256(93_mm_loadu_si128(reinterpret_cast<const __m128i*>(state+0*4)));94const __m256i state1 = _mm256_broadcastsi128_si256(95_mm_loadu_si128(reinterpret_cast<const __m128i*>(state+1*4)));96const __m256i state2 = _mm256_broadcastsi128_si256(97_mm_loadu_si128(reinterpret_cast<const __m128i*>(state+2*4)));98const __m256i state3 = _mm256_broadcastsi128_si256(99_mm_loadu_si128(reinterpret_cast<const __m128i*>(state+3*4)));100101const word32 C = 0xFFFFFFFFu - state[12];102const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, C < 4, 4);103const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5);104const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6);105const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7);106107__m256i X0_0 = state0;108__m256i X0_1 = state1;109__m256i X0_2 = state2;110__m256i X0_3 = _mm256_add_epi32(state3, CTR0);111112__m256i X1_0 = state0;113__m256i X1_1 = state1;114__m256i X1_2 = state2;115__m256i X1_3 = _mm256_add_epi32(state3, CTR1);116117__m256i X2_0 = state0;118__m256i X2_1 = state1;119__m256i X2_2 = state2;120__m256i X2_3 = _mm256_add_epi32(state3, CTR2);121122__m256i X3_0 = state0;123__m256i X3_1 = state1;124__m256i X3_2 = state2;125__m256i X3_3 = _mm256_add_epi32(state3, CTR3);126127for (int i = static_cast<int>(rounds); i > 0; i -= 2)128{129X0_0 = _mm256_add_epi32(X0_0, X0_1);130X1_0 = _mm256_add_epi32(X1_0, X1_1);131X2_0 = _mm256_add_epi32(X2_0, X2_1);132X3_0 = _mm256_add_epi32(X3_0, X3_1);133134X0_3 = _mm256_xor_si256(X0_3, X0_0);135X1_3 = _mm256_xor_si256(X1_3, X1_0);136X2_3 = _mm256_xor_si256(X2_3, X2_0);137X3_3 = _mm256_xor_si256(X3_3, X3_0);138139X0_3 = RotateLeft<16>(X0_3);140X1_3 = RotateLeft<16>(X1_3);141X2_3 = RotateLeft<16>(X2_3);142X3_3 = RotateLeft<16>(X3_3);143144X0_2 = _mm256_add_epi32(X0_2, X0_3);145X1_2 = _mm256_add_epi32(X1_2, X1_3);146X2_2 = _mm256_add_epi32(X2_2, X2_3);147X3_2 = _mm256_add_epi32(X3_2, X3_3);148149X0_1 = _mm256_xor_si256(X0_1, X0_2);150X1_1 = _mm256_xor_si256(X1_1, X1_2);151X2_1 = _mm256_xor_si256(X2_1, X2_2);152X3_1 = _mm256_xor_si256(X3_1, X3_2);153154X0_1 = RotateLeft<12>(X0_1);155X1_1 = RotateLeft<12>(X1_1);156X2_1 = RotateLeft<12>(X2_1);157X3_1 = RotateLeft<12>(X3_1);158159X0_0 = _mm256_add_epi32(X0_0, X0_1);160X1_0 = _mm256_add_epi32(X1_0, X1_1);161X2_0 = _mm256_add_epi32(X2_0, X2_1);162X3_0 = _mm256_add_epi32(X3_0, X3_1);163164X0_3 = _mm256_xor_si256(X0_3, X0_0);165X1_3 = _mm256_xor_si256(X1_3, X1_0);166X2_3 = _mm256_xor_si256(X2_3, X2_0);167X3_3 = _mm256_xor_si256(X3_3, X3_0);168169X0_3 = RotateLeft<8>(X0_3);170X1_3 = RotateLeft<8>(X1_3);171X2_3 = RotateLeft<8>(X2_3);172X3_3 = RotateLeft<8>(X3_3);173174X0_2 = _mm256_add_epi32(X0_2, X0_3);175X1_2 = _mm256_add_epi32(X1_2, X1_3);176X2_2 = _mm256_add_epi32(X2_2, X2_3);177X3_2 = _mm256_add_epi32(X3_2, X3_3);178179X0_1 = _mm256_xor_si256(X0_1, X0_2);180X1_1 = _mm256_xor_si256(X1_1, X1_2);181X2_1 = _mm256_xor_si256(X2_1, X2_2);182X3_1 = _mm256_xor_si256(X3_1, X3_2);183184X0_1 = RotateLeft<7>(X0_1);185X1_1 = RotateLeft<7>(X1_1);186X2_1 = RotateLeft<7>(X2_1);187X3_1 = RotateLeft<7>(X3_1);188189X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));190X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));191X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));192193X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));194X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));195X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));196197X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));198X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));199X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));200201X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));202X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));203X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));204205X0_0 = _mm256_add_epi32(X0_0, X0_1);206X1_0 = _mm256_add_epi32(X1_0, X1_1);207X2_0 = _mm256_add_epi32(X2_0, X2_1);208X3_0 = _mm256_add_epi32(X3_0, X3_1);209210X0_3 = _mm256_xor_si256(X0_3, X0_0);211X1_3 = _mm256_xor_si256(X1_3, X1_0);212X2_3 = _mm256_xor_si256(X2_3, X2_0);213X3_3 = _mm256_xor_si256(X3_3, X3_0);214215X0_3 = RotateLeft<16>(X0_3);216X1_3 = RotateLeft<16>(X1_3);217X2_3 = RotateLeft<16>(X2_3);218X3_3 = RotateLeft<16>(X3_3);219220X0_2 = _mm256_add_epi32(X0_2, X0_3);221X1_2 = _mm256_add_epi32(X1_2, X1_3);222X2_2 = _mm256_add_epi32(X2_2, X2_3);223X3_2 = _mm256_add_epi32(X3_2, X3_3);224225X0_1 = _mm256_xor_si256(X0_1, X0_2);226X1_1 = _mm256_xor_si256(X1_1, X1_2);227X2_1 = _mm256_xor_si256(X2_1, X2_2);228X3_1 = _mm256_xor_si256(X3_1, X3_2);229230X0_1 = RotateLeft<12>(X0_1);231X1_1 = RotateLeft<12>(X1_1);232X2_1 = RotateLeft<12>(X2_1);233X3_1 = RotateLeft<12>(X3_1);234235X0_0 = _mm256_add_epi32(X0_0, X0_1);236X1_0 = _mm256_add_epi32(X1_0, X1_1);237X2_0 = _mm256_add_epi32(X2_0, X2_1);238X3_0 = _mm256_add_epi32(X3_0, X3_1);239240X0_3 = _mm256_xor_si256(X0_3, X0_0);241X1_3 = _mm256_xor_si256(X1_3, X1_0);242X2_3 = _mm256_xor_si256(X2_3, X2_0);243X3_3 = _mm256_xor_si256(X3_3, X3_0);244245X0_3 = RotateLeft<8>(X0_3);246X1_3 = RotateLeft<8>(X1_3);247X2_3 = RotateLeft<8>(X2_3);248X3_3 = RotateLeft<8>(X3_3);249250X0_2 = _mm256_add_epi32(X0_2, X0_3);251X1_2 = _mm256_add_epi32(X1_2, X1_3);252X2_2 = _mm256_add_epi32(X2_2, X2_3);253X3_2 = _mm256_add_epi32(X3_2, X3_3);254255X0_1 = _mm256_xor_si256(X0_1, X0_2);256X1_1 = _mm256_xor_si256(X1_1, X1_2);257X2_1 = _mm256_xor_si256(X2_1, X2_2);258X3_1 = _mm256_xor_si256(X3_1, X3_2);259260X0_1 = RotateLeft<7>(X0_1);261X1_1 = RotateLeft<7>(X1_1);262X2_1 = RotateLeft<7>(X2_1);263X3_1 = RotateLeft<7>(X3_1);264265X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));266X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));267X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));268269X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));270X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));271X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));272273X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));274X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));275X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));276277X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));278X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));279X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));280}281282X0_0 = _mm256_add_epi32(X0_0, state0);283X0_1 = _mm256_add_epi32(X0_1, state1);284X0_2 = _mm256_add_epi32(X0_2, state2);285X0_3 = _mm256_add_epi32(X0_3, state3);286X0_3 = _mm256_add_epi32(X0_3, CTR0);287288X1_0 = _mm256_add_epi32(X1_0, state0);289X1_1 = _mm256_add_epi32(X1_1, state1);290X1_2 = _mm256_add_epi32(X1_2, state2);291X1_3 = _mm256_add_epi32(X1_3, state3);292X1_3 = _mm256_add_epi32(X1_3, CTR1);293294X2_0 = _mm256_add_epi32(X2_0, state0);295X2_1 = _mm256_add_epi32(X2_1, state1);296X2_2 = _mm256_add_epi32(X2_2, state2);297X2_3 = _mm256_add_epi32(X2_3, state3);298X2_3 = _mm256_add_epi32(X2_3, CTR2);299300X3_0 = _mm256_add_epi32(X3_0, state0);301X3_1 = _mm256_add_epi32(X3_1, state1);302X3_2 = _mm256_add_epi32(X3_2, state2);303X3_3 = _mm256_add_epi32(X3_3, state3);304X3_3 = _mm256_add_epi32(X3_3, CTR3);305306if (input)307{308_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),309_mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)),310_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+0*32)))));311_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),312_mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)),313_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+1*32)))));314_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),315_mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)),316_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+2*32)))));317_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),318_mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)),319_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+3*32)))));320}321else322{323_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),324_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));325_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),326_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));327_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),328_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));329_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),330_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));331}332333if (input)334{335_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),336_mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)),337_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+4*32)))));338_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),339_mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)),340_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+5*32)))));341_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),342_mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)),343_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+6*32)))));344_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),345_mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)),346_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+7*32)))));347}348else349{350_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),351_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));352_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),353_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));354_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),355_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));356_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),357_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));358}359360if (input)361{362_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),363_mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)),364_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+8*32)))));365_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),366_mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)),367_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+9*32)))));368_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),369_mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)),370_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+10*32)))));371_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),372_mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)),373_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+11*32)))));374}375else376{377_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),378_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));379_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),380_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));381_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),382_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));383_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),384_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));385}386387if (input)388{389_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),390_mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)),391_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+12*32)))));392_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),393_mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)),394_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+13*32)))));395_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),396_mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)),397_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+14*32)))));398_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),399_mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)),400_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+15*32)))));401}402else403{404_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),405_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));406_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),407_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));408_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),409_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));410_mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),411_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));412}413414// https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties415_mm256_zeroupper();416}417418#endif // CRYPTOPP_AVX2_AVAILABLE419420NAMESPACE_END421422423