Path: blob/a-new-beginning/SharedDependencies/Sources/cryptopp/chacha_simd.cpp
2 views
// chacha_simd.cpp - written and placed in the public domain by1// Jack Lloyd and Jeffrey Walton2//3// This source file uses intrinsics and built-ins to gain access to4// SSE2, ARM NEON and ARMv8a, Power7 and Altivec instructions. A separate5// source file is needed because additional CXXFLAGS are required to enable6// the appropriate instructions sets in some build configurations.7//8// SSE2 implementation based on Botan's chacha_sse2.cpp. Many thanks9// to Jack Lloyd and the Botan team for allowing us to use it.10//11// The SSE2 implementation is kind of unusual among Crypto++ algorithms.12// We guard on CRYTPOPP_SSE2_AVAILABLE and use HasSSE2() at runtime. However,13// if the compiler says a target machine has SSSE3 or XOP available (say, by14// way of -march=native), then we can pull another 150 to 800 MB/s out of15// ChaCha. To capture SSSE3 and XOP we use the compiler defines __SSSE3__ and16// __XOP__ and forgo runtime tests.17//18// Runtime tests for HasSSSE3() and HasXop() are too expensive to make a19// sub-case of SSE2. The rotates are on a critical path and the runtime tests20// crush performance.21//22// Here are some relative numbers for ChaCha8:23// * Intel Skylake, 3.0 GHz: SSE2 at 2160 MB/s; SSSE3 at 2310 MB/s.24// * AMD Bulldozer, 3.3 GHz: SSE2 at 1680 MB/s; XOP at 2510 MB/s.2526#include "pch.h"27#include "config.h"2829#include "chacha.h"30#include "misc.h"3132// Internal compiler error in GCC 3.3 and below33#if defined(__GNUC__) && (__GNUC__ < 4)34# undef CRYPTOPP_SSE2_INTRIN_AVAILABLE35#endif3637#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)38# include <xmmintrin.h>39# include <emmintrin.h>40#endif4142#if defined(__SSSE3__)43# include <tmmintrin.h>44#endif4546#if defined(__XOP__)47# if defined(CRYPTOPP_GCC_COMPATIBLE)48# include <x86intrin.h>49# endif50# include <ammintrin.h>51#endif // XOP5253#if (CRYPTOPP_ARM_NEON_HEADER)54# include <arm_neon.h>55#endif5657#if (CRYPTOPP_ARM_ACLE_HEADER)58# include <stdint.h>59# include <arm_acle.h>60#endif6162#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)63# include "ppc_simd.h"64#endif6566// Squash MS LNK4221 and libtool warnings67extern const char CHACHA_SIMD_FNAME[] = __FILE__;6869ANONYMOUS_NAMESPACE_BEGIN7071// ***************************** NEON ***************************** //7273#if (CRYPTOPP_ARM_NEON_AVAILABLE)7475template <unsigned int R>76inline uint32x4_t RotateLeft(const uint32x4_t& val)77{78return vorrq_u32(vshlq_n_u32(val, R), vshrq_n_u32(val, 32 - R));79}8081template <unsigned int R>82inline uint32x4_t RotateRight(const uint32x4_t& val)83{84return vorrq_u32(vshlq_n_u32(val, 32 - R), vshrq_n_u32(val, R));85}8687template <>88inline uint32x4_t RotateLeft<8>(const uint32x4_t& val)89{90#if defined(__aarch32__) || defined(__aarch64__)91const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };92const uint8x16_t mask = vld1q_u8(maskb);9394return vreinterpretq_u32_u8(95vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));96#else97// fallback to slower C++ rotation.98return vorrq_u32(vshlq_n_u32(val, 8),99vshrq_n_u32(val, 32 - 8));100#endif101}102103template <>104inline uint32x4_t RotateLeft<16>(const uint32x4_t& val)105{106#if defined(__aarch32__) || defined(__aarch64__)107return vreinterpretq_u32_u16(108vrev32q_u16(vreinterpretq_u16_u32(val)));109#else110// fallback to slower C++ rotation.111return vorrq_u32(vshlq_n_u32(val, 16),112vshrq_n_u32(val, 32 - 16));113#endif114}115116template <>117inline uint32x4_t RotateRight<8>(const uint32x4_t& val)118{119#if defined(__aarch32__) || defined(__aarch64__)120const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };121const uint8x16_t mask = vld1q_u8(maskb);122123return vreinterpretq_u32_u8(124vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));125#else126// fallback to slower C++ rotation.127return vorrq_u32(vshrq_n_u32(val, 8),128vshlq_n_u32(val, 32 - 8));129#endif130}131132template <>133inline uint32x4_t RotateRight<16>(const uint32x4_t& val)134{135#if defined(__aarch32__) || defined(__aarch64__)136return vreinterpretq_u32_u16(137vrev32q_u16(vreinterpretq_u16_u32(val)));138#else139// fallback to slower C++ rotation.140return vorrq_u32(vshrq_n_u32(val, 16),141vshlq_n_u32(val, 32 - 16));142#endif143}144145// ChaCha's use of x86 shuffle is really a 4, 8, or 12 byte146// rotation on the 128-bit vector word:147// * [3,2,1,0] => [0,3,2,1] is Extract<1>(x)148// * [3,2,1,0] => [1,0,3,2] is Extract<2>(x)149// * [3,2,1,0] => [2,1,0,3] is Extract<3>(x)150template <unsigned int S>151inline uint32x4_t Extract(const uint32x4_t& val)152{153return vextq_u32(val, val, S);154}155156// Helper to perform 64-bit addition across two elements of 32-bit vectors157inline uint32x4_t Add64(const uint32x4_t& a, const uint32x4_t& b)158{159return vreinterpretq_u32_u64(160vaddq_u64(161vreinterpretq_u64_u32(a),162vreinterpretq_u64_u32(b)));163}164165#endif // CRYPTOPP_ARM_NEON_AVAILABLE166167// ***************************** SSE2 ***************************** //168169#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)170171template <unsigned int R>172inline __m128i RotateLeft(const __m128i val)173{174#ifdef __XOP__175return _mm_roti_epi32(val, R);176#else177return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));178#endif179}180181template <>182inline __m128i RotateLeft<8>(const __m128i val)183{184#if defined(__XOP__)185return _mm_roti_epi32(val, 8);186#elif defined(__SSSE3__)187const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);188return _mm_shuffle_epi8(val, mask);189#else190return _mm_or_si128(_mm_slli_epi32(val, 8), _mm_srli_epi32(val, 32-8));191#endif192}193194template <>195inline __m128i RotateLeft<16>(const __m128i val)196{197#if defined(__XOP__)198return _mm_roti_epi32(val, 16);199#elif defined(__SSSE3__)200const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);201return _mm_shuffle_epi8(val, mask);202#else203return _mm_or_si128(_mm_slli_epi32(val, 16), _mm_srli_epi32(val, 32-16));204#endif205}206207#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE208209// **************************** Altivec **************************** //210211#if (CRYPTOPP_ALTIVEC_AVAILABLE)212213// ChaCha_OperateKeystream is optimized for Altivec. However, Altivec214// is supported by using vec_ld and vec_st, and using a composite VecAdd215// that supports 64-bit element adds. vec_ld and vec_st add significant216// overhead when memory is not aligned. Despite the drawbacks Altivec217// is profitable. The numbers for ChaCha8 are:218//219// PowerMac, C++, 2.0 GHz: 205 MB/s, 9.29 cpb220// PowerMac, Altivec, 2.0 GHz: 471 MB/s, 4.09 cpb221222using CryptoPP::uint8x16_p;223using CryptoPP::uint32x4_p;224using CryptoPP::VecLoad;225using CryptoPP::VecLoadAligned;226using CryptoPP::VecStore;227using CryptoPP::VecPermute;228229// Permutes bytes in packed 32-bit words to little endian.230// State is already in proper endian order. Input and231// output must be permuted during load and save.232inline uint32x4_p VecLoad32LE(const uint8_t src[16])233{234#if (CRYPTOPP_BIG_ENDIAN)235const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};236const uint32x4_p val = VecLoad(src);237return VecPermute(val, val, mask);238#else239return VecLoad(src);240#endif241}242243// Permutes bytes in packed 32-bit words to little endian.244// State is already in proper endian order. Input and245// output must be permuted during load and save.246inline void VecStore32LE(uint8_t dest[16], const uint32x4_p& val)247{248#if (CRYPTOPP_BIG_ENDIAN)249const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};250VecStore(VecPermute(val, val, mask), dest);251#else252return VecStore(val, dest);253#endif254}255256// ChaCha's use of x86 shuffle is really a 4, 8, or 12 byte257// rotation on the 128-bit vector word:258// * [3,2,1,0] => [0,3,2,1] is Shuffle<1>(x)259// * [3,2,1,0] => [1,0,3,2] is Shuffle<2>(x)260// * [3,2,1,0] => [2,1,0,3] is Shuffle<3>(x)261template <unsigned int S>262inline uint32x4_p Shuffle(const uint32x4_p& val)263{264CRYPTOPP_ASSERT(0);265return val;266}267268template <>269inline uint32x4_p Shuffle<1>(const uint32x4_p& val)270{271const uint8x16_p mask = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};272return VecPermute(val, val, mask);273}274275template <>276inline uint32x4_p Shuffle<2>(const uint32x4_p& val)277{278const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};279return VecPermute(val, val, mask);280}281282template <>283inline uint32x4_p Shuffle<3>(const uint32x4_p& val)284{285const uint8x16_p mask = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};286return VecPermute(val, val, mask);287}288289#endif // CRYPTOPP_ALTIVEC_AVAILABLE290291ANONYMOUS_NAMESPACE_END292293NAMESPACE_BEGIN(CryptoPP)294295// ***************************** NEON ***************************** //296297#if (CRYPTOPP_ARM_NEON_AVAILABLE)298299void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input, byte *output, unsigned int rounds)300{301const uint32x4_t state0 = vld1q_u32(state + 0*4);302const uint32x4_t state1 = vld1q_u32(state + 1*4);303const uint32x4_t state2 = vld1q_u32(state + 2*4);304const uint32x4_t state3 = vld1q_u32(state + 3*4);305306const unsigned int w[] = {1,0,0,0, 2,0,0,0, 3,0,0,0};307const uint32x4_t CTRS[3] = {308vld1q_u32(w+0), vld1q_u32(w+4), vld1q_u32(w+8)309};310311uint32x4_t r0_0 = state0;312uint32x4_t r0_1 = state1;313uint32x4_t r0_2 = state2;314uint32x4_t r0_3 = state3;315316uint32x4_t r1_0 = state0;317uint32x4_t r1_1 = state1;318uint32x4_t r1_2 = state2;319uint32x4_t r1_3 = Add64(r0_3, CTRS[0]);320321uint32x4_t r2_0 = state0;322uint32x4_t r2_1 = state1;323uint32x4_t r2_2 = state2;324uint32x4_t r2_3 = Add64(r0_3, CTRS[1]);325326uint32x4_t r3_0 = state0;327uint32x4_t r3_1 = state1;328uint32x4_t r3_2 = state2;329uint32x4_t r3_3 = Add64(r0_3, CTRS[2]);330331for (int i = static_cast<int>(rounds); i > 0; i -= 2)332{333r0_0 = vaddq_u32(r0_0, r0_1);334r1_0 = vaddq_u32(r1_0, r1_1);335r2_0 = vaddq_u32(r2_0, r2_1);336r3_0 = vaddq_u32(r3_0, r3_1);337338r0_3 = veorq_u32(r0_3, r0_0);339r1_3 = veorq_u32(r1_3, r1_0);340r2_3 = veorq_u32(r2_3, r2_0);341r3_3 = veorq_u32(r3_3, r3_0);342343r0_3 = RotateLeft<16>(r0_3);344r1_3 = RotateLeft<16>(r1_3);345r2_3 = RotateLeft<16>(r2_3);346r3_3 = RotateLeft<16>(r3_3);347348r0_2 = vaddq_u32(r0_2, r0_3);349r1_2 = vaddq_u32(r1_2, r1_3);350r2_2 = vaddq_u32(r2_2, r2_3);351r3_2 = vaddq_u32(r3_2, r3_3);352353r0_1 = veorq_u32(r0_1, r0_2);354r1_1 = veorq_u32(r1_1, r1_2);355r2_1 = veorq_u32(r2_1, r2_2);356r3_1 = veorq_u32(r3_1, r3_2);357358r0_1 = RotateLeft<12>(r0_1);359r1_1 = RotateLeft<12>(r1_1);360r2_1 = RotateLeft<12>(r2_1);361r3_1 = RotateLeft<12>(r3_1);362363r0_0 = vaddq_u32(r0_0, r0_1);364r1_0 = vaddq_u32(r1_0, r1_1);365r2_0 = vaddq_u32(r2_0, r2_1);366r3_0 = vaddq_u32(r3_0, r3_1);367368r0_3 = veorq_u32(r0_3, r0_0);369r1_3 = veorq_u32(r1_3, r1_0);370r2_3 = veorq_u32(r2_3, r2_0);371r3_3 = veorq_u32(r3_3, r3_0);372373r0_3 = RotateLeft<8>(r0_3);374r1_3 = RotateLeft<8>(r1_3);375r2_3 = RotateLeft<8>(r2_3);376r3_3 = RotateLeft<8>(r3_3);377378r0_2 = vaddq_u32(r0_2, r0_3);379r1_2 = vaddq_u32(r1_2, r1_3);380r2_2 = vaddq_u32(r2_2, r2_3);381r3_2 = vaddq_u32(r3_2, r3_3);382383r0_1 = veorq_u32(r0_1, r0_2);384r1_1 = veorq_u32(r1_1, r1_2);385r2_1 = veorq_u32(r2_1, r2_2);386r3_1 = veorq_u32(r3_1, r3_2);387388r0_1 = RotateLeft<7>(r0_1);389r1_1 = RotateLeft<7>(r1_1);390r2_1 = RotateLeft<7>(r2_1);391r3_1 = RotateLeft<7>(r3_1);392393r0_1 = Extract<1>(r0_1);394r0_2 = Extract<2>(r0_2);395r0_3 = Extract<3>(r0_3);396397r1_1 = Extract<1>(r1_1);398r1_2 = Extract<2>(r1_2);399r1_3 = Extract<3>(r1_3);400401r2_1 = Extract<1>(r2_1);402r2_2 = Extract<2>(r2_2);403r2_3 = Extract<3>(r2_3);404405r3_1 = Extract<1>(r3_1);406r3_2 = Extract<2>(r3_2);407r3_3 = Extract<3>(r3_3);408409r0_0 = vaddq_u32(r0_0, r0_1);410r1_0 = vaddq_u32(r1_0, r1_1);411r2_0 = vaddq_u32(r2_0, r2_1);412r3_0 = vaddq_u32(r3_0, r3_1);413414r0_3 = veorq_u32(r0_3, r0_0);415r1_3 = veorq_u32(r1_3, r1_0);416r2_3 = veorq_u32(r2_3, r2_0);417r3_3 = veorq_u32(r3_3, r3_0);418419r0_3 = RotateLeft<16>(r0_3);420r1_3 = RotateLeft<16>(r1_3);421r2_3 = RotateLeft<16>(r2_3);422r3_3 = RotateLeft<16>(r3_3);423424r0_2 = vaddq_u32(r0_2, r0_3);425r1_2 = vaddq_u32(r1_2, r1_3);426r2_2 = vaddq_u32(r2_2, r2_3);427r3_2 = vaddq_u32(r3_2, r3_3);428429r0_1 = veorq_u32(r0_1, r0_2);430r1_1 = veorq_u32(r1_1, r1_2);431r2_1 = veorq_u32(r2_1, r2_2);432r3_1 = veorq_u32(r3_1, r3_2);433434r0_1 = RotateLeft<12>(r0_1);435r1_1 = RotateLeft<12>(r1_1);436r2_1 = RotateLeft<12>(r2_1);437r3_1 = RotateLeft<12>(r3_1);438439r0_0 = vaddq_u32(r0_0, r0_1);440r1_0 = vaddq_u32(r1_0, r1_1);441r2_0 = vaddq_u32(r2_0, r2_1);442r3_0 = vaddq_u32(r3_0, r3_1);443444r0_3 = veorq_u32(r0_3, r0_0);445r1_3 = veorq_u32(r1_3, r1_0);446r2_3 = veorq_u32(r2_3, r2_0);447r3_3 = veorq_u32(r3_3, r3_0);448449r0_3 = RotateLeft<8>(r0_3);450r1_3 = RotateLeft<8>(r1_3);451r2_3 = RotateLeft<8>(r2_3);452r3_3 = RotateLeft<8>(r3_3);453454r0_2 = vaddq_u32(r0_2, r0_3);455r1_2 = vaddq_u32(r1_2, r1_3);456r2_2 = vaddq_u32(r2_2, r2_3);457r3_2 = vaddq_u32(r3_2, r3_3);458459r0_1 = veorq_u32(r0_1, r0_2);460r1_1 = veorq_u32(r1_1, r1_2);461r2_1 = veorq_u32(r2_1, r2_2);462r3_1 = veorq_u32(r3_1, r3_2);463464r0_1 = RotateLeft<7>(r0_1);465r1_1 = RotateLeft<7>(r1_1);466r2_1 = RotateLeft<7>(r2_1);467r3_1 = RotateLeft<7>(r3_1);468469r0_1 = Extract<3>(r0_1);470r0_2 = Extract<2>(r0_2);471r0_3 = Extract<1>(r0_3);472473r1_1 = Extract<3>(r1_1);474r1_2 = Extract<2>(r1_2);475r1_3 = Extract<1>(r1_3);476477r2_1 = Extract<3>(r2_1);478r2_2 = Extract<2>(r2_2);479r2_3 = Extract<1>(r2_3);480481r3_1 = Extract<3>(r3_1);482r3_2 = Extract<2>(r3_2);483r3_3 = Extract<1>(r3_3);484}485486r0_0 = vaddq_u32(r0_0, state0);487r0_1 = vaddq_u32(r0_1, state1);488r0_2 = vaddq_u32(r0_2, state2);489r0_3 = vaddq_u32(r0_3, state3);490491r1_0 = vaddq_u32(r1_0, state0);492r1_1 = vaddq_u32(r1_1, state1);493r1_2 = vaddq_u32(r1_2, state2);494r1_3 = vaddq_u32(r1_3, state3);495r1_3 = Add64(r1_3, CTRS[0]);496497r2_0 = vaddq_u32(r2_0, state0);498r2_1 = vaddq_u32(r2_1, state1);499r2_2 = vaddq_u32(r2_2, state2);500r2_3 = vaddq_u32(r2_3, state3);501r2_3 = Add64(r2_3, CTRS[1]);502503r3_0 = vaddq_u32(r3_0, state0);504r3_1 = vaddq_u32(r3_1, state1);505r3_2 = vaddq_u32(r3_2, state2);506r3_3 = vaddq_u32(r3_3, state3);507r3_3 = Add64(r3_3, CTRS[2]);508509if (input)510{511r0_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 0*16)), r0_0);512r0_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 1*16)), r0_1);513r0_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 2*16)), r0_2);514r0_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 3*16)), r0_3);515}516517vst1q_u8(output + 0*16, vreinterpretq_u8_u32(r0_0));518vst1q_u8(output + 1*16, vreinterpretq_u8_u32(r0_1));519vst1q_u8(output + 2*16, vreinterpretq_u8_u32(r0_2));520vst1q_u8(output + 3*16, vreinterpretq_u8_u32(r0_3));521522if (input)523{524r1_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 4*16)), r1_0);525r1_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 5*16)), r1_1);526r1_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 6*16)), r1_2);527r1_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 7*16)), r1_3);528}529530vst1q_u8(output + 4*16, vreinterpretq_u8_u32(r1_0));531vst1q_u8(output + 5*16, vreinterpretq_u8_u32(r1_1));532vst1q_u8(output + 6*16, vreinterpretq_u8_u32(r1_2));533vst1q_u8(output + 7*16, vreinterpretq_u8_u32(r1_3));534535if (input)536{537r2_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 8*16)), r2_0);538r2_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 9*16)), r2_1);539r2_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 10*16)), r2_2);540r2_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 11*16)), r2_3);541}542543vst1q_u8(output + 8*16, vreinterpretq_u8_u32(r2_0));544vst1q_u8(output + 9*16, vreinterpretq_u8_u32(r2_1));545vst1q_u8(output + 10*16, vreinterpretq_u8_u32(r2_2));546vst1q_u8(output + 11*16, vreinterpretq_u8_u32(r2_3));547548if (input)549{550r3_0 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 12*16)), r3_0);551r3_1 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 13*16)), r3_1);552r3_2 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 14*16)), r3_2);553r3_3 = veorq_u32(vreinterpretq_u32_u8(vld1q_u8(input + 15*16)), r3_3);554}555556vst1q_u8(output + 12*16, vreinterpretq_u8_u32(r3_0));557vst1q_u8(output + 13*16, vreinterpretq_u8_u32(r3_1));558vst1q_u8(output + 14*16, vreinterpretq_u8_u32(r3_2));559vst1q_u8(output + 15*16, vreinterpretq_u8_u32(r3_3));560}561562#endif // CRYPTOPP_ARM_NEON_AVAILABLE563564// ***************************** SSE2 ***************************** //565566#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)567568void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds)569{570const __m128i state0 = _mm_load_si128(reinterpret_cast<const __m128i*>(state+0*4));571const __m128i state1 = _mm_load_si128(reinterpret_cast<const __m128i*>(state+1*4));572const __m128i state2 = _mm_load_si128(reinterpret_cast<const __m128i*>(state+2*4));573const __m128i state3 = _mm_load_si128(reinterpret_cast<const __m128i*>(state+3*4));574575__m128i r0_0 = state0;576__m128i r0_1 = state1;577__m128i r0_2 = state2;578__m128i r0_3 = state3;579580__m128i r1_0 = state0;581__m128i r1_1 = state1;582__m128i r1_2 = state2;583__m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1));584585__m128i r2_0 = state0;586__m128i r2_1 = state1;587__m128i r2_2 = state2;588__m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2));589590__m128i r3_0 = state0;591__m128i r3_1 = state1;592__m128i r3_2 = state2;593__m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3));594595for (int i = static_cast<int>(rounds); i > 0; i -= 2)596{597r0_0 = _mm_add_epi32(r0_0, r0_1);598r1_0 = _mm_add_epi32(r1_0, r1_1);599r2_0 = _mm_add_epi32(r2_0, r2_1);600r3_0 = _mm_add_epi32(r3_0, r3_1);601602r0_3 = _mm_xor_si128(r0_3, r0_0);603r1_3 = _mm_xor_si128(r1_3, r1_0);604r2_3 = _mm_xor_si128(r2_3, r2_0);605r3_3 = _mm_xor_si128(r3_3, r3_0);606607r0_3 = RotateLeft<16>(r0_3);608r1_3 = RotateLeft<16>(r1_3);609r2_3 = RotateLeft<16>(r2_3);610r3_3 = RotateLeft<16>(r3_3);611612r0_2 = _mm_add_epi32(r0_2, r0_3);613r1_2 = _mm_add_epi32(r1_2, r1_3);614r2_2 = _mm_add_epi32(r2_2, r2_3);615r3_2 = _mm_add_epi32(r3_2, r3_3);616617r0_1 = _mm_xor_si128(r0_1, r0_2);618r1_1 = _mm_xor_si128(r1_1, r1_2);619r2_1 = _mm_xor_si128(r2_1, r2_2);620r3_1 = _mm_xor_si128(r3_1, r3_2);621622r0_1 = RotateLeft<12>(r0_1);623r1_1 = RotateLeft<12>(r1_1);624r2_1 = RotateLeft<12>(r2_1);625r3_1 = RotateLeft<12>(r3_1);626627r0_0 = _mm_add_epi32(r0_0, r0_1);628r1_0 = _mm_add_epi32(r1_0, r1_1);629r2_0 = _mm_add_epi32(r2_0, r2_1);630r3_0 = _mm_add_epi32(r3_0, r3_1);631632r0_3 = _mm_xor_si128(r0_3, r0_0);633r1_3 = _mm_xor_si128(r1_3, r1_0);634r2_3 = _mm_xor_si128(r2_3, r2_0);635r3_3 = _mm_xor_si128(r3_3, r3_0);636637r0_3 = RotateLeft<8>(r0_3);638r1_3 = RotateLeft<8>(r1_3);639r2_3 = RotateLeft<8>(r2_3);640r3_3 = RotateLeft<8>(r3_3);641642r0_2 = _mm_add_epi32(r0_2, r0_3);643r1_2 = _mm_add_epi32(r1_2, r1_3);644r2_2 = _mm_add_epi32(r2_2, r2_3);645r3_2 = _mm_add_epi32(r3_2, r3_3);646647r0_1 = _mm_xor_si128(r0_1, r0_2);648r1_1 = _mm_xor_si128(r1_1, r1_2);649r2_1 = _mm_xor_si128(r2_1, r2_2);650r3_1 = _mm_xor_si128(r3_1, r3_2);651652r0_1 = RotateLeft<7>(r0_1);653r1_1 = RotateLeft<7>(r1_1);654r2_1 = RotateLeft<7>(r2_1);655r3_1 = RotateLeft<7>(r3_1);656657r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1));658r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));659r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3));660661r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1));662r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));663r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3));664665r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1));666r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));667r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3));668669r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1));670r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));671r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3));672673r0_0 = _mm_add_epi32(r0_0, r0_1);674r1_0 = _mm_add_epi32(r1_0, r1_1);675r2_0 = _mm_add_epi32(r2_0, r2_1);676r3_0 = _mm_add_epi32(r3_0, r3_1);677678r0_3 = _mm_xor_si128(r0_3, r0_0);679r1_3 = _mm_xor_si128(r1_3, r1_0);680r2_3 = _mm_xor_si128(r2_3, r2_0);681r3_3 = _mm_xor_si128(r3_3, r3_0);682683r0_3 = RotateLeft<16>(r0_3);684r1_3 = RotateLeft<16>(r1_3);685r2_3 = RotateLeft<16>(r2_3);686r3_3 = RotateLeft<16>(r3_3);687688r0_2 = _mm_add_epi32(r0_2, r0_3);689r1_2 = _mm_add_epi32(r1_2, r1_3);690r2_2 = _mm_add_epi32(r2_2, r2_3);691r3_2 = _mm_add_epi32(r3_2, r3_3);692693r0_1 = _mm_xor_si128(r0_1, r0_2);694r1_1 = _mm_xor_si128(r1_1, r1_2);695r2_1 = _mm_xor_si128(r2_1, r2_2);696r3_1 = _mm_xor_si128(r3_1, r3_2);697698r0_1 = RotateLeft<12>(r0_1);699r1_1 = RotateLeft<12>(r1_1);700r2_1 = RotateLeft<12>(r2_1);701r3_1 = RotateLeft<12>(r3_1);702703r0_0 = _mm_add_epi32(r0_0, r0_1);704r1_0 = _mm_add_epi32(r1_0, r1_1);705r2_0 = _mm_add_epi32(r2_0, r2_1);706r3_0 = _mm_add_epi32(r3_0, r3_1);707708r0_3 = _mm_xor_si128(r0_3, r0_0);709r1_3 = _mm_xor_si128(r1_3, r1_0);710r2_3 = _mm_xor_si128(r2_3, r2_0);711r3_3 = _mm_xor_si128(r3_3, r3_0);712713r0_3 = RotateLeft<8>(r0_3);714r1_3 = RotateLeft<8>(r1_3);715r2_3 = RotateLeft<8>(r2_3);716r3_3 = RotateLeft<8>(r3_3);717718r0_2 = _mm_add_epi32(r0_2, r0_3);719r1_2 = _mm_add_epi32(r1_2, r1_3);720r2_2 = _mm_add_epi32(r2_2, r2_3);721r3_2 = _mm_add_epi32(r3_2, r3_3);722723r0_1 = _mm_xor_si128(r0_1, r0_2);724r1_1 = _mm_xor_si128(r1_1, r1_2);725r2_1 = _mm_xor_si128(r2_1, r2_2);726r3_1 = _mm_xor_si128(r3_1, r3_2);727728r0_1 = RotateLeft<7>(r0_1);729r1_1 = RotateLeft<7>(r1_1);730r2_1 = RotateLeft<7>(r2_1);731r3_1 = RotateLeft<7>(r3_1);732733r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3));734r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));735r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1));736737r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3));738r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));739r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1));740741r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3));742r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));743r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1));744745r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3));746r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));747r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1));748}749750r0_0 = _mm_add_epi32(r0_0, state0);751r0_1 = _mm_add_epi32(r0_1, state1);752r0_2 = _mm_add_epi32(r0_2, state2);753r0_3 = _mm_add_epi32(r0_3, state3);754755r1_0 = _mm_add_epi32(r1_0, state0);756r1_1 = _mm_add_epi32(r1_1, state1);757r1_2 = _mm_add_epi32(r1_2, state2);758r1_3 = _mm_add_epi32(r1_3, state3);759r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1));760761r2_0 = _mm_add_epi32(r2_0, state0);762r2_1 = _mm_add_epi32(r2_1, state1);763r2_2 = _mm_add_epi32(r2_2, state2);764r2_3 = _mm_add_epi32(r2_3, state3);765r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2));766767r3_0 = _mm_add_epi32(r3_0, state0);768r3_1 = _mm_add_epi32(r3_1, state1);769r3_2 = _mm_add_epi32(r3_2, state2);770r3_3 = _mm_add_epi32(r3_3, state3);771r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3));772773if (input)774{775r0_0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+0*16)), r0_0);776r0_1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+1*16)), r0_1);777r0_2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+2*16)), r0_2);778r0_3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+3*16)), r0_3);779}780781_mm_storeu_si128(reinterpret_cast<__m128i*>(output+0*16), r0_0);782_mm_storeu_si128(reinterpret_cast<__m128i*>(output+1*16), r0_1);783_mm_storeu_si128(reinterpret_cast<__m128i*>(output+2*16), r0_2);784_mm_storeu_si128(reinterpret_cast<__m128i*>(output+3*16), r0_3);785786if (input)787{788r1_0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+4*16)), r1_0);789r1_1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+5*16)), r1_1);790r1_2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+6*16)), r1_2);791r1_3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+7*16)), r1_3);792}793794_mm_storeu_si128(reinterpret_cast<__m128i*>(output+4*16), r1_0);795_mm_storeu_si128(reinterpret_cast<__m128i*>(output+5*16), r1_1);796_mm_storeu_si128(reinterpret_cast<__m128i*>(output+6*16), r1_2);797_mm_storeu_si128(reinterpret_cast<__m128i*>(output+7*16), r1_3);798799if (input)800{801r2_0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+ 8*16)), r2_0);802r2_1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+ 9*16)), r2_1);803r2_2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+10*16)), r2_2);804r2_3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+11*16)), r2_3);805}806807_mm_storeu_si128(reinterpret_cast<__m128i*>(output+ 8*16), r2_0);808_mm_storeu_si128(reinterpret_cast<__m128i*>(output+ 9*16), r2_1);809_mm_storeu_si128(reinterpret_cast<__m128i*>(output+10*16), r2_2);810_mm_storeu_si128(reinterpret_cast<__m128i*>(output+11*16), r2_3);811812if (input)813{814r3_0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+12*16)), r3_0);815r3_1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+13*16)), r3_1);816r3_2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+14*16)), r3_2);817r3_3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+15*16)), r3_3);818}819820_mm_storeu_si128(reinterpret_cast<__m128i*>(output+12*16), r3_0);821_mm_storeu_si128(reinterpret_cast<__m128i*>(output+13*16), r3_1);822_mm_storeu_si128(reinterpret_cast<__m128i*>(output+14*16), r3_2);823_mm_storeu_si128(reinterpret_cast<__m128i*>(output+15*16), r3_3);824}825826#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE827828#if (CRYPTOPP_ALTIVEC_AVAILABLE)829830// ChaCha_OperateKeystream_CORE will use either POWER7 or ALTIVEC,831// depending on the flags used to compile this source file. The832// abstractions are handled in VecLoad, VecStore and friends. In833// the future we may to provide both POWER7 or ALTIVEC at the same834// time to better support distros.835inline void ChaCha_OperateKeystream_CORE(const word32 *state, const byte* input, byte *output, unsigned int rounds)836{837const uint32x4_p state0 = VecLoadAligned(state + 0*4);838const uint32x4_p state1 = VecLoadAligned(state + 1*4);839const uint32x4_p state2 = VecLoadAligned(state + 2*4);840const uint32x4_p state3 = VecLoadAligned(state + 3*4);841842const uint32x4_p CTRS[3] = {843{1,0,0,0}, {2,0,0,0}, {3,0,0,0}844};845846uint32x4_p r0_0 = state0;847uint32x4_p r0_1 = state1;848uint32x4_p r0_2 = state2;849uint32x4_p r0_3 = state3;850851uint32x4_p r1_0 = state0;852uint32x4_p r1_1 = state1;853uint32x4_p r1_2 = state2;854uint32x4_p r1_3 = VecAdd64(r0_3, CTRS[0]);855856uint32x4_p r2_0 = state0;857uint32x4_p r2_1 = state1;858uint32x4_p r2_2 = state2;859uint32x4_p r2_3 = VecAdd64(r0_3, CTRS[1]);860861uint32x4_p r3_0 = state0;862uint32x4_p r3_1 = state1;863uint32x4_p r3_2 = state2;864uint32x4_p r3_3 = VecAdd64(r0_3, CTRS[2]);865866for (int i = static_cast<int>(rounds); i > 0; i -= 2)867{868r0_0 = VecAdd(r0_0, r0_1);869r1_0 = VecAdd(r1_0, r1_1);870r2_0 = VecAdd(r2_0, r2_1);871r3_0 = VecAdd(r3_0, r3_1);872873r0_3 = VecXor(r0_3, r0_0);874r1_3 = VecXor(r1_3, r1_0);875r2_3 = VecXor(r2_3, r2_0);876r3_3 = VecXor(r3_3, r3_0);877878r0_3 = VecRotateLeft<16>(r0_3);879r1_3 = VecRotateLeft<16>(r1_3);880r2_3 = VecRotateLeft<16>(r2_3);881r3_3 = VecRotateLeft<16>(r3_3);882883r0_2 = VecAdd(r0_2, r0_3);884r1_2 = VecAdd(r1_2, r1_3);885r2_2 = VecAdd(r2_2, r2_3);886r3_2 = VecAdd(r3_2, r3_3);887888r0_1 = VecXor(r0_1, r0_2);889r1_1 = VecXor(r1_1, r1_2);890r2_1 = VecXor(r2_1, r2_2);891r3_1 = VecXor(r3_1, r3_2);892893r0_1 = VecRotateLeft<12>(r0_1);894r1_1 = VecRotateLeft<12>(r1_1);895r2_1 = VecRotateLeft<12>(r2_1);896r3_1 = VecRotateLeft<12>(r3_1);897898r0_0 = VecAdd(r0_0, r0_1);899r1_0 = VecAdd(r1_0, r1_1);900r2_0 = VecAdd(r2_0, r2_1);901r3_0 = VecAdd(r3_0, r3_1);902903r0_3 = VecXor(r0_3, r0_0);904r1_3 = VecXor(r1_3, r1_0);905r2_3 = VecXor(r2_3, r2_0);906r3_3 = VecXor(r3_3, r3_0);907908r0_3 = VecRotateLeft<8>(r0_3);909r1_3 = VecRotateLeft<8>(r1_3);910r2_3 = VecRotateLeft<8>(r2_3);911r3_3 = VecRotateLeft<8>(r3_3);912913r0_2 = VecAdd(r0_2, r0_3);914r1_2 = VecAdd(r1_2, r1_3);915r2_2 = VecAdd(r2_2, r2_3);916r3_2 = VecAdd(r3_2, r3_3);917918r0_1 = VecXor(r0_1, r0_2);919r1_1 = VecXor(r1_1, r1_2);920r2_1 = VecXor(r2_1, r2_2);921r3_1 = VecXor(r3_1, r3_2);922923r0_1 = VecRotateLeft<7>(r0_1);924r1_1 = VecRotateLeft<7>(r1_1);925r2_1 = VecRotateLeft<7>(r2_1);926r3_1 = VecRotateLeft<7>(r3_1);927928r0_1 = Shuffle<1>(r0_1);929r0_2 = Shuffle<2>(r0_2);930r0_3 = Shuffle<3>(r0_3);931932r1_1 = Shuffle<1>(r1_1);933r1_2 = Shuffle<2>(r1_2);934r1_3 = Shuffle<3>(r1_3);935936r2_1 = Shuffle<1>(r2_1);937r2_2 = Shuffle<2>(r2_2);938r2_3 = Shuffle<3>(r2_3);939940r3_1 = Shuffle<1>(r3_1);941r3_2 = Shuffle<2>(r3_2);942r3_3 = Shuffle<3>(r3_3);943944r0_0 = VecAdd(r0_0, r0_1);945r1_0 = VecAdd(r1_0, r1_1);946r2_0 = VecAdd(r2_0, r2_1);947r3_0 = VecAdd(r3_0, r3_1);948949r0_3 = VecXor(r0_3, r0_0);950r1_3 = VecXor(r1_3, r1_0);951r2_3 = VecXor(r2_3, r2_0);952r3_3 = VecXor(r3_3, r3_0);953954r0_3 = VecRotateLeft<16>(r0_3);955r1_3 = VecRotateLeft<16>(r1_3);956r2_3 = VecRotateLeft<16>(r2_3);957r3_3 = VecRotateLeft<16>(r3_3);958959r0_2 = VecAdd(r0_2, r0_3);960r1_2 = VecAdd(r1_2, r1_3);961r2_2 = VecAdd(r2_2, r2_3);962r3_2 = VecAdd(r3_2, r3_3);963964r0_1 = VecXor(r0_1, r0_2);965r1_1 = VecXor(r1_1, r1_2);966r2_1 = VecXor(r2_1, r2_2);967r3_1 = VecXor(r3_1, r3_2);968969r0_1 = VecRotateLeft<12>(r0_1);970r1_1 = VecRotateLeft<12>(r1_1);971r2_1 = VecRotateLeft<12>(r2_1);972r3_1 = VecRotateLeft<12>(r3_1);973974r0_0 = VecAdd(r0_0, r0_1);975r1_0 = VecAdd(r1_0, r1_1);976r2_0 = VecAdd(r2_0, r2_1);977r3_0 = VecAdd(r3_0, r3_1);978979r0_3 = VecXor(r0_3, r0_0);980r1_3 = VecXor(r1_3, r1_0);981r2_3 = VecXor(r2_3, r2_0);982r3_3 = VecXor(r3_3, r3_0);983984r0_3 = VecRotateLeft<8>(r0_3);985r1_3 = VecRotateLeft<8>(r1_3);986r2_3 = VecRotateLeft<8>(r2_3);987r3_3 = VecRotateLeft<8>(r3_3);988989r0_2 = VecAdd(r0_2, r0_3);990r1_2 = VecAdd(r1_2, r1_3);991r2_2 = VecAdd(r2_2, r2_3);992r3_2 = VecAdd(r3_2, r3_3);993994r0_1 = VecXor(r0_1, r0_2);995r1_1 = VecXor(r1_1, r1_2);996r2_1 = VecXor(r2_1, r2_2);997r3_1 = VecXor(r3_1, r3_2);998999r0_1 = VecRotateLeft<7>(r0_1);1000r1_1 = VecRotateLeft<7>(r1_1);1001r2_1 = VecRotateLeft<7>(r2_1);1002r3_1 = VecRotateLeft<7>(r3_1);10031004r0_1 = Shuffle<3>(r0_1);1005r0_2 = Shuffle<2>(r0_2);1006r0_3 = Shuffle<1>(r0_3);10071008r1_1 = Shuffle<3>(r1_1);1009r1_2 = Shuffle<2>(r1_2);1010r1_3 = Shuffle<1>(r1_3);10111012r2_1 = Shuffle<3>(r2_1);1013r2_2 = Shuffle<2>(r2_2);1014r2_3 = Shuffle<1>(r2_3);10151016r3_1 = Shuffle<3>(r3_1);1017r3_2 = Shuffle<2>(r3_2);1018r3_3 = Shuffle<1>(r3_3);1019}10201021r0_0 = VecAdd(r0_0, state0);1022r0_1 = VecAdd(r0_1, state1);1023r0_2 = VecAdd(r0_2, state2);1024r0_3 = VecAdd(r0_3, state3);10251026r1_0 = VecAdd(r1_0, state0);1027r1_1 = VecAdd(r1_1, state1);1028r1_2 = VecAdd(r1_2, state2);1029r1_3 = VecAdd(r1_3, state3);1030r1_3 = VecAdd64(r1_3, CTRS[0]);10311032r2_0 = VecAdd(r2_0, state0);1033r2_1 = VecAdd(r2_1, state1);1034r2_2 = VecAdd(r2_2, state2);1035r2_3 = VecAdd(r2_3, state3);1036r2_3 = VecAdd64(r2_3, CTRS[1]);10371038r3_0 = VecAdd(r3_0, state0);1039r3_1 = VecAdd(r3_1, state1);1040r3_2 = VecAdd(r3_2, state2);1041r3_3 = VecAdd(r3_3, state3);1042r3_3 = VecAdd64(r3_3, CTRS[2]);10431044if (input)1045{1046r0_0 = VecXor(VecLoad32LE(input + 0*16), r0_0);1047r0_1 = VecXor(VecLoad32LE(input + 1*16), r0_1);1048r0_2 = VecXor(VecLoad32LE(input + 2*16), r0_2);1049r0_3 = VecXor(VecLoad32LE(input + 3*16), r0_3);1050}10511052VecStore32LE(output + 0*16, r0_0);1053VecStore32LE(output + 1*16, r0_1);1054VecStore32LE(output + 2*16, r0_2);1055VecStore32LE(output + 3*16, r0_3);10561057if (input)1058{1059r1_0 = VecXor(VecLoad32LE(input + 4*16), r1_0);1060r1_1 = VecXor(VecLoad32LE(input + 5*16), r1_1);1061r1_2 = VecXor(VecLoad32LE(input + 6*16), r1_2);1062r1_3 = VecXor(VecLoad32LE(input + 7*16), r1_3);1063}10641065VecStore32LE(output + 4*16, r1_0);1066VecStore32LE(output + 5*16, r1_1);1067VecStore32LE(output + 6*16, r1_2);1068VecStore32LE(output + 7*16, r1_3);10691070if (input)1071{1072r2_0 = VecXor(VecLoad32LE(input + 8*16), r2_0);1073r2_1 = VecXor(VecLoad32LE(input + 9*16), r2_1);1074r2_2 = VecXor(VecLoad32LE(input + 10*16), r2_2);1075r2_3 = VecXor(VecLoad32LE(input + 11*16), r2_3);1076}10771078VecStore32LE(output + 8*16, r2_0);1079VecStore32LE(output + 9*16, r2_1);1080VecStore32LE(output + 10*16, r2_2);1081VecStore32LE(output + 11*16, r2_3);10821083if (input)1084{1085r3_0 = VecXor(VecLoad32LE(input + 12*16), r3_0);1086r3_1 = VecXor(VecLoad32LE(input + 13*16), r3_1);1087r3_2 = VecXor(VecLoad32LE(input + 14*16), r3_2);1088r3_3 = VecXor(VecLoad32LE(input + 15*16), r3_3);1089}10901091VecStore32LE(output + 12*16, r3_0);1092VecStore32LE(output + 13*16, r3_1);1093VecStore32LE(output + 14*16, r3_2);1094VecStore32LE(output + 15*16, r3_3);1095}10961097#endif // CRYPTOPP_ALTIVEC_AVAILABLE10981099#if (CRYPTOPP_ALTIVEC_AVAILABLE)11001101void ChaCha_OperateKeystream_ALTIVEC(const word32 *state, const byte* input, byte *output, unsigned int rounds)1102{1103ChaCha_OperateKeystream_CORE(state, input, output, rounds);1104}11051106#endif11071108NAMESPACE_END110911101111