Path: blob/a-new-beginning/SharedDependencies/Sources/cryptopp/blake2b_simd.cpp
2 views
// blake2_simd.cpp - written and placed in the public domain by1// Samuel Neves, Jeffrey Walton, Uri Blumenthal2// and Marcel Raad.3//4// This source file uses intrinsics to gain access to ARMv7a/ARMv8a5// NEON, Power8 and SSE4.1 instructions. A separate source file is6// needed because additional CXXFLAGS are required to enable the7// appropriate instructions sets in some build configurations.89#include "pch.h"10#include "config.h"11#include "misc.h"12#include "blake2.h"1314// Uncomment for benchmarking C++ against SSE2 or NEON.15// Do so in both blake2.cpp and blake2_simd.cpp.16// #undef CRYPTOPP_SSE41_AVAILABLE17// #undef CRYPTOPP_ARM_NEON_AVAILABLE18// #undef CRYPTOPP_ALTIVEC_AVAILABLE1920// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about21// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.22#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)23# undef CRYPTOPP_ARM_NEON_AVAILABLE24#endif2526// BLAKE2s bug on AIX 7.1 (POWER7) with XLC 12.0127// https://github.com/weidai11/cryptopp/issues/74328#if defined(__xlC__) && (__xlC__ < 0x0d01)29# define CRYPTOPP_DISABLE_ALTIVEC 130# undef CRYPTOPP_POWER8_AVAILABLE31# undef CRYPTOPP_ALTIVEC_AVAILABLE32#endif3334#if defined(__XOP__)35# if defined(CRYPTOPP_GCC_COMPATIBLE)36# include <x86intrin.h>37# endif38# include <ammintrin.h>39#endif // XOP4041#if (CRYPTOPP_SSE41_AVAILABLE)42# include <emmintrin.h>43# include <tmmintrin.h>44# include <smmintrin.h>45#endif4647#if (CRYPTOPP_ARM_NEON_HEADER)48# include <arm_neon.h>49#endif5051#if (CRYPTOPP_ARM_ACLE_HEADER)52# include <stdint.h>53# include <arm_acle.h>54#endif5556#if (CRYPTOPP_POWER8_AVAILABLE)57# include "ppc_simd.h"58#endif5960#if defined(CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE)61/* Ignore "warning: vec_lvsl is deprecated..." */62# pragma GCC diagnostic ignored "-Wdeprecated"63#endif6465// Squash MS LNK4221 and libtool warnings66extern const char BLAKE2B_SIMD_FNAME[] = __FILE__;6768NAMESPACE_BEGIN(CryptoPP)6970// Exported by blake2.cpp71extern const word32 BLAKE2S_IV[8];72extern const word64 BLAKE2B_IV[8];7374#if CRYPTOPP_SSE41_AVAILABLE7576#define LOADU(p) _mm_loadu_si128((const __m128i *)(const void*)(p))77#define STOREU(p,r) _mm_storeu_si128((__m128i *)(void*)(p), r)78#define TOF(reg) _mm_castsi128_ps((reg))79#define TOI(reg) _mm_castps_si128((reg))8081void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2b_State& state)82{83#define BLAKE2B_LOAD_MSG_0_1(b0, b1) \84do { \85b0 = _mm_unpacklo_epi64(m0, m1); \86b1 = _mm_unpacklo_epi64(m2, m3); \87} while(0)8889#define BLAKE2B_LOAD_MSG_0_2(b0, b1) \90do { \91b0 = _mm_unpackhi_epi64(m0, m1); \92b1 = _mm_unpackhi_epi64(m2, m3); \93} while(0)9495#define BLAKE2B_LOAD_MSG_0_3(b0, b1) \96do { \97b0 = _mm_unpacklo_epi64(m4, m5); \98b1 = _mm_unpacklo_epi64(m6, m7); \99} while(0)100101#define BLAKE2B_LOAD_MSG_0_4(b0, b1) \102do { \103b0 = _mm_unpackhi_epi64(m4, m5); \104b1 = _mm_unpackhi_epi64(m6, m7); \105} while(0)106107#define BLAKE2B_LOAD_MSG_1_1(b0, b1) \108do { \109b0 = _mm_unpacklo_epi64(m7, m2); \110b1 = _mm_unpackhi_epi64(m4, m6); \111} while(0)112113#define BLAKE2B_LOAD_MSG_1_2(b0, b1) \114do { \115b0 = _mm_unpacklo_epi64(m5, m4); \116b1 = _mm_alignr_epi8(m3, m7, 8); \117} while(0)118119#define BLAKE2B_LOAD_MSG_1_3(b0, b1) \120do { \121b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \122b1 = _mm_unpackhi_epi64(m5, m2); \123} while(0)124125#define BLAKE2B_LOAD_MSG_1_4(b0, b1) \126do { \127b0 = _mm_unpacklo_epi64(m6, m1); \128b1 = _mm_unpackhi_epi64(m3, m1); \129} while(0)130131#define BLAKE2B_LOAD_MSG_2_1(b0, b1) \132do { \133b0 = _mm_alignr_epi8(m6, m5, 8); \134b1 = _mm_unpackhi_epi64(m2, m7); \135} while(0)136137#define BLAKE2B_LOAD_MSG_2_2(b0, b1) \138do { \139b0 = _mm_unpacklo_epi64(m4, m0); \140b1 = _mm_blend_epi16(m1, m6, 0xF0); \141} while(0)142143#define BLAKE2B_LOAD_MSG_2_3(b0, b1) \144do { \145b0 = _mm_blend_epi16(m5, m1, 0xF0); \146b1 = _mm_unpackhi_epi64(m3, m4); \147} while(0)148149#define BLAKE2B_LOAD_MSG_2_4(b0, b1) \150do { \151b0 = _mm_unpacklo_epi64(m7, m3); \152b1 = _mm_alignr_epi8(m2, m0, 8); \153} while(0)154155#define BLAKE2B_LOAD_MSG_3_1(b0, b1) \156do { \157b0 = _mm_unpackhi_epi64(m3, m1); \158b1 = _mm_unpackhi_epi64(m6, m5); \159} while(0)160161#define BLAKE2B_LOAD_MSG_3_2(b0, b1) \162do { \163b0 = _mm_unpackhi_epi64(m4, m0); \164b1 = _mm_unpacklo_epi64(m6, m7); \165} while(0)166167#define BLAKE2B_LOAD_MSG_3_3(b0, b1) \168do { \169b0 = _mm_blend_epi16(m1, m2, 0xF0); \170b1 = _mm_blend_epi16(m2, m7, 0xF0); \171} while(0)172173#define BLAKE2B_LOAD_MSG_3_4(b0, b1) \174do { \175b0 = _mm_unpacklo_epi64(m3, m5); \176b1 = _mm_unpacklo_epi64(m0, m4); \177} while(0)178179#define BLAKE2B_LOAD_MSG_4_1(b0, b1) \180do { \181b0 = _mm_unpackhi_epi64(m4, m2); \182b1 = _mm_unpacklo_epi64(m1, m5); \183} while(0)184185#define BLAKE2B_LOAD_MSG_4_2(b0, b1) \186do { \187b0 = _mm_blend_epi16(m0, m3, 0xF0); \188b1 = _mm_blend_epi16(m2, m7, 0xF0); \189} while(0)190191#define BLAKE2B_LOAD_MSG_4_3(b0, b1) \192do { \193b0 = _mm_blend_epi16(m7, m5, 0xF0); \194b1 = _mm_blend_epi16(m3, m1, 0xF0); \195} while(0)196197#define BLAKE2B_LOAD_MSG_4_4(b0, b1) \198do { \199b0 = _mm_alignr_epi8(m6, m0, 8); \200b1 = _mm_blend_epi16(m4, m6, 0xF0); \201} while(0)202203#define BLAKE2B_LOAD_MSG_5_1(b0, b1) \204do { \205b0 = _mm_unpacklo_epi64(m1, m3); \206b1 = _mm_unpacklo_epi64(m0, m4); \207} while(0)208209#define BLAKE2B_LOAD_MSG_5_2(b0, b1) \210do { \211b0 = _mm_unpacklo_epi64(m6, m5); \212b1 = _mm_unpackhi_epi64(m5, m1); \213} while(0)214215#define BLAKE2B_LOAD_MSG_5_3(b0, b1) \216do { \217b0 = _mm_blend_epi16(m2, m3, 0xF0); \218b1 = _mm_unpackhi_epi64(m7, m0); \219} while(0)220221#define BLAKE2B_LOAD_MSG_5_4(b0, b1) \222do { \223b0 = _mm_unpackhi_epi64(m6, m2); \224b1 = _mm_blend_epi16(m7, m4, 0xF0); \225} while(0)226227#define BLAKE2B_LOAD_MSG_6_1(b0, b1) \228do { \229b0 = _mm_blend_epi16(m6, m0, 0xF0); \230b1 = _mm_unpacklo_epi64(m7, m2); \231} while(0)232233#define BLAKE2B_LOAD_MSG_6_2(b0, b1) \234do { \235b0 = _mm_unpackhi_epi64(m2, m7); \236b1 = _mm_alignr_epi8(m5, m6, 8); \237} while(0)238239#define BLAKE2B_LOAD_MSG_6_3(b0, b1) \240do { \241b0 = _mm_unpacklo_epi64(m0, m3); \242b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \243} while(0)244245#define BLAKE2B_LOAD_MSG_6_4(b0, b1) \246do { \247b0 = _mm_unpackhi_epi64(m3, m1); \248b1 = _mm_blend_epi16(m1, m5, 0xF0); \249} while(0)250251#define BLAKE2B_LOAD_MSG_7_1(b0, b1) \252do { \253b0 = _mm_unpackhi_epi64(m6, m3); \254b1 = _mm_blend_epi16(m6, m1, 0xF0); \255} while(0)256257#define BLAKE2B_LOAD_MSG_7_2(b0, b1) \258do { \259b0 = _mm_alignr_epi8(m7, m5, 8); \260b1 = _mm_unpackhi_epi64(m0, m4); \261} while(0)262263#define BLAKE2B_LOAD_MSG_7_3(b0, b1) \264do { \265b0 = _mm_unpackhi_epi64(m2, m7); \266b1 = _mm_unpacklo_epi64(m4, m1); \267} while(0)268269#define BLAKE2B_LOAD_MSG_7_4(b0, b1) \270do { \271b0 = _mm_unpacklo_epi64(m0, m2); \272b1 = _mm_unpacklo_epi64(m3, m5); \273} while(0)274275#define BLAKE2B_LOAD_MSG_8_1(b0, b1) \276do { \277b0 = _mm_unpacklo_epi64(m3, m7); \278b1 = _mm_alignr_epi8(m0, m5, 8); \279} while(0)280281#define BLAKE2B_LOAD_MSG_8_2(b0, b1) \282do { \283b0 = _mm_unpackhi_epi64(m7, m4); \284b1 = _mm_alignr_epi8(m4, m1, 8); \285} while(0)286287#define BLAKE2B_LOAD_MSG_8_3(b0, b1) \288do { \289b0 = m6; \290b1 = _mm_alignr_epi8(m5, m0, 8); \291} while(0)292293#define BLAKE2B_LOAD_MSG_8_4(b0, b1) \294do { \295b0 = _mm_blend_epi16(m1, m3, 0xF0); \296b1 = m2; \297} while(0)298299#define BLAKE2B_LOAD_MSG_9_1(b0, b1) \300do { \301b0 = _mm_unpacklo_epi64(m5, m4); \302b1 = _mm_unpackhi_epi64(m3, m0); \303} while(0)304305#define BLAKE2B_LOAD_MSG_9_2(b0, b1) \306do { \307b0 = _mm_unpacklo_epi64(m1, m2); \308b1 = _mm_blend_epi16(m3, m2, 0xF0); \309} while(0)310311#define BLAKE2B_LOAD_MSG_9_3(b0, b1) \312do { \313b0 = _mm_unpackhi_epi64(m7, m4); \314b1 = _mm_unpackhi_epi64(m1, m6); \315} while(0)316317#define BLAKE2B_LOAD_MSG_9_4(b0, b1) \318do { \319b0 = _mm_alignr_epi8(m7, m5, 8); \320b1 = _mm_unpacklo_epi64(m6, m0); \321} while(0)322323#define BLAKE2B_LOAD_MSG_10_1(b0, b1) \324do { \325b0 = _mm_unpacklo_epi64(m0, m1); \326b1 = _mm_unpacklo_epi64(m2, m3); \327} while(0)328329#define BLAKE2B_LOAD_MSG_10_2(b0, b1) \330do { \331b0 = _mm_unpackhi_epi64(m0, m1); \332b1 = _mm_unpackhi_epi64(m2, m3); \333} while(0)334335#define BLAKE2B_LOAD_MSG_10_3(b0, b1) \336do { \337b0 = _mm_unpacklo_epi64(m4, m5); \338b1 = _mm_unpacklo_epi64(m6, m7); \339} while(0)340341#define BLAKE2B_LOAD_MSG_10_4(b0, b1) \342do { \343b0 = _mm_unpackhi_epi64(m4, m5); \344b1 = _mm_unpackhi_epi64(m6, m7); \345} while(0)346347#define BLAKE2B_LOAD_MSG_11_1(b0, b1) \348do { \349b0 = _mm_unpacklo_epi64(m7, m2); \350b1 = _mm_unpackhi_epi64(m4, m6); \351} while(0)352353#define BLAKE2B_LOAD_MSG_11_2(b0, b1) \354do { \355b0 = _mm_unpacklo_epi64(m5, m4); \356b1 = _mm_alignr_epi8(m3, m7, 8); \357} while(0)358359#define BLAKE2B_LOAD_MSG_11_3(b0, b1) \360do { \361b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \362b1 = _mm_unpackhi_epi64(m5, m2); \363} while(0)364365#define BLAKE2B_LOAD_MSG_11_4(b0, b1) \366do { \367b0 = _mm_unpacklo_epi64(m6, m1); \368b1 = _mm_unpackhi_epi64(m3, m1); \369} while(0)370371#ifdef __XOP__372# define MM_ROTI_EPI64(r, c) \373_mm_roti_epi64(r, c)374#else375# define MM_ROTI_EPI64(x, c) \376(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \377: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \378: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \379: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \380: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))381#endif382383#define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \384row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \385row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \386\387row4l = _mm_xor_si128(row4l, row1l); \388row4h = _mm_xor_si128(row4h, row1h); \389\390row4l = MM_ROTI_EPI64(row4l, -32); \391row4h = MM_ROTI_EPI64(row4h, -32); \392\393row3l = _mm_add_epi64(row3l, row4l); \394row3h = _mm_add_epi64(row3h, row4h); \395\396row2l = _mm_xor_si128(row2l, row3l); \397row2h = _mm_xor_si128(row2h, row3h); \398\399row2l = MM_ROTI_EPI64(row2l, -24); \400row2h = MM_ROTI_EPI64(row2h, -24);401402#define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \403row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \404row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \405\406row4l = _mm_xor_si128(row4l, row1l); \407row4h = _mm_xor_si128(row4h, row1h); \408\409row4l = MM_ROTI_EPI64(row4l, -16); \410row4h = MM_ROTI_EPI64(row4h, -16); \411\412row3l = _mm_add_epi64(row3l, row4l); \413row3h = _mm_add_epi64(row3h, row4h); \414\415row2l = _mm_xor_si128(row2l, row3l); \416row2h = _mm_xor_si128(row2h, row3h); \417\418row2l = MM_ROTI_EPI64(row2l, -63); \419row2h = MM_ROTI_EPI64(row2h, -63); \420421#define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \422t0 = row4l;\423t1 = row2l;\424row4l = row3l;\425row3l = row3h;\426row3h = row4l;\427row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \428row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \429row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \430row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))431432#define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \433t0 = row3l;\434row3l = row3h;\435row3h = t0;\436t0 = row2l;\437t1 = row4l;\438row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \439row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \440row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \441row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))442443#define BLAKE2B_ROUND(r) \444BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \445BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \446BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \447BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \448BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \449BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \450BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \451BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \452BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \453BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);454455__m128i row1l, row1h;456__m128i row2l, row2h;457__m128i row3l, row3h;458__m128i row4l, row4h;459__m128i b0, b1;460__m128i t0, t1;461462const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);463const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);464465const __m128i m0 = LOADU(input + 00);466const __m128i m1 = LOADU(input + 16);467const __m128i m2 = LOADU(input + 32);468const __m128i m3 = LOADU(input + 48);469const __m128i m4 = LOADU(input + 64);470const __m128i m5 = LOADU(input + 80);471const __m128i m6 = LOADU(input + 96);472const __m128i m7 = LOADU(input + 112);473474row1l = LOADU(state.h()+0);475row1h = LOADU(state.h()+2);476row2l = LOADU(state.h()+4);477row2h = LOADU(state.h()+6);478row3l = LOADU(BLAKE2B_IV+0);479row3h = LOADU(BLAKE2B_IV+2);480row4l = _mm_xor_si128(LOADU(BLAKE2B_IV+4), LOADU(state.t()+0));481row4h = _mm_xor_si128(LOADU(BLAKE2B_IV+6), LOADU(state.f()+0));482483BLAKE2B_ROUND(0);484BLAKE2B_ROUND(1);485BLAKE2B_ROUND(2);486BLAKE2B_ROUND(3);487BLAKE2B_ROUND(4);488BLAKE2B_ROUND(5);489BLAKE2B_ROUND(6);490BLAKE2B_ROUND(7);491BLAKE2B_ROUND(8);492BLAKE2B_ROUND(9);493BLAKE2B_ROUND(10);494BLAKE2B_ROUND(11);495496row1l = _mm_xor_si128(row3l, row1l);497row1h = _mm_xor_si128(row3h, row1h);498STOREU(state.h()+0, _mm_xor_si128(LOADU(state.h()+0), row1l));499STOREU(state.h()+2, _mm_xor_si128(LOADU(state.h()+2), row1h));500row2l = _mm_xor_si128(row4l, row2l);501row2h = _mm_xor_si128(row4h, row2h);502STOREU(state.h()+4, _mm_xor_si128(LOADU(state.h()+4), row2l));503STOREU(state.h()+6, _mm_xor_si128(LOADU(state.h()+6), row2h));504}505#endif // CRYPTOPP_SSE41_AVAILABLE506507#if CRYPTOPP_ARM_NEON_AVAILABLE508void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state)509{510#define BLAKE2B_LOAD_MSG_0_1(b0, b1) \511do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)512513#define BLAKE2B_LOAD_MSG_0_2(b0, b1) \514do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)515516#define BLAKE2B_LOAD_MSG_0_3(b0, b1) \517do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)518519#define BLAKE2B_LOAD_MSG_0_4(b0, b1) \520do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)521522#define BLAKE2B_LOAD_MSG_1_1(b0, b1) \523do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)524525#define BLAKE2B_LOAD_MSG_1_2(b0, b1) \526do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)527528#define BLAKE2B_LOAD_MSG_1_3(b0, b1) \529do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)530531#define BLAKE2B_LOAD_MSG_1_4(b0, b1) \532do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)533534#define BLAKE2B_LOAD_MSG_2_1(b0, b1) \535do { b0 = vextq_u64(m5, m6, 1); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); } while(0)536537#define BLAKE2B_LOAD_MSG_2_2(b0, b1) \538do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); } while(0)539540#define BLAKE2B_LOAD_MSG_2_3(b0, b1) \541do { b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); } while(0)542543#define BLAKE2B_LOAD_MSG_2_4(b0, b1) \544do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); b1 = vextq_u64(m0, m2, 1); } while(0)545546#define BLAKE2B_LOAD_MSG_3_1(b0, b1) \547do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); } while(0)548549#define BLAKE2B_LOAD_MSG_3_2(b0, b1) \550do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)551552#define BLAKE2B_LOAD_MSG_3_3(b0, b1) \553do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)554555#define BLAKE2B_LOAD_MSG_3_4(b0, b1) \556do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)557558#define BLAKE2B_LOAD_MSG_4_1(b0, b1) \559do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); } while(0)560561#define BLAKE2B_LOAD_MSG_4_2(b0, b1) \562do { b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)563564#define BLAKE2B_LOAD_MSG_4_3(b0, b1) \565do { b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); } while(0)566567#define BLAKE2B_LOAD_MSG_4_4(b0, b1) \568do { b0 = vextq_u64(m0, m6, 1); b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); } while(0)569570#define BLAKE2B_LOAD_MSG_5_1(b0, b1) \571do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)572573#define BLAKE2B_LOAD_MSG_5_2(b0, b1) \574do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); } while(0)575576#define BLAKE2B_LOAD_MSG_5_3(b0, b1) \577do { b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); } while(0)578579#define BLAKE2B_LOAD_MSG_5_4(b0, b1) \580do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); } while(0)581582#define BLAKE2B_LOAD_MSG_6_1(b0, b1) \583do { b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); } while(0)584585#define BLAKE2B_LOAD_MSG_6_2(b0, b1) \586do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vextq_u64(m6, m5, 1); } while(0)587588#define BLAKE2B_LOAD_MSG_6_3(b0, b1) \589do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); b1 = vextq_u64(m4, m4, 1); } while(0)590591#define BLAKE2B_LOAD_MSG_6_4(b0, b1) \592do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); } while(0)593594#define BLAKE2B_LOAD_MSG_7_1(b0, b1) \595do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); } while(0)596597#define BLAKE2B_LOAD_MSG_7_2(b0, b1) \598do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); } while(0)599600#define BLAKE2B_LOAD_MSG_7_3(b0, b1) \601do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); } while(0)602603#define BLAKE2B_LOAD_MSG_7_4(b0, b1) \604do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); } while(0)605606#define BLAKE2B_LOAD_MSG_8_1(b0, b1) \607do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); b1 = vextq_u64(m5, m0, 1); } while(0)608609#define BLAKE2B_LOAD_MSG_8_2(b0, b1) \610do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vextq_u64(m1, m4, 1); } while(0)611612#define BLAKE2B_LOAD_MSG_8_3(b0, b1) \613do { b0 = m6; b1 = vextq_u64(m0, m5, 1); } while(0)614615#define BLAKE2B_LOAD_MSG_8_4(b0, b1) \616do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); b1 = m2; } while(0)617618#define BLAKE2B_LOAD_MSG_9_1(b0, b1) \619do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); } while(0)620621#define BLAKE2B_LOAD_MSG_9_2(b0, b1) \622do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); } while(0)623624#define BLAKE2B_LOAD_MSG_9_3(b0, b1) \625do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); } while(0)626627#define BLAKE2B_LOAD_MSG_9_4(b0, b1) \628do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); } while(0)629630#define BLAKE2B_LOAD_MSG_10_1(b0, b1) \631do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)632633#define BLAKE2B_LOAD_MSG_10_2(b0, b1) \634do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)635636#define BLAKE2B_LOAD_MSG_10_3(b0, b1) \637do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)638639#define BLAKE2B_LOAD_MSG_10_4(b0, b1) \640do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)641642#define BLAKE2B_LOAD_MSG_11_1(b0, b1) \643do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)644645#define BLAKE2B_LOAD_MSG_11_2(b0, b1) \646do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)647648#define BLAKE2B_LOAD_MSG_11_3(b0, b1) \649do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)650651#define BLAKE2B_LOAD_MSG_11_4(b0, b1) \652do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)653654#define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x))))655656#define vrorq_n_u64_24(x) vcombine_u64( \657vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \658vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3)))659660#define vrorq_n_u64_16(x) vcombine_u64( \661vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \662vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2)))663664#define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63))665666#define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \667do { \668row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \669row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \670row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \671row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \672row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \673row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \674row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \675} while(0)676677#define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \678do { \679row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \680row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \681row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \682row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \683row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \684row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \685row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \686} while(0)687688#define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \689do { \690uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \691uint64x2_t t1 = vextq_u64(row2h, row2l, 1); \692row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \693t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \694row4l = t0; row4h = t1; \695} while(0)696697#define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \698do { \699uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \700uint64x2_t t1 = vextq_u64(row2l, row2h, 1); \701row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \702t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \703row4l = t0; row4h = t1; \704} while(0)705706#define BLAKE2B_ROUND(r) \707do { \708uint64x2_t b0, b1; \709BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \710BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \711BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \712BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \713BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \714BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \715BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \716BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \717BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \718BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \719} while(0)720721const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(input + 00));722const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(input + 16));723const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(input + 32));724const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(input + 48));725const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(input + 64));726const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(input + 80));727const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(input + 96));728const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(input + 112));729730uint64x2_t row1l, row1h, row2l, row2h;731uint64x2_t row3l, row3h, row4l, row4h;732733const uint64x2_t h0 = row1l = vld1q_u64(state.h()+0);734const uint64x2_t h1 = row1h = vld1q_u64(state.h()+2);735const uint64x2_t h2 = row2l = vld1q_u64(state.h()+4);736const uint64x2_t h3 = row2h = vld1q_u64(state.h()+6);737738row3l = vld1q_u64(BLAKE2B_IV+0);739row3h = vld1q_u64(BLAKE2B_IV+2);740row4l = veorq_u64(vld1q_u64(BLAKE2B_IV+4), vld1q_u64(state.t()+0));741row4h = veorq_u64(vld1q_u64(BLAKE2B_IV+6), vld1q_u64(state.f()+0));742743BLAKE2B_ROUND(0);744BLAKE2B_ROUND(1);745BLAKE2B_ROUND(2);746BLAKE2B_ROUND(3);747BLAKE2B_ROUND(4);748BLAKE2B_ROUND(5);749BLAKE2B_ROUND(6);750BLAKE2B_ROUND(7);751BLAKE2B_ROUND(8);752BLAKE2B_ROUND(9);753BLAKE2B_ROUND(10);754BLAKE2B_ROUND(11);755756vst1q_u64(state.h()+0, veorq_u64(h0, veorq_u64(row1l, row3l)));757vst1q_u64(state.h()+2, veorq_u64(h1, veorq_u64(row1h, row3h)));758vst1q_u64(state.h()+4, veorq_u64(h2, veorq_u64(row2l, row4l)));759vst1q_u64(state.h()+6, veorq_u64(h3, veorq_u64(row2h, row4h)));760}761#endif // CRYPTOPP_ARM_NEON_AVAILABLE762763#if (CRYPTOPP_POWER8_AVAILABLE)764765inline uint64x2_p VecLoad64(const void* p)766{767return (uint64x2_p)vec_xl(0, CONST_V32_CAST(p));768}769770inline uint64x2_p VecLoad64LE(const void* p, const uint8x16_p le_mask)771{772#if defined(CRYPTOPP_BIG_ENDIAN)773const uint32x4_p v = vec_xl(0, CONST_V32_CAST(p));774return (uint64x2_p)VecPermute(v, v, le_mask);775#else776CRYPTOPP_UNUSED(le_mask);777return (uint64x2_p)vec_xl(0, CONST_V32_CAST(p));778#endif779}780781inline void VecStore64(void* p, const uint64x2_p x)782{783vec_xst((uint32x4_p)x, 0, NCONST_V32_CAST(p));784}785786inline void VecStore64LE(void* p, const uint64x2_p x, const uint8x16_p le_mask)787{788#if defined(CRYPTOPP_BIG_ENDIAN)789const uint64x2_p v = VecPermute(x, x, le_mask);790vec_xst((uint32x4_p)v, 0, NCONST_V32_CAST(p));791#else792CRYPTOPP_UNUSED(le_mask);793vec_xst((uint32x4_p)x, 0, NCONST_V32_CAST(p));794#endif795}796797#if defined(CRYPTOPP_BIG_ENDIAN)798#define vec_shl_8(a,b) (uint64x2_p)vec_sld((uint8x16_p)a,(uint8x16_p)b,8)799#else800#define vec_shl_8(a,b) (uint64x2_p)vec_sld((uint8x16_p)b,(uint8x16_p)a,8)801#endif802803#define vec_merge_hi(a, b) vec_mergeh(a,b)804#define vec_merge_hi_lo(a, b) vec_mergeh(a,(uint64x2_p)vec_sld((uint8x16_p)b,(uint8x16_p)b,8))805#define vec_merge_lo(a, b) vec_mergel(a,b)806807void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)808{809#define BLAKE2B_LOAD_MSG_0_1(b0, b1) \810do { \811b0 = vec_merge_hi(m0, m1); \812b1 = vec_merge_hi(m2, m3); \813} while(0)814815#define BLAKE2B_LOAD_MSG_0_2(b0, b1) \816do { \817b0 = vec_merge_lo(m0, m1); \818b1 = vec_merge_lo(m2, m3); \819} while(0)820821#define BLAKE2B_LOAD_MSG_0_3(b0, b1) \822do { \823b0 = vec_merge_hi(m4, m5); \824b1 = vec_merge_hi(m6, m7); \825} while(0)826827#define BLAKE2B_LOAD_MSG_0_4(b0, b1) \828do { \829b0 = vec_merge_lo(m4, m5); \830b1 = vec_merge_lo(m6, m7); \831} while(0)832833#define BLAKE2B_LOAD_MSG_1_1(b0, b1) \834do { \835b0 = vec_merge_hi(m7, m2); \836b1 = vec_merge_lo(m4, m6); \837} while(0)838839#define BLAKE2B_LOAD_MSG_1_2(b0, b1) \840do { \841b0 = vec_merge_hi(m5, m4); \842b1 = vec_shl_8(m7, m3); \843} while(0)844845#define BLAKE2B_LOAD_MSG_1_3(b0, b1) \846do { \847b0 = vec_shl_8(m0, m0); \848b1 = vec_merge_lo(m5, m2); \849} while(0)850851#define BLAKE2B_LOAD_MSG_1_4(b0, b1) \852do { \853b0 = vec_merge_hi(m6, m1); \854b1 = vec_merge_lo(m3, m1); \855} while(0)856857#define BLAKE2B_LOAD_MSG_2_1(b0, b1) \858do { \859b0 = vec_shl_8(m5, m6); \860b1 = vec_merge_lo(m2, m7); \861} while(0)862863#define BLAKE2B_LOAD_MSG_2_2(b0, b1) \864do { \865b0 = vec_merge_hi(m4, m0); \866b1 = vec_merge_hi_lo(m1, m6); \867} while(0)868869#define BLAKE2B_LOAD_MSG_2_3(b0, b1) \870do { \871b0 = vec_merge_hi_lo(m5, m1); \872b1 = vec_merge_lo(m3, m4); \873} while(0)874875#define BLAKE2B_LOAD_MSG_2_4(b0, b1) \876do { \877b0 = vec_merge_hi(m7, m3); \878b1 = vec_shl_8(m0, m2); \879} while(0)880881#define BLAKE2B_LOAD_MSG_3_1(b0, b1) \882do { \883b0 = vec_merge_lo(m3, m1); \884b1 = vec_merge_lo(m6, m5); \885} while(0)886887#define BLAKE2B_LOAD_MSG_3_2(b0, b1) \888do { \889b0 = vec_merge_lo(m4, m0); \890b1 = vec_merge_hi(m6, m7); \891} while(0)892893#define BLAKE2B_LOAD_MSG_3_3(b0, b1) \894do { \895b0 = vec_merge_hi_lo(m1, m2); \896b1 = vec_merge_hi_lo(m2, m7); \897} while(0)898899#define BLAKE2B_LOAD_MSG_3_4(b0, b1) \900do { \901b0 = vec_merge_hi(m3, m5); \902b1 = vec_merge_hi(m0, m4); \903} while(0)904905#define BLAKE2B_LOAD_MSG_4_1(b0, b1) \906do { \907b0 = vec_merge_lo(m4, m2); \908b1 = vec_merge_hi(m1, m5); \909} while(0)910911#define BLAKE2B_LOAD_MSG_4_2(b0, b1) \912do { \913b0 = vec_merge_hi_lo(m0, m3); \914b1 = vec_merge_hi_lo(m2, m7); \915} while(0)916917#define BLAKE2B_LOAD_MSG_4_3(b0, b1) \918do { \919b0 = vec_merge_hi_lo(m7, m5); \920b1 = vec_merge_hi_lo(m3, m1); \921} while(0)922923#define BLAKE2B_LOAD_MSG_4_4(b0, b1) \924do { \925b0 = vec_shl_8(m0, m6); \926b1 = vec_merge_hi_lo(m4, m6); \927} while(0)928929#define BLAKE2B_LOAD_MSG_5_1(b0, b1) \930do { \931b0 = vec_merge_hi(m1, m3); \932b1 = vec_merge_hi(m0, m4); \933} while(0)934935#define BLAKE2B_LOAD_MSG_5_2(b0, b1) \936do { \937b0 = vec_merge_hi(m6, m5); \938b1 = vec_merge_lo(m5, m1); \939} while(0)940941#define BLAKE2B_LOAD_MSG_5_3(b0, b1) \942do { \943b0 = vec_merge_hi_lo(m2, m3); \944b1 = vec_merge_lo(m7, m0); \945} while(0)946947#define BLAKE2B_LOAD_MSG_5_4(b0, b1) \948do { \949b0 = vec_merge_lo(m6, m2); \950b1 = vec_merge_hi_lo(m7, m4); \951} while(0)952953#define BLAKE2B_LOAD_MSG_6_1(b0, b1) \954do { \955b0 = vec_merge_hi_lo(m6, m0); \956b1 = vec_merge_hi(m7, m2); \957} while(0)958959#define BLAKE2B_LOAD_MSG_6_2(b0, b1) \960do { \961b0 = vec_merge_lo(m2, m7); \962b1 = vec_shl_8(m6, m5); \963} while(0)964965#define BLAKE2B_LOAD_MSG_6_3(b0, b1) \966do { \967b0 = vec_merge_hi(m0, m3); \968b1 = vec_shl_8(m4, m4); \969} while(0)970971#define BLAKE2B_LOAD_MSG_6_4(b0, b1) \972do { \973b0 = vec_merge_lo(m3, m1); \974b1 = vec_merge_hi_lo(m1, m5); \975} while(0)976977#define BLAKE2B_LOAD_MSG_7_1(b0, b1) \978do { \979b0 = vec_merge_lo(m6, m3); \980b1 = vec_merge_hi_lo(m6, m1); \981} while(0)982983#define BLAKE2B_LOAD_MSG_7_2(b0, b1) \984do { \985b0 = vec_shl_8(m5, m7); \986b1 = vec_merge_lo(m0, m4); \987} while(0)988989#define BLAKE2B_LOAD_MSG_7_3(b0, b1) \990do { \991b0 = vec_merge_lo(m2, m7); \992b1 = vec_merge_hi(m4, m1); \993} while(0)994995#define BLAKE2B_LOAD_MSG_7_4(b0, b1) \996do { \997b0 = vec_merge_hi(m0, m2); \998b1 = vec_merge_hi(m3, m5); \999} while(0)10001001#define BLAKE2B_LOAD_MSG_8_1(b0, b1) \1002do { \1003b0 = vec_merge_hi(m3, m7); \1004b1 = vec_shl_8(m5, m0); \1005} while(0)10061007#define BLAKE2B_LOAD_MSG_8_2(b0, b1) \1008do { \1009b0 = vec_merge_lo(m7, m4); \1010b1 = vec_shl_8(m1, m4); \1011} while(0)10121013#define BLAKE2B_LOAD_MSG_8_3(b0, b1) \1014do { \1015b0 = m6; \1016b1 = vec_shl_8(m0, m5); \1017} while(0)10181019#define BLAKE2B_LOAD_MSG_8_4(b0, b1) \1020do { \1021b0 = vec_merge_hi_lo(m1, m3); \1022b1 = m2; \1023} while(0)10241025#define BLAKE2B_LOAD_MSG_9_1(b0, b1) \1026do { \1027b0 = vec_merge_hi(m5, m4); \1028b1 = vec_merge_lo(m3, m0); \1029} while(0)10301031#define BLAKE2B_LOAD_MSG_9_2(b0, b1) \1032do { \1033b0 = vec_merge_hi(m1, m2); \1034b1 = vec_merge_hi_lo(m3, m2); \1035} while(0)10361037#define BLAKE2B_LOAD_MSG_9_3(b0, b1) \1038do { \1039b0 = vec_merge_lo(m7, m4); \1040b1 = vec_merge_lo(m1, m6); \1041} while(0)10421043#define BLAKE2B_LOAD_MSG_9_4(b0, b1) \1044do { \1045b0 = vec_shl_8(m5, m7); \1046b1 = vec_merge_hi(m6, m0); \1047} while(0)10481049#define BLAKE2B_LOAD_MSG_10_1(b0, b1) \1050do { \1051b0 = vec_merge_hi(m0, m1); \1052b1 = vec_merge_hi(m2, m3); \1053} while(0)10541055#define BLAKE2B_LOAD_MSG_10_2(b0, b1) \1056do { \1057b0 = vec_merge_lo(m0, m1); \1058b1 = vec_merge_lo(m2, m3); \1059} while(0)10601061#define BLAKE2B_LOAD_MSG_10_3(b0, b1) \1062do { \1063b0 = vec_merge_hi(m4, m5); \1064b1 = vec_merge_hi(m6, m7); \1065} while(0)10661067#define BLAKE2B_LOAD_MSG_10_4(b0, b1) \1068do { \1069b0 = vec_merge_lo(m4, m5); \1070b1 = vec_merge_lo(m6, m7); \1071} while(0)10721073#define BLAKE2B_LOAD_MSG_11_1(b0, b1) \1074do { \1075b0 = vec_merge_hi(m7, m2); \1076b1 = vec_merge_lo(m4, m6); \1077} while(0)10781079#define BLAKE2B_LOAD_MSG_11_2(b0, b1) \1080do { \1081b0 = vec_merge_hi(m5, m4); \1082b1 = vec_shl_8(m7, m3); \1083} while(0)10841085#define BLAKE2B_LOAD_MSG_11_3(b0, b1) \1086do { \1087b0 = vec_shl_8(m0, m0); \1088b1 = vec_merge_lo(m5, m2); \1089} while(0)10901091#define BLAKE2B_LOAD_MSG_11_4(b0, b1) \1092do { \1093b0 = vec_merge_hi(m6, m1); \1094b1 = vec_merge_lo(m3, m1); \1095} while(0)10961097// Power8 has packed 64-bit rotate, but in terms of left rotate1098const uint64x2_p ROR16_MASK = { 64-16, 64-16 };1099const uint64x2_p ROR24_MASK = { 64-24, 64-24 };1100const uint64x2_p ROR32_MASK = { 64-32, 64-32 };1101const uint64x2_p ROR63_MASK = { 64-63, 64-63 };11021103#define vec_ror_32(x) vec_rl(x, ROR32_MASK)1104#define vec_ror_24(x) vec_rl(x, ROR24_MASK)1105#define vec_ror_16(x) vec_rl(x, ROR16_MASK)1106#define vec_ror_63(x) vec_rl(x, ROR63_MASK)11071108#define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \1109do { \1110row1l = VecAdd(VecAdd(row1l, b0), row2l); \1111row1h = VecAdd(VecAdd(row1h, b1), row2h); \1112row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \1113row4l = vec_ror_32(row4l); row4h = vec_ror_32(row4h); \1114row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \1115row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \1116row2l = vec_ror_24(row2l); row2h = vec_ror_24(row2h); \1117} while(0)11181119#define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \1120do { \1121row1l = VecAdd(VecAdd(row1l, b0), row2l); \1122row1h = VecAdd(VecAdd(row1h, b1), row2h); \1123row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \1124row4l = vec_ror_16(row4l); row4h = vec_ror_16(row4h); \1125row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \1126row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \1127row2l = vec_ror_63(row2l); row2h = vec_ror_63(row2h); \1128} while(0)11291130#define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \1131do { \1132uint64x2_p t0 = vec_shl_8(row2l, row2h); \1133uint64x2_p t1 = vec_shl_8(row2h, row2l); \1134row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \1135t0 = vec_shl_8(row4h, row4l); t1 = vec_shl_8(row4l, row4h); \1136row4l = t0; row4h = t1; \1137} while(0)11381139#define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \1140do { \1141uint64x2_p t0 = vec_shl_8(row2h, row2l); \1142uint64x2_p t1 = vec_shl_8(row2l, row2h); \1143row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \1144t0 = vec_shl_8(row4l, row4h); t1 = vec_shl_8(row4h, row4l); \1145row4l = t0; row4h = t1; \1146} while(0)11471148#define BLAKE2B_ROUND(r) \1149do { \1150uint64x2_p b0, b1; \1151BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \1152BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \1153BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \1154BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \1155BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \1156BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \1157BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \1158BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \1159BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \1160BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \1161} while(0)11621163// Possibly unaligned user messages1164uint64x2_p m0, m1, m2, m3, m4, m5, m6, m7;1165// Endian conversion mask1166const uint8x16_p le_mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};11671168#if defined(_ARCH_PWR9)1169// POWER9 provides loads for char's and short's1170m0 = (uint64x2_p) vec_xl( 0, CONST_V8_CAST( input ));1171m1 = (uint64x2_p) vec_xl( 16, CONST_V8_CAST( input ));1172m2 = (uint64x2_p) vec_xl( 32, CONST_V8_CAST( input ));1173m3 = (uint64x2_p) vec_xl( 48, CONST_V8_CAST( input ));1174m4 = (uint64x2_p) vec_xl( 64, CONST_V8_CAST( input ));1175m5 = (uint64x2_p) vec_xl( 80, CONST_V8_CAST( input ));1176m6 = (uint64x2_p) vec_xl( 96, CONST_V8_CAST( input ));1177m7 = (uint64x2_p) vec_xl(112, CONST_V8_CAST( input ));11781179# if defined(CRYPTOPP_BIG_ENDIAN)1180m0 = vec_perm(m0, m0, le_mask);1181m1 = vec_perm(m1, m1, le_mask);1182m2 = vec_perm(m2, m2, le_mask);1183m3 = vec_perm(m3, m3, le_mask);1184m4 = vec_perm(m4, m4, le_mask);1185m5 = vec_perm(m5, m5, le_mask);1186m6 = vec_perm(m6, m6, le_mask);1187m7 = vec_perm(m7, m7, le_mask);1188# endif1189#else1190// Altivec only provides 16-byte aligned loads1191// http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf1192m0 = (uint64x2_p) vec_ld( 0, CONST_V8_CAST( input ));1193m1 = (uint64x2_p) vec_ld( 16, CONST_V8_CAST( input ));1194m2 = (uint64x2_p) vec_ld( 32, CONST_V8_CAST( input ));1195m3 = (uint64x2_p) vec_ld( 48, CONST_V8_CAST( input ));1196m4 = (uint64x2_p) vec_ld( 64, CONST_V8_CAST( input ));1197m5 = (uint64x2_p) vec_ld( 80, CONST_V8_CAST( input ));1198m6 = (uint64x2_p) vec_ld( 96, CONST_V8_CAST( input ));1199m7 = (uint64x2_p) vec_ld(112, CONST_V8_CAST( input ));12001201// Alignment check for load of the message buffer1202const uintptr_t addr = (uintptr_t)input;1203if (addr%16 == 0)1204{1205// Already aligned. Perform a little-endian swap as required1206# if defined(CRYPTOPP_BIG_ENDIAN)1207m0 = vec_perm(m0, m0, le_mask);1208m1 = vec_perm(m1, m1, le_mask);1209m2 = vec_perm(m2, m2, le_mask);1210m3 = vec_perm(m3, m3, le_mask);1211m4 = vec_perm(m4, m4, le_mask);1212m5 = vec_perm(m5, m5, le_mask);1213m6 = vec_perm(m6, m6, le_mask);1214m7 = vec_perm(m7, m7, le_mask);1215# endif1216}1217else1218{1219// Not aligned. Fix vectors and perform a little-endian swap as required1220// http://mirror.informatimago.com/next/developer.apple.com/1221// hardwaredrivers/ve/code_optimization.html1222uint64x2_p ex; uint8x16_p perm;1223ex = (uint64x2_p) vec_ld(112+15, CONST_V8_CAST( input ));1224perm = vec_lvsl(0, CONST_V8_CAST( addr ));12251226# if defined(CRYPTOPP_BIG_ENDIAN)1227// Combine the vector permute with the little-endian swap1228perm = vec_perm(perm, perm, le_mask);1229# endif12301231m0 = vec_perm(m0, m1, perm);1232m1 = vec_perm(m1, m2, perm);1233m2 = vec_perm(m2, m3, perm);1234m3 = vec_perm(m3, m4, perm);1235m4 = vec_perm(m4, m5, perm);1236m5 = vec_perm(m5, m6, perm);1237m6 = vec_perm(m6, m7, perm);1238m7 = vec_perm(m7, ex, perm);1239}1240#endif12411242uint64x2_p row1l, row1h, row2l, row2h;1243uint64x2_p row3l, row3h, row4l, row4h;12441245const uint64x2_p h0 = row1l = VecLoad64LE(state.h()+0, le_mask);1246const uint64x2_p h1 = row1h = VecLoad64LE(state.h()+2, le_mask);1247const uint64x2_p h2 = row2l = VecLoad64LE(state.h()+4, le_mask);1248const uint64x2_p h3 = row2h = VecLoad64LE(state.h()+6, le_mask);12491250row3l = VecLoad64(BLAKE2B_IV+0);1251row3h = VecLoad64(BLAKE2B_IV+2);1252row4l = VecXor(VecLoad64(BLAKE2B_IV+4), VecLoad64(state.t()+0));1253row4h = VecXor(VecLoad64(BLAKE2B_IV+6), VecLoad64(state.f()+0));12541255BLAKE2B_ROUND(0);1256BLAKE2B_ROUND(1);1257BLAKE2B_ROUND(2);1258BLAKE2B_ROUND(3);1259BLAKE2B_ROUND(4);1260BLAKE2B_ROUND(5);1261BLAKE2B_ROUND(6);1262BLAKE2B_ROUND(7);1263BLAKE2B_ROUND(8);1264BLAKE2B_ROUND(9);1265BLAKE2B_ROUND(10);1266BLAKE2B_ROUND(11);12671268VecStore64LE(state.h()+0, VecXor(h0, VecXor(row1l, row3l)), le_mask);1269VecStore64LE(state.h()+2, VecXor(h1, VecXor(row1h, row3h)), le_mask);1270VecStore64LE(state.h()+4, VecXor(h2, VecXor(row2l, row4l)), le_mask);1271VecStore64LE(state.h()+6, VecXor(h3, VecXor(row2h, row4h)), le_mask);1272}1273#endif // CRYPTOPP_POWER8_AVAILABLE12741275NAMESPACE_END127612771278