Path: blob/main/contrib/bearssl/src/hash/ghash_pclmul.c
39507 views
/*1* Copyright (c) 2017 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#define BR_ENABLE_INTRINSICS 125#include "inner.h"2627/*28* This is the GHASH implementation that leverages the pclmulqdq opcode29* (from the AES-NI instructions).30*/3132#if BR_AES_X86NI3334/*35* Test CPU support for PCLMULQDQ.36*/37static inline int38pclmul_supported(void)39{40/*41* Bit mask for features in ECX:42* 1 PCLMULQDQ support43*/44return br_cpuid(0, 0, 0x00000002, 0);45}4647/* see bearssl_hash.h */48br_ghash49br_ghash_pclmul_get(void)50{51return pclmul_supported() ? &br_ghash_pclmul : 0;52}5354BR_TARGETS_X86_UP5556/*57* GHASH is defined over elements of GF(2^128) with "full little-endian"58* representation: leftmost byte is least significant, and, within each59* byte, leftmost _bit_ is least significant. The natural ordering in60* x86 is "mixed little-endian": bytes are ordered from least to most61* significant, but bits within a byte are in most-to-least significant62* order. Going to full little-endian representation would require63* reversing bits within each byte, which is doable but expensive.64*65* Instead, we go to full big-endian representation, by swapping bytes66* around, which is done with a single _mm_shuffle_epi8() opcode (it67* comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We68* can use a full big-endian representation because in a carryless69* multiplication, we have a nice bit reversal property:70*71* rev_128(x) * rev_128(y) = rev_255(x * y)72*73* So by using full big-endian, we still get the right result, except74* that it is right-shifted by 1 bit. The left-shift is relatively75* inexpensive, and it can be mutualised.76*77*78* Since SSE2 opcodes do not have facilities for shitfting full 128-bit79* values with bit precision, we have to break down values into 64-bit80* chunks. We number chunks from 0 to 3 in left to right order.81*/8283/*84* Byte-swap a complete 128-bit value. This normally uses85* _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode).86* However, this crashes old Clang versions, so, for Clang before 3.8,87* we use an alternate (and less efficient) version.88*/89#if BR_CLANG && !BR_CLANG_3_890#define BYTESWAP_DECL91#define BYTESWAP_PREP (void)092#define BYTESWAP(x) do { \93__m128i byteswap1, byteswap2; \94byteswap1 = (x); \95byteswap2 = _mm_srli_epi16(byteswap1, 8); \96byteswap1 = _mm_slli_epi16(byteswap1, 8); \97byteswap1 = _mm_or_si128(byteswap1, byteswap2); \98byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \99byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \100(x) = _mm_shuffle_epi32(byteswap1, 0x4E); \101} while (0)102#else103#define BYTESWAP_DECL __m128i byteswap_index;104#define BYTESWAP_PREP do { \105byteswap_index = _mm_set_epi8( \1060, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \107} while (0)108#define BYTESWAP(x) do { \109(x) = _mm_shuffle_epi8((x), byteswap_index); \110} while (0)111#endif112113/*114* Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,115* for that compiler, we use inline assembly. Inline assembly is116* potentially a bit slower because the compiler does not understand117* what the opcode does, and thus cannot optimize instruction118* scheduling.119*120* We use a target of "sse2" only, so that Clang may still handle the121* '__m128i' type and allocate SSE2 registers.122*/123#if BR_CLANG124BR_TARGET("sse2")125static inline __m128i126pclmulqdq00(__m128i x, __m128i y)127{128__asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y));129return x;130}131BR_TARGET("sse2")132static inline __m128i133pclmulqdq11(__m128i x, __m128i y)134{135__asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y));136return x;137}138#else139#define pclmulqdq00(x, y) _mm_clmulepi64_si128(x, y, 0x00)140#define pclmulqdq11(x, y) _mm_clmulepi64_si128(x, y, 0x11)141#endif142143/*144* From a 128-bit value kw, compute kx as the XOR of the two 64-bit145* halves of kw (into the right half of kx; left half is unspecified).146*/147#define BK(kw, kx) do { \148kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \149} while (0)150151/*152* Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and153* the XOR of the two values (kx).154*/155#define PBK(k0, k1, kw, kx) do { \156kw = _mm_unpacklo_epi64(k1, k0); \157kx = _mm_xor_si128(k0, k1); \158} while (0)159160/*161* Left-shift by 1 bit a 256-bit value (in four 64-bit words).162*/163#define SL_256(x0, x1, x2, x3) do { \164x0 = _mm_or_si128( \165_mm_slli_epi64(x0, 1), \166_mm_srli_epi64(x1, 63)); \167x1 = _mm_or_si128( \168_mm_slli_epi64(x1, 1), \169_mm_srli_epi64(x2, 63)); \170x2 = _mm_or_si128( \171_mm_slli_epi64(x2, 1), \172_mm_srli_epi64(x3, 63)); \173x3 = _mm_slli_epi64(x3, 1); \174} while (0)175176/*177* Perform reduction in GF(2^128). The 256-bit value is in x0..x3;178* result is written in x0..x1.179*/180#define REDUCE_F128(x0, x1, x2, x3) do { \181x1 = _mm_xor_si128( \182x1, \183_mm_xor_si128( \184_mm_xor_si128( \185x3, \186_mm_srli_epi64(x3, 1)), \187_mm_xor_si128( \188_mm_srli_epi64(x3, 2), \189_mm_srli_epi64(x3, 7)))); \190x2 = _mm_xor_si128( \191_mm_xor_si128( \192x2, \193_mm_slli_epi64(x3, 63)), \194_mm_xor_si128( \195_mm_slli_epi64(x3, 62), \196_mm_slli_epi64(x3, 57))); \197x0 = _mm_xor_si128( \198x0, \199_mm_xor_si128( \200_mm_xor_si128( \201x2, \202_mm_srli_epi64(x2, 1)), \203_mm_xor_si128( \204_mm_srli_epi64(x2, 2), \205_mm_srli_epi64(x2, 7)))); \206x1 = _mm_xor_si128( \207_mm_xor_si128( \208x1, \209_mm_slli_epi64(x2, 63)), \210_mm_xor_si128( \211_mm_slli_epi64(x2, 62), \212_mm_slli_epi64(x2, 57))); \213} while (0)214215/*216* Square value kw into (dw,dx).217*/218#define SQUARE_F128(kw, dw, dx) do { \219__m128i z0, z1, z2, z3; \220z1 = pclmulqdq11(kw, kw); \221z3 = pclmulqdq00(kw, kw); \222z0 = _mm_shuffle_epi32(z1, 0x0E); \223z2 = _mm_shuffle_epi32(z3, 0x0E); \224SL_256(z0, z1, z2, z3); \225REDUCE_F128(z0, z1, z2, z3); \226PBK(z0, z1, dw, dx); \227} while (0)228229/* see bearssl_hash.h */230BR_TARGET("ssse3,pclmul")231void232br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)233{234const unsigned char *buf1, *buf2;235unsigned char tmp[64];236size_t num4, num1;237__m128i yw, h1w, h1x;238BYTESWAP_DECL239240/*241* We split data into two chunks. First chunk starts at buf1242* and contains num4 blocks of 64-byte values. Second chunk243* starts at buf2 and contains num1 blocks of 16-byte values.244* We want the first chunk to be as large as possible.245*/246buf1 = data;247num4 = len >> 6;248len &= 63;249buf2 = buf1 + (num4 << 6);250num1 = (len + 15) >> 4;251if ((len & 15) != 0) {252memcpy(tmp, buf2, len);253memset(tmp + len, 0, (num1 << 4) - len);254buf2 = tmp;255}256257/*258* Preparatory step for endian conversions.259*/260BYTESWAP_PREP;261262/*263* Load y and h.264*/265yw = _mm_loadu_si128(y);266h1w = _mm_loadu_si128(h);267BYTESWAP(yw);268BYTESWAP(h1w);269BK(h1w, h1x);270271if (num4 > 0) {272__m128i h2w, h2x, h3w, h3x, h4w, h4x;273__m128i t0, t1, t2, t3;274275/*276* Compute h2 = h^2.277*/278SQUARE_F128(h1w, h2w, h2x);279280/*281* Compute h3 = h^3 = h*(h^2).282*/283t1 = pclmulqdq11(h1w, h2w);284t3 = pclmulqdq00(h1w, h2w);285t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x),286_mm_xor_si128(t1, t3));287t0 = _mm_shuffle_epi32(t1, 0x0E);288t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));289t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));290SL_256(t0, t1, t2, t3);291REDUCE_F128(t0, t1, t2, t3);292PBK(t0, t1, h3w, h3x);293294/*295* Compute h4 = h^4 = (h^2)^2.296*/297SQUARE_F128(h2w, h4w, h4x);298299while (num4 -- > 0) {300__m128i aw0, aw1, aw2, aw3;301__m128i ax0, ax1, ax2, ax3;302303aw0 = _mm_loadu_si128((void *)(buf1 + 0));304aw1 = _mm_loadu_si128((void *)(buf1 + 16));305aw2 = _mm_loadu_si128((void *)(buf1 + 32));306aw3 = _mm_loadu_si128((void *)(buf1 + 48));307BYTESWAP(aw0);308BYTESWAP(aw1);309BYTESWAP(aw2);310BYTESWAP(aw3);311buf1 += 64;312313aw0 = _mm_xor_si128(aw0, yw);314BK(aw1, ax1);315BK(aw2, ax2);316BK(aw3, ax3);317BK(aw0, ax0);318319t1 = _mm_xor_si128(320_mm_xor_si128(321pclmulqdq11(aw0, h4w),322pclmulqdq11(aw1, h3w)),323_mm_xor_si128(324pclmulqdq11(aw2, h2w),325pclmulqdq11(aw3, h1w)));326t3 = _mm_xor_si128(327_mm_xor_si128(328pclmulqdq00(aw0, h4w),329pclmulqdq00(aw1, h3w)),330_mm_xor_si128(331pclmulqdq00(aw2, h2w),332pclmulqdq00(aw3, h1w)));333t2 = _mm_xor_si128(334_mm_xor_si128(335pclmulqdq00(ax0, h4x),336pclmulqdq00(ax1, h3x)),337_mm_xor_si128(338pclmulqdq00(ax2, h2x),339pclmulqdq00(ax3, h1x)));340t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));341t0 = _mm_shuffle_epi32(t1, 0x0E);342t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));343t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));344SL_256(t0, t1, t2, t3);345REDUCE_F128(t0, t1, t2, t3);346yw = _mm_unpacklo_epi64(t1, t0);347}348}349350while (num1 -- > 0) {351__m128i aw, ax;352__m128i t0, t1, t2, t3;353354aw = _mm_loadu_si128((void *)buf2);355BYTESWAP(aw);356buf2 += 16;357358aw = _mm_xor_si128(aw, yw);359BK(aw, ax);360361t1 = pclmulqdq11(aw, h1w);362t3 = pclmulqdq00(aw, h1w);363t2 = pclmulqdq00(ax, h1x);364t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));365t0 = _mm_shuffle_epi32(t1, 0x0E);366t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));367t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));368SL_256(t0, t1, t2, t3);369REDUCE_F128(t0, t1, t2, t3);370yw = _mm_unpacklo_epi64(t1, t0);371}372373BYTESWAP(yw);374_mm_storeu_si128(y, yw);375}376377BR_TARGETS_X86_DOWN378379#else380381/* see bearssl_hash.h */382br_ghash383br_ghash_pclmul_get(void)384{385return 0;386}387388#endif389390391