Path: blob/main/contrib/bearssl/src/hash/ghash_pwr8.c
39536 views
/*1* Copyright (c) 2017 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#define BR_POWER_ASM_MACROS 125#include "inner.h"2627/*28* This is the GHASH implementation that leverages the POWER8 opcodes.29*/3031#if BR_POWER83233/*34* Some symbolic names for registers.35* HB0 = 16 bytes of value 036* HB1 = 16 bytes of value 137* HB2 = 16 bytes of value 238* HB6 = 16 bytes of value 639* HB7 = 16 bytes of value 740* TT0, TT1 and TT2 are temporaries41*42* BSW holds the pattern for byteswapping 32-bit words; this is set only43* on little-endian systems. XBSW is the same register with the +32 offset44* for access with the VSX opcodes.45*/46#define HB0 047#define HB1 148#define HB2 249#define HB6 350#define HB7 451#define TT0 552#define TT1 653#define TT2 75455#define BSW 856#define XBSW 405758/*59* Macro to initialise the constants.60*/61#define INIT \62vxor(HB0, HB0, HB0) \63vspltisb(HB1, 1) \64vspltisb(HB2, 2) \65vspltisb(HB6, 6) \66vspltisb(HB7, 7) \67INIT_BSW6869/*70* Fix endianness of a value after reading it or before writing it, if71* necessary.72*/73#if BR_POWER8_LE74#define INIT_BSW lxvw4x(XBSW, 0, %[idx2be])75#define FIX_ENDIAN(xx) vperm(xx, xx, xx, BSW)76#else77#define INIT_BSW78#define FIX_ENDIAN(xx)79#endif8081/*82* Left-shift x0:x1 by one bit to the left. This is a corrective action83* needed because GHASH is defined in full little-endian specification,84* while the opcodes use full big-endian convention, so the 255-bit product85* ends up one bit to the right.86*/87#define SL_256(x0, x1) \88vsldoi(TT0, HB0, x1, 1) \89vsl(x0, x0, HB1) \90vsr(TT0, TT0, HB7) \91vsl(x1, x1, HB1) \92vxor(x0, x0, TT0)9394/*95* Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as96* x0 or x1, or a different register). x0 and x1 are modified.97*/98#define REDUCE_F128(xd, x0, x1) \99vxor(x0, x0, x1) \100vsr(TT0, x1, HB1) \101vsr(TT1, x1, HB2) \102vsr(TT2, x1, HB7) \103vxor(x0, x0, TT0) \104vxor(TT1, TT1, TT2) \105vxor(x0, x0, TT1) \106vsldoi(x1, x1, HB0, 15) \107vsl(TT1, x1, HB6) \108vsl(TT2, x1, HB1) \109vxor(x1, TT1, TT2) \110vsr(TT0, x1, HB1) \111vsr(TT1, x1, HB2) \112vsr(TT2, x1, HB7) \113vxor(x0, x0, x1) \114vxor(x0, x0, TT0) \115vxor(TT1, TT1, TT2) \116vxor(xd, x0, TT1)117118/* see bearssl_hash.h */119void120br_ghash_pwr8(void *y, const void *h, const void *data, size_t len)121{122const unsigned char *buf1, *buf2;123size_t num4, num1;124unsigned char tmp[64];125long cc0, cc1, cc2, cc3;126127#if BR_POWER8_LE128static const uint32_t idx2be[] = {1290x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C130};131#endif132133buf1 = data;134135/*136* Assembly code requires data into two chunks; first chunk137* must contain a number of blocks which is a multiple of 4.138* Since the processing for the first chunk is faster, we want139* to make it as big as possible.140*141* For the remainder, there are two possibilities:142* -- if the remainder size is a multiple of 16, then use it143* in place;144* -- otherwise, copy it to the tmp[] array and pad it with145* zeros.146*/147num4 = len >> 6;148buf2 = buf1 + (num4 << 6);149len &= 63;150num1 = (len + 15) >> 4;151if ((len & 15) != 0) {152memcpy(tmp, buf2, len);153memset(tmp + len, 0, (num1 << 4) - len);154buf2 = tmp;155}156157cc0 = 0;158cc1 = 16;159cc2 = 32;160cc3 = 48;161asm volatile (162INIT163164/*165* Load current h (denoted hereafter h1) in v9.166*/167lxvw4x(41, 0, %[h])168FIX_ENDIAN(9)169170/*171* Load current y into v28.172*/173lxvw4x(60, 0, %[y])174FIX_ENDIAN(28)175176/*177* Split h1 into three registers:178* v17 = h1_1:h1_0179* v18 = 0:h1_0180* v19 = h1_1:0181*/182xxpermdi(49, 41, 41, 2)183vsldoi(18, HB0, 9, 8)184vsldoi(19, 9, HB0, 8)185186/*187* If num4 is 0, skip directly to the second chunk.188*/189cmpldi(%[num4], 0)190beq(chunk1)191192/*193* Compute h2 = h*h in v10.194*/195vpmsumd(10, 18, 18)196vpmsumd(11, 19, 19)197SL_256(10, 11)198REDUCE_F128(10, 10, 11)199200/*201* Compute h3 = h*h*h in v11.202* We first split h2 into:203* v10 = h2_0:h2_1204* v11 = 0:h2_0205* v12 = h2_1:0206* Then we do the product with h1, and reduce into v11.207*/208vsldoi(11, HB0, 10, 8)209vsldoi(12, 10, HB0, 8)210vpmsumd(13, 10, 17)211vpmsumd(11, 11, 18)212vpmsumd(12, 12, 19)213vsldoi(14, HB0, 13, 8)214vsldoi(15, 13, HB0, 8)215vxor(11, 11, 14)216vxor(12, 12, 15)217SL_256(11, 12)218REDUCE_F128(11, 11, 12)219220/*221* Compute h4 = h*h*h*h in v12. This is done by squaring h2.222*/223vsldoi(12, HB0, 10, 8)224vsldoi(13, 10, HB0, 8)225vpmsumd(12, 12, 12)226vpmsumd(13, 13, 13)227SL_256(12, 13)228REDUCE_F128(12, 12, 13)229230/*231* Repack h1, h2, h3 and h4:232* v13 = h4_0:h3_0233* v14 = h4_1:h3_1234* v15 = h2_0:h1_0235* v16 = h2_1:h1_1236*/237xxpermdi(45, 44, 43, 0)238xxpermdi(46, 44, 43, 3)239xxpermdi(47, 42, 41, 0)240xxpermdi(48, 42, 41, 3)241242/*243* Loop for each group of four blocks.244*/245mtctr(%[num4])246label(loop4)247/*248* Read the four next blocks.249* v20 = y + a0 = b0250* v21 = a1 = b1251* v22 = a2 = b2252* v23 = a3 = b3253*/254lxvw4x(52, %[cc0], %[buf1])255lxvw4x(53, %[cc1], %[buf1])256lxvw4x(54, %[cc2], %[buf1])257lxvw4x(55, %[cc3], %[buf1])258FIX_ENDIAN(20)259FIX_ENDIAN(21)260FIX_ENDIAN(22)261FIX_ENDIAN(23)262addi(%[buf1], %[buf1], 64)263vxor(20, 20, 28)264265/*266* Repack the blocks into v9, v10, v11 and v12.267* v9 = b0_0:b1_0268* v10 = b0_1:b1_1269* v11 = b2_0:b3_0270* v12 = b2_1:b3_1271*/272xxpermdi(41, 52, 53, 0)273xxpermdi(42, 52, 53, 3)274xxpermdi(43, 54, 55, 0)275xxpermdi(44, 54, 55, 3)276277/*278* Compute the products.279* v20 = b0_0*h4_0 + b1_0*h3_0280* v21 = b0_1*h4_0 + b1_1*h3_0281* v22 = b0_0*h4_1 + b1_0*h3_1282* v23 = b0_1*h4_1 + b1_1*h3_1283* v24 = b2_0*h2_0 + b3_0*h1_0284* v25 = b2_1*h2_0 + b3_1*h1_0285* v26 = b2_0*h2_1 + b3_0*h1_1286* v27 = b2_1*h2_1 + b3_1*h1_1287*/288vpmsumd(20, 13, 9)289vpmsumd(21, 13, 10)290vpmsumd(22, 14, 9)291vpmsumd(23, 14, 10)292vpmsumd(24, 15, 11)293vpmsumd(25, 15, 12)294vpmsumd(26, 16, 11)295vpmsumd(27, 16, 12)296297/*298* Sum products into a single 256-bit result in v11:v12.299*/300vxor(11, 20, 24)301vxor(12, 23, 27)302vxor( 9, 21, 22)303vxor(10, 25, 26)304vxor(20, 9, 10)305vsldoi( 9, HB0, 20, 8)306vsldoi(10, 20, HB0, 8)307vxor(11, 11, 9)308vxor(12, 12, 10)309310/*311* Fix and reduce in GF(2^128); this is the new y (in v28).312*/313SL_256(11, 12)314REDUCE_F128(28, 11, 12)315316/*317* Loop for next group of four blocks.318*/319bdnz(loop4)320321/*322* Process second chunk, one block at a time.323*/324label(chunk1)325cmpldi(%[num1], 0)326beq(done)327328mtctr(%[num1])329label(loop1)330/*331* Load next data block and XOR it into y.332*/333lxvw4x(41, 0, %[buf2])334#if BR_POWER8_LE335FIX_ENDIAN(9)336#endif337addi(%[buf2], %[buf2], 16)338vxor(9, 28, 9)339340/*341* Split y into doublewords:342* v9 = y_0:y_1343* v10 = 0:y_0344* v11 = y_1:0345*/346vsldoi(10, HB0, 9, 8)347vsldoi(11, 9, HB0, 8)348349/*350* Compute products with h:351* v12 = y_0 * h_0352* v13 = y_1 * h_1353* v14 = y_1 * h_0 + y_0 * h_1354*/355vpmsumd(14, 9, 17)356vpmsumd(12, 10, 18)357vpmsumd(13, 11, 19)358359/*360* Propagate v14 into v12:v13 to finalise product.361*/362vsldoi(10, HB0, 14, 8)363vsldoi(11, 14, HB0, 8)364vxor(12, 12, 10)365vxor(13, 13, 11)366367/*368* Fix result and reduce into v28 (next value for y).369*/370SL_256(12, 13)371REDUCE_F128(28, 12, 13)372bdnz(loop1)373374label(done)375/*376* Write back the new y.377*/378FIX_ENDIAN(28)379stxvw4x(60, 0, %[y])380381: [buf1] "+b" (buf1), [buf2] "+b" (buf2)382: [y] "b" (y), [h] "b" (h), [num4] "b" (num4), [num1] "b" (num1),383[cc0] "b" (cc0), [cc1] "b" (cc1), [cc2] "b" (cc2), [cc3] "b" (cc3)384#if BR_POWER8_LE385, [idx2be] "b" (idx2be)386#endif387: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",388"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",389"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",390"ctr", "memory"391);392}393394/* see bearssl_hash.h */395br_ghash396br_ghash_pwr8_get(void)397{398return &br_ghash_pwr8;399}400401#else402403/* see bearssl_hash.h */404br_ghash405br_ghash_pwr8_get(void)406{407return 0;408}409410#endif411412413