/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* Fast SHA-256 implementation for SPE instruction set (PPC)3*4* This code makes use of the SPE SIMD instruction set as defined in5* http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf6* Implementation is based on optimization guide notes from7* http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf8*9* Copyright (c) 2015 Markus Stockhausen <[email protected]>10*/1112#include <asm/ppc_asm.h>13#include <asm/asm-offsets.h>1415#define rHP r3 /* pointer to hash values in memory */16#define rKP r24 /* pointer to round constants */17#define rWP r4 /* pointer to input data */1819#define rH0 r5 /* 8 32 bit hash values in 8 registers */20#define rH1 r621#define rH2 r722#define rH3 r823#define rH4 r924#define rH5 r1025#define rH6 r1126#define rH7 r122728#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */29#define rW1 r1530#define rW2 r1631#define rW3 r1732#define rW4 r1833#define rW5 r1934#define rW6 r2035#define rW7 r213637#define rT0 r22 /* 64 bit temporaries */38#define rT1 r2339#define rT2 r0 /* 32 bit temporaries */40#define rT3 r254142#define CMP_KN_LOOP43#define CMP_KC_LOOP \44cmpwi rT1,0;4546#define INITIALIZE \47stwu r1,-128(r1); /* create stack frame */ \48evstdw r14,8(r1); /* We must save non volatile */ \49evstdw r15,16(r1); /* registers. Take the chance */ \50evstdw r16,24(r1); /* and save the SPE part too */ \51evstdw r17,32(r1); \52evstdw r18,40(r1); \53evstdw r19,48(r1); \54evstdw r20,56(r1); \55evstdw r21,64(r1); \56evstdw r22,72(r1); \57evstdw r23,80(r1); \58stw r24,88(r1); /* save normal registers */ \59stw r25,92(r1);606162#define FINALIZE \63evldw r14,8(r1); /* restore SPE registers */ \64evldw r15,16(r1); \65evldw r16,24(r1); \66evldw r17,32(r1); \67evldw r18,40(r1); \68evldw r19,48(r1); \69evldw r20,56(r1); \70evldw r21,64(r1); \71evldw r22,72(r1); \72evldw r23,80(r1); \73lwz r24,88(r1); /* restore normal registers */ \74lwz r25,92(r1); \75xor r0,r0,r0; \76stw r0,8(r1); /* Delete sensitive data */ \77stw r0,16(r1); /* that we might have pushed */ \78stw r0,24(r1); /* from other context that runs */ \79stw r0,32(r1); /* the same code. Assume that */ \80stw r0,40(r1); /* the lower part of the GPRs */ \81stw r0,48(r1); /* was already overwritten on */ \82stw r0,56(r1); /* the way down to here */ \83stw r0,64(r1); \84stw r0,72(r1); \85stw r0,80(r1); \86addi r1,r1,128; /* cleanup stack frame */8788#ifdef __BIG_ENDIAN__89#define LOAD_DATA(reg, off) \90lwz reg,off(rWP); /* load data */91#define NEXT_BLOCK \92addi rWP,rWP,64; /* increment per block */93#else94#define LOAD_DATA(reg, off) \95lwbrx reg,0,rWP; /* load data */ \96addi rWP,rWP,4; /* increment per word */97#define NEXT_BLOCK /* nothing to do */98#endif99100#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \101LOAD_DATA(w, off) /* 1: W */ \102rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \103rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \104rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \105xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \106and rT3,e,f; /* 1: ch = e and f */ \107xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \108andc rT1,g,e; /* 1: ch' = ~e and g */ \109lwz rT2,off(rKP); /* 1: K */ \110xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \111add h,h,rT0; /* 1: temp1 = h + S1 */ \112add rT3,rT3,w; /* 1: temp1' = ch + w */ \113rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \114add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \115rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \116add h,h,rT2; /* 1: temp1 = temp1 + K */ \117rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \118xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \119add d,d,h; /* 1: d = d + temp1 */ \120xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \121evmergelo w,w,w; /* shift W */ \122or rT2,a,b; /* 1: maj = a or b */ \123and rT1,a,b; /* 1: maj' = a and b */ \124and rT2,rT2,c; /* 1: maj = maj and c */ \125LOAD_DATA(w, off+4) /* 2: W */ \126or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \127rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \128add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \129rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \130add h,h,rT3; /* 1: h = temp1 + temp2 */ \131rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \132xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \133and rT3,d,e; /* 2: ch = e and f */ \134xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \135andc rT1,f,d; /* 2: ch' = ~e and g */ \136lwz rT2,off+4(rKP); /* 2: K */ \137xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \138add g,g,rT0; /* 2: temp1 = h + S1 */ \139add rT3,rT3,w; /* 2: temp1' = ch + w */ \140rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \141add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \142rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \143add g,g,rT2; /* 2: temp1 = temp1 + K */ \144rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \145xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \146or rT2,h,a; /* 2: maj = a or b */ \147xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \148and rT1,h,a; /* 2: maj' = a and b */ \149and rT2,rT2,b; /* 2: maj = maj and c */ \150add c,c,g; /* 2: d = d + temp1 */ \151or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \152add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \153add g,g,rT3 /* 2: h = temp1 + temp2 */154155#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \156rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \157evmergelohi rT0,w0,w1; /* w[-15] */ \158rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \159evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \160xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \161evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \162rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \163evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \164xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \165evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \166add h,h,rT2; /* 1: temp1 = h + S1 */ \167evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \168and rT2,e,f; /* 1: ch = e and f */ \169evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \170andc rT3,g,e; /* 1: ch' = ~e and g */ \171evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \172xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \173evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \174add h,h,rT2; /* 1: temp1 = temp1 + ch */ \175evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \176rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \177evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \178rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \179evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \180xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \181evldw rT1,off(rKP); /* k */ \182rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \183evaddw w0,w0,rT0; /* w = w + s1 */ \184xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \185evmergelohi rT0,w4,w5; /* w[-7] */ \186and rT3,a,b; /* 1: maj = a and b */ \187evaddw w0,w0,rT0; /* w = w + w[-7] */ \188CMP_K##k##_LOOP \189add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \190evaddw rT1,rT1,w0; /* wk = w + k */ \191xor rT3,a,b; /* 1: maj = a xor b */ \192evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \193and rT3,rT3,c; /* 1: maj = maj and c */ \194add h,h,rT0; /* 1: temp1 = temp1 + wk */ \195add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \196add g,g,rT1; /* 2: temp1 = temp1 + wk */ \197add d,d,h; /* 1: d = d + temp1 */ \198rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \199add h,h,rT2; /* 1: h = temp1 + temp2 */ \200rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \201rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \202xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \203and rT3,d,e; /* 2: ch = e and f */ \204xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \205andc rT1,f,d; /* 2: ch' = ~e and g */ \206add g,g,rT0; /* 2: temp1 = h + S1 */ \207xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \208rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \209add g,g,rT3; /* 2: temp1 = temp1 + ch */ \210rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \211rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \212xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \213or rT2,h,a; /* 2: maj = a or b */ \214and rT1,h,a; /* 2: maj' = a and b */ \215and rT2,rT2,b; /* 2: maj = maj and c */ \216xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \217or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \218add c,c,g; /* 2: d = d + temp1 */ \219add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \220add g,g,rT3 /* 2: h = temp1 + temp2 */221222_GLOBAL(ppc_spe_sha256_transform)223INITIALIZE224225mtctr r5226lwz rH0,0(rHP)227lwz rH1,4(rHP)228lwz rH2,8(rHP)229lwz rH3,12(rHP)230lwz rH4,16(rHP)231lwz rH5,20(rHP)232lwz rH6,24(rHP)233lwz rH7,28(rHP)234235ppc_spe_sha256_main:236lis rKP,PPC_SPE_SHA256_K@ha237addi rKP,rKP,PPC_SPE_SHA256_K@l238239R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)240R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)241R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)242R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)243R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)244R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)245R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)246R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)247ppc_spe_sha256_16_rounds:248addi rKP,rKP,64249R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,250rW0, rW1, rW4, rW5, rW7, N, 0)251R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,252rW1, rW2, rW5, rW6, rW0, N, 8)253R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,254rW2, rW3, rW6, rW7, rW1, N, 16)255R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,256rW3, rW4, rW7, rW0, rW2, N, 24)257R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,258rW4, rW5, rW0, rW1, rW3, N, 32)259R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,260rW5, rW6, rW1, rW2, rW4, N, 40)261R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,262rW6, rW7, rW2, rW3, rW5, N, 48)263R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,264rW7, rW0, rW3, rW4, rW6, C, 56)265bt gt,ppc_spe_sha256_16_rounds266267lwz rW0,0(rHP)268NEXT_BLOCK269lwz rW1,4(rHP)270lwz rW2,8(rHP)271lwz rW3,12(rHP)272lwz rW4,16(rHP)273lwz rW5,20(rHP)274lwz rW6,24(rHP)275lwz rW7,28(rHP)276277add rH0,rH0,rW0278stw rH0,0(rHP)279add rH1,rH1,rW1280stw rH1,4(rHP)281add rH2,rH2,rW2282stw rH2,8(rHP)283add rH3,rH3,rW3284stw rH3,12(rHP)285add rH4,rH4,rW4286stw rH4,16(rHP)287add rH5,rH5,rW5288stw rH5,20(rHP)289add rH6,rH6,rW6290stw rH6,24(rHP)291add rH7,rH7,rW7292stw rH7,28(rHP)293294bdnz ppc_spe_sha256_main295296FINALIZE297blr298299.data300.align 5301PPC_SPE_SHA256_K:302.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5303.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5304.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3305.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174306.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc307.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da308.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7309.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967310.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13311.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85312.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3313.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070314.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5315.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3316.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208317.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2318319320