########################################################################1# Implement fast SHA-256 with AVX1 instructions. (x86_64)2#3# Copyright (C) 2013 Intel Corporation.4#5# Authors:6# James Guilford <[email protected]>7# Kirk Yap <[email protected]>8# Tim Chen <[email protected]>9#10# This software is available to you under a choice of one of two11# licenses. You may choose to be licensed under the terms of the GNU12# General Public License (GPL) Version 2, available from the file13# COPYING in the main directory of this source tree, or the14# OpenIB.org BSD license below:15#16# Redistribution and use in source and binary forms, with or17# without modification, are permitted provided that the following18# conditions are met:19#20# - Redistributions of source code must retain the above21# copyright notice, this list of conditions and the following22# disclaimer.23#24# - Redistributions in binary form must reproduce the above25# copyright notice, this list of conditions and the following26# disclaimer in the documentation and/or other materials27# provided with the distribution.28#29# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,30# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF31# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND32# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS33# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN34# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN35# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE36# SOFTWARE.37########################################################################38#39# This code is described in an Intel White-Paper:40# "Fast SHA-256 Implementations on Intel Architecture Processors"41#42# To find it, surf to http://www.intel.com/p/en_US/embedded43# and search for that title.44#45########################################################################46# This code schedules 1 block at a time, with 4 lanes per block47########################################################################4849#include <linux/linkage.h>5051## assume buffers not aligned52#define VMOVDQ vmovdqu5354################################ Define Macros5556# addm [mem], reg57# Add reg to mem using reg-mem add and store58.macro addm p1 p259add \p1, \p260mov \p2, \p161.endm626364.macro MY_ROR p1 p265shld $(32-(\p1)), \p2, \p266.endm6768################################6970# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask71# Load xmm with mem and byte swap each dword72.macro COPY_XMM_AND_BSWAP p1 p2 p373VMOVDQ \p2, \p174vpshufb \p3, \p1, \p175.endm7677################################7879X0 = %xmm480X1 = %xmm581X2 = %xmm682X3 = %xmm78384XTMP0 = %xmm085XTMP1 = %xmm186XTMP2 = %xmm287XTMP3 = %xmm388XTMP4 = %xmm889XFER = %xmm990XTMP5 = %xmm119192SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA93SHUF_DC00 = %xmm12 # shuffle xDxC -> DC0094BYTE_FLIP_MASK = %xmm139596NUM_BLKS = %rdx # 3rd arg97INP = %rsi # 2nd arg98CTX = %rdi # 1st arg99100SRND = %rsi # clobbers INP101c = %ecx102d = %r8d103e = %edx104TBL = %r12105a = %eax106b = %ebx107108f = %r9d109g = %r10d110h = %r11d111112y0 = %r13d113y1 = %r14d114y2 = %r15d115116117_INP_END_SIZE = 8118_INP_SIZE = 8119_XFER_SIZE = 16120_XMM_SAVE_SIZE = 0121122_INP_END = 0123_INP = _INP_END + _INP_END_SIZE124_XFER = _INP + _INP_SIZE125_XMM_SAVE = _XFER + _XFER_SIZE126STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE127128# rotate_Xs129# Rotate values of symbols X0...X3130.macro rotate_Xs131X_ = X0132X0 = X1133X1 = X2134X2 = X3135X3 = X_136.endm137138# ROTATE_ARGS139# Rotate values of symbols a...h140.macro ROTATE_ARGS141TMP_ = h142h = g143g = f144f = e145e = d146d = c147c = b148b = a149a = TMP_150.endm151152.macro FOUR_ROUNDS_AND_SCHED153## compute s0 four at a time and s1 two at a time154## compute W[-16] + W[-7] 4 at a time155156mov e, y0 # y0 = e157MY_ROR (25-11), y0 # y0 = e >> (25-11)158mov a, y1 # y1 = a159vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]160MY_ROR (22-13), y1 # y1 = a >> (22-13)161xor e, y0 # y0 = e ^ (e >> (25-11))162mov f, y2 # y2 = f163MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))164xor a, y1 # y1 = a ^ (a >> (22-13)165xor g, y2 # y2 = f^g166vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]167xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))168and e, y2 # y2 = (f^g)&e169MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))170## compute s0171vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]172xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))173MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)174xor g, y2 # y2 = CH = ((f^g)&e)^g175MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)176add y0, y2 # y2 = S1 + CH177add _XFER(%rsp), y2 # y2 = k + w + S1 + CH178mov a, y0 # y0 = a179add y2, h # h = h + S1 + CH + k + w180mov a, y2 # y2 = a181vpsrld $7, XTMP1, XTMP2182or c, y0 # y0 = a|c183add h, d # d = d + h + S1 + CH + k + w184and c, y2 # y2 = a&c185vpslld $(32-7), XTMP1, XTMP3186and b, y0 # y0 = (a|c)&b187add y1, h # h = h + S1 + CH + k + w + S0188vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7189or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)190add y0, h # h = h + S1 + CH + k + w + S0 + MAJ191ROTATE_ARGS192mov e, y0 # y0 = e193mov a, y1 # y1 = a194MY_ROR (25-11), y0 # y0 = e >> (25-11)195xor e, y0 # y0 = e ^ (e >> (25-11))196mov f, y2 # y2 = f197MY_ROR (22-13), y1 # y1 = a >> (22-13)198vpsrld $18, XTMP1, XTMP2 #199xor a, y1 # y1 = a ^ (a >> (22-13)200MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))201xor g, y2 # y2 = f^g202vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3203MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))204xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))205and e, y2 # y2 = (f^g)&e206MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)207vpslld $(32-18), XTMP1, XTMP1208xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))209xor g, y2 # y2 = CH = ((f^g)&e)^g210vpxor XTMP1, XTMP3, XTMP3 #211add y0, y2 # y2 = S1 + CH212add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH213MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)214vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR215mov a, y0 # y0 = a216add y2, h # h = h + S1 + CH + k + w217mov a, y2 # y2 = a218vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0219or c, y0 # y0 = a|c220add h, d # d = d + h + S1 + CH + k + w221and c, y2 # y2 = a&c222## compute low s1223vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}224and b, y0 # y0 = (a|c)&b225add y1, h # h = h + S1 + CH + k + w + S0226vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0227or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)228add y0, h # h = h + S1 + CH + k + w + S0 + MAJ229ROTATE_ARGS230mov e, y0 # y0 = e231mov a, y1 # y1 = a232MY_ROR (25-11), y0 # y0 = e >> (25-11)233xor e, y0 # y0 = e ^ (e >> (25-11))234MY_ROR (22-13), y1 # y1 = a >> (22-13)235mov f, y2 # y2 = f236xor a, y1 # y1 = a ^ (a >> (22-13)237MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))238vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}239xor g, y2 # y2 = f^g240vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}241xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))242and e, y2 # y2 = (f^g)&e243vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}244MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))245xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))246xor g, y2 # y2 = CH = ((f^g)&e)^g247MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)248vpxor XTMP3, XTMP2, XTMP2 #249add y0, y2 # y2 = S1 + CH250MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)251add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH252vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}253mov a, y0 # y0 = a254add y2, h # h = h + S1 + CH + k + w255mov a, y2 # y2 = a256vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}257or c, y0 # y0 = a|c258add h, d # d = d + h + S1 + CH + k + w259and c, y2 # y2 = a&c260vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}261and b, y0 # y0 = (a|c)&b262add y1, h # h = h + S1 + CH + k + w + S0263## compute high s1264vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}265or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)266add y0, h # h = h + S1 + CH + k + w + S0 + MAJ267ROTATE_ARGS268mov e, y0 # y0 = e269MY_ROR (25-11), y0 # y0 = e >> (25-11)270mov a, y1 # y1 = a271MY_ROR (22-13), y1 # y1 = a >> (22-13)272xor e, y0 # y0 = e ^ (e >> (25-11))273mov f, y2 # y2 = f274MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))275vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}276xor a, y1 # y1 = a ^ (a >> (22-13)277xor g, y2 # y2 = f^g278vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}279xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))280and e, y2 # y2 = (f^g)&e281MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))282vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}283xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))284MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)285xor g, y2 # y2 = CH = ((f^g)&e)^g286vpxor XTMP3, XTMP2, XTMP2287MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)288add y0, y2 # y2 = S1 + CH289add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH290vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}291mov a, y0 # y0 = a292add y2, h # h = h + S1 + CH + k + w293mov a, y2 # y2 = a294vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}295or c, y0 # y0 = a|c296add h, d # d = d + h + S1 + CH + k + w297and c, y2 # y2 = a&c298vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}299and b, y0 # y0 = (a|c)&b300add y1, h # h = h + S1 + CH + k + w + S0301or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)302add y0, h # h = h + S1 + CH + k + w + S0 + MAJ303ROTATE_ARGS304rotate_Xs305.endm306307## input is [rsp + _XFER + %1 * 4]308.macro DO_ROUND round309mov e, y0 # y0 = e310MY_ROR (25-11), y0 # y0 = e >> (25-11)311mov a, y1 # y1 = a312xor e, y0 # y0 = e ^ (e >> (25-11))313MY_ROR (22-13), y1 # y1 = a >> (22-13)314mov f, y2 # y2 = f315xor a, y1 # y1 = a ^ (a >> (22-13)316MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))317xor g, y2 # y2 = f^g318xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))319MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))320and e, y2 # y2 = (f^g)&e321xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))322MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)323xor g, y2 # y2 = CH = ((f^g)&e)^g324add y0, y2 # y2 = S1 + CH325MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)326offset = \round * 4 + _XFER #327add offset(%rsp), y2 # y2 = k + w + S1 + CH328mov a, y0 # y0 = a329add y2, h # h = h + S1 + CH + k + w330mov a, y2 # y2 = a331or c, y0 # y0 = a|c332add h, d # d = d + h + S1 + CH + k + w333and c, y2 # y2 = a&c334and b, y0 # y0 = (a|c)&b335add y1, h # h = h + S1 + CH + k + w + S0336or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)337add y0, h # h = h + S1 + CH + k + w + S0 + MAJ338ROTATE_ARGS339.endm340341########################################################################342## void sha256_transform_avx(struct sha256_block_state *state,343## const u8 *data, size_t nblocks);344########################################################################345.text346SYM_FUNC_START(sha256_transform_avx)347pushq %rbx348pushq %r12349pushq %r13350pushq %r14351pushq %r15352pushq %rbp353movq %rsp, %rbp354355subq $STACK_SIZE, %rsp # allocate stack space356and $~15, %rsp # align stack pointer357358shl $6, NUM_BLKS # convert to bytes359add INP, NUM_BLKS # pointer to end of data360mov NUM_BLKS, _INP_END(%rsp)361362## load initial digest363mov 4*0(CTX), a364mov 4*1(CTX), b365mov 4*2(CTX), c366mov 4*3(CTX), d367mov 4*4(CTX), e368mov 4*5(CTX), f369mov 4*6(CTX), g370mov 4*7(CTX), h371372vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK373vmovdqa _SHUF_00BA(%rip), SHUF_00BA374vmovdqa _SHUF_DC00(%rip), SHUF_DC00375.Lloop0:376lea K256(%rip), TBL377378## byte swap first 16 dwords379COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK380COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK381COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK382COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK383384mov INP, _INP(%rsp)385386## schedule 48 input dwords, by doing 3 rounds of 16 each387mov $3, SRND388.align 16389.Lloop1:390vpaddd (TBL), X0, XFER391vmovdqa XFER, _XFER(%rsp)392FOUR_ROUNDS_AND_SCHED393394vpaddd 1*16(TBL), X0, XFER395vmovdqa XFER, _XFER(%rsp)396FOUR_ROUNDS_AND_SCHED397398vpaddd 2*16(TBL), X0, XFER399vmovdqa XFER, _XFER(%rsp)400FOUR_ROUNDS_AND_SCHED401402vpaddd 3*16(TBL), X0, XFER403vmovdqa XFER, _XFER(%rsp)404add $4*16, TBL405FOUR_ROUNDS_AND_SCHED406407sub $1, SRND408jne .Lloop1409410mov $2, SRND411.Lloop2:412vpaddd (TBL), X0, XFER413vmovdqa XFER, _XFER(%rsp)414DO_ROUND 0415DO_ROUND 1416DO_ROUND 2417DO_ROUND 3418419vpaddd 1*16(TBL), X1, XFER420vmovdqa XFER, _XFER(%rsp)421add $2*16, TBL422DO_ROUND 0423DO_ROUND 1424DO_ROUND 2425DO_ROUND 3426427vmovdqa X2, X0428vmovdqa X3, X1429430sub $1, SRND431jne .Lloop2432433addm (4*0)(CTX),a434addm (4*1)(CTX),b435addm (4*2)(CTX),c436addm (4*3)(CTX),d437addm (4*4)(CTX),e438addm (4*5)(CTX),f439addm (4*6)(CTX),g440addm (4*7)(CTX),h441442mov _INP(%rsp), INP443add $64, INP444cmp _INP_END(%rsp), INP445jne .Lloop0446447mov %rbp, %rsp448popq %rbp449popq %r15450popq %r14451popq %r13452popq %r12453popq %rbx454RET455SYM_FUNC_END(sha256_transform_avx)456457.section .rodata.cst256.K256, "aM", @progbits, 256458.align 64459K256:460.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5461.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5462.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3463.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174464.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc465.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da466.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7467.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967468.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13469.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85470.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3471.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070472.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5473.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3474.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208475.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2476477.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16478.align 16479PSHUFFLE_BYTE_FLIP_MASK:480.octa 0x0c0d0e0f08090a0b0405060700010203481482.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16483.align 16484# shuffle xBxA -> 00BA485_SHUF_00BA:486.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100487488.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16489.align 16490# shuffle xDxC -> DC00491_SHUF_DC00:492.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF493494495