########################################################################1# Implement fast SHA-256 with SSSE3 instructions. (x86_64)2#3# Copyright (C) 2013 Intel Corporation.4#5# Authors:6# James Guilford <[email protected]>7# Kirk Yap <[email protected]>8# Tim Chen <[email protected]>9#10# This software is available to you under a choice of one of two11# licenses. You may choose to be licensed under the terms of the GNU12# General Public License (GPL) Version 2, available from the file13# COPYING in the main directory of this source tree, or the14# OpenIB.org BSD license below:15#16# Redistribution and use in source and binary forms, with or17# without modification, are permitted provided that the following18# conditions are met:19#20# - Redistributions of source code must retain the above21# copyright notice, this list of conditions and the following22# disclaimer.23#24# - Redistributions in binary form must reproduce the above25# copyright notice, this list of conditions and the following26# disclaimer in the documentation and/or other materials27# provided with the distribution.28#29# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,30# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF31# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND32# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS33# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN34# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN35# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE36# SOFTWARE.37#38########################################################################39#40# This code is described in an Intel White-Paper:41# "Fast SHA-256 Implementations on Intel Architecture Processors"42#43# To find it, surf to http://www.intel.com/p/en_US/embedded44# and search for that title.45#46########################################################################4748#include <linux/linkage.h>4950## assume buffers not aligned51#define MOVDQ movdqu5253################################ Define Macros5455# addm [mem], reg56# Add reg to mem using reg-mem add and store57.macro addm p1 p258add \p1, \p259mov \p2, \p160.endm6162################################6364# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask65# Load xmm with mem and byte swap each dword66.macro COPY_XMM_AND_BSWAP p1 p2 p367MOVDQ \p2, \p168pshufb \p3, \p169.endm7071################################7273X0 = %xmm474X1 = %xmm575X2 = %xmm676X3 = %xmm77778XTMP0 = %xmm079XTMP1 = %xmm180XTMP2 = %xmm281XTMP3 = %xmm382XTMP4 = %xmm883XFER = %xmm98485SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA86SHUF_DC00 = %xmm11 # shuffle xDxC -> DC0087BYTE_FLIP_MASK = %xmm128889NUM_BLKS = %rdx # 3rd arg90INP = %rsi # 2nd arg91CTX = %rdi # 1st arg9293SRND = %rsi # clobbers INP94c = %ecx95d = %r8d96e = %edx97TBL = %r1298a = %eax99b = %ebx100101f = %r9d102g = %r10d103h = %r11d104105y0 = %r13d106y1 = %r14d107y2 = %r15d108109110111_INP_END_SIZE = 8112_INP_SIZE = 8113_XFER_SIZE = 16114_XMM_SAVE_SIZE = 0115116_INP_END = 0117_INP = _INP_END + _INP_END_SIZE118_XFER = _INP + _INP_SIZE119_XMM_SAVE = _XFER + _XFER_SIZE120STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE121122# rotate_Xs123# Rotate values of symbols X0...X3124.macro rotate_Xs125X_ = X0126X0 = X1127X1 = X2128X2 = X3129X3 = X_130.endm131132# ROTATE_ARGS133# Rotate values of symbols a...h134.macro ROTATE_ARGS135TMP_ = h136h = g137g = f138f = e139e = d140d = c141c = b142b = a143a = TMP_144.endm145146.macro FOUR_ROUNDS_AND_SCHED147## compute s0 four at a time and s1 two at a time148## compute W[-16] + W[-7] 4 at a time149movdqa X3, XTMP0150mov e, y0 # y0 = e151ror $(25-11), y0 # y0 = e >> (25-11)152mov a, y1 # y1 = a153palignr $4, X2, XTMP0 # XTMP0 = W[-7]154ror $(22-13), y1 # y1 = a >> (22-13)155xor e, y0 # y0 = e ^ (e >> (25-11))156mov f, y2 # y2 = f157ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))158movdqa X1, XTMP1159xor a, y1 # y1 = a ^ (a >> (22-13)160xor g, y2 # y2 = f^g161paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]162xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))163and e, y2 # y2 = (f^g)&e164ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))165## compute s0166palignr $4, X0, XTMP1 # XTMP1 = W[-15]167xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))168ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)169xor g, y2 # y2 = CH = ((f^g)&e)^g170movdqa XTMP1, XTMP2 # XTMP2 = W[-15]171ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)172add y0, y2 # y2 = S1 + CH173add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH174movdqa XTMP1, XTMP3 # XTMP3 = W[-15]175mov a, y0 # y0 = a176add y2, h # h = h + S1 + CH + k + w177mov a, y2 # y2 = a178pslld $(32-7), XTMP1 #179or c, y0 # y0 = a|c180add h, d # d = d + h + S1 + CH + k + w181and c, y2 # y2 = a&c182psrld $7, XTMP2 #183and b, y0 # y0 = (a|c)&b184add y1, h # h = h + S1 + CH + k + w + S0185por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7186or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)187add y0, h # h = h + S1 + CH + k + w + S0 + MAJ188#189ROTATE_ARGS #190movdqa XTMP3, XTMP2 # XTMP2 = W[-15]191mov e, y0 # y0 = e192mov a, y1 # y1 = a193movdqa XTMP3, XTMP4 # XTMP4 = W[-15]194ror $(25-11), y0 # y0 = e >> (25-11)195xor e, y0 # y0 = e ^ (e >> (25-11))196mov f, y2 # y2 = f197ror $(22-13), y1 # y1 = a >> (22-13)198pslld $(32-18), XTMP3 #199xor a, y1 # y1 = a ^ (a >> (22-13)200ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))201xor g, y2 # y2 = f^g202psrld $18, XTMP2 #203ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))204xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))205and e, y2 # y2 = (f^g)&e206ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)207pxor XTMP3, XTMP1208xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))209xor g, y2 # y2 = CH = ((f^g)&e)^g210psrld $3, XTMP4 # XTMP4 = W[-15] >> 3211add y0, y2 # y2 = S1 + CH212add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH213ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)214pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18215mov a, y0 # y0 = a216add y2, h # h = h + S1 + CH + k + w217mov a, y2 # y2 = a218pxor XTMP4, XTMP1 # XTMP1 = s0219or c, y0 # y0 = a|c220add h, d # d = d + h + S1 + CH + k + w221and c, y2 # y2 = a&c222## compute low s1223pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}224and b, y0 # y0 = (a|c)&b225add y1, h # h = h + S1 + CH + k + w + S0226paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0227or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)228add y0, h # h = h + S1 + CH + k + w + S0 + MAJ229230ROTATE_ARGS231movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}232mov e, y0 # y0 = e233mov a, y1 # y1 = a234ror $(25-11), y0 # y0 = e >> (25-11)235movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}236xor e, y0 # y0 = e ^ (e >> (25-11))237ror $(22-13), y1 # y1 = a >> (22-13)238mov f, y2 # y2 = f239xor a, y1 # y1 = a ^ (a >> (22-13)240ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))241psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}242xor g, y2 # y2 = f^g243psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}244xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))245and e, y2 # y2 = (f^g)&e246psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}247ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))248xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))249xor g, y2 # y2 = CH = ((f^g)&e)^g250ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)251pxor XTMP3, XTMP2252add y0, y2 # y2 = S1 + CH253ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)254add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH255pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}256mov a, y0 # y0 = a257add y2, h # h = h + S1 + CH + k + w258mov a, y2 # y2 = a259pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}260or c, y0 # y0 = a|c261add h, d # d = d + h + S1 + CH + k + w262and c, y2 # y2 = a&c263paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}264and b, y0 # y0 = (a|c)&b265add y1, h # h = h + S1 + CH + k + w + S0266## compute high s1267pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}268or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)269add y0, h # h = h + S1 + CH + k + w + S0 + MAJ270#271ROTATE_ARGS #272movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}273mov e, y0 # y0 = e274ror $(25-11), y0 # y0 = e >> (25-11)275mov a, y1 # y1 = a276movdqa XTMP2, X0 # X0 = W[-2] {DDCC}277ror $(22-13), y1 # y1 = a >> (22-13)278xor e, y0 # y0 = e ^ (e >> (25-11))279mov f, y2 # y2 = f280ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))281psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}282xor a, y1 # y1 = a ^ (a >> (22-13)283xor g, y2 # y2 = f^g284psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}285xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25286and e, y2 # y2 = (f^g)&e287ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))288psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}289xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22290ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2291xor g, y2 # y2 = CH = ((f^g)&e)^g292pxor XTMP3, XTMP2 #293ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2294add y0, y2 # y2 = S1 + CH295add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH296pxor XTMP2, X0 # X0 = s1 {xDxC}297mov a, y0 # y0 = a298add y2, h # h = h + S1 + CH + k + w299mov a, y2 # y2 = a300pshufb SHUF_DC00, X0 # X0 = s1 {DC00}301or c, y0 # y0 = a|c302add h, d # d = d + h + S1 + CH + k + w303and c, y2 # y2 = a&c304paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}305and b, y0 # y0 = (a|c)&b306add y1, h # h = h + S1 + CH + k + w + S0307or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)308add y0, h # h = h + S1 + CH + k + w + S0 + MAJ309310ROTATE_ARGS311rotate_Xs312.endm313314## input is [rsp + _XFER + %1 * 4]315.macro DO_ROUND round316mov e, y0 # y0 = e317ror $(25-11), y0 # y0 = e >> (25-11)318mov a, y1 # y1 = a319xor e, y0 # y0 = e ^ (e >> (25-11))320ror $(22-13), y1 # y1 = a >> (22-13)321mov f, y2 # y2 = f322xor a, y1 # y1 = a ^ (a >> (22-13)323ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))324xor g, y2 # y2 = f^g325xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))326ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))327and e, y2 # y2 = (f^g)&e328xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))329ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)330xor g, y2 # y2 = CH = ((f^g)&e)^g331add y0, y2 # y2 = S1 + CH332ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)333offset = \round * 4 + _XFER334add offset(%rsp), y2 # y2 = k + w + S1 + CH335mov a, y0 # y0 = a336add y2, h # h = h + S1 + CH + k + w337mov a, y2 # y2 = a338or c, y0 # y0 = a|c339add h, d # d = d + h + S1 + CH + k + w340and c, y2 # y2 = a&c341and b, y0 # y0 = (a|c)&b342add y1, h # h = h + S1 + CH + k + w + S0343or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)344add y0, h # h = h + S1 + CH + k + w + S0 + MAJ345ROTATE_ARGS346.endm347348########################################################################349## void sha256_transform_ssse3(struct sha256_block_state *state,350## const u8 *data, size_t nblocks);351########################################################################352.text353SYM_FUNC_START(sha256_transform_ssse3)354pushq %rbx355pushq %r12356pushq %r13357pushq %r14358pushq %r15359pushq %rbp360mov %rsp, %rbp361362subq $STACK_SIZE, %rsp363and $~15, %rsp364365shl $6, NUM_BLKS # convert to bytes366add INP, NUM_BLKS367mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data368369## load initial digest370mov 4*0(CTX), a371mov 4*1(CTX), b372mov 4*2(CTX), c373mov 4*3(CTX), d374mov 4*4(CTX), e375mov 4*5(CTX), f376mov 4*6(CTX), g377mov 4*7(CTX), h378379movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK380movdqa _SHUF_00BA(%rip), SHUF_00BA381movdqa _SHUF_DC00(%rip), SHUF_DC00382383.Lloop0:384lea K256(%rip), TBL385386## byte swap first 16 dwords387COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK388COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK389COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK390COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK391392mov INP, _INP(%rsp)393394## schedule 48 input dwords, by doing 3 rounds of 16 each395mov $3, SRND396.align 16397.Lloop1:398movdqa (TBL), XFER399paddd X0, XFER400movdqa XFER, _XFER(%rsp)401FOUR_ROUNDS_AND_SCHED402403movdqa 1*16(TBL), XFER404paddd X0, XFER405movdqa XFER, _XFER(%rsp)406FOUR_ROUNDS_AND_SCHED407408movdqa 2*16(TBL), XFER409paddd X0, XFER410movdqa XFER, _XFER(%rsp)411FOUR_ROUNDS_AND_SCHED412413movdqa 3*16(TBL), XFER414paddd X0, XFER415movdqa XFER, _XFER(%rsp)416add $4*16, TBL417FOUR_ROUNDS_AND_SCHED418419sub $1, SRND420jne .Lloop1421422mov $2, SRND423.Lloop2:424paddd (TBL), X0425movdqa X0, _XFER(%rsp)426DO_ROUND 0427DO_ROUND 1428DO_ROUND 2429DO_ROUND 3430paddd 1*16(TBL), X1431movdqa X1, _XFER(%rsp)432add $2*16, TBL433DO_ROUND 0434DO_ROUND 1435DO_ROUND 2436DO_ROUND 3437438movdqa X2, X0439movdqa X3, X1440441sub $1, SRND442jne .Lloop2443444addm (4*0)(CTX),a445addm (4*1)(CTX),b446addm (4*2)(CTX),c447addm (4*3)(CTX),d448addm (4*4)(CTX),e449addm (4*5)(CTX),f450addm (4*6)(CTX),g451addm (4*7)(CTX),h452453mov _INP(%rsp), INP454add $64, INP455cmp _INP_END(%rsp), INP456jne .Lloop0457458mov %rbp, %rsp459popq %rbp460popq %r15461popq %r14462popq %r13463popq %r12464popq %rbx465466RET467SYM_FUNC_END(sha256_transform_ssse3)468469.section .rodata.cst256.K256, "aM", @progbits, 256470.align 64471K256:472.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5473.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5474.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3475.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174476.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc477.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da478.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7479.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967480.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13481.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85482.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3483.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070484.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5485.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3486.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208487.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2488489.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16490.align 16491PSHUFFLE_BYTE_FLIP_MASK:492.octa 0x0c0d0e0f08090a0b0405060700010203493494.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16495.align 16496# shuffle xBxA -> 00BA497_SHUF_00BA:498.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100499500.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16501.align 16502# shuffle xDxC -> DC00503_SHUF_DC00:504.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF505506507