########################################################################1# Implement fast SHA-256 with AVX2 instructions. (x86_64)2#3# Copyright (C) 2013 Intel Corporation.4#5# Authors:6# James Guilford <[email protected]>7# Kirk Yap <[email protected]>8# Tim Chen <[email protected]>9#10# This software is available to you under a choice of one of two11# licenses. You may choose to be licensed under the terms of the GNU12# General Public License (GPL) Version 2, available from the file13# COPYING in the main directory of this source tree, or the14# OpenIB.org BSD license below:15#16# Redistribution and use in source and binary forms, with or17# without modification, are permitted provided that the following18# conditions are met:19#20# - Redistributions of source code must retain the above21# copyright notice, this list of conditions and the following22# disclaimer.23#24# - Redistributions in binary form must reproduce the above25# copyright notice, this list of conditions and the following26# disclaimer in the documentation and/or other materials27# provided with the distribution.28#29# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,30# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF31# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND32# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS33# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN34# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN35# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE36# SOFTWARE.37#38########################################################################39#40# This code is described in an Intel White-Paper:41# "Fast SHA-256 Implementations on Intel Architecture Processors"42#43# To find it, surf to http://www.intel.com/p/en_US/embedded44# and search for that title.45#46########################################################################47# This code schedules 2 blocks at a time, with 4 lanes per block48########################################################################4950#include <linux/linkage.h>5152## assume buffers not aligned53#define VMOVDQ vmovdqu5455################################ Define Macros5657# addm [mem], reg58# Add reg to mem using reg-mem add and store59.macro addm p1 p260add \p1, \p261mov \p2, \p162.endm6364################################6566X0 = %ymm467X1 = %ymm568X2 = %ymm669X3 = %ymm77071# XMM versions of above72XWORD0 = %xmm473XWORD1 = %xmm574XWORD2 = %xmm675XWORD3 = %xmm77677XTMP0 = %ymm078XTMP1 = %ymm179XTMP2 = %ymm280XTMP3 = %ymm381XTMP4 = %ymm882XFER = %ymm983XTMP5 = %ymm118485SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA86SHUF_DC00 = %ymm12 # shuffle xDxC -> DC0087BYTE_FLIP_MASK = %ymm138889X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK9091NUM_BLKS = %rdx # 3rd arg92INP = %rsi # 2nd arg93CTX = %rdi # 1st arg94c = %ecx95d = %r8d96e = %edx # clobbers NUM_BLKS97y3 = %esi # clobbers INP9899SRND = CTX # SRND is same register as CTX100101a = %eax102b = %ebx103f = %r9d104g = %r10d105h = %r11d106old_h = %r11d107108T1 = %r12d109y0 = %r13d110y1 = %r14d111y2 = %r15d112113114_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round115_XMM_SAVE_SIZE = 0116_INP_END_SIZE = 8117_INP_SIZE = 8118_CTX_SIZE = 8119120_XFER = 0121_XMM_SAVE = _XFER + _XFER_SIZE122_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE123_INP = _INP_END + _INP_END_SIZE124_CTX = _INP + _INP_SIZE125STACK_SIZE = _CTX + _CTX_SIZE126127# rotate_Xs128# Rotate values of symbols X0...X3129.macro rotate_Xs130X_ = X0131X0 = X1132X1 = X2133X2 = X3134X3 = X_135.endm136137# ROTATE_ARGS138# Rotate values of symbols a...h139.macro ROTATE_ARGS140old_h = h141TMP_ = h142h = g143g = f144f = e145e = d146d = c147c = b148b = a149a = TMP_150.endm151152.macro FOUR_ROUNDS_AND_SCHED disp153################################### RND N + 0 ############################154155mov a, y3 # y3 = a # MAJA156rorx $25, e, y0 # y0 = e >> 25 # S1A157rorx $11, e, y1 # y1 = e >> 11 # S1B158159addl \disp(%rsp, SRND), h # h = k + w + h # --160or c, y3 # y3 = a|c # MAJA161vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]162mov f, y2 # y2 = f # CH163rorx $13, a, T1 # T1 = a >> 13 # S0B164165xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1166xor g, y2 # y2 = f^g # CH167vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1168rorx $6, e, y1 # y1 = (e >> 6) # S1169170and e, y2 # y2 = (f^g)&e # CH171xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1172rorx $22, a, y1 # y1 = a >> 22 # S0A173add h, d # d = k + w + h + d # --174175and b, y3 # y3 = (a|c)&b # MAJA176vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]177xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0178rorx $2, a, T1 # T1 = (a >> 2) # S0179180xor g, y2 # y2 = CH = ((f^g)&e)^g # CH181vpsrld $7, XTMP1, XTMP2182xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0183mov a, T1 # T1 = a # MAJB184and c, T1 # T1 = a&c # MAJB185186add y0, y2 # y2 = S1 + CH # --187vpslld $(32-7), XTMP1, XTMP3188or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ189add y1, h # h = k + w + h + S0 # --190191add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --192vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7193194vpsrld $18, XTMP1, XTMP2195add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --196add y3, h # h = t1 + S0 + MAJ # --197198199ROTATE_ARGS200201################################### RND N + 1 ############################202203mov a, y3 # y3 = a # MAJA204rorx $25, e, y0 # y0 = e >> 25 # S1A205rorx $11, e, y1 # y1 = e >> 11 # S1B206offset = \disp + 1*4207addl offset(%rsp, SRND), h # h = k + w + h # --208or c, y3 # y3 = a|c # MAJA209210211vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3212mov f, y2 # y2 = f # CH213rorx $13, a, T1 # T1 = a >> 13 # S0B214xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1215xor g, y2 # y2 = f^g # CH216217218rorx $6, e, y1 # y1 = (e >> 6) # S1219xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1220rorx $22, a, y1 # y1 = a >> 22 # S0A221and e, y2 # y2 = (f^g)&e # CH222add h, d # d = k + w + h + d # --223224vpslld $(32-18), XTMP1, XTMP1225and b, y3 # y3 = (a|c)&b # MAJA226xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0227228vpxor XTMP1, XTMP3, XTMP3229rorx $2, a, T1 # T1 = (a >> 2) # S0230xor g, y2 # y2 = CH = ((f^g)&e)^g # CH231232vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18233xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0234mov a, T1 # T1 = a # MAJB235and c, T1 # T1 = a&c # MAJB236add y0, y2 # y2 = S1 + CH # --237238vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0239vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}240or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ241add y1, h # h = k + w + h + S0 # --242243vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0244add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --245add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --246add y3, h # h = t1 + S0 + MAJ # --247248vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}249250251ROTATE_ARGS252253################################### RND N + 2 ############################254255mov a, y3 # y3 = a # MAJA256rorx $25, e, y0 # y0 = e >> 25 # S1A257offset = \disp + 2*4258addl offset(%rsp, SRND), h # h = k + w + h # --259260vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}261rorx $11, e, y1 # y1 = e >> 11 # S1B262or c, y3 # y3 = a|c # MAJA263mov f, y2 # y2 = f # CH264xor g, y2 # y2 = f^g # CH265266rorx $13, a, T1 # T1 = a >> 13 # S0B267xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1268vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}269and e, y2 # y2 = (f^g)&e # CH270271rorx $6, e, y1 # y1 = (e >> 6) # S1272vpxor XTMP3, XTMP2, XTMP2273add h, d # d = k + w + h + d # --274and b, y3 # y3 = (a|c)&b # MAJA275276xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1277rorx $22, a, y1 # y1 = a >> 22 # S0A278vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}279xor g, y2 # y2 = CH = ((f^g)&e)^g # CH280281vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}282xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0283rorx $2, a ,T1 # T1 = (a >> 2) # S0284vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}285286xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0287mov a, T1 # T1 = a # MAJB288and c, T1 # T1 = a&c # MAJB289add y0, y2 # y2 = S1 + CH # --290vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}291292or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ293add y1,h # h = k + w + h + S0 # --294add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --295add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --296297add y3,h # h = t1 + S0 + MAJ # --298299300ROTATE_ARGS301302################################### RND N + 3 ############################303304mov a, y3 # y3 = a # MAJA305rorx $25, e, y0 # y0 = e >> 25 # S1A306rorx $11, e, y1 # y1 = e >> 11 # S1B307offset = \disp + 3*4308addl offset(%rsp, SRND), h # h = k + w + h # --309or c, y3 # y3 = a|c # MAJA310311312vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}313mov f, y2 # y2 = f # CH314rorx $13, a, T1 # T1 = a >> 13 # S0B315xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1316xor g, y2 # y2 = f^g # CH317318319vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}320rorx $6, e, y1 # y1 = (e >> 6) # S1321and e, y2 # y2 = (f^g)&e # CH322add h, d # d = k + w + h + d # --323and b, y3 # y3 = (a|c)&b # MAJA324325vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}326xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1327xor g, y2 # y2 = CH = ((f^g)&e)^g # CH328329vpxor XTMP3, XTMP2, XTMP2330rorx $22, a, y1 # y1 = a >> 22 # S0A331add y0, y2 # y2 = S1 + CH # --332333vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}334xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0335add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --336337rorx $2, a, T1 # T1 = (a >> 2) # S0338vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}339340vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}341xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0342mov a, T1 # T1 = a # MAJB343and c, T1 # T1 = a&c # MAJB344or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ345346add y1, h # h = k + w + h + S0 # --347add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --348add y3, h # h = t1 + S0 + MAJ # --349350ROTATE_ARGS351rotate_Xs352.endm353354.macro DO_4ROUNDS disp355################################### RND N + 0 ###########################356357mov f, y2 # y2 = f # CH358rorx $25, e, y0 # y0 = e >> 25 # S1A359rorx $11, e, y1 # y1 = e >> 11 # S1B360xor g, y2 # y2 = f^g # CH361362xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1363rorx $6, e, y1 # y1 = (e >> 6) # S1364and e, y2 # y2 = (f^g)&e # CH365366xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1367rorx $13, a, T1 # T1 = a >> 13 # S0B368xor g, y2 # y2 = CH = ((f^g)&e)^g # CH369rorx $22, a, y1 # y1 = a >> 22 # S0A370mov a, y3 # y3 = a # MAJA371372xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0373rorx $2, a, T1 # T1 = (a >> 2) # S0374addl \disp(%rsp, SRND), h # h = k + w + h # --375or c, y3 # y3 = a|c # MAJA376377xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0378mov a, T1 # T1 = a # MAJB379and b, y3 # y3 = (a|c)&b # MAJA380and c, T1 # T1 = a&c # MAJB381add y0, y2 # y2 = S1 + CH # --382383384add h, d # d = k + w + h + d # --385or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ386add y1, h # h = k + w + h + S0 # --387add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --388389ROTATE_ARGS390391################################### RND N + 1 ###########################392393add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --394mov f, y2 # y2 = f # CH395rorx $25, e, y0 # y0 = e >> 25 # S1A396rorx $11, e, y1 # y1 = e >> 11 # S1B397xor g, y2 # y2 = f^g # CH398399xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1400rorx $6, e, y1 # y1 = (e >> 6) # S1401and e, y2 # y2 = (f^g)&e # CH402add y3, old_h # h = t1 + S0 + MAJ # --403404xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1405rorx $13, a, T1 # T1 = a >> 13 # S0B406xor g, y2 # y2 = CH = ((f^g)&e)^g # CH407rorx $22, a, y1 # y1 = a >> 22 # S0A408mov a, y3 # y3 = a # MAJA409410xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0411rorx $2, a, T1 # T1 = (a >> 2) # S0412offset = 4*1 + \disp413addl offset(%rsp, SRND), h # h = k + w + h # --414or c, y3 # y3 = a|c # MAJA415416xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0417mov a, T1 # T1 = a # MAJB418and b, y3 # y3 = (a|c)&b # MAJA419and c, T1 # T1 = a&c # MAJB420add y0, y2 # y2 = S1 + CH # --421422423add h, d # d = k + w + h + d # --424or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ425add y1, h # h = k + w + h + S0 # --426427add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --428429ROTATE_ARGS430431################################### RND N + 2 ##############################432433add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --434mov f, y2 # y2 = f # CH435rorx $25, e, y0 # y0 = e >> 25 # S1A436rorx $11, e, y1 # y1 = e >> 11 # S1B437xor g, y2 # y2 = f^g # CH438439xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1440rorx $6, e, y1 # y1 = (e >> 6) # S1441and e, y2 # y2 = (f^g)&e # CH442add y3, old_h # h = t1 + S0 + MAJ # --443444xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1445rorx $13, a, T1 # T1 = a >> 13 # S0B446xor g, y2 # y2 = CH = ((f^g)&e)^g # CH447rorx $22, a, y1 # y1 = a >> 22 # S0A448mov a, y3 # y3 = a # MAJA449450xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0451rorx $2, a, T1 # T1 = (a >> 2) # S0452offset = 4*2 + \disp453addl offset(%rsp, SRND), h # h = k + w + h # --454or c, y3 # y3 = a|c # MAJA455456xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0457mov a, T1 # T1 = a # MAJB458and b, y3 # y3 = (a|c)&b # MAJA459and c, T1 # T1 = a&c # MAJB460add y0, y2 # y2 = S1 + CH # --461462463add h, d # d = k + w + h + d # --464or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ465add y1, h # h = k + w + h + S0 # --466467add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --468469ROTATE_ARGS470471################################### RND N + 3 ###########################472473add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --474mov f, y2 # y2 = f # CH475rorx $25, e, y0 # y0 = e >> 25 # S1A476rorx $11, e, y1 # y1 = e >> 11 # S1B477xor g, y2 # y2 = f^g # CH478479xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1480rorx $6, e, y1 # y1 = (e >> 6) # S1481and e, y2 # y2 = (f^g)&e # CH482add y3, old_h # h = t1 + S0 + MAJ # --483484xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1485rorx $13, a, T1 # T1 = a >> 13 # S0B486xor g, y2 # y2 = CH = ((f^g)&e)^g # CH487rorx $22, a, y1 # y1 = a >> 22 # S0A488mov a, y3 # y3 = a # MAJA489490xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0491rorx $2, a, T1 # T1 = (a >> 2) # S0492offset = 4*3 + \disp493addl offset(%rsp, SRND), h # h = k + w + h # --494or c, y3 # y3 = a|c # MAJA495496xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0497mov a, T1 # T1 = a # MAJB498and b, y3 # y3 = (a|c)&b # MAJA499and c, T1 # T1 = a&c # MAJB500add y0, y2 # y2 = S1 + CH # --501502503add h, d # d = k + w + h + d # --504or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ505add y1, h # h = k + w + h + S0 # --506507add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --508509510add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --511512add y3, h # h = t1 + S0 + MAJ # --513514ROTATE_ARGS515516.endm517518########################################################################519## void sha256_transform_rorx(struct sha256_block_state *state,520## const u8 *data, size_t nblocks);521########################################################################522.text523SYM_FUNC_START(sha256_transform_rorx)524pushq %rbx525pushq %r12526pushq %r13527pushq %r14528pushq %r15529530push %rbp531mov %rsp, %rbp532533subq $STACK_SIZE, %rsp534and $-32, %rsp # align rsp to 32 byte boundary535536shl $6, NUM_BLKS # convert to bytes537lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block538mov NUM_BLKS, _INP_END(%rsp)539540cmp NUM_BLKS, INP541je .Lonly_one_block542543## load initial digest544mov (CTX), a545mov 4*1(CTX), b546mov 4*2(CTX), c547mov 4*3(CTX), d548mov 4*4(CTX), e549mov 4*5(CTX), f550mov 4*6(CTX), g551mov 4*7(CTX), h552553vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK554vmovdqa _SHUF_00BA(%rip), SHUF_00BA555vmovdqa _SHUF_DC00(%rip), SHUF_DC00556557mov CTX, _CTX(%rsp)558559.Lloop0:560## Load first 16 dwords from two blocks561VMOVDQ 0*32(INP),XTMP0562VMOVDQ 1*32(INP),XTMP1563VMOVDQ 2*32(INP),XTMP2564VMOVDQ 3*32(INP),XTMP3565566## byte swap data567vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0568vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1569vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2570vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3571572## transpose data into high/low halves573vperm2i128 $0x20, XTMP2, XTMP0, X0574vperm2i128 $0x31, XTMP2, XTMP0, X1575vperm2i128 $0x20, XTMP3, XTMP1, X2576vperm2i128 $0x31, XTMP3, XTMP1, X3577578.Llast_block_enter:579add $64, INP580mov INP, _INP(%rsp)581582## schedule 48 input dwords, by doing 3 rounds of 12 each583xor SRND, SRND584585.align 16586.Lloop1:587leaq K256+0*32(%rip), INP ## reuse INP as scratch reg588vpaddd (INP, SRND), X0, XFER589vmovdqa XFER, 0*32+_XFER(%rsp, SRND)590FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)591592leaq K256+1*32(%rip), INP593vpaddd (INP, SRND), X0, XFER594vmovdqa XFER, 1*32+_XFER(%rsp, SRND)595FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)596597leaq K256+2*32(%rip), INP598vpaddd (INP, SRND), X0, XFER599vmovdqa XFER, 2*32+_XFER(%rsp, SRND)600FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)601602leaq K256+3*32(%rip), INP603vpaddd (INP, SRND), X0, XFER604vmovdqa XFER, 3*32+_XFER(%rsp, SRND)605FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)606607add $4*32, SRND608cmp $3*4*32, SRND609jb .Lloop1610611.Lloop2:612## Do last 16 rounds with no scheduling613leaq K256+0*32(%rip), INP614vpaddd (INP, SRND), X0, XFER615vmovdqa XFER, 0*32+_XFER(%rsp, SRND)616DO_4ROUNDS (_XFER + 0*32)617618leaq K256+1*32(%rip), INP619vpaddd (INP, SRND), X1, XFER620vmovdqa XFER, 1*32+_XFER(%rsp, SRND)621DO_4ROUNDS (_XFER + 1*32)622add $2*32, SRND623624vmovdqa X2, X0625vmovdqa X3, X1626627cmp $4*4*32, SRND628jb .Lloop2629630mov _CTX(%rsp), CTX631mov _INP(%rsp), INP632633addm (4*0)(CTX),a634addm (4*1)(CTX),b635addm (4*2)(CTX),c636addm (4*3)(CTX),d637addm (4*4)(CTX),e638addm (4*5)(CTX),f639addm (4*6)(CTX),g640addm (4*7)(CTX),h641642cmp _INP_END(%rsp), INP643ja .Ldone_hash644645#### Do second block using previously scheduled results646xor SRND, SRND647.align 16648.Lloop3:649DO_4ROUNDS (_XFER + 0*32 + 16)650DO_4ROUNDS (_XFER + 1*32 + 16)651add $2*32, SRND652cmp $4*4*32, SRND653jb .Lloop3654655mov _CTX(%rsp), CTX656mov _INP(%rsp), INP657add $64, INP658659addm (4*0)(CTX),a660addm (4*1)(CTX),b661addm (4*2)(CTX),c662addm (4*3)(CTX),d663addm (4*4)(CTX),e664addm (4*5)(CTX),f665addm (4*6)(CTX),g666addm (4*7)(CTX),h667668cmp _INP_END(%rsp), INP669jb .Lloop0670ja .Ldone_hash671672.Ldo_last_block:673VMOVDQ 0*16(INP),XWORD0674VMOVDQ 1*16(INP),XWORD1675VMOVDQ 2*16(INP),XWORD2676VMOVDQ 3*16(INP),XWORD3677678vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0679vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1680vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2681vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3682683jmp .Llast_block_enter684685.Lonly_one_block:686687## load initial digest688mov (4*0)(CTX),a689mov (4*1)(CTX),b690mov (4*2)(CTX),c691mov (4*3)(CTX),d692mov (4*4)(CTX),e693mov (4*5)(CTX),f694mov (4*6)(CTX),g695mov (4*7)(CTX),h696697vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK698vmovdqa _SHUF_00BA(%rip), SHUF_00BA699vmovdqa _SHUF_DC00(%rip), SHUF_DC00700701mov CTX, _CTX(%rsp)702jmp .Ldo_last_block703704.Ldone_hash:705706mov %rbp, %rsp707pop %rbp708709popq %r15710popq %r14711popq %r13712popq %r12713popq %rbx714vzeroupper715RET716SYM_FUNC_END(sha256_transform_rorx)717718.section .rodata.cst512.K256, "aM", @progbits, 512719.align 64720K256:721.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5722.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5723.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5724.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5725.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3726.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3727.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174728.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174729.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc730.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc731.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da732.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da733.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7734.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7735.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967736.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967737.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13738.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13739.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85740.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85741.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3742.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3743.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070744.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070745.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5746.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5747.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3748.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3749.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208750.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208751.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2752.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2753754.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32755.align 32756PSHUFFLE_BYTE_FLIP_MASK:757.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203758759# shuffle xBxA -> 00BA760.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32761.align 32762_SHUF_00BA:763.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100764765# shuffle xDxC -> DC00766.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32767.align 32768_SHUF_DC00:769.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF770771772