Path: blob/main/sys/crypto/skein/amd64/skein_block_asm.S
39536 views
#1#----------------------------------------------------------------2# 64-bit x86 assembler code (gnu as) for Skein block functions3#4# Author: Doug Whiting, Hifn/Exar5#6# This code is released to the public domain.7#----------------------------------------------------------------8#9.text10.altmacro11#ifndef __clang__12.psize 0,128 #list file has no page boundaries13#endif14#15_MASK_ALL_ = (256+512+1024) #all three algorithm bits16_MAX_FRAME_ = 24017#18#################19#ifndef SKEIN_USE_ASM20_USE_ASM_ = _MASK_ALL_21#else22_USE_ASM_ = SKEIN_USE_ASM23#endif24#################25#configure loop unrolling26#ifndef SKEIN_LOOP27_SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 102428#else29_SKEIN_LOOP = SKEIN_LOOP30.irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line31#.print "+++ SKEIN_LOOP = \_NN_"32.endr33#endif34# the unroll counts (0 --> fully unrolled)35SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 1036SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 1037SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 1038#39SKEIN_ASM_UNROLL = 040.irp _NN_,256,512,102441.if (SKEIN_UNROLL_\_NN_) == 042SKEIN_ASM_UNROLL = (SKEIN_ASM_UNROLL) + \_NN_43.endif44.endr45#################46#47.ifndef SKEIN_ROUNDS48ROUNDS_256 = 7249ROUNDS_512 = 7250ROUNDS_1024 = 8051.else52ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)53ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5)54ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5)55# only display rounds if default size is changed on command line56.irp _NN_,256,512,102457.if _USE_ASM_ & \_NN_58.irp _RR_,%(ROUNDS_\_NN_)59.if _NN_ < 102460.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"61.else62.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"63.endif64.endr65.endif66.endr67.endif68#################69#70.ifdef SKEIN_CODE_SIZE71_SKEIN_CODE_SIZE = (1)72.else73.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined74_SKEIN_CODE_SIZE = (1)75.else76_SKEIN_CODE_SIZE = (0)77.endif78.endif79#80#################81#82.ifndef SKEIN_DEBUG83_SKEIN_DEBUG = 084.else85_SKEIN_DEBUG = 186.endif87#################88#89# define offsets of fields in hash context structure90#91HASH_BITS = 0 #bits of hash output92BCNT = 8 + HASH_BITS #number of bytes in BUFFER[]93TWEAK = 8 + BCNT #tweak values[0..1]94X_VARS = 16 + TWEAK #chaining vars95#96#(Note: buffer[] in context structure is NOT needed here :-)97#98KW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words99FIRST_MASK = ~ (1 << 6)100FIRST_MASK64= ~ (1 << 62)101#102# rotation constants for Skein103#104RC_256_0_0 = 14105RC_256_0_1 = 16106107RC_256_1_0 = 52108RC_256_1_1 = 57109110RC_256_2_0 = 23111RC_256_2_1 = 40112113RC_256_3_0 = 5114RC_256_3_1 = 37115116RC_256_4_0 = 25117RC_256_4_1 = 33118119RC_256_5_0 = 46120RC_256_5_1 = 12121122RC_256_6_0 = 58123RC_256_6_1 = 22124125RC_256_7_0 = 32126RC_256_7_1 = 32127128RC_512_0_0 = 46129RC_512_0_1 = 36130RC_512_0_2 = 19131RC_512_0_3 = 37132133RC_512_1_0 = 33134RC_512_1_1 = 27135RC_512_1_2 = 14136RC_512_1_3 = 42137138RC_512_2_0 = 17139RC_512_2_1 = 49140RC_512_2_2 = 36141RC_512_2_3 = 39142143RC_512_3_0 = 44144RC_512_3_1 = 9145RC_512_3_2 = 54146RC_512_3_3 = 56147148RC_512_4_0 = 39149RC_512_4_1 = 30150RC_512_4_2 = 34151RC_512_4_3 = 24152153RC_512_5_0 = 13154RC_512_5_1 = 50155RC_512_5_2 = 10156RC_512_5_3 = 17157158RC_512_6_0 = 25159RC_512_6_1 = 29160RC_512_6_2 = 39161RC_512_6_3 = 43162163RC_512_7_0 = 8164RC_512_7_1 = 35165RC_512_7_2 = 56166RC_512_7_3 = 22167168RC_1024_0_0 = 24169RC_1024_0_1 = 13170RC_1024_0_2 = 8171RC_1024_0_3 = 47172RC_1024_0_4 = 8173RC_1024_0_5 = 17174RC_1024_0_6 = 22175RC_1024_0_7 = 37176177RC_1024_1_0 = 38178RC_1024_1_1 = 19179RC_1024_1_2 = 10180RC_1024_1_3 = 55181RC_1024_1_4 = 49182RC_1024_1_5 = 18183RC_1024_1_6 = 23184RC_1024_1_7 = 52185186RC_1024_2_0 = 33187RC_1024_2_1 = 4188RC_1024_2_2 = 51189RC_1024_2_3 = 13190RC_1024_2_4 = 34191RC_1024_2_5 = 41192RC_1024_2_6 = 59193RC_1024_2_7 = 17194195RC_1024_3_0 = 5196RC_1024_3_1 = 20197RC_1024_3_2 = 48198RC_1024_3_3 = 41199RC_1024_3_4 = 47200RC_1024_3_5 = 28201RC_1024_3_6 = 16202RC_1024_3_7 = 25203204RC_1024_4_0 = 41205RC_1024_4_1 = 9206RC_1024_4_2 = 37207RC_1024_4_3 = 31208RC_1024_4_4 = 12209RC_1024_4_5 = 47210RC_1024_4_6 = 44211RC_1024_4_7 = 30212213RC_1024_5_0 = 16214RC_1024_5_1 = 34215RC_1024_5_2 = 56216RC_1024_5_3 = 51217RC_1024_5_4 = 4218RC_1024_5_5 = 53219RC_1024_5_6 = 42220RC_1024_5_7 = 41221222RC_1024_6_0 = 31223RC_1024_6_1 = 44224RC_1024_6_2 = 47225RC_1024_6_3 = 46226RC_1024_6_4 = 19227RC_1024_6_5 = 42228RC_1024_6_6 = 44229RC_1024_6_7 = 25230231RC_1024_7_0 = 9232RC_1024_7_1 = 48233RC_1024_7_2 = 35234RC_1024_7_3 = 52235RC_1024_7_4 = 23236RC_1024_7_5 = 31237RC_1024_7_6 = 37238RC_1024_7_7 = 20239#240# Input: reg241# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024242#243.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM244.if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM #is there anything to do?245rolq $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg246.endif247.endm248#249#----------------------------------------------------------------250#251# MACROS: define local vars and configure stack252#253#----------------------------------------------------------------254# declare allocated space on the stack255.macro StackVar localName,localSize256\localName = _STK_OFFS_257_STK_OFFS_ = _STK_OFFS_+(\localSize)258.endm #StackVar259#260#----------------------------------------------------------------261#262# MACRO: Configure stack frame, allocate local vars263#264.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt265WCNT = (\BLK_BITS)/64266#267_PushCnt_ = 0 #save nonvolatile regs on stack268.irp _reg_,rbp,rbx,r12,r13,r14,r15269pushq %\_reg_270_PushCnt_ = _PushCnt_ + 1 #track count to keep alignment271.endr272#273_STK_OFFS_ = 0 #starting offset from rsp274#---- local variables #<-- rsp275StackVar X_stk ,8*(WCNT) #local context vars276StackVar ksTwk ,8*3 #key schedule: tweak words277StackVar ksKey ,8*(WCNT)+8 #key schedule: key words278.if ((SKEIN_ASM_UNROLL) & (\BLK_BITS)) == 0279StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen280.endif281StackVar Wcopy ,8*(WCNT) #copy of input block282.if _SKEIN_DEBUG283.if \debugCnt + 0 #temp location for debug X[] info284StackVar xDebug_\BLK_BITS ,8*(\debugCnt)285.endif286.endif287.if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0288StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?)289tmpStk_\BLK_BITS = align16 #use this290.endif291#---- saved caller parameters (from regs rdi, rsi, rdx, rcx)292StackVar ctxPtr ,8 #context ptr293StackVar blkPtr ,8 #pointer to block data294StackVar blkCnt ,8 #number of full blocks to process295StackVar bitAdd ,8 #bit count to add to tweak296LOCAL_SIZE = _STK_OFFS_ #size of "local" vars297#----298StackVar savRegs,8*_PushCnt_ #saved registers299StackVar retAddr,8 #return address300#---- caller's stack frame (aligned mod 16)301#302# set up the stack frame pointer (rbp)303#304FRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey305.if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range306FRAME_OFFS = _STK_OFFS_307.endif308F_O = -FRAME_OFFS309#310#put some useful defines in the .lst file (for grep)311__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE312__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_313__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS314#315# Notes on stack frame setup:316# * the most frequently used variable is X_stk[], based at [rsp+0]317# * the next most used is the key schedule arrays, ksKey and ksTwk318# so rbp is "centered" there, allowing short offsets to the key319# schedule even in 1024-bit Skein case320# * the Wcopy variables are infrequently accessed, but they have long321# offsets from both rsp and rbp only in the 1024-bit case.322# * all other local vars and calling parameters can be accessed323# with short offsets, except in the 1024-bit case324#325subq $LOCAL_SIZE,%rsp #make room for the locals326leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets327movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack328movq %rsi, blkPtr+F_O(%rbp)329movq %rdx, blkCnt+F_O(%rbp)330movq %rcx, bitAdd+F_O(%rbp)331#332.endm #Setup_Stack333#334#----------------------------------------------------------------335#336.macro Reset_Stack337addq $LOCAL_SIZE,%rsp #get rid of locals (wipe?)338.irp _reg_,r15,r14,r13,r12,rbx,rbp339popq %\_reg_ #restore caller's regs340_PushCnt_ = _PushCnt_ - 1341.endr342.if _PushCnt_343.error "Mismatched push/pops?"344.endif345.endm # Reset_Stack346#347#----------------------------------------------------------------348# macros to help debug internals349#350.if _SKEIN_DEBUG351.extern Skein_Show_Block #calls to C routines352.extern Skein_Show_Round353#354SKEIN_RND_SPECIAL = 1000355SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0356SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1357SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2358#359.macro Skein_Debug_Block BLK_BITS360#361#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,362# const u08b_t *blkPtr, const u64b_t *wPtr,363# const u64b_t *ksPtr,const u64b_t *tsPtr)364#365_NN_ = 0366.irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11367pushq %\_reg_ #save all volatile regs on tack before the call368_NN_ = _NN_ + 1369.endr370# get and push call parameters371movq $\BLK_BITS ,%rdi #bits372movq ctxPtr+F_O(%rbp),%rsi #h (pointer)373leaq X_VARS (%rsi),%rdx #X (pointer)374movq blkPtr+F_O(%rbp),%rcx #blkPtr375leaq Wcopy +F_O(%rbp),%r8 #wPtr376leaq ksKey +F_O(%rbp),%r9 #key pointer377leaq ksTwk +F_O(%rbp),%rax #tweak pointer378pushq %rax # (pass on the stack)379call Skein_Show_Block #call external debug handler380addq $8*1,%rsp #discard parameters on stack381.if (_NN_ % 2 ) == 0 #check stack alignment382.error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"383.endif384.irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax385popq %\_reg_ #restore regs386_NN_ = _NN_ - 1387.endr388.if _NN_389.error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"390.endif391.endm # Skein_Debug_Block392#393# the macro to "call" to debug a round394#395.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp396# call the appropriate (local) debug "function"397pushq %rdx #save rdx, so we can use it for round "number"398.if ((SKEIN_ASM_UNROLL) & \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)399movq $\R,%rdx400.else #compute round number using edi401_rOffs_ = \RDI_OFFS + 0402.if \BLK_BITS == 1024403movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above)404leaq 1+(((\R)-1) & 3)+_rOffs_(,%rdx,4),%rdx405.else406leaq 1+(((\R)-1) & 3)+_rOffs_(,%rdi,4),%rdx407.endif408.endif409call Skein_Debug_Round_\BLK_BITS410popq %rdx #restore origianl rdx value411#412afterOp413.endm # Skein_Debug_Round414.else #------- _SKEIN_DEBUG (dummy macros if debug not enabled)415.macro Skein_Debug_Block BLK_BITS416.endm417#418.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp419.endm420#421.endif # _SKEIN_DEBUG422#423#----------------------------------------------------------------424#425.macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs426.if \immOffs + 0427leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg428.elseif ((\useAddOp + 0) == 0)429.ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs!430leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg431.else432addq %\srcReg_A\srcReg_B,%\dstReg433.endif434.else435addq %\srcReg_A\srcReg_B,%\dstReg436.endif437.endm438439# keep Intel-style ordering here, to match addReg440.macro xorReg dstReg,srcReg_A,srcReg_B441xorq %\srcReg_A\srcReg_B,%\dstReg442.endm443#444#----------------------------------------------------------------445#446.macro C_label lName447\lName: #use both "genders" to work across linkage conventions448_\lName:449.global \lName450.global _\lName451.endm452#453#=================================== Skein_256 =============================================454#455.if _USE_ASM_ & 256456#457# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#458#459#################460#461# code462#463C_label Skein_256_Process_Block464Setup_Stack 256,((ROUNDS_256/8)+1)465movq TWEAK+8(%rdi),%r14466jmp Skein_256_block_loop467.p2align 4468# main hash loop for Skein_256469Skein_256_block_loop:470#471# general register usage:472# RAX..RDX = X0..X3473# R08..R12 = ks[0..4]474# R13..R15 = ts[0..2]475# RSP, RBP = stack/frame pointers476# RDI = round counter or context pointer477# RSI = temp478#479movq TWEAK+0(%rdi) ,%r13480addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0481movq %r14 ,%r15482xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak483484movq $KW_PARITY ,%r12485movq X_VARS+ 0(%rdi),%r8486movq X_VARS+ 8(%rdi),%r9487movq X_VARS+16(%rdi),%r10488movq X_VARS+24(%rdi),%r11489movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0]490xorq %r8 ,%r12 #start accumulating overall parity491492movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block493xorq %r9 ,%r12494movq 0(%rsi) ,%rax #get X[0..3]495xorq %r10 ,%r12496movq 8(%rsi) ,%rbx497xorq %r11 ,%r12498movq 16(%rsi) ,%rcx499movq 24(%rsi) ,%rdx500501movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block502movq %rbx,Wcopy+ 8+F_O(%rbp)503movq %rcx,Wcopy+16+F_O(%rbp)504movq %rdx,Wcopy+24+F_O(%rbp)505506addq %r8 ,%rax #initial key injection507addq %r9 ,%rbx508addq %r10,%rcx509addq %r11,%rdx510addq %r13,%rbx511addq %r14,%rcx512513.if _SKEIN_DEBUG514movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?)515movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block516movq %r9 ,ksKey+ 8+F_O(%rbp)517movq %r10,ksKey+16+F_O(%rbp)518movq %r11,ksKey+24+F_O(%rbp)519movq %r12,ksKey+32+F_O(%rbp)520521movq %r13,ksTwk+ 0+F_O(%rbp)522movq %r14,ksTwk+ 8+F_O(%rbp)523movq %r15,ksTwk+16+F_O(%rbp)524525movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block526movq %rbx,X_stk + 8(%rsp)527movq %rcx,X_stk +16(%rsp)528movq %rdx,X_stk +24(%rsp)529530Skein_Debug_Block 256 #debug dump531Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL532.endif533#534.if (((SKEIN_ASM_UNROLL) & 256) == 0)535movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code536movq %r9 ,ksKey+ 8+F_O(%rbp)537movq %r10,ksKey+16+F_O(%rbp)538movq %r11,ksKey+24+F_O(%rbp)539movq %r12,ksKey+32+F_O(%rbp)540541movq %r13,ksTwk+24+F_O(%rbp)542movq %r14,ksTwk+ 8+F_O(%rbp)543movq %r15,ksTwk+16+F_O(%rbp)544.endif545addq $WCNT*8,%rsi #skip the block546movq %rsi,blkPtr +F_O(%rbp) #update block pointer547#548# now the key schedule is computed. Start the rounds549#550.if (SKEIN_ASM_UNROLL) & 256551_UNROLL_CNT = ROUNDS_256/8552.else553_UNROLL_CNT = SKEIN_UNROLL_256554.if ((ROUNDS_256/8) % _UNROLL_CNT)555.error "Invalid SKEIN_UNROLL_256"556.endif557xorq %rdi,%rdi #rdi = iteration count558Skein_256_round_loop:559.endif560_Rbase_ = 0561.rept _UNROLL_CNT*2562# all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled)563# round 4*_RBase_ + 0564addReg rax, rbx565RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0566addReg rcx, rdx567.if ((SKEIN_ASM_UNROLL) & 256) == 0568movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8569.endif570xorReg rbx, rax571RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1572xorReg rdx, rcx573.if (SKEIN_ASM_UNROLL) & 256574.irp _r0_,%( 8+(_Rbase_+3) % 5)575.irp _r1_,%(13+(_Rbase_+2) % 3)576leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx577.endr578.endr579.endif580.if ((SKEIN_ASM_UNROLL) & 256) == 0581movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13582.endif583Skein_Debug_Round 256,%(4*_Rbase_+1)584585# round 4*_Rbase_ + 1586addReg rax, rdx587RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0588xorReg rdx, rax589.if ((SKEIN_ASM_UNROLL) & 256) == 0590movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9591.endif592addReg rcx, rbx593RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1594xorReg rbx, rcx595.if ((SKEIN_ASM_UNROLL) & 256) == 0596movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11597.endif598Skein_Debug_Round 256,%(4*_Rbase_+2)599.if (SKEIN_ASM_UNROLL) & 256600.irp _r0_,%( 8+(_Rbase_+2) % 5)601.irp _r1_,%(13+(_Rbase_+1) % 3)602leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx603.endr604.endr605.endif606# round 4*_Rbase_ + 2607addReg rax, rbx608RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0609addReg rcx, rdx610.if ((SKEIN_ASM_UNROLL) & 256) == 0611movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10612.endif613xorReg rbx, rax614RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1615xorReg rdx, rcx616.if ((SKEIN_ASM_UNROLL) & 256) == 0617movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key618leaq 1(%r11,%rdi),%r11 #precompute key + tweak619.endif620Skein_Debug_Round 256,%(4*_Rbase_+3)621# round 4*_Rbase_ + 3622addReg rax, rdx623RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0624addReg rcx, rbx625.if ((SKEIN_ASM_UNROLL) & 256) == 0626addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak627movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak628.endif629xorReg rdx, rax630RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1631xorReg rbx, rcx632Skein_Debug_Round 256,%(4*_Rbase_+4)633.if ((SKEIN_ASM_UNROLL) & 256) == 0634addReg r9 ,r13 #precompute key+tweak635.endif636#inject key schedule words637_Rbase_ = _Rbase_+1638.if (SKEIN_ASM_UNROLL) & 256639addReg rax,r,%(8+((_Rbase_+0) % 5))640addReg rbx,rsi641addReg rcx,rdi642addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_643.else644incq %rdi645addReg rax,r8646addReg rcx,r10647addReg rbx,r9648addReg rdx,r11649.endif650Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT651.endr #rept _UNROLL_CNT652#653.if ((SKEIN_ASM_UNROLL) & 256) == 0654cmpq $2*(ROUNDS_256/8),%rdi655jb Skein_256_round_loop656.endif # (SKEIN_ASM_UNROLL & 256) == 0657movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context658659#----------------------------660# feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3}661movq $FIRST_MASK64 ,%r14662xorq Wcopy + 0+F_O (%rbp),%rax663xorq Wcopy + 8+F_O (%rbp),%rbx664xorq Wcopy +16+F_O (%rbp),%rcx665xorq Wcopy +24+F_O (%rbp),%rdx666andq TWEAK + 8 (%rdi),%r14667movq %rax,X_VARS+ 0(%rdi) #store final result668movq %rbx,X_VARS+ 8(%rdi)669movq %rcx,X_VARS+16(%rdi)670movq %rdx,X_VARS+24(%rdi)671672Skein_Debug_Round 256,SKEIN_RND_FEED_FWD673674# go back for more blocks, if needed675decq blkCnt+F_O(%rbp)676jnz Skein_256_block_loop677movq %r14,TWEAK + 8(%rdi)678Reset_Stack679ret680Skein_256_Process_Block_End:681682.if _SKEIN_DEBUG683Skein_Debug_Round_256: #here with rdx == round "number" from macro684pushq %rsi #save two regs for BLK_BITS-specific parms685pushq %rdi686movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi687movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it688movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!)689movq %rcx,X_stk+16+F_O(%rbp)690movq %rdi,X_stk+24+F_O(%rbp)691692movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr693movq $256,%rdi #now <rdi,rsi,rdx> are set for the call694jmp Skein_Debug_Round_Common695.endif696#697.if _SKEIN_CODE_SIZE698C_label Skein_256_Process_Block_CodeSize699movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax700ret701#702C_label Skein_256_Unroll_Cnt703.if _UNROLL_CNT <> ROUNDS_256/8704movq $_UNROLL_CNT,%rax705.else706xorq %rax,%rax707.endif708ret709.endif710#711.endif #_USE_ASM_ & 256712#713#=================================== Skein_512 =============================================714#715.if _USE_ASM_ & 512716#717# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)718#719# X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7)720#721#################722# MACRO: one round for 512-bit blocks723#724.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4725#726addReg r\rn0, r\rn1727RotL64 r\rn1, 512,%((\_Rn_) % 8),0728xorReg r\rn1, r\rn0729\op1730addReg r\rn2, r\rn3731RotL64 r\rn3, 512,%((\_Rn_) % 8),1732xorReg r\rn3, r\rn2733\op2734addReg r\rn4, r\rn5735RotL64 r\rn5, 512,%((\_Rn_) % 8),2736xorReg r\rn5, r\rn4737\op3738addReg r\rn6, r\rn7739RotL64 r\rn7, 512,%((\_Rn_) % 8),3740xorReg r\rn7, r\rn6741\op4742Skein_Debug_Round 512,%(\_Rn_+1),-4743#744.endm #R_512_OneRound745#746#################747# MACRO: eight rounds for 512-bit blocks748#749.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8)750.if ((SKEIN_ASM_UNROLL) & 512)751# here for fully unrolled case.752_II_ = ((\_RR_)/4) + 1 #key injection counter753R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>754R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>755R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>756R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,757# inject the key schedule758addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8759addReg r11, rax760addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9761addReg r12, rbx762addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10763addReg r13, rcx764addReg r14, rdx765addReg r15, rsi,,,(_II_)766.else767# here for looping case #"rotate" key/tweak schedule (move up on stack)768incq %rdi #bump key injection counter769R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>770R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8) >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>771R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>772R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>773# inject the key schedule774addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8775addReg r11, rax776addReg r12, rbx777addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9778addReg r13, rcx779addReg r14, rdx780addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10781addReg r15, rsi782addReg r15, rdi #inject the round number783.endif784785#show the result of the key injection786Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT787.endm #R_512_EightRounds788#789#################790# instantiated code791#792C_label Skein_512_Process_Block793Setup_Stack 512,ROUNDS_512/8794movq TWEAK+ 8(%rdi),%rbx795jmp Skein_512_block_loop796.p2align 4797# main hash loop for Skein_512798Skein_512_block_loop:799# general register usage:800# RAX..RDX = temps for key schedule pre-loads801# R8 ..R15 = X0..X7802# RSP, RBP = stack/frame pointers803# RDI = round counter or context pointer804# RSI = temp805#806movq TWEAK + 0(%rdi),%rax807addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0808movq %rbx,%rcx809xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule810movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0]811movq %rax,ksTwk+ 0+F_O(%rbp)812movq $KW_PARITY,%rdx813movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block814movq %rbx,ksTwk+ 8+F_O(%rbp)815movq %rcx,ksTwk+16+F_O(%rbp)816.irp _Rn_,8,9,10,11,12,13,14,15817movq X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_818xorq %r\_Rn_,%rdx #compute overall parity819movq %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp)820.endr #load state into %r8 ..%r15, compute parity821movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity822823addReg r13,rax #precompute key injection for tweak824addReg r14, rbx825.if _SKEIN_DEBUG826movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below827.endif828movq 0(%rsi),%rax #load input block829movq 8(%rsi),%rbx830movq 16(%rsi),%rcx831movq 24(%rsi),%rdx832addReg r8 , rax #do initial key injection833addReg r9 , rbx834movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward835movq %rbx,Wcopy+ 8+F_O(%rbp)836addReg r10, rcx837addReg r11, rdx838movq %rcx,Wcopy+16+F_O(%rbp)839movq %rdx,Wcopy+24+F_O(%rbp)840841movq 32(%rsi),%rax842movq 40(%rsi),%rbx843movq 48(%rsi),%rcx844movq 56(%rsi),%rdx845addReg r12, rax846addReg r13, rbx847addReg r14, rcx848addReg r15, rdx849movq %rax,Wcopy+32+F_O(%rbp)850movq %rbx,Wcopy+40+F_O(%rbp)851movq %rcx,Wcopy+48+F_O(%rbp)852movq %rdx,Wcopy+56+F_O(%rbp)853854.if _SKEIN_DEBUG855.irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output856movq %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp)857.endr858859Skein_Debug_Block 512 #debug dump860Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL861.endif862addq $8*WCNT,%rsi #skip the block863movq %rsi,blkPtr+F_O(%rbp) #update block pointer864#865#################866# now the key schedule is computed. Start the rounds867#868.if (SKEIN_ASM_UNROLL) & 512869_UNROLL_CNT = ROUNDS_512/8870.else871_UNROLL_CNT = SKEIN_UNROLL_512872.if ((ROUNDS_512/8) % _UNROLL_CNT)873.error "Invalid SKEIN_UNROLL_512"874.endif875xorq %rdi,%rdi #rdi = round counter876Skein_512_round_loop:877.endif878#879_Rbase_ = 0880.rept _UNROLL_CNT*2881R_512_FourRounds %(4*_Rbase_+00)882_Rbase_ = _Rbase_+1883.endr #rept _UNROLL_CNT884#885.if ((SKEIN_ASM_UNROLL) & 512) == 0886cmpq $2*(ROUNDS_512/8),%rdi887jb Skein_512_round_loop888movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context889.endif890# end of rounds891#################892# feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7}893.irp _Rn_,8,9,10,11,12,13,14,15894.if (\_Rn_ == 8)895movq $FIRST_MASK64,%rbx896.endif897xorq Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR898movq %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi) #and store result899.if (\_Rn_ == 14)900andq TWEAK+ 8(%rdi),%rbx901.endif902.endr903Skein_Debug_Round 512,SKEIN_RND_FEED_FWD904905# go back for more blocks, if needed906decq blkCnt+F_O(%rbp)907jnz Skein_512_block_loop908movq %rbx,TWEAK + 8(%rdi)909910Reset_Stack911ret912Skein_512_Process_Block_End:913#914.if _SKEIN_DEBUG915# call here with rdx = "round number"916Skein_Debug_Round_512:917pushq %rsi #save two regs for BLK_BITS-specific parms918pushq %rdi919.irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it920movq %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp)921.endr922movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr923movq $512,%rdi #now <rdi,rsi,rdx> are set for the call924jmp Skein_Debug_Round_Common925.endif926#927.if _SKEIN_CODE_SIZE928C_label Skein_512_Process_Block_CodeSize929movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax930ret931#932C_label Skein_512_Unroll_Cnt933.if _UNROLL_CNT <> (ROUNDS_512/8)934movq $_UNROLL_CNT,%rax935.else936xorq %rax,%rax937.endif938ret939.endif940#941.endif # _USE_ASM_ & 512942#943#=================================== Skein1024 =============================================944.if _USE_ASM_ & 1024945#946# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#947#948#################949# use details of permutation to make register assignments950#951o1K_rdi = 0 #offsets in X[] associated with each register952o1K_rsi = 1953o1K_rbp = 2954o1K_rax = 3955o1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate956o1K_rbx = 5957o1K_rdx = 7958o1K_r8 = 8959o1K_r9 = 9960o1K_r10 = 10961o1K_r11 = 11962o1K_r12 = 12963o1K_r13 = 13964o1K_r14 = 14965o1K_r15 = 15966#967rIdx_offs = tmpStk_1024968#969.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1970addReg \reg0 , \reg1 #perform the MIX971RotL64 \reg1 , 1024,%((\_RN0_) % 8),\_Rn1_972xorReg \reg1 , \reg0973.if ((\_RN0_) & 3) == 3 #time to do key injection?974.if _SKEIN_DEBUG975movq %\reg0 , xDebug_1024+8*\w0(%rsp) #save intermediate values for Debug_Round976movq %\reg1 , xDebug_1024+8*\w1(%rsp) # (before inline key injection)977.endif978_II_ = ((\_RN0_)/4)+1 #injection count979.if (SKEIN_ASM_UNROLL) & 1024 #here to do fully unrolled key injection980addq ksKey+ 8*((_II_+\w0) % 17)(%rsp),%\reg0981addq ksKey+ 8*((_II_+\w1) % 17)(%rsp),%\reg1982.if \w1 == 13 #tweak injection983addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1984.elseif \w0 == 14985addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0986.elseif \w1 == 15987addq $_II_, %\reg1 #(injection counter)988.endif989.else #here to do looping key injection990.if (\w0 == 0)991movq %rdi, X_stk+8*\w0(%rsp) #if so, store N0 so we can use reg as index992movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi993.else994addq ksKey+8+8*\w0(%rsp,%rdi,8),%\reg0 #even key injection995.endif996.if \w1 == 13 #tweak injection997addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1998.elseif \w0 == 14999addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg01000.elseif \w1 == 151001addReg \reg1,rdi,,,1 #(injection counter)1002.endif1003addq ksKey+8+8*\w1(%rsp,%rdi,8),%\reg1 #odd key injection1004.endif1005.endif1006# insert the op provided, .if any1007\op11008.endm1009#################1010# MACRO: four rounds for 1024-bit blocks1011#1012.macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4)1013# should be here with X4 set properly, X6 stored on stack1014_Rn_ = (\_RR_) + 01015r1024_Mix 0, 1,rdi,rsi,_Rn_,01016r1024_Mix 2, 3,rbp,rax,_Rn_,11017r1024_Mix 4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)1018r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack1019r1024_Mix 10,11,r10,r11,_Rn_,51020r1024_Mix 12,13,r12,r13,_Rn_,61021r1024_Mix 6, 7,rcx,rdx,_Rn_,31022r1024_Mix 14,15,r14,r15,_Rn_,71023.if _SKEIN_DEBUG1024Skein_Debug_Round 1024,%(_Rn_+1)1025.endif1026_Rn_ = (\_RR_) + 11027r1024_Mix 0, 9,rdi,r9 ,_Rn_,01028r1024_Mix 2,13,rbp,r13,_Rn_,11029r1024_Mix 6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)1030r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack1031r1024_Mix 12, 3,r12,rax,_Rn_,51032r1024_Mix 14, 5,r14,rbx,_Rn_,61033r1024_Mix 4,15,rcx,r15,_Rn_,31034r1024_Mix 8, 1,r8 ,rsi,_Rn_,71035.if _SKEIN_DEBUG1036Skein_Debug_Round 1024,%(_Rn_+1)1037.endif1038_Rn_ = (\_RR_) + 21039r1024_Mix 0, 7,rdi,rdx,_Rn_,01040r1024_Mix 2, 5,rbp,rbx,_Rn_,11041r1024_Mix 4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)1042r1024_Mix 12,15,r12,r15,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack1043r1024_Mix 14,13,r14,r13,_Rn_,51044r1024_Mix 8,11,r8 ,r11,_Rn_,61045r1024_Mix 6, 1,rcx,rsi,_Rn_,31046r1024_Mix 10, 9,r10,r9 ,_Rn_,71047.if _SKEIN_DEBUG1048Skein_Debug_Round 1024,%(_Rn_+1)1049.endif1050_Rn_ = (\_RR_) + 31051r1024_Mix 0,15,rdi,r15,_Rn_,01052r1024_Mix 2,11,rbp,r11,_Rn_,11053r1024_Mix 6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)1054r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack1055r1024_Mix 8, 5,r8 ,rbx,_Rn_,51056r1024_Mix 10, 3,r10,rax,_Rn_,61057r1024_Mix 4, 9,rcx,r9 ,_Rn_,31058r1024_Mix 12, 7,r12,rdx,_Rn_,71059.if _SKEIN_DEBUG1060Skein_Debug_Round 1024,%(_Rn_+1)1061.endif10621063.if ((SKEIN_ASM_UNROLL) & 1024) == 0 #here with rdi == rIdx, X0 on stack1064#"rotate" the key schedule on the stack1065i8 = o1K_r81066i0 = o1K_rdi1067movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack)1068movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word1069movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!)1070movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word1071movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack)1072movq X_stk+8*i8(%rsp) ,%r8 #get the reg back1073incq %rdi #bump the index1074movq %rdi, rIdx_offs (%rsp) #save rdi again1075movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back1076addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection1077.endif1078#show the result of the key injection1079Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT1080.endm #r1024_FourRounds1081#1082################1083# code1084#1085C_label Skein1024_Process_Block1086#1087Setup_Stack 1024,ROUNDS_1024/8,WCNT1088movq TWEAK+ 8(%rdi),%r91089jmp Skein1024_block_loop1090# main hash loop for Skein10241091.p2align 41092Skein1024_block_loop:1093# general register usage:1094# RSP = stack pointer1095# RAX..RDX,RSI,RDI = X1, X3..X7 (state words)1096# R8 ..R15 = X8..X15 (state words)1097# RBP = temp (used for X0 and X2)1098#1099.if ((SKEIN_ASM_UNROLL) & 1024) == 01100xorq %rax,%rax #init loop index on the stack1101movq %rax,rIdx_offs(%rsp)1102.endif1103movq TWEAK+ 0(%rdi),%r81104addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T01105movq %r9 ,%r101106xorq %r8 ,%r10 #%rax/%rbx/%rcx = tweak schedule1107movq %r8 ,TWEAK+ 0(%rdi) #save updated tweak value ctx->h.T[0]1108movq %r8 ,ksTwk+ 0+F_O(%rbp)1109movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below1110movq %r10,ksTwk+16+F_O(%rbp)1111.if _SKEIN_DEBUG1112movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block1113.endif1114movq blkPtr +F_O(%rbp),%rsi # rsi --> input block1115movq $KW_PARITY ,%rax #overall key schedule parity11161117# the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]1118.irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps1119movq X_VARS+8*\_rN_(%rdi),%r14 #get state word1120movq 8*\_rN_(%rsi),%r15 #get msg word1121xorq %r14,%rax #update key schedule overall parity1122movq %r14,ksKey +8*\_rN_+F_O(%rbp) #save key schedule word on stack1123movq %r15,Wcopy +8*\_rN_+F_O(%rbp) #save local msg Wcopy1124addq %r15,%r14 #do the initial key injection1125movq %r14,X_stk +8*\_rN_ (%rsp) #save initial state var on stack1126.endr1127# now process the rest, using the "real" registers1128# (MUST do it in reverse order to inject tweaks r8/r9 first)1129.irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx1130_oo_ = o1K_\_rr_ #offset assocated with the register1131movq X_VARS+8*_oo_(%rdi),%\_rr_ #get key schedule word from context1132movq 8*_oo_(%rsi),%rcx #get next input msg word1133movq %\_rr_, ksKey +8*_oo_(%rsp) #save key schedule on stack1134xorq %\_rr_, %rax #accumulate key schedule parity1135movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward1136addq %rcx,%\_rr_ #do the initial key injection1137.if _oo_ == 13 #do the initial tweak injection1138addReg \_rr_,r8 # (only in words 13/14)1139.elseif _oo_ == 141140addReg \_rr_,r91141.endif1142.endr1143movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity1144.if _SKEIN_DEBUG1145Skein_Debug_Block 1024 #initial debug dump1146.endif1147addq $8*WCNT,%rsi #bump the msg ptr1148movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr1149# re-load words 0..4 from stack, enter the main loop1150.irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack)1151movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go!1152.endr1153.if _SKEIN_DEBUG1154Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection1155.endif1156#1157#################1158# now the key schedule is computed. Start the rounds1159#1160.if (SKEIN_ASM_UNROLL) & 10241161_UNROLL_CNT = ROUNDS_1024/81162.else1163_UNROLL_CNT = SKEIN_UNROLL_10241164.if ((ROUNDS_1024/8) % _UNROLL_CNT)1165.error "Invalid SKEIN_UNROLL_1024"1166.endif1167Skein1024_round_loop:1168.endif1169#1170_Rbase_ = 01171.rept _UNROLL_CNT*2 #implement the rounds, 4 at a time1172r1024_FourRounds %(4*_Rbase_+00)1173_Rbase_ = _Rbase_+11174.endr #rept _UNROLL_CNT1175#1176.if ((SKEIN_ASM_UNROLL) & 1024) == 01177cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done1178jb Skein1024_round_loop1179.endif1180# end of rounds1181#################1182#1183# feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15}1184movq %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack1185movq ctxPtr(%rsp),%rdx11861187.irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15 #do all but x6,x71188_oo_ = o1K_\_rr_1189xorq Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR1190movq %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context1191.if (_oo_ == 9)1192movq $FIRST_MASK64 ,%r91193.endif1194.if (_oo_ == 14)1195andq TWEAK+ 8(%rdx),%r91196.endif1197.endr1198#1199movq X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above)1200movq X_stk +8*7(%rsp),%rbx1201xorq Wcopy +8*6(%rsp),%rax1202xorq Wcopy +8*7(%rsp),%rbx1203movq %rax,X_VARS+8*6(%rdx)1204decq blkCnt(%rsp) #set zero flag iff done1205movq %rbx,X_VARS+8*7(%rdx)12061207Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)>1208# go back for more blocks, if needed1209movq ctxPtr(%rsp),%rdi #don't muck with the flags here!1210lea FRAME_OFFS(%rsp),%rbp1211jnz Skein1024_block_loop1212movq %r9 ,TWEAK+ 8(%rdx)1213Reset_Stack1214ret1215#1216Skein1024_Process_Block_End:1217#1218.if _SKEIN_DEBUG1219Skein_Debug_Round_1024:1220# call here with rdx = "round number",1221_SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr1222#1223#save rest of X[] state on stack so debug routines can access it1224.irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r151225movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp)1226.endr1227# Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack1228cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save1229jae save_x01230testq $3,%rdx #otherwise only if rdx != 0 mod 41231jz save_x0_not1232save_x0:1233movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp)1234save_x0_not:1235#figure out the x4/x6 swapping state and save the correct one!1236cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x41237jae save_x41238testq $1,%rdx #and even ones have r4 as well1239jz save_x41240movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp)1241jmp debug_1024_go1242save_x4:1243movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp)1244debug_1024_go:1245#now all is saved in Xstk[] except for rdx1246push %rsi #save two regs for BLK_BITS-specific parms1247push %rdi1248_SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32)12491250movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call)1251movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[]12521253movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr1254movq $1024,%rdi #rdi = block size1255jmp Skein_Debug_Round_Common1256.endif1257#1258.if _SKEIN_CODE_SIZE1259C_label Skein1024_Process_Block_CodeSize1260movq $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax1261ret1262#1263C_label Skein1024_Unroll_Cnt1264.if _UNROLL_CNT <> (ROUNDS_1024/8)1265movq $_UNROLL_CNT,%rax1266.else1267xorq %rax,%rax1268.endif1269ret1270.endif1271#1272.endif # _USE_ASM_ and 10241273#1274.if _SKEIN_DEBUG1275#----------------------------------------------------------------1276#local debug routine to set up for calls to:1277# void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)1278# [ rdi rsi rdx rcx]1279#1280# here with %rdx = round number1281# %rsi = ctx_hdr_ptr1282# %rdi = block size (256/512/1024)1283# on stack: saved rdi, saved rsi, retAddr, saved rdx1284#1285Skein_Debug_Round_Common:1286_SP_OFFS_ = 32 #account for four words on stack already1287.irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs1288pushq %\_rr_1289_SP_OFFS_ = _SP_OFFS_+81290.endr1291.if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here1292.error "Debug_Round_Common: stack alignment"1293.endif1294# compute %rcx = ptr to the X[] array on the stack (final parameter to call)1295leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address1296cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"?1297jnz _got_rcxA1298leaq X_VARS(%rsi),%rcx1299_got_rcxA:1300.if _USE_ASM_ & 10241301# special handling for 1024-bit case1302# (for rounds right before with key injection:1303# use xDebug_1024[] instead of X_stk[])1304cmpq $SKEIN_RND_SPECIAL,%rdx1305jae _got_rcxB #must be a normal round1306orq %rdx,%rdx1307jz _got_rcxB #just before key injection1308test $3,%rdx1309jne _got_rcxB1310cmp $1024,%rdi #only 1024-bit(s) for now1311jne _got_rcxB1312leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx1313_got_rcxB:1314.endif1315call Skein_Show_Round #call external debug handler13161317.irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs1318popq %\_rr_1319_SP_OFFS_ = _SP_OFFS_-81320.endr1321.if _SP_OFFS_ - 321322.error "Debug_Round_Common: push/pop misalignment!"1323.endif1324popq %rdi1325popq %rsi1326ret1327.endif1328#----------------------------------------------------------------1329.section .note.GNU-stack,"",@progbits13301331.end133213331334