CoCalc -- skein_block

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/skein/amd64/skein_block_asm.S
¹⁰⁸⁶⁹⁶ views
1
#
2
#----------------------------------------------------------------
3
# 64-bit x86 assembler code (gnu as) for Skein block functions
4
#
5
# Author: Doug Whiting, Hifn/Exar
6
#
7
# This code is released to the public domain.
8
#----------------------------------------------------------------
9
#
10
    .text
11
    .altmacro
12
#ifndef __clang__
13
    .psize 0,128                            #list file has no page boundaries
14
#endif
15
#
16
_MASK_ALL_  =  (256+512+1024)               #all three algorithm bits
17
_MAX_FRAME_ =  240
18
#
19
#################
20
#ifndef SKEIN_USE_ASM
21
_USE_ASM_         = _MASK_ALL_
22
#else
23
_USE_ASM_         = SKEIN_USE_ASM
24
#endif
25
#################
26
#configure loop unrolling
27
#ifndef SKEIN_LOOP
28
_SKEIN_LOOP       =   2                     #default is fully unrolled for 256/512, twice for 1024
29
#else
30
_SKEIN_LOOP       = SKEIN_LOOP
31
  .irp _NN_,%_SKEIN_LOOP                #only display loop unrolling if default changed on command line
32
#.print  "+++ SKEIN_LOOP = \_NN_"
33
  .endr
34
#endif
35
# the unroll counts (0 --> fully unrolled)
36
SKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) % 10
37
SKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) % 10
38
SKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) % 10
39
#
40
SKEIN_ASM_UNROLL  = 0
41
  .irp _NN_,256,512,1024
42
    .if (SKEIN_UNROLL_\_NN_) == 0
43
SKEIN_ASM_UNROLL  = (SKEIN_ASM_UNROLL) + \_NN_
44
    .endif
45
  .endr
46
#################
47
#
48
.ifndef SKEIN_ROUNDS
49
ROUNDS_256  =   72
50
ROUNDS_512  =   72
51
ROUNDS_1024 =   80
52
.else
53
ROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
54
ROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) % 10) + 5)
55
ROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) % 10) + 5)
56
# only display rounds if default size is changed on command line
57
.irp _NN_,256,512,1024
58
  .if _USE_ASM_ & \_NN_
59
    .irp _RR_,%(ROUNDS_\_NN_)
60
      .if _NN_ < 1024
61
.print  "+++ SKEIN_ROUNDS_\_NN_  = \_RR_"
62
      .else
63
.print  "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
64
      .endif
65
    .endr
66
  .endif
67
.endr
68
.endif
69
#################
70
#
71
.ifdef SKEIN_CODE_SIZE
72
_SKEIN_CODE_SIZE = (1)
73
.else
74
.ifdef  SKEIN_PERF                           #use code size if SKEIN_PERF is defined
75
_SKEIN_CODE_SIZE = (1)
76
.else
77
_SKEIN_CODE_SIZE = (0)
78
.endif
79
.endif
80
#
81
#################
82
#
83
.ifndef SKEIN_DEBUG
84
_SKEIN_DEBUG      = 0
85
.else
86
_SKEIN_DEBUG      = 1
87
.endif
88
#################
89
#
90
# define offsets of fields in hash context structure
91
#
92
HASH_BITS   =   0                   #bits of hash output
93
BCNT        =   8 + HASH_BITS       #number of bytes in BUFFER[]
94
TWEAK       =   8 + BCNT            #tweak values[0..1]
95
X_VARS      =  16 + TWEAK           #chaining vars
96
#
97
#(Note: buffer[] in context structure is NOT needed here :-)
98
#
99
KW_PARITY   =   0x1BD11BDAA9FC1A22  #overall parity of key schedule words
100
FIRST_MASK  =   ~ (1 <<  6)
101
FIRST_MASK64=   ~ (1 << 62)
102
#
103
# rotation constants for Skein
104
#
105
RC_256_0_0  = 14
106
RC_256_0_1  = 16
107

108
RC_256_1_0  = 52
109
RC_256_1_1  = 57
110

111
RC_256_2_0  = 23
112
RC_256_2_1  = 40
113

114
RC_256_3_0  =  5
115
RC_256_3_1  = 37
116

117
RC_256_4_0  = 25
118
RC_256_4_1  = 33
119

120
RC_256_5_0  = 46
121
RC_256_5_1  = 12
122

123
RC_256_6_0  = 58
124
RC_256_6_1  = 22
125

126
RC_256_7_0  = 32
127
RC_256_7_1  = 32
128

129
RC_512_0_0  = 46
130
RC_512_0_1  = 36
131
RC_512_0_2  = 19
132
RC_512_0_3  = 37
133

134
RC_512_1_0  = 33
135
RC_512_1_1  = 27
136
RC_512_1_2  = 14
137
RC_512_1_3  = 42
138

139
RC_512_2_0  = 17
140
RC_512_2_1  = 49
141
RC_512_2_2  = 36
142
RC_512_2_3  = 39
143

144
RC_512_3_0  = 44
145
RC_512_3_1  =  9
146
RC_512_3_2  = 54
147
RC_512_3_3  = 56
148

149
RC_512_4_0  = 39
150
RC_512_4_1  = 30
151
RC_512_4_2  = 34
152
RC_512_4_3  = 24
153

154
RC_512_5_0  = 13
155
RC_512_5_1  = 50
156
RC_512_5_2  = 10
157
RC_512_5_3  = 17
158

159
RC_512_6_0  = 25
160
RC_512_6_1  = 29
161
RC_512_6_2  = 39
162
RC_512_6_3  = 43
163

164
RC_512_7_0  =  8
165
RC_512_7_1  = 35
166
RC_512_7_2  = 56
167
RC_512_7_3  = 22
168

169
RC_1024_0_0 = 24
170
RC_1024_0_1 = 13
171
RC_1024_0_2 =  8
172
RC_1024_0_3 = 47
173
RC_1024_0_4 =  8
174
RC_1024_0_5 = 17
175
RC_1024_0_6 = 22
176
RC_1024_0_7 = 37
177

178
RC_1024_1_0 = 38
179
RC_1024_1_1 = 19
180
RC_1024_1_2 = 10
181
RC_1024_1_3 = 55
182
RC_1024_1_4 = 49
183
RC_1024_1_5 = 18
184
RC_1024_1_6 = 23
185
RC_1024_1_7 = 52
186

187
RC_1024_2_0 = 33
188
RC_1024_2_1 =  4
189
RC_1024_2_2 = 51
190
RC_1024_2_3 = 13
191
RC_1024_2_4 = 34
192
RC_1024_2_5 = 41
193
RC_1024_2_6 = 59
194
RC_1024_2_7 = 17
195

196
RC_1024_3_0 =  5
197
RC_1024_3_1 = 20
198
RC_1024_3_2 = 48
199
RC_1024_3_3 = 41
200
RC_1024_3_4 = 47
201
RC_1024_3_5 = 28
202
RC_1024_3_6 = 16
203
RC_1024_3_7 = 25
204

205
RC_1024_4_0 = 41
206
RC_1024_4_1 =  9
207
RC_1024_4_2 = 37
208
RC_1024_4_3 = 31
209
RC_1024_4_4 = 12
210
RC_1024_4_5 = 47
211
RC_1024_4_6 = 44
212
RC_1024_4_7 = 30
213

214
RC_1024_5_0 = 16
215
RC_1024_5_1 = 34
216
RC_1024_5_2 = 56
217
RC_1024_5_3 = 51
218
RC_1024_5_4 =  4
219
RC_1024_5_5 = 53
220
RC_1024_5_6 = 42
221
RC_1024_5_7 = 41
222

223
RC_1024_6_0 = 31
224
RC_1024_6_1 = 44
225
RC_1024_6_2 = 47
226
RC_1024_6_3 = 46
227
RC_1024_6_4 = 19
228
RC_1024_6_5 = 42
229
RC_1024_6_6 = 44
230
RC_1024_6_7 = 25
231

232
RC_1024_7_0 =  9
233
RC_1024_7_1 = 48
234
RC_1024_7_2 = 35
235
RC_1024_7_3 = 52
236
RC_1024_7_4 = 23
237
RC_1024_7_5 = 31
238
RC_1024_7_6 = 37
239
RC_1024_7_7 = 20
240
#
241
#  Input:  reg
242
# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
243
#
244
.macro RotL64   reg,BLK_SIZE,ROUND_NUM,MIX_NUM
245
  .if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM  #is there anything to do?
246
    rolq    $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg
247
  .endif
248
.endm
249
#
250
#----------------------------------------------------------------
251
#
252
# MACROS: define local vars and configure stack
253
#
254
#----------------------------------------------------------------
255
# declare allocated space on the stack
256
.macro StackVar localName,localSize
257
\localName  =   _STK_OFFS_
258
_STK_OFFS_  =   _STK_OFFS_+(\localSize)
259
.endm #StackVar
260
#
261
#----------------------------------------------------------------
262
#
263
# MACRO: Configure stack frame, allocate local vars
264
#
265
.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt
266
    WCNT    =    (\BLK_BITS)/64
267
#
268
_PushCnt_   =   0                   #save nonvolatile regs on stack
269
  .irp _reg_,rbp,rbx,r12,r13,r14,r15
270
       pushq    %\_reg_
271
_PushCnt_ = _PushCnt_ + 1           #track count to keep alignment
272
  .endr
273
#
274
_STK_OFFS_  =   0                   #starting offset from rsp
275
    #---- local  variables         #<-- rsp
276
    StackVar    X_stk  ,8*(WCNT)    #local context vars
277
    StackVar    ksTwk  ,8*3         #key schedule: tweak words
278
    StackVar    ksKey  ,8*(WCNT)+8  #key schedule: key   words
279
  .if ((SKEIN_ASM_UNROLL) & (\BLK_BITS)) == 0
280
    StackVar    ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen
281
  .endif
282
    StackVar    Wcopy  ,8*(WCNT)    #copy of input block    
283
  .if _SKEIN_DEBUG
284
  .if \debugCnt + 0                 #temp location for debug X[] info
285
    StackVar    xDebug_\BLK_BITS ,8*(\debugCnt)
286
  .endif
287
  .endif
288
  .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0
289
    StackVar    align16,8           #keep 16-byte aligned (adjust for retAddr?)
290
tmpStk_\BLK_BITS = align16          #use this
291
  .endif
292
    #---- saved caller parameters (from regs rdi, rsi, rdx, rcx)
293
    StackVar    ctxPtr ,8           #context ptr
294
    StackVar    blkPtr ,8           #pointer to block data
295
    StackVar    blkCnt ,8           #number of full blocks to process
296
    StackVar    bitAdd ,8           #bit count to add to tweak
297
LOCAL_SIZE  =   _STK_OFFS_          #size of "local" vars
298
    #---- 
299
    StackVar    savRegs,8*_PushCnt_ #saved registers
300
    StackVar    retAddr,8           #return address
301
    #---- caller's stack frame (aligned mod 16)
302
#
303
# set up the stack frame pointer (rbp)
304
#
305
FRAME_OFFS  =   ksTwk + 128         #allow short (negative) offset to ksTwk, kwKey
306
  .if FRAME_OFFS > _STK_OFFS_       #keep rbp in the "locals" range
307
FRAME_OFFS  =      _STK_OFFS_
308
  .endif
309
F_O         =   -FRAME_OFFS
310
#
311
  #put some useful defines in the .lst file (for grep)
312
__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE
313
__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_
314
__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
315
#
316
# Notes on stack frame setup:
317
#   * the most frequently used variable is X_stk[], based at [rsp+0]
318
#   * the next most used is the key schedule arrays, ksKey and ksTwk
319
#       so rbp is "centered" there, allowing short offsets to the key 
320
#       schedule even in 1024-bit Skein case
321
#   * the Wcopy variables are infrequently accessed, but they have long 
322
#       offsets from both rsp and rbp only in the 1024-bit case.
323
#   * all other local vars and calling parameters can be accessed 
324
#       with short offsets, except in the 1024-bit case
325
#
326
    subq    $LOCAL_SIZE,%rsp        #make room for the locals
327
    leaq    FRAME_OFFS(%rsp),%rbp   #maximize use of short offsets
328
    movq    %rdi, ctxPtr+F_O(%rbp)  #save caller's parameters on the stack
329
    movq    %rsi, blkPtr+F_O(%rbp)
330
    movq    %rdx, blkCnt+F_O(%rbp)
331
    movq    %rcx, bitAdd+F_O(%rbp)
332
#
333
.endm #Setup_Stack
334
#
335
#----------------------------------------------------------------
336
#
337
.macro Reset_Stack
338
    addq    $LOCAL_SIZE,%rsp        #get rid of locals (wipe?)
339
  .irp _reg_,r15,r14,r13,r12,rbx,rbp
340
    popq    %\_reg_                 #restore caller's regs
341
_PushCnt_ = _PushCnt_ - 1
342
  .endr
343
  .if _PushCnt_
344
    .error  "Mismatched push/pops?"
345
  .endif
346
.endm # Reset_Stack
347
#
348
#----------------------------------------------------------------
349
# macros to help debug internals
350
#
351
.if _SKEIN_DEBUG
352
    .extern  Skein_Show_Block     #calls to C routines
353
    .extern  Skein_Show_Round
354
#
355
SKEIN_RND_SPECIAL       =   1000
356
SKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
357
SKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
358
SKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
359
#
360
.macro Skein_Debug_Block BLK_BITS
361
#
362
#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
363
#                     const u08b_t *blkPtr, const u64b_t *wPtr, 
364
#                     const u64b_t *ksPtr,const u64b_t *tsPtr)
365
#
366
_NN_ = 0
367
  .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11
368
    pushq   %\_reg_                 #save all volatile regs on tack before the call
369
_NN_ = _NN_ + 1
370
  .endr
371
    # get and push call parameters
372
    movq    $\BLK_BITS      ,%rdi   #bits
373
    movq    ctxPtr+F_O(%rbp),%rsi   #h (pointer)
374
    leaq    X_VARS    (%rsi),%rdx   #X (pointer)
375
    movq    blkPtr+F_O(%rbp),%rcx   #blkPtr
376
    leaq    Wcopy +F_O(%rbp),%r8    #wPtr
377
    leaq    ksKey +F_O(%rbp),%r9    #key pointer
378
    leaq    ksTwk +F_O(%rbp),%rax   #tweak pointer
379
    pushq   %rax                    #   (pass on the stack)
380
    call    Skein_Show_Block        #call external debug handler
381
    addq    $8*1,%rsp               #discard parameters on stack
382
  .if (_NN_ % 2 ) == 0              #check stack alignment
383
    .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"
384
  .endif
385
  .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax
386
    popq    %\_reg_                 #restore regs
387
_NN_ = _NN_ - 1
388
  .endr
389
  .if _NN_
390
    .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"
391
  .endif
392
.endm # Skein_Debug_Block
393
#
394
# the macro to "call" to debug a round
395
#
396
.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
397
    # call the appropriate (local) debug "function"
398
    pushq   %rdx                    #save rdx, so we can use it for round "number"
399
  .if ((SKEIN_ASM_UNROLL) & \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
400
    movq    $\R,%rdx
401
  .else                             #compute round number using edi
402
_rOffs_ = \RDI_OFFS + 0
403
   .if \BLK_BITS == 1024
404
    movq    rIdx_offs+8(%rsp),%rdx  #get rIdx off the stack (adjust for pushq rdx above)
405
    leaq    1+(((\R)-1) & 3)+_rOffs_(,%rdx,4),%rdx
406
   .else
407
    leaq    1+(((\R)-1) & 3)+_rOffs_(,%rdi,4),%rdx
408
   .endif
409
  .endif
410
    call    Skein_Debug_Round_\BLK_BITS
411
    popq    %rdx                    #restore origianl rdx value
412
#
413
    afterOp
414
.endm  #  Skein_Debug_Round
415
.else  #------- _SKEIN_DEBUG (dummy macros if debug not enabled)
416
.macro Skein_Debug_Block BLK_BITS
417
.endm
418
#
419
.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
420
.endm
421
#
422
.endif # _SKEIN_DEBUG
423
#
424
#----------------------------------------------------------------
425
#
426
.macro  addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
427
  .if \immOffs + 0
428
       leaq    \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
429
  .elseif ((\useAddOp + 0) == 0)
430
    .ifndef ASM_NO_LEA  #lea seems to be faster on Core 2 Duo CPUs!
431
       leaq   (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
432
    .else
433
       addq    %\srcReg_A\srcReg_B,%\dstReg
434
    .endif
435
  .else
436
       addq    %\srcReg_A\srcReg_B,%\dstReg
437
  .endif
438
.endm
439

440
# keep Intel-style ordering here, to match addReg
441
.macro  xorReg dstReg,srcReg_A,srcReg_B
442
        xorq   %\srcReg_A\srcReg_B,%\dstReg
443
.endm
444
#
445
#----------------------------------------------------------------
446
#
447
.macro C_label lName
448
 \lName:        #use both "genders" to work across linkage conventions
449
_\lName:
450
    .global  \lName
451
    .global _\lName
452
.endm
453
#
454
#=================================== Skein_256 =============================================
455
#
456
.if _USE_ASM_ & 256
457
#
458
# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
459
#
460
#################
461
#
462
# code
463
#
464
C_label Skein_256_Process_Block
465
    Setup_Stack 256,((ROUNDS_256/8)+1)
466
    movq    TWEAK+8(%rdi),%r14
467
    jmp     Skein_256_block_loop
468
    .p2align 4
469
    # main hash loop for Skein_256
470
Skein_256_block_loop:
471
    #
472
    # general register usage:
473
    #   RAX..RDX        = X0..X3    
474
    #   R08..R12        = ks[0..4]
475
    #   R13..R15        = ts[0..2]
476
    #   RSP, RBP        = stack/frame pointers
477
    #   RDI             = round counter or context pointer
478
    #   RSI             = temp
479
    #
480
    movq    TWEAK+0(%rdi)     ,%r13
481
    addq    bitAdd+F_O(%rbp)  ,%r13  #computed updated tweak value T0
482
    movq    %r14              ,%r15
483
    xorq    %r13              ,%r15  #now %r13.%r15 is set as the tweak 
484

485
    movq    $KW_PARITY        ,%r12
486
    movq       X_VARS+ 0(%rdi),%r8
487
    movq       X_VARS+ 8(%rdi),%r9 
488
    movq       X_VARS+16(%rdi),%r10
489
    movq       X_VARS+24(%rdi),%r11
490
    movq    %r13,TWEAK+0(%rdi)       #save updated tweak value ctx->h.T[0]
491
    xorq    %r8               ,%r12  #start accumulating overall parity
492

493
    movq    blkPtr +F_O(%rbp) ,%rsi  #esi --> input block
494
    xorq    %r9               ,%r12
495
    movq     0(%rsi)          ,%rax  #get X[0..3]
496
    xorq    %r10              ,%r12
497
    movq     8(%rsi)          ,%rbx
498
    xorq    %r11              ,%r12
499
    movq    16(%rsi)          ,%rcx
500
    movq    24(%rsi)          ,%rdx
501

502
    movq    %rax,Wcopy+ 0+F_O(%rbp)  #save copy of input block
503
    movq    %rbx,Wcopy+ 8+F_O(%rbp)    
504
    movq    %rcx,Wcopy+16+F_O(%rbp)    
505
    movq    %rdx,Wcopy+24+F_O(%rbp)    
506

507
    addq    %r8 ,%rax                #initial key injection
508
    addq    %r9 ,%rbx 
509
    addq    %r10,%rcx
510
    addq    %r11,%rdx
511
    addq    %r13,%rbx
512
    addq    %r14,%rcx
513

514
.if _SKEIN_DEBUG
515
    movq    %r14,TWEAK+ 8(%rdi)      #save updated tweak T[1] (start bit cleared?)
516
    movq    %r8 ,ksKey+ 0+F_O(%rbp)  #save key schedule on stack for Skein_Debug_Block
517
    movq    %r9 ,ksKey+ 8+F_O(%rbp)    
518
    movq    %r10,ksKey+16+F_O(%rbp)    
519
    movq    %r11,ksKey+24+F_O(%rbp)    
520
    movq    %r12,ksKey+32+F_O(%rbp)    
521
                                       
522
    movq    %r13,ksTwk+ 0+F_O(%rbp)    
523
    movq    %r14,ksTwk+ 8+F_O(%rbp)    
524
    movq    %r15,ksTwk+16+F_O(%rbp)    
525
                                       
526
    movq    %rax,X_stk + 0(%rsp)     #save X[] on stack for Skein_Debug_Block
527
    movq    %rbx,X_stk + 8(%rsp)       
528
    movq    %rcx,X_stk +16(%rsp)       
529
    movq    %rdx,X_stk +24(%rsp)       
530

531
    Skein_Debug_Block 256            #debug dump
532
    Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
533
.endif
534
#
535
.if (((SKEIN_ASM_UNROLL) & 256) == 0)
536
    movq    %r8 ,ksKey+40+F_O(%rbp)  #save key schedule on stack for looping code
537
    movq    %r9 ,ksKey+ 8+F_O(%rbp)    
538
    movq    %r10,ksKey+16+F_O(%rbp)    
539
    movq    %r11,ksKey+24+F_O(%rbp)    
540
    movq    %r12,ksKey+32+F_O(%rbp)    
541
                                       
542
    movq    %r13,ksTwk+24+F_O(%rbp)    
543
    movq    %r14,ksTwk+ 8+F_O(%rbp)    
544
    movq    %r15,ksTwk+16+F_O(%rbp)    
545
.endif
546
    addq    $WCNT*8,%rsi             #skip the block
547
    movq    %rsi,blkPtr  +F_O(%rbp)  #update block pointer
548
    #
549
    # now the key schedule is computed. Start the rounds
550
    #
551
.if (SKEIN_ASM_UNROLL) & 256
552
_UNROLL_CNT =   ROUNDS_256/8
553
.else
554
_UNROLL_CNT =   SKEIN_UNROLL_256
555
  .if ((ROUNDS_256/8) % _UNROLL_CNT)
556
    .error "Invalid SKEIN_UNROLL_256"
557
  .endif
558
    xorq    %rdi,%rdi                #rdi = iteration count
559
Skein_256_round_loop:
560
.endif
561
_Rbase_ = 0
562
.rept _UNROLL_CNT*2
563
    # all X and ks vars in regs      # (ops to "rotate" ks vars, via mem, if not unrolled)
564
    # round 4*_RBase_ + 0
565
    addReg  rax, rbx
566
    RotL64  rbx, 256,%((4*_Rbase_+0) % 8),0
567
    addReg  rcx, rdx
568
                .if ((SKEIN_ASM_UNROLL) & 256) == 0
569
                    movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8
570
                .endif
571
    xorReg  rbx, rax
572
    RotL64  rdx, 256,%((4*_Rbase_+0) % 8),1
573
    xorReg  rdx, rcx
574
  .if (SKEIN_ASM_UNROLL) & 256
575
    .irp _r0_,%( 8+(_Rbase_+3) % 5)
576
    .irp _r1_,%(13+(_Rbase_+2) % 3)
577
      leaq   (%r\_r0_,%r\_r1_),%rdi    #precompute key injection value for %rcx
578
    .endr
579
    .endr
580
  .endif
581
                .if ((SKEIN_ASM_UNROLL) & 256) == 0
582
                    movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13
583
                .endif
584
    Skein_Debug_Round 256,%(4*_Rbase_+1)
585

586
    # round 4*_Rbase_ + 1
587
    addReg  rax, rdx
588
    RotL64  rdx, 256,%((4*_Rbase_+1) % 8),0
589
    xorReg  rdx, rax
590
                .if ((SKEIN_ASM_UNROLL) & 256) == 0
591
                    movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9
592
                .endif
593
    addReg  rcx, rbx
594
    RotL64  rbx, 256,%((4*_Rbase_+1) % 8),1
595
    xorReg  rbx, rcx
596
                .if ((SKEIN_ASM_UNROLL) & 256) == 0
597
                    movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11
598
                .endif
599
    Skein_Debug_Round 256,%(4*_Rbase_+2)
600
 .if (SKEIN_ASM_UNROLL) & 256
601
    .irp _r0_,%( 8+(_Rbase_+2) % 5)
602
    .irp _r1_,%(13+(_Rbase_+1) % 3)
603
      leaq   (%r\_r0_,%r\_r1_),%rsi     #precompute key injection value for %rbx
604
    .endr
605
    .endr
606
 .endif
607
    # round 4*_Rbase_ + 2
608
    addReg  rax, rbx
609
    RotL64  rbx, 256,%((4*_Rbase_+2) % 8),0
610
    addReg  rcx, rdx
611
                .if ((SKEIN_ASM_UNROLL) & 256) == 0
612
                    movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10
613
                .endif
614
    xorReg  rbx, rax
615
    RotL64  rdx, 256,%((4*_Rbase_+2) % 8),1
616
    xorReg  rdx, rcx
617
                .if ((SKEIN_ASM_UNROLL) & 256) == 0
618
                    movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8)  #"rotate" the key
619
                    leaq 1(%r11,%rdi),%r11               #precompute key + tweak
620
                .endif
621
    Skein_Debug_Round 256,%(4*_Rbase_+3)
622
    # round 4*_Rbase_ + 3
623
    addReg  rax, rdx
624
    RotL64  rdx, 256,%((4*_Rbase_+3) % 8),0
625
    addReg  rcx, rbx
626
                .if ((SKEIN_ASM_UNROLL) & 256) == 0
627
                    addq      ksTwk+8*2+F_O(%rbp,%rdi,8),%r10  #precompute key + tweak
628
                    movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8)       #"rotate" the tweak
629
                .endif
630
    xorReg  rdx, rax
631
    RotL64  rbx, 256,%((4*_Rbase_+3) % 8),1
632
    xorReg  rbx, rcx
633
    Skein_Debug_Round 256,%(4*_Rbase_+4)
634
                .if ((SKEIN_ASM_UNROLL) & 256) == 0
635
                    addReg r9 ,r13           #precompute key+tweak
636
                .endif
637
      #inject key schedule words
638
_Rbase_ = _Rbase_+1
639
  .if (SKEIN_ASM_UNROLL) & 256
640
    addReg    rax,r,%(8+((_Rbase_+0) % 5))
641
    addReg    rbx,rsi
642
    addReg    rcx,rdi
643
    addReg    rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_
644
  .else
645
    incq      %rdi
646
    addReg    rax,r8 
647
    addReg    rcx,r10
648
    addReg    rbx,r9 
649
    addReg    rdx,r11
650
  .endif
651
    Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
652
.endr #rept _UNROLL_CNT
653
#
654
.if ((SKEIN_ASM_UNROLL) & 256) == 0
655
    cmpq    $2*(ROUNDS_256/8),%rdi
656
    jb      Skein_256_round_loop
657
.endif # (SKEIN_ASM_UNROLL & 256) == 0
658
    movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
659

660
    #----------------------------
661
    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
662
    movq    $FIRST_MASK64 ,%r14
663
    xorq    Wcopy + 0+F_O (%rbp),%rax
664
    xorq    Wcopy + 8+F_O (%rbp),%rbx
665
    xorq    Wcopy +16+F_O (%rbp),%rcx
666
    xorq    Wcopy +24+F_O (%rbp),%rdx
667
    andq    TWEAK + 8     (%rdi),%r14
668
    movq    %rax,X_VARS+ 0(%rdi)             #store final result
669
    movq    %rbx,X_VARS+ 8(%rdi)        
670
    movq    %rcx,X_VARS+16(%rdi)        
671
    movq    %rdx,X_VARS+24(%rdi)        
672

673
    Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
674

675
    # go back for more blocks, if needed
676
    decq    blkCnt+F_O(%rbp)
677
    jnz     Skein_256_block_loop
678
    movq    %r14,TWEAK + 8(%rdi)
679
    Reset_Stack
680
    ret
681
Skein_256_Process_Block_End:
682

683
  .if _SKEIN_DEBUG
684
Skein_Debug_Round_256:               #here with rdx == round "number" from macro
685
    pushq   %rsi                     #save two regs for BLK_BITS-specific parms
686
    pushq   %rdi
687
    movq    24(%rsp),%rdi            #get back original rdx (pushed on stack in macro call) to rdi
688
    movq    %rax,X_stk+ 0+F_O(%rbp)  #save X[] state on stack so debug routines can access it
689
    movq    %rbx,X_stk+ 8+F_O(%rbp)  #(use FP_ since rsp has changed!)
690
    movq    %rcx,X_stk+16+F_O(%rbp)
691
    movq    %rdi,X_stk+24+F_O(%rbp)
692

693
    movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
694
    movq    $256,%rdi                #now <rdi,rsi,rdx> are set for the call
695
    jmp     Skein_Debug_Round_Common
696
  .endif
697
#
698
.if _SKEIN_CODE_SIZE
699
C_label  Skein_256_Process_Block_CodeSize
700
    movq    $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax
701
    ret
702
#
703
C_label Skein_256_Unroll_Cnt
704
  .if _UNROLL_CNT <> ROUNDS_256/8
705
    movq    $_UNROLL_CNT,%rax
706
  .else
707
    xorq    %rax,%rax
708
  .endif
709
    ret
710
.endif
711
#
712
.endif #_USE_ASM_ & 256
713
#
714
#=================================== Skein_512 =============================================
715
#
716
.if _USE_ASM_ & 512
717
#
718
# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)
719
#
720
# X[i] == %r[8+i]          #register assignments for X[] values during rounds (i=0..7)
721
#
722
#################
723
# MACRO: one round for 512-bit blocks
724
#
725
.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
726
#
727
    addReg      r\rn0, r\rn1
728
    RotL64      r\rn1, 512,%((\_Rn_) % 8),0
729
    xorReg      r\rn1, r\rn0
730
            \op1
731
    addReg      r\rn2, r\rn3
732
    RotL64      r\rn3, 512,%((\_Rn_) % 8),1
733
    xorReg      r\rn3, r\rn2
734
            \op2
735
    addReg      r\rn4, r\rn5
736
    RotL64      r\rn5, 512,%((\_Rn_) % 8),2
737
    xorReg      r\rn5, r\rn4
738
            \op3
739
    addReg      r\rn6, r\rn7
740
    RotL64      r\rn7, 512,%((\_Rn_) % 8),3
741
    xorReg      r\rn7, r\rn6
742
            \op4
743
    Skein_Debug_Round 512,%(\_Rn_+1),-4
744
#
745
.endm #R_512_OneRound
746
#
747
#################
748
# MACRO: eight rounds for 512-bit blocks
749
#
750
.macro R_512_FourRounds _RR_    #RR = base round number (0 % 8)
751
  .if ((SKEIN_ASM_UNROLL) & 512)
752
    # here for fully unrolled case.
753
    _II_ = ((\_RR_)/4) + 1       #key injection counter
754
    R_512_OneRound  8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
755
    R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
756
    R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
757
    R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
758
    # inject the key schedule
759
    addq    ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
760
    addReg   r11, rax
761
    addq    ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9
762
    addReg   r12, rbx
763
    addq    ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10
764
    addReg   r13, rcx
765
    addReg   r14, rdx
766
    addReg   r15, rsi,,,(_II_)
767
  .else
768
    # here for looping case                                                    #"rotate" key/tweak schedule (move up on stack)
769
    incq    %rdi                 #bump key injection counter
770
    R_512_OneRound  8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq      ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq      ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
771
    R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8)     >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
772
    R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq      ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq      ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
773
    R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq      ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
774
    # inject the key schedule
775
    addq    ksKey+8*0+F_O(%rbp,%rdi,8),%r8
776
    addReg   r11, rax
777
    addReg   r12, rbx
778
    addq    ksKey+8*1+F_O(%rbp,%rdi,8),%r9
779
    addReg   r13, rcx
780
    addReg   r14, rdx
781
    addq    ksKey+8*2+F_O(%rbp,%rdi,8),%r10
782
    addReg   r15, rsi
783
    addReg   r15, rdi              #inject the round number
784
  .endif
785

786
    #show the result of the key injection
787
    Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
788
.endm #R_512_EightRounds
789
#
790
#################
791
# instantiated code
792
#
793
C_label Skein_512_Process_Block
794
    Setup_Stack 512,ROUNDS_512/8
795
    movq    TWEAK+ 8(%rdi),%rbx
796
    jmp     Skein_512_block_loop
797
    .p2align 4
798
    # main hash loop for Skein_512
799
Skein_512_block_loop:
800
    # general register usage:
801
    #   RAX..RDX       = temps for key schedule pre-loads
802
    #   R8 ..R15       = X0..X7
803
    #   RSP, RBP       = stack/frame pointers
804
    #   RDI            = round counter or context pointer
805
    #   RSI            = temp
806
    #
807
    movq    TWEAK +  0(%rdi),%rax
808
    addq    bitAdd+F_O(%rbp),%rax     #computed updated tweak value T0
809
    movq    %rbx,%rcx
810
    xorq    %rax,%rcx                 #%rax/%rbx/%rcx = tweak schedule
811
    movq    %rax,TWEAK+ 0    (%rdi)   #save updated tweak value ctx->h.T[0]
812
    movq    %rax,ksTwk+ 0+F_O(%rbp)
813
    movq    $KW_PARITY,%rdx
814
    movq    blkPtr +F_O(%rbp),%rsi    #%rsi --> input block
815
    movq    %rbx,ksTwk+ 8+F_O(%rbp)
816
    movq    %rcx,ksTwk+16+F_O(%rbp)
817
    .irp _Rn_,8,9,10,11,12,13,14,15
818
      movq  X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_
819
      xorq  %r\_Rn_,%rdx              #compute overall parity
820
      movq  %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp)
821
    .endr                             #load state into %r8 ..%r15, compute parity
822
      movq  %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
823

824
    addReg   r13,rax                  #precompute key injection for tweak
825
    addReg   r14, rbx
826
.if _SKEIN_DEBUG
827
    movq    %rbx,TWEAK+ 8(%rdi)       #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
828
.endif
829
    movq     0(%rsi),%rax             #load input block
830
    movq     8(%rsi),%rbx 
831
    movq    16(%rsi),%rcx 
832
    movq    24(%rsi),%rdx 
833
    addReg   r8 , rax                 #do initial key injection
834
    addReg   r9 , rbx
835
    movq    %rax,Wcopy+ 0+F_O(%rbp)   #keep local copy for feedforward
836
    movq    %rbx,Wcopy+ 8+F_O(%rbp)
837
    addReg   r10, rcx
838
    addReg   r11, rdx
839
    movq    %rcx,Wcopy+16+F_O(%rbp)
840
    movq    %rdx,Wcopy+24+F_O(%rbp)
841

842
    movq    32(%rsi),%rax
843
    movq    40(%rsi),%rbx 
844
    movq    48(%rsi),%rcx 
845
    movq    56(%rsi),%rdx
846
    addReg   r12, rax
847
    addReg   r13, rbx
848
    addReg   r14, rcx
849
    addReg   r15, rdx
850
    movq    %rax,Wcopy+32+F_O(%rbp)    
851
    movq    %rbx,Wcopy+40+F_O(%rbp)    
852
    movq    %rcx,Wcopy+48+F_O(%rbp)    
853
    movq    %rdx,Wcopy+56+F_O(%rbp)    
854

855
.if _SKEIN_DEBUG
856
    .irp _Rn_,8,9,10,11,12,13,14,15   #save values on stack for debug output
857
      movq  %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp)
858
    .endr
859

860
    Skein_Debug_Block 512             #debug dump
861
    Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
862
.endif
863
    addq    $8*WCNT,%rsi              #skip the block
864
    movq    %rsi,blkPtr+F_O(%rbp)     #update block pointer
865
    #
866
    #################
867
    # now the key schedule is computed. Start the rounds
868
    #
869
.if (SKEIN_ASM_UNROLL) & 512
870
_UNROLL_CNT =   ROUNDS_512/8
871
.else
872
_UNROLL_CNT =   SKEIN_UNROLL_512
873
  .if ((ROUNDS_512/8) % _UNROLL_CNT)
874
    .error "Invalid SKEIN_UNROLL_512"
875
  .endif
876
    xorq    %rdi,%rdi                 #rdi = round counter
877
Skein_512_round_loop:
878
.endif
879
#
880
_Rbase_ = 0
881
.rept _UNROLL_CNT*2
882
      R_512_FourRounds %(4*_Rbase_+00)
883
_Rbase_ = _Rbase_+1
884
.endr #rept _UNROLL_CNT
885
#
886
.if ((SKEIN_ASM_UNROLL) & 512) == 0
887
    cmpq    $2*(ROUNDS_512/8),%rdi
888
    jb      Skein_512_round_loop
889
    movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
890
.endif
891
    # end of rounds
892
    #################
893
    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..7}
894
    .irp _Rn_,8,9,10,11,12,13,14,15
895
  .if (\_Rn_ == 8)
896
    movq    $FIRST_MASK64,%rbx
897
  .endif
898
      xorq  Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_  #feedforward XOR
899
      movq  %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi)     #and store result
900
  .if (\_Rn_ == 14)
901
    andq    TWEAK+ 8(%rdi),%rbx
902
  .endif
903
    .endr
904
    Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
905

906
    # go back for more blocks, if needed
907
    decq    blkCnt+F_O(%rbp)
908
    jnz     Skein_512_block_loop
909
    movq    %rbx,TWEAK + 8(%rdi)
910

911
    Reset_Stack
912
    ret
913
Skein_512_Process_Block_End:
914
#
915
  .if _SKEIN_DEBUG
916
# call here with rdx  = "round number"
917
Skein_Debug_Round_512:
918
    pushq   %rsi                     #save two regs for BLK_BITS-specific parms
919
    pushq   %rdi
920
  .irp _Rn_,8,9,10,11,12,13,14,15    #save X[] state on stack so debug routines can access it
921
    movq    %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp)
922
  .endr
923
    movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
924
    movq    $512,%rdi                #now <rdi,rsi,rdx> are set for the call
925
    jmp     Skein_Debug_Round_Common
926
  .endif
927
#
928
.if _SKEIN_CODE_SIZE
929
C_label Skein_512_Process_Block_CodeSize
930
    movq    $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax
931
    ret
932
#
933
C_label Skein_512_Unroll_Cnt
934
  .if _UNROLL_CNT <> (ROUNDS_512/8)
935
    movq    $_UNROLL_CNT,%rax
936
  .else
937
    xorq    %rax,%rax
938
  .endif
939
    ret
940
.endif
941
#
942
.endif # _USE_ASM_ & 512
943
#
944
#=================================== Skein1024 =============================================
945
.if _USE_ASM_ & 1024
946
#
947
# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
948
#
949
#################
950
# use details of permutation to make register assignments
951
# 
952
o1K_rdi =  0        #offsets in X[] associated with each register
953
o1K_rsi =  1 
954
o1K_rbp =  2 
955
o1K_rax =  3 
956
o1K_rcx =  4        #rcx is "shared" with X6, since X4/X6 alternate
957
o1K_rbx =  5 
958
o1K_rdx =  7 
959
o1K_r8  =  8  
960
o1K_r9  =  9  
961
o1K_r10 = 10
962
o1K_r11 = 11
963
o1K_r12 = 12
964
o1K_r13 = 13
965
o1K_r14 = 14
966
o1K_r15 = 15
967
#
968
rIdx_offs = tmpStk_1024
969
#
970
.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
971
    addReg      \reg0 , \reg1                      #perform the MIX
972
    RotL64      \reg1 , 1024,%((\_RN0_) % 8),\_Rn1_
973
    xorReg      \reg1 , \reg0
974
.if ((\_RN0_) & 3) == 3        #time to do key injection?
975
 .if _SKEIN_DEBUG
976
    movq       %\reg0 , xDebug_1024+8*\w0(%rsp)    #save intermediate values for Debug_Round
977
    movq       %\reg1 , xDebug_1024+8*\w1(%rsp)    # (before inline key injection)
978
 .endif
979
_II_ = ((\_RN0_)/4)+1           #injection count
980
 .if (SKEIN_ASM_UNROLL) & 1024   #here to do fully unrolled key injection
981
    addq        ksKey+ 8*((_II_+\w0) % 17)(%rsp),%\reg0
982
    addq        ksKey+ 8*((_II_+\w1) % 17)(%rsp),%\reg1
983
  .if     \w1 == 13                                #tweak injection
984
    addq        ksTwk+ 8*((_II_+ 0) %  3)(%rsp),%\reg1
985
  .elseif \w0 == 14
986
    addq        ksTwk+ 8*((_II_+ 1) %  3)(%rsp),%\reg0
987
  .elseif \w1 == 15
988
    addq        $_II_, %\reg1                      #(injection counter)
989
  .endif
990
 .else                          #here to do looping  key injection
991
  .if  (\w0 == 0)
992
    movq        %rdi, X_stk+8*\w0(%rsp)            #if so, store N0 so we can use reg as index
993
    movq         rIdx_offs(%rsp),%rdi              #get the injection counter index into rdi
994
  .else
995
    addq         ksKey+8+8*\w0(%rsp,%rdi,8),%\reg0 #even key injection
996
  .endif
997
  .if     \w1 == 13                                #tweak injection
998
    addq         ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1  
999
  .elseif \w0 == 14
1000
    addq         ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0  
1001
  .elseif \w1 == 15
1002
    addReg      \reg1,rdi,,,1                      #(injection counter)
1003
  .endif
1004
    addq         ksKey+8+8*\w1(%rsp,%rdi,8),%\reg1 #odd key injection
1005
 .endif
1006
.endif
1007
    # insert the op provided, .if any
1008
    \op1
1009
.endm
1010
#################
1011
# MACRO: four rounds for 1024-bit blocks
1012
#
1013
.macro r1024_FourRounds _RR_    #RR = base round number (0 mod 4)
1014
    # should be here with X4 set properly, X6 stored on stack
1015
_Rn_ = (\_RR_) + 0
1016
        r1024_Mix  0, 1,rdi,rsi,_Rn_,0
1017
        r1024_Mix  2, 3,rbp,rax,_Rn_,1
1018
        r1024_Mix  4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
1019
        r1024_Mix  8, 9,r8 ,r9 ,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack 
1020
        r1024_Mix 10,11,r10,r11,_Rn_,5
1021
        r1024_Mix 12,13,r12,r13,_Rn_,6
1022
        r1024_Mix  6, 7,rcx,rdx,_Rn_,3
1023
        r1024_Mix 14,15,r14,r15,_Rn_,7
1024
    .if _SKEIN_DEBUG
1025
      Skein_Debug_Round 1024,%(_Rn_+1)
1026
    .endif
1027
_Rn_ = (\_RR_) + 1
1028
        r1024_Mix  0, 9,rdi,r9 ,_Rn_,0
1029
        r1024_Mix  2,13,rbp,r13,_Rn_,1
1030
        r1024_Mix  6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
1031
        r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack 
1032
        r1024_Mix 12, 3,r12,rax,_Rn_,5
1033
        r1024_Mix 14, 5,r14,rbx,_Rn_,6
1034
        r1024_Mix  4,15,rcx,r15,_Rn_,3
1035
        r1024_Mix  8, 1,r8 ,rsi,_Rn_,7
1036
    .if _SKEIN_DEBUG
1037
      Skein_Debug_Round 1024,%(_Rn_+1)
1038
    .endif
1039
_Rn_ = (\_RR_) + 2
1040
        r1024_Mix  0, 7,rdi,rdx,_Rn_,0
1041
        r1024_Mix  2, 5,rbp,rbx,_Rn_,1
1042
        r1024_Mix  4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
1043
        r1024_Mix 12,15,r12,r15,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack 
1044
        r1024_Mix 14,13,r14,r13,_Rn_,5
1045
        r1024_Mix  8,11,r8 ,r11,_Rn_,6
1046
        r1024_Mix  6, 1,rcx,rsi,_Rn_,3
1047
        r1024_Mix 10, 9,r10,r9 ,_Rn_,7
1048
    .if _SKEIN_DEBUG
1049
      Skein_Debug_Round 1024,%(_Rn_+1)
1050
    .endif
1051
_Rn_ = (\_RR_) + 3
1052
        r1024_Mix  0,15,rdi,r15,_Rn_,0
1053
        r1024_Mix  2,11,rbp,r11,_Rn_,1
1054
        r1024_Mix  6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
1055
        r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack 
1056
        r1024_Mix  8, 5,r8 ,rbx,_Rn_,5
1057
        r1024_Mix 10, 3,r10,rax,_Rn_,6
1058
        r1024_Mix  4, 9,rcx,r9 ,_Rn_,3
1059
        r1024_Mix 12, 7,r12,rdx,_Rn_,7
1060
    .if _SKEIN_DEBUG
1061
      Skein_Debug_Round 1024,%(_Rn_+1)
1062
    .endif
1063

1064
  .if ((SKEIN_ASM_UNROLL) & 1024) == 0           #here with rdi == rIdx, X0 on stack
1065
    #"rotate" the key schedule on the stack
1066
i8 = o1K_r8
1067
i0 = o1K_rdi
1068
    movq    %r8 , X_stk+8*i8(%rsp)              #free up a register (save it on the stack)
1069
    movq          ksKey+8* 0(%rsp,%rdi,8),%r8   #get  key  word
1070
    movq    %r8 , ksKey+8*17(%rsp,%rdi,8)       #rotate key (must do key first or tweak clobbers it!)
1071
    movq          ksTwk+8* 0(%rsp,%rdi,8),%r8   #get tweak word
1072
    movq    %r8 , ksTwk+8* 3(%rsp,%rdi,8)       #rotate tweak (onto the stack)
1073
    movq          X_stk+8*i8(%rsp)       ,%r8   #get the reg back
1074
    incq    %rdi                                #bump the index
1075
    movq    %rdi, rIdx_offs (%rsp)              #save rdi again
1076
    movq          ksKey+8*i0(%rsp,%rdi,8),%rdi  #get the key schedule word for X0 back
1077
    addq          X_stk+8*i0(%rsp)       ,%rdi  #perform the X0 key injection
1078
  .endif
1079
    #show the result of the key injection
1080
    Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
1081
.endm #r1024_FourRounds
1082
#
1083
################
1084
# code
1085
#
1086
C_label Skein1024_Process_Block
1087
#
1088
    Setup_Stack 1024,ROUNDS_1024/8,WCNT
1089
    movq    TWEAK+ 8(%rdi),%r9
1090
    jmp     Skein1024_block_loop
1091
    # main hash loop for Skein1024
1092
    .p2align 4
1093
Skein1024_block_loop:
1094
    # general register usage:
1095
    #   RSP              = stack pointer
1096
    #   RAX..RDX,RSI,RDI = X1, X3..X7 (state words)
1097
    #   R8 ..R15         = X8..X15    (state words)
1098
    #   RBP              = temp (used for X0 and X2)
1099
    #
1100
  .if ((SKEIN_ASM_UNROLL) & 1024) == 0
1101
    xorq    %rax,%rax                      #init loop index on the stack
1102
    movq    %rax,rIdx_offs(%rsp)
1103
  .endif
1104
    movq         TWEAK+     0(%rdi),%r8
1105
    addq         bitAdd+  F_O(%rbp),%r8    #computed updated tweak value T0
1106
    movq    %r9 ,%r10 
1107
    xorq    %r8 ,%r10                      #%rax/%rbx/%rcx = tweak schedule
1108
    movq    %r8 ,TWEAK+     0(%rdi)        #save updated tweak value ctx->h.T[0]
1109
    movq    %r8 ,ksTwk+ 0+F_O(%rbp)
1110
    movq    %r9 ,ksTwk+ 8+F_O(%rbp)        #keep values in %r8 ,%r9  for initial tweak injection below
1111
    movq    %r10,ksTwk+16+F_O(%rbp)
1112
  .if _SKEIN_DEBUG
1113
    movq    %r9 ,TWEAK+     8(%rdi)        #save updated tweak value ctx->h.T[1] for Skein_Debug_Block
1114
  .endif
1115
    movq         blkPtr +F_O(%rbp),%rsi    # rsi --> input block
1116
    movq        $KW_PARITY        ,%rax    #overall key schedule parity
1117

1118
    # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]
1119
    .irp _rN_,0,1,2,3,4,6                  #process the "initial" words, using r14/r15 as temps
1120
      movq       X_VARS+8*\_rN_(%rdi),%r14 #get state word
1121
      movq              8*\_rN_(%rsi),%r15 #get msg   word
1122
      xorq  %r14,%rax                      #update key schedule overall parity
1123
      movq  %r14,ksKey +8*\_rN_+F_O(%rbp)  #save key schedule word on stack
1124
      movq  %r15,Wcopy +8*\_rN_+F_O(%rbp)  #save local msg Wcopy
1125
      addq  %r15,%r14                      #do the initial key injection
1126
      movq  %r14,X_stk +8*\_rN_    (%rsp)  #save initial state var on stack
1127
    .endr
1128
    # now process the rest, using the "real" registers 
1129
    #     (MUST do it in reverse order to inject tweaks r8/r9 first)
1130
    .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx
1131
_oo_ = o1K_\_rr_                           #offset assocated with the register
1132
      movq  X_VARS+8*_oo_(%rdi),%\_rr_     #get key schedule word from context
1133
      movq         8*_oo_(%rsi),%rcx       #get next input msg word
1134
      movq  %\_rr_, ksKey +8*_oo_(%rsp)    #save key schedule on stack
1135
      xorq  %\_rr_, %rax                   #accumulate key schedule parity
1136
      movq  %rcx,Wcopy+8*_oo_+F_O(%rbp)    #save copy of msg word for feedforward
1137
      addq  %rcx,%\_rr_                    #do the initial  key  injection
1138
      .if    _oo_ == 13                    #do the initial tweak injection
1139
        addReg \_rr_,r8                    #          (only in words 13/14)
1140
      .elseif _oo_ == 14
1141
        addReg \_rr_,r9
1142
      .endif
1143
    .endr
1144
    movq    %rax,ksKey+8*WCNT+F_O(%rbp)    #save key schedule parity
1145
.if _SKEIN_DEBUG
1146
    Skein_Debug_Block 1024                 #initial debug dump
1147
.endif
1148
    addq     $8*WCNT,%rsi                  #bump the msg ptr
1149
    movq     %rsi,blkPtr+F_O(%rbp)         #save bumped msg ptr
1150
    # re-load words 0..4 from stack, enter the main loop
1151
    .irp _rr_,rdi,rsi,rbp,rax,rcx          #(no need to re-load x6, already on stack)
1152
      movq  X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go!
1153
    .endr
1154
.if _SKEIN_DEBUG
1155
    Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL        #show state after initial key injection
1156
.endif
1157
    #
1158
    #################
1159
    # now the key schedule is computed. Start the rounds
1160
    #
1161
.if (SKEIN_ASM_UNROLL) & 1024
1162
_UNROLL_CNT =   ROUNDS_1024/8
1163
.else
1164
_UNROLL_CNT =   SKEIN_UNROLL_1024
1165
  .if ((ROUNDS_1024/8) % _UNROLL_CNT)
1166
    .error "Invalid SKEIN_UNROLL_1024"
1167
  .endif
1168
Skein1024_round_loop:
1169
.endif
1170
#
1171
_Rbase_ = 0
1172
.rept _UNROLL_CNT*2                        #implement the rounds, 4 at a time
1173
      r1024_FourRounds %(4*_Rbase_+00)
1174
_Rbase_ = _Rbase_+1
1175
.endr #rept _UNROLL_CNT
1176
#
1177
.if ((SKEIN_ASM_UNROLL) & 1024) == 0
1178
    cmpq    $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done
1179
    jb      Skein1024_round_loop    
1180
.endif
1181
    # end of rounds
1182
    #################
1183
    #
1184
    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..15}
1185
    movq    %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack
1186
    movq       ctxPtr(%rsp),%rdx
1187
    
1188
    .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15   #do all but x6,x7
1189
_oo_ = o1K_\_rr_
1190
      xorq  Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR
1191
      movq  %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context
1192
      .if (_oo_ ==  9)
1193
        movq   $FIRST_MASK64 ,%r9
1194
      .endif
1195
      .if (_oo_ == 14)
1196
        andq   TWEAK+ 8(%rdx),%r9
1197
      .endif
1198
    .endr
1199
    # 
1200
    movq         X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above)
1201
    movq         X_stk +8*7(%rsp),%rbx
1202
    xorq         Wcopy +8*6(%rsp),%rax
1203
    xorq         Wcopy +8*7(%rsp),%rbx
1204
    movq    %rax,X_VARS+8*6(%rdx)
1205
    decq             blkCnt(%rsp)      #set zero flag iff done
1206
    movq    %rbx,X_VARS+8*7(%rdx)
1207

1208
    Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)>
1209
    # go back for more blocks, if needed
1210
    movq             ctxPtr(%rsp),%rdi #don't muck with the flags here!
1211
    lea          FRAME_OFFS(%rsp),%rbp
1212
    jnz     Skein1024_block_loop
1213
    movq    %r9 ,TWEAK+   8(%rdx)
1214
    Reset_Stack
1215
    ret
1216
#
1217
Skein1024_Process_Block_End:
1218
#
1219
.if _SKEIN_DEBUG
1220
Skein_Debug_Round_1024:
1221
    # call here with rdx  = "round number",
1222
_SP_OFFS_ = 8*2                     #stack "offset" here: rdx, return addr
1223
    #
1224
  #save rest of X[] state on stack so debug routines can access it
1225
  .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15
1226
    movq    %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp)
1227
  .endr
1228
    # Figure out what to do with x0 (rdi).  When rdx == 0 mod 4, it's already on stack
1229
    cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always save
1230
    jae     save_x0
1231
    testq   $3,%rdx                 #otherwise only if rdx != 0 mod 4
1232
    jz      save_x0_not
1233
save_x0:
1234
    movq    %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp)
1235
save_x0_not:
1236
    #figure out the x4/x6 swapping state and save the correct one!
1237
    cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4
1238
    jae     save_x4
1239
    testq   $1,%rdx                  #and even ones have r4 as well
1240
    jz      save_x4
1241
    movq    %rcx,X_stk+8*6+_SP_OFFS_(%rsp)
1242
    jmp     debug_1024_go
1243
save_x4:
1244
    movq    %rcx,X_stk+8*4+_SP_OFFS_(%rsp)
1245
debug_1024_go:
1246
    #now all is saved in Xstk[] except for rdx
1247
    push    %rsi                    #save two regs for BLK_BITS-specific parms
1248
    push    %rdi
1249
_SP_OFFS_ = _SP_OFFS_ + 16          #adjust stack offset accordingly (now 32)
1250

1251
    movq    _SP_OFFS_-8(%rsp),%rsi  #get back original %rdx (pushed on stack in macro call)
1252
    movq    %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[]
1253

1254
    movq    ctxPtr+_SP_OFFS_(%rsp),%rsi  #rsi = ctx_hdr_ptr
1255
    movq    $1024,%rdi                   #rdi = block size
1256
    jmp     Skein_Debug_Round_Common
1257
.endif
1258
#
1259
.if _SKEIN_CODE_SIZE
1260
C_label Skein1024_Process_Block_CodeSize
1261
    movq    $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax
1262
    ret
1263
#
1264
C_label Skein1024_Unroll_Cnt
1265
  .if _UNROLL_CNT <> (ROUNDS_1024/8)
1266
    movq    $_UNROLL_CNT,%rax
1267
  .else
1268
    xorq    %rax,%rax
1269
  .endif
1270
    ret
1271
.endif
1272
#
1273
.endif # _USE_ASM_ and 1024
1274
#
1275
.if _SKEIN_DEBUG
1276
#----------------------------------------------------------------
1277
#local debug routine to set up for calls to:
1278
#  void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)
1279
#                       [       rdi                        rsi   rdx              rcx]
1280
#
1281
# here with %rdx = round number
1282
#           %rsi = ctx_hdr_ptr
1283
#           %rdi = block size (256/512/1024)
1284
# on stack: saved rdi, saved rsi, retAddr, saved rdx  
1285
#
1286
Skein_Debug_Round_Common:
1287
_SP_OFFS_ = 32                        #account for four words on stack already
1288
  .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15  #save the rest of the regs
1289
    pushq %\_rr_
1290
_SP_OFFS_ = _SP_OFFS_+8
1291
  .endr
1292
  .if (_SP_OFFS_ % 16)                # make sure stack is still 16-byte aligned here
1293
    .error  "Debug_Round_Common: stack alignment"
1294
  .endif
1295
    # compute %rcx  = ptr to the X[] array on the stack (final parameter to call)
1296
    leaq    X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address
1297
    cmpq    $SKEIN_RND_FEED_FWD,%rdx   #special handling for feedforward "round"?
1298
    jnz     _got_rcxA
1299
    leaq    X_VARS(%rsi),%rcx
1300
_got_rcxA:
1301
  .if _USE_ASM_ & 1024
1302
    # special handling for 1024-bit case
1303
    #    (for rounds right before with key injection: 
1304
    #        use xDebug_1024[] instead of X_stk[])
1305
    cmpq    $SKEIN_RND_SPECIAL,%rdx
1306
    jae     _got_rcxB               #must be a normal round
1307
    orq     %rdx,%rdx
1308
    jz      _got_rcxB               #just before key injection
1309
    test    $3,%rdx
1310
    jne     _got_rcxB
1311
    cmp     $1024,%rdi              #only 1024-bit(s) for now
1312
    jne     _got_rcxB
1313
    leaq    xDebug_1024+_SP_OFFS_(%rsp),%rcx
1314
_got_rcxB:
1315
  .endif
1316
    call    Skein_Show_Round        #call external debug handler
1317

1318
  .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax  #restore regs
1319
    popq  %\_rr_
1320
_SP_OFFS_ = _SP_OFFS_-8
1321
  .endr
1322
  .if _SP_OFFS_ - 32
1323
    .error   "Debug_Round_Common: push/pop misalignment!"
1324
  .endif    
1325
    popq    %rdi
1326
    popq    %rsi
1327
    ret
1328
.endif
1329
#----------------------------------------------------------------
1330
    .section .note.GNU-stack,"",@progbits
1331

1332
    .end
1333

1334
Product

Resources

Company