Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/sha1-avx2-asm.S
26292 views
1
/*
2
* Implement fast SHA-1 with AVX2 instructions. (x86_64)
3
*
4
* This file is provided under a dual BSD/GPLv2 license. When using or
5
* redistributing this file, you may do so under either license.
6
*
7
* GPL LICENSE SUMMARY
8
*
9
* Copyright(c) 2014 Intel Corporation.
10
*
11
* This program is free software; you can redistribute it and/or modify
12
* it under the terms of version 2 of the GNU General Public License as
13
* published by the Free Software Foundation.
14
*
15
* This program is distributed in the hope that it will be useful, but
16
* WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18
* General Public License for more details.
19
*
20
* Contact Information:
21
* Ilya Albrekht <[email protected]>
22
* Maxim Locktyukhin <[email protected]>
23
* Ronen Zohar <[email protected]>
24
* Chandramouli Narayanan <[email protected]>
25
*
26
* BSD LICENSE
27
*
28
* Copyright(c) 2014 Intel Corporation.
29
*
30
* Redistribution and use in source and binary forms, with or without
31
* modification, are permitted provided that the following conditions
32
* are met:
33
*
34
* Redistributions of source code must retain the above copyright
35
* notice, this list of conditions and the following disclaimer.
36
* Redistributions in binary form must reproduce the above copyright
37
* notice, this list of conditions and the following disclaimer in
38
* the documentation and/or other materials provided with the
39
* distribution.
40
* Neither the name of Intel Corporation nor the names of its
41
* contributors may be used to endorse or promote products derived
42
* from this software without specific prior written permission.
43
*
44
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
45
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
46
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
47
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
48
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
49
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
50
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
51
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
52
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
53
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
54
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55
*
56
*/
57
58
/*
59
* SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
60
*
61
*This implementation is based on the previous SSSE3 release:
62
*Visit http://software.intel.com/en-us/articles/
63
*and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
64
*
65
* void sha1_transform_avx2(struct sha1_block_state *state,
66
* const u8 *data, size_t nblocks);
67
*/
68
69
#include <linux/linkage.h>
70
71
#define CTX %rdi /* arg1 */
72
#define BUF %rsi /* arg2 */
73
#define CNT %rdx /* arg3 */
74
75
#define REG_A %ecx
76
#define REG_B %esi
77
#define REG_C %edi
78
#define REG_D %eax
79
#define REG_E %edx
80
#define REG_TB %ebx
81
#define REG_TA %r12d
82
#define REG_RA %rcx
83
#define REG_RB %rsi
84
#define REG_RC %rdi
85
#define REG_RD %rax
86
#define REG_RE %rdx
87
#define REG_RTA %r12
88
#define REG_RTB %rbx
89
#define REG_T1 %r11d
90
#define xmm_mov vmovups
91
#define avx2_zeroupper vzeroupper
92
#define RND_F1 1
93
#define RND_F2 2
94
#define RND_F3 3
95
96
.macro REGALLOC
97
.set A, REG_A
98
.set B, REG_B
99
.set C, REG_C
100
.set D, REG_D
101
.set E, REG_E
102
.set TB, REG_TB
103
.set TA, REG_TA
104
105
.set RA, REG_RA
106
.set RB, REG_RB
107
.set RC, REG_RC
108
.set RD, REG_RD
109
.set RE, REG_RE
110
111
.set RTA, REG_RTA
112
.set RTB, REG_RTB
113
114
.set T1, REG_T1
115
.endm
116
117
#define HASH_PTR %r9
118
#define BLOCKS_CTR %r8
119
#define BUFFER_PTR %r10
120
#define BUFFER_PTR2 %r13
121
122
#define PRECALC_BUF %r14
123
#define WK_BUF %r15
124
125
#define W_TMP %xmm0
126
#define WY_TMP %ymm0
127
#define WY_TMP2 %ymm9
128
129
# AVX2 variables
130
#define WY0 %ymm3
131
#define WY4 %ymm5
132
#define WY08 %ymm7
133
#define WY12 %ymm8
134
#define WY16 %ymm12
135
#define WY20 %ymm13
136
#define WY24 %ymm14
137
#define WY28 %ymm15
138
139
#define YMM_SHUFB_BSWAP %ymm10
140
141
/*
142
* Keep 2 iterations precalculated at a time:
143
* - 80 DWORDs per iteration * 2
144
*/
145
#define W_SIZE (80*2*2 +16)
146
147
#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
148
#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
149
150
151
.macro UPDATE_HASH hash, val
152
add \hash, \val
153
mov \val, \hash
154
.endm
155
156
.macro PRECALC_RESET_WY
157
.set WY_00, WY0
158
.set WY_04, WY4
159
.set WY_08, WY08
160
.set WY_12, WY12
161
.set WY_16, WY16
162
.set WY_20, WY20
163
.set WY_24, WY24
164
.set WY_28, WY28
165
.set WY_32, WY_00
166
.endm
167
168
.macro PRECALC_ROTATE_WY
169
/* Rotate macros */
170
.set WY_32, WY_28
171
.set WY_28, WY_24
172
.set WY_24, WY_20
173
.set WY_20, WY_16
174
.set WY_16, WY_12
175
.set WY_12, WY_08
176
.set WY_08, WY_04
177
.set WY_04, WY_00
178
.set WY_00, WY_32
179
180
/* Define register aliases */
181
.set WY, WY_00
182
.set WY_minus_04, WY_04
183
.set WY_minus_08, WY_08
184
.set WY_minus_12, WY_12
185
.set WY_minus_16, WY_16
186
.set WY_minus_20, WY_20
187
.set WY_minus_24, WY_24
188
.set WY_minus_28, WY_28
189
.set WY_minus_32, WY
190
.endm
191
192
.macro PRECALC_00_15
193
.if (i == 0) # Initialize and rotate registers
194
PRECALC_RESET_WY
195
PRECALC_ROTATE_WY
196
.endif
197
198
/* message scheduling pre-compute for rounds 0-15 */
199
.if ((i & 7) == 0)
200
/*
201
* blended AVX2 and ALU instruction scheduling
202
* 1 vector iteration per 8 rounds
203
*/
204
vmovdqu (i * 2)(BUFFER_PTR), W_TMP
205
.elseif ((i & 7) == 1)
206
vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
207
WY_TMP, WY_TMP
208
.elseif ((i & 7) == 2)
209
vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
210
.elseif ((i & 7) == 4)
211
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
212
.elseif ((i & 7) == 7)
213
vmovdqu WY_TMP, PRECALC_WK(i&~7)
214
215
PRECALC_ROTATE_WY
216
.endif
217
.endm
218
219
.macro PRECALC_16_31
220
/*
221
* message scheduling pre-compute for rounds 16-31
222
* calculating last 32 w[i] values in 8 XMM registers
223
* pre-calculate K+w[i] values and store to mem
224
* for later load by ALU add instruction
225
*
226
* "brute force" vectorization for rounds 16-31 only
227
* due to w[i]->w[i-3] dependency
228
*/
229
.if ((i & 7) == 0)
230
/*
231
* blended AVX2 and ALU instruction scheduling
232
* 1 vector iteration per 8 rounds
233
*/
234
/* w[i-14] */
235
vpalignr $8, WY_minus_16, WY_minus_12, WY
236
vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
237
.elseif ((i & 7) == 1)
238
vpxor WY_minus_08, WY, WY
239
vpxor WY_minus_16, WY_TMP, WY_TMP
240
.elseif ((i & 7) == 2)
241
vpxor WY_TMP, WY, WY
242
vpslldq $12, WY, WY_TMP2
243
.elseif ((i & 7) == 3)
244
vpslld $1, WY, WY_TMP
245
vpsrld $31, WY, WY
246
.elseif ((i & 7) == 4)
247
vpor WY, WY_TMP, WY_TMP
248
vpslld $2, WY_TMP2, WY
249
.elseif ((i & 7) == 5)
250
vpsrld $30, WY_TMP2, WY_TMP2
251
vpxor WY, WY_TMP, WY_TMP
252
.elseif ((i & 7) == 7)
253
vpxor WY_TMP2, WY_TMP, WY
254
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
255
vmovdqu WY_TMP, PRECALC_WK(i&~7)
256
257
PRECALC_ROTATE_WY
258
.endif
259
.endm
260
261
.macro PRECALC_32_79
262
/*
263
* in SHA-1 specification:
264
* w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
265
* instead we do equal:
266
* w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
267
* allows more efficient vectorization
268
* since w[i]=>w[i-3] dependency is broken
269
*/
270
271
.if ((i & 7) == 0)
272
/*
273
* blended AVX2 and ALU instruction scheduling
274
* 1 vector iteration per 8 rounds
275
*/
276
vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
277
.elseif ((i & 7) == 1)
278
/* W is W_minus_32 before xor */
279
vpxor WY_minus_28, WY, WY
280
.elseif ((i & 7) == 2)
281
vpxor WY_minus_16, WY_TMP, WY_TMP
282
.elseif ((i & 7) == 3)
283
vpxor WY_TMP, WY, WY
284
.elseif ((i & 7) == 4)
285
vpslld $2, WY, WY_TMP
286
.elseif ((i & 7) == 5)
287
vpsrld $30, WY, WY
288
vpor WY, WY_TMP, WY
289
.elseif ((i & 7) == 7)
290
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
291
vmovdqu WY_TMP, PRECALC_WK(i&~7)
292
293
PRECALC_ROTATE_WY
294
.endif
295
.endm
296
297
.macro PRECALC r, s
298
.set i, \r
299
300
.if (i < 40)
301
.set K_XMM, 32*0
302
.elseif (i < 80)
303
.set K_XMM, 32*1
304
.elseif (i < 120)
305
.set K_XMM, 32*2
306
.else
307
.set K_XMM, 32*3
308
.endif
309
310
.if (i<32)
311
PRECALC_00_15 \s
312
.elseif (i<64)
313
PRECALC_16_31 \s
314
.elseif (i < 160)
315
PRECALC_32_79 \s
316
.endif
317
.endm
318
319
.macro ROTATE_STATE
320
.set T_REG, E
321
.set E, D
322
.set D, C
323
.set C, B
324
.set B, TB
325
.set TB, A
326
.set A, T_REG
327
328
.set T_REG, RE
329
.set RE, RD
330
.set RD, RC
331
.set RC, RB
332
.set RB, RTB
333
.set RTB, RA
334
.set RA, T_REG
335
.endm
336
337
/* Macro relies on saved ROUND_Fx */
338
339
.macro RND_FUN f, r
340
.if (\f == RND_F1)
341
ROUND_F1 \r
342
.elseif (\f == RND_F2)
343
ROUND_F2 \r
344
.elseif (\f == RND_F3)
345
ROUND_F3 \r
346
.endif
347
.endm
348
349
.macro RR r
350
.set round_id, (\r % 80)
351
352
.if (round_id == 0) /* Precalculate F for first round */
353
.set ROUND_FUNC, RND_F1
354
mov B, TB
355
356
rorx $(32-30), B, B /* b>>>2 */
357
andn D, TB, T1
358
and C, TB
359
xor T1, TB
360
.endif
361
362
RND_FUN ROUND_FUNC, \r
363
ROTATE_STATE
364
365
.if (round_id == 18)
366
.set ROUND_FUNC, RND_F2
367
.elseif (round_id == 38)
368
.set ROUND_FUNC, RND_F3
369
.elseif (round_id == 58)
370
.set ROUND_FUNC, RND_F2
371
.endif
372
373
.set round_id, ( (\r+1) % 80)
374
375
RND_FUN ROUND_FUNC, (\r+1)
376
ROTATE_STATE
377
.endm
378
379
.macro ROUND_F1 r
380
add WK(\r), E
381
382
andn C, A, T1 /* ~b&d */
383
lea (RE,RTB), E /* Add F from the previous round */
384
385
rorx $(32-5), A, TA /* T2 = A >>> 5 */
386
rorx $(32-30),A, TB /* b>>>2 for next round */
387
388
PRECALC (\r) /* msg scheduling for next 2 blocks */
389
390
/*
391
* Calculate F for the next round
392
* (b & c) ^ andn[b, d]
393
*/
394
and B, A /* b&c */
395
xor T1, A /* F1 = (b&c) ^ (~b&d) */
396
397
lea (RE,RTA), E /* E += A >>> 5 */
398
.endm
399
400
.macro ROUND_F2 r
401
add WK(\r), E
402
lea (RE,RTB), E /* Add F from the previous round */
403
404
/* Calculate F for the next round */
405
rorx $(32-5), A, TA /* T2 = A >>> 5 */
406
.if ((round_id) < 79)
407
rorx $(32-30), A, TB /* b>>>2 for next round */
408
.endif
409
PRECALC (\r) /* msg scheduling for next 2 blocks */
410
411
.if ((round_id) < 79)
412
xor B, A
413
.endif
414
415
add TA, E /* E += A >>> 5 */
416
417
.if ((round_id) < 79)
418
xor C, A
419
.endif
420
.endm
421
422
.macro ROUND_F3 r
423
add WK(\r), E
424
PRECALC (\r) /* msg scheduling for next 2 blocks */
425
426
lea (RE,RTB), E /* Add F from the previous round */
427
428
mov B, T1
429
or A, T1
430
431
rorx $(32-5), A, TA /* T2 = A >>> 5 */
432
rorx $(32-30), A, TB /* b>>>2 for next round */
433
434
/* Calculate F for the next round
435
* (b and c) or (d and (b or c))
436
*/
437
and C, T1
438
and B, A
439
or T1, A
440
441
add TA, E /* E += A >>> 5 */
442
443
.endm
444
445
/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
446
* %1 + %2 >= %3 ? %4 : 0
447
*/
448
.macro ADD_IF_GE a, b, c, d
449
mov \a, RTA
450
add $\d, RTA
451
cmp $\c, \b
452
cmovge RTA, \a
453
.endm
454
455
/*
456
* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
457
*/
458
.macro SHA1_PIPELINED_MAIN_BODY
459
460
REGALLOC
461
462
mov (HASH_PTR), A
463
mov 4(HASH_PTR), B
464
mov 8(HASH_PTR), C
465
mov 12(HASH_PTR), D
466
mov 16(HASH_PTR), E
467
468
mov %rsp, PRECALC_BUF
469
lea (2*4*80+32)(%rsp), WK_BUF
470
471
# Precalc WK for first 2 blocks
472
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
473
.set i, 0
474
.rept 160
475
PRECALC i
476
.set i, i + 1
477
.endr
478
479
/* Go to next block if needed */
480
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
481
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
482
xchg WK_BUF, PRECALC_BUF
483
484
.align 32
485
.L_loop:
486
/*
487
* code loops through more than one block
488
* we use K_BASE value as a signal of a last block,
489
* it is set below by: cmovae BUFFER_PTR, K_BASE
490
*/
491
test BLOCKS_CTR, BLOCKS_CTR
492
jnz .L_begin
493
.align 32
494
jmp .L_end
495
.align 32
496
.L_begin:
497
498
/*
499
* Do first block
500
* rounds: 0,2,4,6,8
501
*/
502
.set j, 0
503
.rept 5
504
RR j
505
.set j, j+2
506
.endr
507
508
/*
509
* rounds:
510
* 10,12,14,16,18
511
* 20,22,24,26,28
512
* 30,32,34,36,38
513
* 40,42,44,46,48
514
* 50,52,54,56,58
515
*/
516
.rept 25
517
RR j
518
.set j, j+2
519
.endr
520
521
/* Update Counter */
522
sub $1, BLOCKS_CTR
523
/* Move to the next block only if needed*/
524
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
525
/*
526
* rounds
527
* 60,62,64,66,68
528
* 70,72,74,76,78
529
*/
530
.rept 10
531
RR j
532
.set j, j+2
533
.endr
534
535
UPDATE_HASH (HASH_PTR), A
536
UPDATE_HASH 4(HASH_PTR), TB
537
UPDATE_HASH 8(HASH_PTR), C
538
UPDATE_HASH 12(HASH_PTR), D
539
UPDATE_HASH 16(HASH_PTR), E
540
541
test BLOCKS_CTR, BLOCKS_CTR
542
jz .L_loop
543
544
mov TB, B
545
546
/* Process second block */
547
/*
548
* rounds
549
* 0+80, 2+80, 4+80, 6+80, 8+80
550
* 10+80,12+80,14+80,16+80,18+80
551
*/
552
553
.set j, 0
554
.rept 10
555
RR j+80
556
.set j, j+2
557
.endr
558
559
/*
560
* rounds
561
* 20+80,22+80,24+80,26+80,28+80
562
* 30+80,32+80,34+80,36+80,38+80
563
*/
564
.rept 10
565
RR j+80
566
.set j, j+2
567
.endr
568
569
/*
570
* rounds
571
* 40+80,42+80,44+80,46+80,48+80
572
* 50+80,52+80,54+80,56+80,58+80
573
*/
574
.rept 10
575
RR j+80
576
.set j, j+2
577
.endr
578
579
/* update counter */
580
sub $1, BLOCKS_CTR
581
/* Move to the next block only if needed*/
582
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
583
584
/*
585
* rounds
586
* 60+80,62+80,64+80,66+80,68+80
587
* 70+80,72+80,74+80,76+80,78+80
588
*/
589
.rept 10
590
RR j+80
591
.set j, j+2
592
.endr
593
594
UPDATE_HASH (HASH_PTR), A
595
UPDATE_HASH 4(HASH_PTR), TB
596
UPDATE_HASH 8(HASH_PTR), C
597
UPDATE_HASH 12(HASH_PTR), D
598
UPDATE_HASH 16(HASH_PTR), E
599
600
/* Reset state for AVX2 reg permutation */
601
mov A, TA
602
mov TB, A
603
mov C, TB
604
mov E, C
605
mov D, B
606
mov TA, D
607
608
REGALLOC
609
610
xchg WK_BUF, PRECALC_BUF
611
612
jmp .L_loop
613
614
.align 32
615
.L_end:
616
617
.endm
618
/*
619
* macro implements SHA-1 function's body for several 64-byte blocks
620
* param: function's name
621
*/
622
.macro SHA1_VECTOR_ASM name
623
SYM_FUNC_START(\name)
624
625
push %rbx
626
push %r12
627
push %r13
628
push %r14
629
push %r15
630
631
RESERVE_STACK = (W_SIZE*4 + 8+24)
632
633
/* Align stack */
634
push %rbp
635
mov %rsp, %rbp
636
and $~(0x20-1), %rsp
637
sub $RESERVE_STACK, %rsp
638
639
avx2_zeroupper
640
641
/* Setup initial values */
642
mov CTX, HASH_PTR
643
mov BUF, BUFFER_PTR
644
645
mov BUF, BUFFER_PTR2
646
mov CNT, BLOCKS_CTR
647
648
xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
649
650
SHA1_PIPELINED_MAIN_BODY
651
652
avx2_zeroupper
653
654
mov %rbp, %rsp
655
pop %rbp
656
657
pop %r15
658
pop %r14
659
pop %r13
660
pop %r12
661
pop %rbx
662
663
RET
664
665
SYM_FUNC_END(\name)
666
.endm
667
668
.section .rodata
669
670
#define K1 0x5a827999
671
#define K2 0x6ed9eba1
672
#define K3 0x8f1bbcdc
673
#define K4 0xca62c1d6
674
675
.align 128
676
K_XMM_AR:
677
.long K1, K1, K1, K1
678
.long K1, K1, K1, K1
679
.long K2, K2, K2, K2
680
.long K2, K2, K2, K2
681
.long K3, K3, K3, K3
682
.long K3, K3, K3, K3
683
.long K4, K4, K4, K4
684
.long K4, K4, K4, K4
685
686
BSWAP_SHUFB_CTL:
687
.long 0x00010203
688
.long 0x04050607
689
.long 0x08090a0b
690
.long 0x0c0d0e0f
691
.long 0x00010203
692
.long 0x04050607
693
.long 0x08090a0b
694
.long 0x0c0d0e0f
695
.text
696
697
SHA1_VECTOR_ASM sha1_transform_avx2
698
699