Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/sha1-ssse3-and-avx.S
26292 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
4
* SSE3 instruction set extensions introduced in Intel Core Microarchitecture
5
* processors. CPUs supporting Intel(R) AVX extensions will get an additional
6
* boost.
7
*
8
* This work was inspired by the vectorized implementation of Dean Gaudet.
9
* Additional information on it can be found at:
10
* http://www.arctic.org/~dean/crypto/sha1.html
11
*
12
* It was improved upon with more efficient vectorization of the message
13
* scheduling. This implementation has also been optimized for all current and
14
* several future generations of Intel CPUs.
15
*
16
* See this article for more information about the implementation details:
17
* http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
18
*
19
* Copyright (C) 2010, Intel Corp.
20
* Authors: Maxim Locktyukhin <[email protected]>
21
* Ronen Zohar <[email protected]>
22
*
23
* Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
24
* Author: Mathias Krause <[email protected]>
25
*/
26
27
#include <linux/linkage.h>
28
29
#define CTX %rdi // arg1
30
#define BUF %rsi // arg2
31
#define CNT %rdx // arg3
32
33
#define REG_A %ecx
34
#define REG_B %esi
35
#define REG_C %edi
36
#define REG_D %r12d
37
#define REG_E %edx
38
39
#define REG_T1 %eax
40
#define REG_T2 %ebx
41
42
#define K_BASE %r8
43
#define HASH_PTR %r9
44
#define BUFFER_PTR %r10
45
#define BUFFER_END %r11
46
47
#define W_TMP1 %xmm0
48
#define W_TMP2 %xmm9
49
50
#define W0 %xmm1
51
#define W4 %xmm2
52
#define W8 %xmm3
53
#define W12 %xmm4
54
#define W16 %xmm5
55
#define W20 %xmm6
56
#define W24 %xmm7
57
#define W28 %xmm8
58
59
#define XMM_SHUFB_BSWAP %xmm10
60
61
/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
62
#define WK(t) (((t) & 15) * 4)(%rsp)
63
#define W_PRECALC_AHEAD 16
64
65
/*
66
* This macro implements the SHA-1 function's body for single 64-byte block
67
* param: function's name
68
*/
69
.macro SHA1_VECTOR_ASM name
70
SYM_FUNC_START(\name)
71
72
push %rbx
73
push %r12
74
push %rbp
75
mov %rsp, %rbp
76
77
sub $64, %rsp # allocate workspace
78
and $~15, %rsp # align stack
79
80
mov CTX, HASH_PTR
81
mov BUF, BUFFER_PTR
82
83
shl $6, CNT # multiply by 64
84
add BUF, CNT
85
mov CNT, BUFFER_END
86
87
lea K_XMM_AR(%rip), K_BASE
88
xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
89
90
SHA1_PIPELINED_MAIN_BODY
91
92
# cleanup workspace
93
mov $8, %ecx
94
mov %rsp, %rdi
95
xor %eax, %eax
96
rep stosq
97
98
mov %rbp, %rsp # deallocate workspace
99
pop %rbp
100
pop %r12
101
pop %rbx
102
RET
103
104
SYM_FUNC_END(\name)
105
.endm
106
107
/*
108
* This macro implements 80 rounds of SHA-1 for one 64-byte block
109
*/
110
.macro SHA1_PIPELINED_MAIN_BODY
111
INIT_REGALLOC
112
113
mov (HASH_PTR), A
114
mov 4(HASH_PTR), B
115
mov 8(HASH_PTR), C
116
mov 12(HASH_PTR), D
117
mov 16(HASH_PTR), E
118
119
.set i, 0
120
.rept W_PRECALC_AHEAD
121
W_PRECALC i
122
.set i, (i+1)
123
.endr
124
125
.align 4
126
1:
127
RR F1,A,B,C,D,E,0
128
RR F1,D,E,A,B,C,2
129
RR F1,B,C,D,E,A,4
130
RR F1,E,A,B,C,D,6
131
RR F1,C,D,E,A,B,8
132
133
RR F1,A,B,C,D,E,10
134
RR F1,D,E,A,B,C,12
135
RR F1,B,C,D,E,A,14
136
RR F1,E,A,B,C,D,16
137
RR F1,C,D,E,A,B,18
138
139
RR F2,A,B,C,D,E,20
140
RR F2,D,E,A,B,C,22
141
RR F2,B,C,D,E,A,24
142
RR F2,E,A,B,C,D,26
143
RR F2,C,D,E,A,B,28
144
145
RR F2,A,B,C,D,E,30
146
RR F2,D,E,A,B,C,32
147
RR F2,B,C,D,E,A,34
148
RR F2,E,A,B,C,D,36
149
RR F2,C,D,E,A,B,38
150
151
RR F3,A,B,C,D,E,40
152
RR F3,D,E,A,B,C,42
153
RR F3,B,C,D,E,A,44
154
RR F3,E,A,B,C,D,46
155
RR F3,C,D,E,A,B,48
156
157
RR F3,A,B,C,D,E,50
158
RR F3,D,E,A,B,C,52
159
RR F3,B,C,D,E,A,54
160
RR F3,E,A,B,C,D,56
161
RR F3,C,D,E,A,B,58
162
163
add $64, BUFFER_PTR # move to the next 64-byte block
164
cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
165
cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
166
167
RR F4,A,B,C,D,E,60
168
RR F4,D,E,A,B,C,62
169
RR F4,B,C,D,E,A,64
170
RR F4,E,A,B,C,D,66
171
RR F4,C,D,E,A,B,68
172
173
RR F4,A,B,C,D,E,70
174
RR F4,D,E,A,B,C,72
175
RR F4,B,C,D,E,A,74
176
RR F4,E,A,B,C,D,76
177
RR F4,C,D,E,A,B,78
178
179
UPDATE_HASH (HASH_PTR), A
180
UPDATE_HASH 4(HASH_PTR), B
181
UPDATE_HASH 8(HASH_PTR), C
182
UPDATE_HASH 12(HASH_PTR), D
183
UPDATE_HASH 16(HASH_PTR), E
184
185
RESTORE_RENAMED_REGS
186
cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
187
jne 1b
188
.endm
189
190
.macro INIT_REGALLOC
191
.set A, REG_A
192
.set B, REG_B
193
.set C, REG_C
194
.set D, REG_D
195
.set E, REG_E
196
.set T1, REG_T1
197
.set T2, REG_T2
198
.endm
199
200
.macro RESTORE_RENAMED_REGS
201
# order is important (REG_C is where it should be)
202
mov B, REG_B
203
mov D, REG_D
204
mov A, REG_A
205
mov E, REG_E
206
.endm
207
208
.macro SWAP_REG_NAMES a, b
209
.set _T, \a
210
.set \a, \b
211
.set \b, _T
212
.endm
213
214
.macro F1 b, c, d
215
mov \c, T1
216
SWAP_REG_NAMES \c, T1
217
xor \d, T1
218
and \b, T1
219
xor \d, T1
220
.endm
221
222
.macro F2 b, c, d
223
mov \d, T1
224
SWAP_REG_NAMES \d, T1
225
xor \c, T1
226
xor \b, T1
227
.endm
228
229
.macro F3 b, c ,d
230
mov \c, T1
231
SWAP_REG_NAMES \c, T1
232
mov \b, T2
233
or \b, T1
234
and \c, T2
235
and \d, T1
236
or T2, T1
237
.endm
238
239
.macro F4 b, c, d
240
F2 \b, \c, \d
241
.endm
242
243
.macro UPDATE_HASH hash, val
244
add \hash, \val
245
mov \val, \hash
246
.endm
247
248
/*
249
* RR does two rounds of SHA-1 back to back with W[] pre-calc
250
* t1 = F(b, c, d); e += w(i)
251
* e += t1; b <<= 30; d += w(i+1);
252
* t1 = F(a, b, c);
253
* d += t1; a <<= 5;
254
* e += a;
255
* t1 = e; a >>= 7;
256
* t1 <<= 5;
257
* d += t1;
258
*/
259
.macro RR F, a, b, c, d, e, round
260
add WK(\round), \e
261
\F \b, \c, \d # t1 = F(b, c, d);
262
W_PRECALC (\round + W_PRECALC_AHEAD)
263
rol $30, \b
264
add T1, \e
265
add WK(\round + 1), \d
266
267
\F \a, \b, \c
268
W_PRECALC (\round + W_PRECALC_AHEAD + 1)
269
rol $5, \a
270
add \a, \e
271
add T1, \d
272
ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
273
274
mov \e, T1
275
SWAP_REG_NAMES \e, T1
276
277
rol $5, T1
278
add T1, \d
279
280
# write: \a, \b
281
# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
282
.endm
283
284
.macro W_PRECALC r
285
.set i, \r
286
287
.if (i < 20)
288
.set K_XMM, 0
289
.elseif (i < 40)
290
.set K_XMM, 16
291
.elseif (i < 60)
292
.set K_XMM, 32
293
.elseif (i < 80)
294
.set K_XMM, 48
295
.endif
296
297
.if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
298
.set i, ((\r) % 80) # pre-compute for the next iteration
299
.if (i == 0)
300
W_PRECALC_RESET
301
.endif
302
W_PRECALC_00_15
303
.elseif (i<32)
304
W_PRECALC_16_31
305
.elseif (i < 80) // rounds 32-79
306
W_PRECALC_32_79
307
.endif
308
.endm
309
310
.macro W_PRECALC_RESET
311
.set W, W0
312
.set W_minus_04, W4
313
.set W_minus_08, W8
314
.set W_minus_12, W12
315
.set W_minus_16, W16
316
.set W_minus_20, W20
317
.set W_minus_24, W24
318
.set W_minus_28, W28
319
.set W_minus_32, W
320
.endm
321
322
.macro W_PRECALC_ROTATE
323
.set W_minus_32, W_minus_28
324
.set W_minus_28, W_minus_24
325
.set W_minus_24, W_minus_20
326
.set W_minus_20, W_minus_16
327
.set W_minus_16, W_minus_12
328
.set W_minus_12, W_minus_08
329
.set W_minus_08, W_minus_04
330
.set W_minus_04, W
331
.set W, W_minus_32
332
.endm
333
334
.macro W_PRECALC_SSSE3
335
336
.macro W_PRECALC_00_15
337
W_PRECALC_00_15_SSSE3
338
.endm
339
.macro W_PRECALC_16_31
340
W_PRECALC_16_31_SSSE3
341
.endm
342
.macro W_PRECALC_32_79
343
W_PRECALC_32_79_SSSE3
344
.endm
345
346
/* message scheduling pre-compute for rounds 0-15 */
347
.macro W_PRECALC_00_15_SSSE3
348
.if ((i & 3) == 0)
349
movdqu (i*4)(BUFFER_PTR), W_TMP1
350
.elseif ((i & 3) == 1)
351
pshufb XMM_SHUFB_BSWAP, W_TMP1
352
movdqa W_TMP1, W
353
.elseif ((i & 3) == 2)
354
paddd (K_BASE), W_TMP1
355
.elseif ((i & 3) == 3)
356
movdqa W_TMP1, WK(i&~3)
357
W_PRECALC_ROTATE
358
.endif
359
.endm
360
361
/* message scheduling pre-compute for rounds 16-31
362
*
363
* - calculating last 32 w[i] values in 8 XMM registers
364
* - pre-calculate K+w[i] values and store to mem, for later load by ALU add
365
* instruction
366
*
367
* some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
368
* dependency, but improves for 32-79
369
*/
370
.macro W_PRECALC_16_31_SSSE3
371
# blended scheduling of vector and scalar instruction streams, one 4-wide
372
# vector iteration / 4 scalar rounds
373
.if ((i & 3) == 0)
374
movdqa W_minus_12, W
375
palignr $8, W_minus_16, W # w[i-14]
376
movdqa W_minus_04, W_TMP1
377
psrldq $4, W_TMP1 # w[i-3]
378
pxor W_minus_08, W
379
.elseif ((i & 3) == 1)
380
pxor W_minus_16, W_TMP1
381
pxor W_TMP1, W
382
movdqa W, W_TMP2
383
movdqa W, W_TMP1
384
pslldq $12, W_TMP2
385
.elseif ((i & 3) == 2)
386
psrld $31, W
387
pslld $1, W_TMP1
388
por W, W_TMP1
389
movdqa W_TMP2, W
390
psrld $30, W_TMP2
391
pslld $2, W
392
.elseif ((i & 3) == 3)
393
pxor W, W_TMP1
394
pxor W_TMP2, W_TMP1
395
movdqa W_TMP1, W
396
paddd K_XMM(K_BASE), W_TMP1
397
movdqa W_TMP1, WK(i&~3)
398
W_PRECALC_ROTATE
399
.endif
400
.endm
401
402
/* message scheduling pre-compute for rounds 32-79
403
*
404
* in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
405
* instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
406
* allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
407
*/
408
.macro W_PRECALC_32_79_SSSE3
409
.if ((i & 3) == 0)
410
movdqa W_minus_04, W_TMP1
411
pxor W_minus_28, W # W is W_minus_32 before xor
412
palignr $8, W_minus_08, W_TMP1
413
.elseif ((i & 3) == 1)
414
pxor W_minus_16, W
415
pxor W_TMP1, W
416
movdqa W, W_TMP1
417
.elseif ((i & 3) == 2)
418
psrld $30, W
419
pslld $2, W_TMP1
420
por W, W_TMP1
421
.elseif ((i & 3) == 3)
422
movdqa W_TMP1, W
423
paddd K_XMM(K_BASE), W_TMP1
424
movdqa W_TMP1, WK(i&~3)
425
W_PRECALC_ROTATE
426
.endif
427
.endm
428
429
.endm // W_PRECALC_SSSE3
430
431
432
#define K1 0x5a827999
433
#define K2 0x6ed9eba1
434
#define K3 0x8f1bbcdc
435
#define K4 0xca62c1d6
436
437
.section .rodata
438
.align 16
439
440
K_XMM_AR:
441
.long K1, K1, K1, K1
442
.long K2, K2, K2, K2
443
.long K3, K3, K3, K3
444
.long K4, K4, K4, K4
445
446
BSWAP_SHUFB_CTL:
447
.long 0x00010203
448
.long 0x04050607
449
.long 0x08090a0b
450
.long 0x0c0d0e0f
451
452
453
.section .text
454
455
W_PRECALC_SSSE3
456
.macro xmm_mov a, b
457
movdqu \a,\b
458
.endm
459
460
/*
461
* SSSE3 optimized implementation:
462
*
463
* void sha1_transform_ssse3(struct sha1_block_state *state,
464
* const u8 *data, size_t nblocks);
465
*/
466
SHA1_VECTOR_ASM sha1_transform_ssse3
467
468
.macro W_PRECALC_AVX
469
470
.purgem W_PRECALC_00_15
471
.macro W_PRECALC_00_15
472
W_PRECALC_00_15_AVX
473
.endm
474
.purgem W_PRECALC_16_31
475
.macro W_PRECALC_16_31
476
W_PRECALC_16_31_AVX
477
.endm
478
.purgem W_PRECALC_32_79
479
.macro W_PRECALC_32_79
480
W_PRECALC_32_79_AVX
481
.endm
482
483
.macro W_PRECALC_00_15_AVX
484
.if ((i & 3) == 0)
485
vmovdqu (i*4)(BUFFER_PTR), W_TMP1
486
.elseif ((i & 3) == 1)
487
vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
488
.elseif ((i & 3) == 2)
489
vpaddd (K_BASE), W, W_TMP1
490
.elseif ((i & 3) == 3)
491
vmovdqa W_TMP1, WK(i&~3)
492
W_PRECALC_ROTATE
493
.endif
494
.endm
495
496
.macro W_PRECALC_16_31_AVX
497
.if ((i & 3) == 0)
498
vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
499
vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
500
vpxor W_minus_08, W, W
501
vpxor W_minus_16, W_TMP1, W_TMP1
502
.elseif ((i & 3) == 1)
503
vpxor W_TMP1, W, W
504
vpslldq $12, W, W_TMP2
505
vpslld $1, W, W_TMP1
506
.elseif ((i & 3) == 2)
507
vpsrld $31, W, W
508
vpor W, W_TMP1, W_TMP1
509
vpslld $2, W_TMP2, W
510
vpsrld $30, W_TMP2, W_TMP2
511
.elseif ((i & 3) == 3)
512
vpxor W, W_TMP1, W_TMP1
513
vpxor W_TMP2, W_TMP1, W
514
vpaddd K_XMM(K_BASE), W, W_TMP1
515
vmovdqu W_TMP1, WK(i&~3)
516
W_PRECALC_ROTATE
517
.endif
518
.endm
519
520
.macro W_PRECALC_32_79_AVX
521
.if ((i & 3) == 0)
522
vpalignr $8, W_minus_08, W_minus_04, W_TMP1
523
vpxor W_minus_28, W, W # W is W_minus_32 before xor
524
.elseif ((i & 3) == 1)
525
vpxor W_minus_16, W_TMP1, W_TMP1
526
vpxor W_TMP1, W, W
527
.elseif ((i & 3) == 2)
528
vpslld $2, W, W_TMP1
529
vpsrld $30, W, W
530
vpor W, W_TMP1, W
531
.elseif ((i & 3) == 3)
532
vpaddd K_XMM(K_BASE), W, W_TMP1
533
vmovdqu W_TMP1, WK(i&~3)
534
W_PRECALC_ROTATE
535
.endif
536
.endm
537
538
.endm // W_PRECALC_AVX
539
540
W_PRECALC_AVX
541
.purgem xmm_mov
542
.macro xmm_mov a, b
543
vmovdqu \a,\b
544
.endm
545
546
547
/* AVX optimized implementation:
548
* void sha1_transform_avx(struct sha1_block_state *state,
549
* const u8 *data, size_t nblocks);
550
*/
551
SHA1_VECTOR_ASM sha1_transform_avx
552
553