Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/sha256-avx-asm.S
26292 views
1
########################################################################
2
# Implement fast SHA-256 with AVX1 instructions. (x86_64)
3
#
4
# Copyright (C) 2013 Intel Corporation.
5
#
6
# Authors:
7
# James Guilford <[email protected]>
8
# Kirk Yap <[email protected]>
9
# Tim Chen <[email protected]>
10
#
11
# This software is available to you under a choice of one of two
12
# licenses. You may choose to be licensed under the terms of the GNU
13
# General Public License (GPL) Version 2, available from the file
14
# COPYING in the main directory of this source tree, or the
15
# OpenIB.org BSD license below:
16
#
17
# Redistribution and use in source and binary forms, with or
18
# without modification, are permitted provided that the following
19
# conditions are met:
20
#
21
# - Redistributions of source code must retain the above
22
# copyright notice, this list of conditions and the following
23
# disclaimer.
24
#
25
# - Redistributions in binary form must reproduce the above
26
# copyright notice, this list of conditions and the following
27
# disclaimer in the documentation and/or other materials
28
# provided with the distribution.
29
#
30
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37
# SOFTWARE.
38
########################################################################
39
#
40
# This code is described in an Intel White-Paper:
41
# "Fast SHA-256 Implementations on Intel Architecture Processors"
42
#
43
# To find it, surf to http://www.intel.com/p/en_US/embedded
44
# and search for that title.
45
#
46
########################################################################
47
# This code schedules 1 block at a time, with 4 lanes per block
48
########################################################################
49
50
#include <linux/linkage.h>
51
52
## assume buffers not aligned
53
#define VMOVDQ vmovdqu
54
55
################################ Define Macros
56
57
# addm [mem], reg
58
# Add reg to mem using reg-mem add and store
59
.macro addm p1 p2
60
add \p1, \p2
61
mov \p2, \p1
62
.endm
63
64
65
.macro MY_ROR p1 p2
66
shld $(32-(\p1)), \p2, \p2
67
.endm
68
69
################################
70
71
# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
72
# Load xmm with mem and byte swap each dword
73
.macro COPY_XMM_AND_BSWAP p1 p2 p3
74
VMOVDQ \p2, \p1
75
vpshufb \p3, \p1, \p1
76
.endm
77
78
################################
79
80
X0 = %xmm4
81
X1 = %xmm5
82
X2 = %xmm6
83
X3 = %xmm7
84
85
XTMP0 = %xmm0
86
XTMP1 = %xmm1
87
XTMP2 = %xmm2
88
XTMP3 = %xmm3
89
XTMP4 = %xmm8
90
XFER = %xmm9
91
XTMP5 = %xmm11
92
93
SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
94
SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
95
BYTE_FLIP_MASK = %xmm13
96
97
NUM_BLKS = %rdx # 3rd arg
98
INP = %rsi # 2nd arg
99
CTX = %rdi # 1st arg
100
101
SRND = %rsi # clobbers INP
102
c = %ecx
103
d = %r8d
104
e = %edx
105
TBL = %r12
106
a = %eax
107
b = %ebx
108
109
f = %r9d
110
g = %r10d
111
h = %r11d
112
113
y0 = %r13d
114
y1 = %r14d
115
y2 = %r15d
116
117
118
_INP_END_SIZE = 8
119
_INP_SIZE = 8
120
_XFER_SIZE = 16
121
_XMM_SAVE_SIZE = 0
122
123
_INP_END = 0
124
_INP = _INP_END + _INP_END_SIZE
125
_XFER = _INP + _INP_SIZE
126
_XMM_SAVE = _XFER + _XFER_SIZE
127
STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
128
129
# rotate_Xs
130
# Rotate values of symbols X0...X3
131
.macro rotate_Xs
132
X_ = X0
133
X0 = X1
134
X1 = X2
135
X2 = X3
136
X3 = X_
137
.endm
138
139
# ROTATE_ARGS
140
# Rotate values of symbols a...h
141
.macro ROTATE_ARGS
142
TMP_ = h
143
h = g
144
g = f
145
f = e
146
e = d
147
d = c
148
c = b
149
b = a
150
a = TMP_
151
.endm
152
153
.macro FOUR_ROUNDS_AND_SCHED
154
## compute s0 four at a time and s1 two at a time
155
## compute W[-16] + W[-7] 4 at a time
156
157
mov e, y0 # y0 = e
158
MY_ROR (25-11), y0 # y0 = e >> (25-11)
159
mov a, y1 # y1 = a
160
vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
161
MY_ROR (22-13), y1 # y1 = a >> (22-13)
162
xor e, y0 # y0 = e ^ (e >> (25-11))
163
mov f, y2 # y2 = f
164
MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
165
xor a, y1 # y1 = a ^ (a >> (22-13)
166
xor g, y2 # y2 = f^g
167
vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
168
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
169
and e, y2 # y2 = (f^g)&e
170
MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
171
## compute s0
172
vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
173
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
174
MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
175
xor g, y2 # y2 = CH = ((f^g)&e)^g
176
MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
177
add y0, y2 # y2 = S1 + CH
178
add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
179
mov a, y0 # y0 = a
180
add y2, h # h = h + S1 + CH + k + w
181
mov a, y2 # y2 = a
182
vpsrld $7, XTMP1, XTMP2
183
or c, y0 # y0 = a|c
184
add h, d # d = d + h + S1 + CH + k + w
185
and c, y2 # y2 = a&c
186
vpslld $(32-7), XTMP1, XTMP3
187
and b, y0 # y0 = (a|c)&b
188
add y1, h # h = h + S1 + CH + k + w + S0
189
vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
190
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
191
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
192
ROTATE_ARGS
193
mov e, y0 # y0 = e
194
mov a, y1 # y1 = a
195
MY_ROR (25-11), y0 # y0 = e >> (25-11)
196
xor e, y0 # y0 = e ^ (e >> (25-11))
197
mov f, y2 # y2 = f
198
MY_ROR (22-13), y1 # y1 = a >> (22-13)
199
vpsrld $18, XTMP1, XTMP2 #
200
xor a, y1 # y1 = a ^ (a >> (22-13)
201
MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
202
xor g, y2 # y2 = f^g
203
vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
204
MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
205
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
206
and e, y2 # y2 = (f^g)&e
207
MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
208
vpslld $(32-18), XTMP1, XTMP1
209
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
210
xor g, y2 # y2 = CH = ((f^g)&e)^g
211
vpxor XTMP1, XTMP3, XTMP3 #
212
add y0, y2 # y2 = S1 + CH
213
add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
214
MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
215
vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
216
mov a, y0 # y0 = a
217
add y2, h # h = h + S1 + CH + k + w
218
mov a, y2 # y2 = a
219
vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
220
or c, y0 # y0 = a|c
221
add h, d # d = d + h + S1 + CH + k + w
222
and c, y2 # y2 = a&c
223
## compute low s1
224
vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
225
and b, y0 # y0 = (a|c)&b
226
add y1, h # h = h + S1 + CH + k + w + S0
227
vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
228
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
229
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
230
ROTATE_ARGS
231
mov e, y0 # y0 = e
232
mov a, y1 # y1 = a
233
MY_ROR (25-11), y0 # y0 = e >> (25-11)
234
xor e, y0 # y0 = e ^ (e >> (25-11))
235
MY_ROR (22-13), y1 # y1 = a >> (22-13)
236
mov f, y2 # y2 = f
237
xor a, y1 # y1 = a ^ (a >> (22-13)
238
MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
239
vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
240
xor g, y2 # y2 = f^g
241
vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
242
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
243
and e, y2 # y2 = (f^g)&e
244
vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
245
MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
246
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
247
xor g, y2 # y2 = CH = ((f^g)&e)^g
248
MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
249
vpxor XTMP3, XTMP2, XTMP2 #
250
add y0, y2 # y2 = S1 + CH
251
MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
252
add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
253
vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
254
mov a, y0 # y0 = a
255
add y2, h # h = h + S1 + CH + k + w
256
mov a, y2 # y2 = a
257
vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
258
or c, y0 # y0 = a|c
259
add h, d # d = d + h + S1 + CH + k + w
260
and c, y2 # y2 = a&c
261
vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
262
and b, y0 # y0 = (a|c)&b
263
add y1, h # h = h + S1 + CH + k + w + S0
264
## compute high s1
265
vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
266
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
267
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
268
ROTATE_ARGS
269
mov e, y0 # y0 = e
270
MY_ROR (25-11), y0 # y0 = e >> (25-11)
271
mov a, y1 # y1 = a
272
MY_ROR (22-13), y1 # y1 = a >> (22-13)
273
xor e, y0 # y0 = e ^ (e >> (25-11))
274
mov f, y2 # y2 = f
275
MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
276
vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
277
xor a, y1 # y1 = a ^ (a >> (22-13)
278
xor g, y2 # y2 = f^g
279
vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
280
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
281
and e, y2 # y2 = (f^g)&e
282
MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
283
vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
284
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
285
MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
286
xor g, y2 # y2 = CH = ((f^g)&e)^g
287
vpxor XTMP3, XTMP2, XTMP2
288
MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
289
add y0, y2 # y2 = S1 + CH
290
add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
291
vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
292
mov a, y0 # y0 = a
293
add y2, h # h = h + S1 + CH + k + w
294
mov a, y2 # y2 = a
295
vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
296
or c, y0 # y0 = a|c
297
add h, d # d = d + h + S1 + CH + k + w
298
and c, y2 # y2 = a&c
299
vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
300
and b, y0 # y0 = (a|c)&b
301
add y1, h # h = h + S1 + CH + k + w + S0
302
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
303
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
304
ROTATE_ARGS
305
rotate_Xs
306
.endm
307
308
## input is [rsp + _XFER + %1 * 4]
309
.macro DO_ROUND round
310
mov e, y0 # y0 = e
311
MY_ROR (25-11), y0 # y0 = e >> (25-11)
312
mov a, y1 # y1 = a
313
xor e, y0 # y0 = e ^ (e >> (25-11))
314
MY_ROR (22-13), y1 # y1 = a >> (22-13)
315
mov f, y2 # y2 = f
316
xor a, y1 # y1 = a ^ (a >> (22-13)
317
MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
318
xor g, y2 # y2 = f^g
319
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
320
MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
321
and e, y2 # y2 = (f^g)&e
322
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
323
MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
324
xor g, y2 # y2 = CH = ((f^g)&e)^g
325
add y0, y2 # y2 = S1 + CH
326
MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
327
offset = \round * 4 + _XFER #
328
add offset(%rsp), y2 # y2 = k + w + S1 + CH
329
mov a, y0 # y0 = a
330
add y2, h # h = h + S1 + CH + k + w
331
mov a, y2 # y2 = a
332
or c, y0 # y0 = a|c
333
add h, d # d = d + h + S1 + CH + k + w
334
and c, y2 # y2 = a&c
335
and b, y0 # y0 = (a|c)&b
336
add y1, h # h = h + S1 + CH + k + w + S0
337
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
338
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
339
ROTATE_ARGS
340
.endm
341
342
########################################################################
343
## void sha256_transform_avx(struct sha256_block_state *state,
344
## const u8 *data, size_t nblocks);
345
########################################################################
346
.text
347
SYM_FUNC_START(sha256_transform_avx)
348
pushq %rbx
349
pushq %r12
350
pushq %r13
351
pushq %r14
352
pushq %r15
353
pushq %rbp
354
movq %rsp, %rbp
355
356
subq $STACK_SIZE, %rsp # allocate stack space
357
and $~15, %rsp # align stack pointer
358
359
shl $6, NUM_BLKS # convert to bytes
360
add INP, NUM_BLKS # pointer to end of data
361
mov NUM_BLKS, _INP_END(%rsp)
362
363
## load initial digest
364
mov 4*0(CTX), a
365
mov 4*1(CTX), b
366
mov 4*2(CTX), c
367
mov 4*3(CTX), d
368
mov 4*4(CTX), e
369
mov 4*5(CTX), f
370
mov 4*6(CTX), g
371
mov 4*7(CTX), h
372
373
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
374
vmovdqa _SHUF_00BA(%rip), SHUF_00BA
375
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
376
.Lloop0:
377
lea K256(%rip), TBL
378
379
## byte swap first 16 dwords
380
COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
381
COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
382
COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
383
COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
384
385
mov INP, _INP(%rsp)
386
387
## schedule 48 input dwords, by doing 3 rounds of 16 each
388
mov $3, SRND
389
.align 16
390
.Lloop1:
391
vpaddd (TBL), X0, XFER
392
vmovdqa XFER, _XFER(%rsp)
393
FOUR_ROUNDS_AND_SCHED
394
395
vpaddd 1*16(TBL), X0, XFER
396
vmovdqa XFER, _XFER(%rsp)
397
FOUR_ROUNDS_AND_SCHED
398
399
vpaddd 2*16(TBL), X0, XFER
400
vmovdqa XFER, _XFER(%rsp)
401
FOUR_ROUNDS_AND_SCHED
402
403
vpaddd 3*16(TBL), X0, XFER
404
vmovdqa XFER, _XFER(%rsp)
405
add $4*16, TBL
406
FOUR_ROUNDS_AND_SCHED
407
408
sub $1, SRND
409
jne .Lloop1
410
411
mov $2, SRND
412
.Lloop2:
413
vpaddd (TBL), X0, XFER
414
vmovdqa XFER, _XFER(%rsp)
415
DO_ROUND 0
416
DO_ROUND 1
417
DO_ROUND 2
418
DO_ROUND 3
419
420
vpaddd 1*16(TBL), X1, XFER
421
vmovdqa XFER, _XFER(%rsp)
422
add $2*16, TBL
423
DO_ROUND 0
424
DO_ROUND 1
425
DO_ROUND 2
426
DO_ROUND 3
427
428
vmovdqa X2, X0
429
vmovdqa X3, X1
430
431
sub $1, SRND
432
jne .Lloop2
433
434
addm (4*0)(CTX),a
435
addm (4*1)(CTX),b
436
addm (4*2)(CTX),c
437
addm (4*3)(CTX),d
438
addm (4*4)(CTX),e
439
addm (4*5)(CTX),f
440
addm (4*6)(CTX),g
441
addm (4*7)(CTX),h
442
443
mov _INP(%rsp), INP
444
add $64, INP
445
cmp _INP_END(%rsp), INP
446
jne .Lloop0
447
448
mov %rbp, %rsp
449
popq %rbp
450
popq %r15
451
popq %r14
452
popq %r13
453
popq %r12
454
popq %rbx
455
RET
456
SYM_FUNC_END(sha256_transform_avx)
457
458
.section .rodata.cst256.K256, "aM", @progbits, 256
459
.align 64
460
K256:
461
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
462
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
463
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
464
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
465
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
466
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
467
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
468
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
469
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
470
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
471
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
472
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
473
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
474
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
475
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
476
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
477
478
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
479
.align 16
480
PSHUFFLE_BYTE_FLIP_MASK:
481
.octa 0x0c0d0e0f08090a0b0405060700010203
482
483
.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
484
.align 16
485
# shuffle xBxA -> 00BA
486
_SHUF_00BA:
487
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
488
489
.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
490
.align 16
491
# shuffle xDxC -> DC00
492
_SHUF_DC00:
493
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
494
495