Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/sha256-avx2-asm.S
26292 views
1
########################################################################
2
# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3
#
4
# Copyright (C) 2013 Intel Corporation.
5
#
6
# Authors:
7
# James Guilford <[email protected]>
8
# Kirk Yap <[email protected]>
9
# Tim Chen <[email protected]>
10
#
11
# This software is available to you under a choice of one of two
12
# licenses. You may choose to be licensed under the terms of the GNU
13
# General Public License (GPL) Version 2, available from the file
14
# COPYING in the main directory of this source tree, or the
15
# OpenIB.org BSD license below:
16
#
17
# Redistribution and use in source and binary forms, with or
18
# without modification, are permitted provided that the following
19
# conditions are met:
20
#
21
# - Redistributions of source code must retain the above
22
# copyright notice, this list of conditions and the following
23
# disclaimer.
24
#
25
# - Redistributions in binary form must reproduce the above
26
# copyright notice, this list of conditions and the following
27
# disclaimer in the documentation and/or other materials
28
# provided with the distribution.
29
#
30
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37
# SOFTWARE.
38
#
39
########################################################################
40
#
41
# This code is described in an Intel White-Paper:
42
# "Fast SHA-256 Implementations on Intel Architecture Processors"
43
#
44
# To find it, surf to http://www.intel.com/p/en_US/embedded
45
# and search for that title.
46
#
47
########################################################################
48
# This code schedules 2 blocks at a time, with 4 lanes per block
49
########################################################################
50
51
#include <linux/linkage.h>
52
53
## assume buffers not aligned
54
#define VMOVDQ vmovdqu
55
56
################################ Define Macros
57
58
# addm [mem], reg
59
# Add reg to mem using reg-mem add and store
60
.macro addm p1 p2
61
add \p1, \p2
62
mov \p2, \p1
63
.endm
64
65
################################
66
67
X0 = %ymm4
68
X1 = %ymm5
69
X2 = %ymm6
70
X3 = %ymm7
71
72
# XMM versions of above
73
XWORD0 = %xmm4
74
XWORD1 = %xmm5
75
XWORD2 = %xmm6
76
XWORD3 = %xmm7
77
78
XTMP0 = %ymm0
79
XTMP1 = %ymm1
80
XTMP2 = %ymm2
81
XTMP3 = %ymm3
82
XTMP4 = %ymm8
83
XFER = %ymm9
84
XTMP5 = %ymm11
85
86
SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
87
SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
88
BYTE_FLIP_MASK = %ymm13
89
90
X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
91
92
NUM_BLKS = %rdx # 3rd arg
93
INP = %rsi # 2nd arg
94
CTX = %rdi # 1st arg
95
c = %ecx
96
d = %r8d
97
e = %edx # clobbers NUM_BLKS
98
y3 = %esi # clobbers INP
99
100
SRND = CTX # SRND is same register as CTX
101
102
a = %eax
103
b = %ebx
104
f = %r9d
105
g = %r10d
106
h = %r11d
107
old_h = %r11d
108
109
T1 = %r12d
110
y0 = %r13d
111
y1 = %r14d
112
y2 = %r15d
113
114
115
_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
116
_XMM_SAVE_SIZE = 0
117
_INP_END_SIZE = 8
118
_INP_SIZE = 8
119
_CTX_SIZE = 8
120
121
_XFER = 0
122
_XMM_SAVE = _XFER + _XFER_SIZE
123
_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
124
_INP = _INP_END + _INP_END_SIZE
125
_CTX = _INP + _INP_SIZE
126
STACK_SIZE = _CTX + _CTX_SIZE
127
128
# rotate_Xs
129
# Rotate values of symbols X0...X3
130
.macro rotate_Xs
131
X_ = X0
132
X0 = X1
133
X1 = X2
134
X2 = X3
135
X3 = X_
136
.endm
137
138
# ROTATE_ARGS
139
# Rotate values of symbols a...h
140
.macro ROTATE_ARGS
141
old_h = h
142
TMP_ = h
143
h = g
144
g = f
145
f = e
146
e = d
147
d = c
148
c = b
149
b = a
150
a = TMP_
151
.endm
152
153
.macro FOUR_ROUNDS_AND_SCHED disp
154
################################### RND N + 0 ############################
155
156
mov a, y3 # y3 = a # MAJA
157
rorx $25, e, y0 # y0 = e >> 25 # S1A
158
rorx $11, e, y1 # y1 = e >> 11 # S1B
159
160
addl \disp(%rsp, SRND), h # h = k + w + h # --
161
or c, y3 # y3 = a|c # MAJA
162
vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
163
mov f, y2 # y2 = f # CH
164
rorx $13, a, T1 # T1 = a >> 13 # S0B
165
166
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
167
xor g, y2 # y2 = f^g # CH
168
vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
169
rorx $6, e, y1 # y1 = (e >> 6) # S1
170
171
and e, y2 # y2 = (f^g)&e # CH
172
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
173
rorx $22, a, y1 # y1 = a >> 22 # S0A
174
add h, d # d = k + w + h + d # --
175
176
and b, y3 # y3 = (a|c)&b # MAJA
177
vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
178
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
179
rorx $2, a, T1 # T1 = (a >> 2) # S0
180
181
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
182
vpsrld $7, XTMP1, XTMP2
183
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
184
mov a, T1 # T1 = a # MAJB
185
and c, T1 # T1 = a&c # MAJB
186
187
add y0, y2 # y2 = S1 + CH # --
188
vpslld $(32-7), XTMP1, XTMP3
189
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
190
add y1, h # h = k + w + h + S0 # --
191
192
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
193
vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
194
195
vpsrld $18, XTMP1, XTMP2
196
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
197
add y3, h # h = t1 + S0 + MAJ # --
198
199
200
ROTATE_ARGS
201
202
################################### RND N + 1 ############################
203
204
mov a, y3 # y3 = a # MAJA
205
rorx $25, e, y0 # y0 = e >> 25 # S1A
206
rorx $11, e, y1 # y1 = e >> 11 # S1B
207
offset = \disp + 1*4
208
addl offset(%rsp, SRND), h # h = k + w + h # --
209
or c, y3 # y3 = a|c # MAJA
210
211
212
vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
213
mov f, y2 # y2 = f # CH
214
rorx $13, a, T1 # T1 = a >> 13 # S0B
215
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
216
xor g, y2 # y2 = f^g # CH
217
218
219
rorx $6, e, y1 # y1 = (e >> 6) # S1
220
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
221
rorx $22, a, y1 # y1 = a >> 22 # S0A
222
and e, y2 # y2 = (f^g)&e # CH
223
add h, d # d = k + w + h + d # --
224
225
vpslld $(32-18), XTMP1, XTMP1
226
and b, y3 # y3 = (a|c)&b # MAJA
227
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
228
229
vpxor XTMP1, XTMP3, XTMP3
230
rorx $2, a, T1 # T1 = (a >> 2) # S0
231
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
232
233
vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
234
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
235
mov a, T1 # T1 = a # MAJB
236
and c, T1 # T1 = a&c # MAJB
237
add y0, y2 # y2 = S1 + CH # --
238
239
vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
240
vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
241
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
242
add y1, h # h = k + w + h + S0 # --
243
244
vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
245
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
246
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
247
add y3, h # h = t1 + S0 + MAJ # --
248
249
vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
250
251
252
ROTATE_ARGS
253
254
################################### RND N + 2 ############################
255
256
mov a, y3 # y3 = a # MAJA
257
rorx $25, e, y0 # y0 = e >> 25 # S1A
258
offset = \disp + 2*4
259
addl offset(%rsp, SRND), h # h = k + w + h # --
260
261
vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
262
rorx $11, e, y1 # y1 = e >> 11 # S1B
263
or c, y3 # y3 = a|c # MAJA
264
mov f, y2 # y2 = f # CH
265
xor g, y2 # y2 = f^g # CH
266
267
rorx $13, a, T1 # T1 = a >> 13 # S0B
268
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
269
vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
270
and e, y2 # y2 = (f^g)&e # CH
271
272
rorx $6, e, y1 # y1 = (e >> 6) # S1
273
vpxor XTMP3, XTMP2, XTMP2
274
add h, d # d = k + w + h + d # --
275
and b, y3 # y3 = (a|c)&b # MAJA
276
277
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
278
rorx $22, a, y1 # y1 = a >> 22 # S0A
279
vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
280
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
281
282
vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
283
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
284
rorx $2, a ,T1 # T1 = (a >> 2) # S0
285
vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
286
287
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
288
mov a, T1 # T1 = a # MAJB
289
and c, T1 # T1 = a&c # MAJB
290
add y0, y2 # y2 = S1 + CH # --
291
vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
292
293
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
294
add y1,h # h = k + w + h + S0 # --
295
add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
296
add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
297
298
add y3,h # h = t1 + S0 + MAJ # --
299
300
301
ROTATE_ARGS
302
303
################################### RND N + 3 ############################
304
305
mov a, y3 # y3 = a # MAJA
306
rorx $25, e, y0 # y0 = e >> 25 # S1A
307
rorx $11, e, y1 # y1 = e >> 11 # S1B
308
offset = \disp + 3*4
309
addl offset(%rsp, SRND), h # h = k + w + h # --
310
or c, y3 # y3 = a|c # MAJA
311
312
313
vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
314
mov f, y2 # y2 = f # CH
315
rorx $13, a, T1 # T1 = a >> 13 # S0B
316
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
317
xor g, y2 # y2 = f^g # CH
318
319
320
vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
321
rorx $6, e, y1 # y1 = (e >> 6) # S1
322
and e, y2 # y2 = (f^g)&e # CH
323
add h, d # d = k + w + h + d # --
324
and b, y3 # y3 = (a|c)&b # MAJA
325
326
vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
327
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
328
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
329
330
vpxor XTMP3, XTMP2, XTMP2
331
rorx $22, a, y1 # y1 = a >> 22 # S0A
332
add y0, y2 # y2 = S1 + CH # --
333
334
vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
335
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
336
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
337
338
rorx $2, a, T1 # T1 = (a >> 2) # S0
339
vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
340
341
vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
342
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
343
mov a, T1 # T1 = a # MAJB
344
and c, T1 # T1 = a&c # MAJB
345
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
346
347
add y1, h # h = k + w + h + S0 # --
348
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
349
add y3, h # h = t1 + S0 + MAJ # --
350
351
ROTATE_ARGS
352
rotate_Xs
353
.endm
354
355
.macro DO_4ROUNDS disp
356
################################### RND N + 0 ###########################
357
358
mov f, y2 # y2 = f # CH
359
rorx $25, e, y0 # y0 = e >> 25 # S1A
360
rorx $11, e, y1 # y1 = e >> 11 # S1B
361
xor g, y2 # y2 = f^g # CH
362
363
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
364
rorx $6, e, y1 # y1 = (e >> 6) # S1
365
and e, y2 # y2 = (f^g)&e # CH
366
367
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
368
rorx $13, a, T1 # T1 = a >> 13 # S0B
369
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
370
rorx $22, a, y1 # y1 = a >> 22 # S0A
371
mov a, y3 # y3 = a # MAJA
372
373
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
374
rorx $2, a, T1 # T1 = (a >> 2) # S0
375
addl \disp(%rsp, SRND), h # h = k + w + h # --
376
or c, y3 # y3 = a|c # MAJA
377
378
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
379
mov a, T1 # T1 = a # MAJB
380
and b, y3 # y3 = (a|c)&b # MAJA
381
and c, T1 # T1 = a&c # MAJB
382
add y0, y2 # y2 = S1 + CH # --
383
384
385
add h, d # d = k + w + h + d # --
386
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
387
add y1, h # h = k + w + h + S0 # --
388
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
389
390
ROTATE_ARGS
391
392
################################### RND N + 1 ###########################
393
394
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
395
mov f, y2 # y2 = f # CH
396
rorx $25, e, y0 # y0 = e >> 25 # S1A
397
rorx $11, e, y1 # y1 = e >> 11 # S1B
398
xor g, y2 # y2 = f^g # CH
399
400
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
401
rorx $6, e, y1 # y1 = (e >> 6) # S1
402
and e, y2 # y2 = (f^g)&e # CH
403
add y3, old_h # h = t1 + S0 + MAJ # --
404
405
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
406
rorx $13, a, T1 # T1 = a >> 13 # S0B
407
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
408
rorx $22, a, y1 # y1 = a >> 22 # S0A
409
mov a, y3 # y3 = a # MAJA
410
411
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
412
rorx $2, a, T1 # T1 = (a >> 2) # S0
413
offset = 4*1 + \disp
414
addl offset(%rsp, SRND), h # h = k + w + h # --
415
or c, y3 # y3 = a|c # MAJA
416
417
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
418
mov a, T1 # T1 = a # MAJB
419
and b, y3 # y3 = (a|c)&b # MAJA
420
and c, T1 # T1 = a&c # MAJB
421
add y0, y2 # y2 = S1 + CH # --
422
423
424
add h, d # d = k + w + h + d # --
425
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
426
add y1, h # h = k + w + h + S0 # --
427
428
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
429
430
ROTATE_ARGS
431
432
################################### RND N + 2 ##############################
433
434
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
435
mov f, y2 # y2 = f # CH
436
rorx $25, e, y0 # y0 = e >> 25 # S1A
437
rorx $11, e, y1 # y1 = e >> 11 # S1B
438
xor g, y2 # y2 = f^g # CH
439
440
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
441
rorx $6, e, y1 # y1 = (e >> 6) # S1
442
and e, y2 # y2 = (f^g)&e # CH
443
add y3, old_h # h = t1 + S0 + MAJ # --
444
445
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
446
rorx $13, a, T1 # T1 = a >> 13 # S0B
447
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
448
rorx $22, a, y1 # y1 = a >> 22 # S0A
449
mov a, y3 # y3 = a # MAJA
450
451
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
452
rorx $2, a, T1 # T1 = (a >> 2) # S0
453
offset = 4*2 + \disp
454
addl offset(%rsp, SRND), h # h = k + w + h # --
455
or c, y3 # y3 = a|c # MAJA
456
457
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
458
mov a, T1 # T1 = a # MAJB
459
and b, y3 # y3 = (a|c)&b # MAJA
460
and c, T1 # T1 = a&c # MAJB
461
add y0, y2 # y2 = S1 + CH # --
462
463
464
add h, d # d = k + w + h + d # --
465
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
466
add y1, h # h = k + w + h + S0 # --
467
468
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
469
470
ROTATE_ARGS
471
472
################################### RND N + 3 ###########################
473
474
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
475
mov f, y2 # y2 = f # CH
476
rorx $25, e, y0 # y0 = e >> 25 # S1A
477
rorx $11, e, y1 # y1 = e >> 11 # S1B
478
xor g, y2 # y2 = f^g # CH
479
480
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
481
rorx $6, e, y1 # y1 = (e >> 6) # S1
482
and e, y2 # y2 = (f^g)&e # CH
483
add y3, old_h # h = t1 + S0 + MAJ # --
484
485
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
486
rorx $13, a, T1 # T1 = a >> 13 # S0B
487
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
488
rorx $22, a, y1 # y1 = a >> 22 # S0A
489
mov a, y3 # y3 = a # MAJA
490
491
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
492
rorx $2, a, T1 # T1 = (a >> 2) # S0
493
offset = 4*3 + \disp
494
addl offset(%rsp, SRND), h # h = k + w + h # --
495
or c, y3 # y3 = a|c # MAJA
496
497
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
498
mov a, T1 # T1 = a # MAJB
499
and b, y3 # y3 = (a|c)&b # MAJA
500
and c, T1 # T1 = a&c # MAJB
501
add y0, y2 # y2 = S1 + CH # --
502
503
504
add h, d # d = k + w + h + d # --
505
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
506
add y1, h # h = k + w + h + S0 # --
507
508
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
509
510
511
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
512
513
add y3, h # h = t1 + S0 + MAJ # --
514
515
ROTATE_ARGS
516
517
.endm
518
519
########################################################################
520
## void sha256_transform_rorx(struct sha256_block_state *state,
521
## const u8 *data, size_t nblocks);
522
########################################################################
523
.text
524
SYM_FUNC_START(sha256_transform_rorx)
525
pushq %rbx
526
pushq %r12
527
pushq %r13
528
pushq %r14
529
pushq %r15
530
531
push %rbp
532
mov %rsp, %rbp
533
534
subq $STACK_SIZE, %rsp
535
and $-32, %rsp # align rsp to 32 byte boundary
536
537
shl $6, NUM_BLKS # convert to bytes
538
lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
539
mov NUM_BLKS, _INP_END(%rsp)
540
541
cmp NUM_BLKS, INP
542
je .Lonly_one_block
543
544
## load initial digest
545
mov (CTX), a
546
mov 4*1(CTX), b
547
mov 4*2(CTX), c
548
mov 4*3(CTX), d
549
mov 4*4(CTX), e
550
mov 4*5(CTX), f
551
mov 4*6(CTX), g
552
mov 4*7(CTX), h
553
554
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
555
vmovdqa _SHUF_00BA(%rip), SHUF_00BA
556
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
557
558
mov CTX, _CTX(%rsp)
559
560
.Lloop0:
561
## Load first 16 dwords from two blocks
562
VMOVDQ 0*32(INP),XTMP0
563
VMOVDQ 1*32(INP),XTMP1
564
VMOVDQ 2*32(INP),XTMP2
565
VMOVDQ 3*32(INP),XTMP3
566
567
## byte swap data
568
vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
569
vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
570
vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
571
vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
572
573
## transpose data into high/low halves
574
vperm2i128 $0x20, XTMP2, XTMP0, X0
575
vperm2i128 $0x31, XTMP2, XTMP0, X1
576
vperm2i128 $0x20, XTMP3, XTMP1, X2
577
vperm2i128 $0x31, XTMP3, XTMP1, X3
578
579
.Llast_block_enter:
580
add $64, INP
581
mov INP, _INP(%rsp)
582
583
## schedule 48 input dwords, by doing 3 rounds of 12 each
584
xor SRND, SRND
585
586
.align 16
587
.Lloop1:
588
leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
589
vpaddd (INP, SRND), X0, XFER
590
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
591
FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
592
593
leaq K256+1*32(%rip), INP
594
vpaddd (INP, SRND), X0, XFER
595
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
596
FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
597
598
leaq K256+2*32(%rip), INP
599
vpaddd (INP, SRND), X0, XFER
600
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
601
FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
602
603
leaq K256+3*32(%rip), INP
604
vpaddd (INP, SRND), X0, XFER
605
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
606
FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
607
608
add $4*32, SRND
609
cmp $3*4*32, SRND
610
jb .Lloop1
611
612
.Lloop2:
613
## Do last 16 rounds with no scheduling
614
leaq K256+0*32(%rip), INP
615
vpaddd (INP, SRND), X0, XFER
616
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
617
DO_4ROUNDS (_XFER + 0*32)
618
619
leaq K256+1*32(%rip), INP
620
vpaddd (INP, SRND), X1, XFER
621
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
622
DO_4ROUNDS (_XFER + 1*32)
623
add $2*32, SRND
624
625
vmovdqa X2, X0
626
vmovdqa X3, X1
627
628
cmp $4*4*32, SRND
629
jb .Lloop2
630
631
mov _CTX(%rsp), CTX
632
mov _INP(%rsp), INP
633
634
addm (4*0)(CTX),a
635
addm (4*1)(CTX),b
636
addm (4*2)(CTX),c
637
addm (4*3)(CTX),d
638
addm (4*4)(CTX),e
639
addm (4*5)(CTX),f
640
addm (4*6)(CTX),g
641
addm (4*7)(CTX),h
642
643
cmp _INP_END(%rsp), INP
644
ja .Ldone_hash
645
646
#### Do second block using previously scheduled results
647
xor SRND, SRND
648
.align 16
649
.Lloop3:
650
DO_4ROUNDS (_XFER + 0*32 + 16)
651
DO_4ROUNDS (_XFER + 1*32 + 16)
652
add $2*32, SRND
653
cmp $4*4*32, SRND
654
jb .Lloop3
655
656
mov _CTX(%rsp), CTX
657
mov _INP(%rsp), INP
658
add $64, INP
659
660
addm (4*0)(CTX),a
661
addm (4*1)(CTX),b
662
addm (4*2)(CTX),c
663
addm (4*3)(CTX),d
664
addm (4*4)(CTX),e
665
addm (4*5)(CTX),f
666
addm (4*6)(CTX),g
667
addm (4*7)(CTX),h
668
669
cmp _INP_END(%rsp), INP
670
jb .Lloop0
671
ja .Ldone_hash
672
673
.Ldo_last_block:
674
VMOVDQ 0*16(INP),XWORD0
675
VMOVDQ 1*16(INP),XWORD1
676
VMOVDQ 2*16(INP),XWORD2
677
VMOVDQ 3*16(INP),XWORD3
678
679
vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
680
vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
681
vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
682
vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
683
684
jmp .Llast_block_enter
685
686
.Lonly_one_block:
687
688
## load initial digest
689
mov (4*0)(CTX),a
690
mov (4*1)(CTX),b
691
mov (4*2)(CTX),c
692
mov (4*3)(CTX),d
693
mov (4*4)(CTX),e
694
mov (4*5)(CTX),f
695
mov (4*6)(CTX),g
696
mov (4*7)(CTX),h
697
698
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
699
vmovdqa _SHUF_00BA(%rip), SHUF_00BA
700
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
701
702
mov CTX, _CTX(%rsp)
703
jmp .Ldo_last_block
704
705
.Ldone_hash:
706
707
mov %rbp, %rsp
708
pop %rbp
709
710
popq %r15
711
popq %r14
712
popq %r13
713
popq %r12
714
popq %rbx
715
vzeroupper
716
RET
717
SYM_FUNC_END(sha256_transform_rorx)
718
719
.section .rodata.cst512.K256, "aM", @progbits, 512
720
.align 64
721
K256:
722
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
723
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
724
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
725
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
726
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
727
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
728
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
729
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
730
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
731
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
732
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
733
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
734
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
735
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
736
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
737
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
738
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
739
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
740
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
741
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
742
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
743
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
744
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
745
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
746
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
747
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
748
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
749
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
750
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
751
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
752
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
753
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
754
755
.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
756
.align 32
757
PSHUFFLE_BYTE_FLIP_MASK:
758
.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
759
760
# shuffle xBxA -> 00BA
761
.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
762
.align 32
763
_SHUF_00BA:
764
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
765
766
# shuffle xDxC -> DC00
767
.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
768
.align 32
769
_SHUF_DC00:
770
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
771
772