Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/sha256-ssse3-asm.S
26292 views
1
########################################################################
2
# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
3
#
4
# Copyright (C) 2013 Intel Corporation.
5
#
6
# Authors:
7
# James Guilford <[email protected]>
8
# Kirk Yap <[email protected]>
9
# Tim Chen <[email protected]>
10
#
11
# This software is available to you under a choice of one of two
12
# licenses. You may choose to be licensed under the terms of the GNU
13
# General Public License (GPL) Version 2, available from the file
14
# COPYING in the main directory of this source tree, or the
15
# OpenIB.org BSD license below:
16
#
17
# Redistribution and use in source and binary forms, with or
18
# without modification, are permitted provided that the following
19
# conditions are met:
20
#
21
# - Redistributions of source code must retain the above
22
# copyright notice, this list of conditions and the following
23
# disclaimer.
24
#
25
# - Redistributions in binary form must reproduce the above
26
# copyright notice, this list of conditions and the following
27
# disclaimer in the documentation and/or other materials
28
# provided with the distribution.
29
#
30
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37
# SOFTWARE.
38
#
39
########################################################################
40
#
41
# This code is described in an Intel White-Paper:
42
# "Fast SHA-256 Implementations on Intel Architecture Processors"
43
#
44
# To find it, surf to http://www.intel.com/p/en_US/embedded
45
# and search for that title.
46
#
47
########################################################################
48
49
#include <linux/linkage.h>
50
51
## assume buffers not aligned
52
#define MOVDQ movdqu
53
54
################################ Define Macros
55
56
# addm [mem], reg
57
# Add reg to mem using reg-mem add and store
58
.macro addm p1 p2
59
add \p1, \p2
60
mov \p2, \p1
61
.endm
62
63
################################
64
65
# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
66
# Load xmm with mem and byte swap each dword
67
.macro COPY_XMM_AND_BSWAP p1 p2 p3
68
MOVDQ \p2, \p1
69
pshufb \p3, \p1
70
.endm
71
72
################################
73
74
X0 = %xmm4
75
X1 = %xmm5
76
X2 = %xmm6
77
X3 = %xmm7
78
79
XTMP0 = %xmm0
80
XTMP1 = %xmm1
81
XTMP2 = %xmm2
82
XTMP3 = %xmm3
83
XTMP4 = %xmm8
84
XFER = %xmm9
85
86
SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
87
SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
88
BYTE_FLIP_MASK = %xmm12
89
90
NUM_BLKS = %rdx # 3rd arg
91
INP = %rsi # 2nd arg
92
CTX = %rdi # 1st arg
93
94
SRND = %rsi # clobbers INP
95
c = %ecx
96
d = %r8d
97
e = %edx
98
TBL = %r12
99
a = %eax
100
b = %ebx
101
102
f = %r9d
103
g = %r10d
104
h = %r11d
105
106
y0 = %r13d
107
y1 = %r14d
108
y2 = %r15d
109
110
111
112
_INP_END_SIZE = 8
113
_INP_SIZE = 8
114
_XFER_SIZE = 16
115
_XMM_SAVE_SIZE = 0
116
117
_INP_END = 0
118
_INP = _INP_END + _INP_END_SIZE
119
_XFER = _INP + _INP_SIZE
120
_XMM_SAVE = _XFER + _XFER_SIZE
121
STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
122
123
# rotate_Xs
124
# Rotate values of symbols X0...X3
125
.macro rotate_Xs
126
X_ = X0
127
X0 = X1
128
X1 = X2
129
X2 = X3
130
X3 = X_
131
.endm
132
133
# ROTATE_ARGS
134
# Rotate values of symbols a...h
135
.macro ROTATE_ARGS
136
TMP_ = h
137
h = g
138
g = f
139
f = e
140
e = d
141
d = c
142
c = b
143
b = a
144
a = TMP_
145
.endm
146
147
.macro FOUR_ROUNDS_AND_SCHED
148
## compute s0 four at a time and s1 two at a time
149
## compute W[-16] + W[-7] 4 at a time
150
movdqa X3, XTMP0
151
mov e, y0 # y0 = e
152
ror $(25-11), y0 # y0 = e >> (25-11)
153
mov a, y1 # y1 = a
154
palignr $4, X2, XTMP0 # XTMP0 = W[-7]
155
ror $(22-13), y1 # y1 = a >> (22-13)
156
xor e, y0 # y0 = e ^ (e >> (25-11))
157
mov f, y2 # y2 = f
158
ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
159
movdqa X1, XTMP1
160
xor a, y1 # y1 = a ^ (a >> (22-13)
161
xor g, y2 # y2 = f^g
162
paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
163
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
164
and e, y2 # y2 = (f^g)&e
165
ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
166
## compute s0
167
palignr $4, X0, XTMP1 # XTMP1 = W[-15]
168
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
169
ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
170
xor g, y2 # y2 = CH = ((f^g)&e)^g
171
movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
172
ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
173
add y0, y2 # y2 = S1 + CH
174
add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
175
movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
176
mov a, y0 # y0 = a
177
add y2, h # h = h + S1 + CH + k + w
178
mov a, y2 # y2 = a
179
pslld $(32-7), XTMP1 #
180
or c, y0 # y0 = a|c
181
add h, d # d = d + h + S1 + CH + k + w
182
and c, y2 # y2 = a&c
183
psrld $7, XTMP2 #
184
and b, y0 # y0 = (a|c)&b
185
add y1, h # h = h + S1 + CH + k + w + S0
186
por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
187
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
188
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
189
#
190
ROTATE_ARGS #
191
movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
192
mov e, y0 # y0 = e
193
mov a, y1 # y1 = a
194
movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
195
ror $(25-11), y0 # y0 = e >> (25-11)
196
xor e, y0 # y0 = e ^ (e >> (25-11))
197
mov f, y2 # y2 = f
198
ror $(22-13), y1 # y1 = a >> (22-13)
199
pslld $(32-18), XTMP3 #
200
xor a, y1 # y1 = a ^ (a >> (22-13)
201
ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
202
xor g, y2 # y2 = f^g
203
psrld $18, XTMP2 #
204
ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
205
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
206
and e, y2 # y2 = (f^g)&e
207
ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
208
pxor XTMP3, XTMP1
209
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
210
xor g, y2 # y2 = CH = ((f^g)&e)^g
211
psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
212
add y0, y2 # y2 = S1 + CH
213
add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
214
ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
215
pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
216
mov a, y0 # y0 = a
217
add y2, h # h = h + S1 + CH + k + w
218
mov a, y2 # y2 = a
219
pxor XTMP4, XTMP1 # XTMP1 = s0
220
or c, y0 # y0 = a|c
221
add h, d # d = d + h + S1 + CH + k + w
222
and c, y2 # y2 = a&c
223
## compute low s1
224
pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
225
and b, y0 # y0 = (a|c)&b
226
add y1, h # h = h + S1 + CH + k + w + S0
227
paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
228
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
229
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
230
231
ROTATE_ARGS
232
movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
233
mov e, y0 # y0 = e
234
mov a, y1 # y1 = a
235
ror $(25-11), y0 # y0 = e >> (25-11)
236
movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
237
xor e, y0 # y0 = e ^ (e >> (25-11))
238
ror $(22-13), y1 # y1 = a >> (22-13)
239
mov f, y2 # y2 = f
240
xor a, y1 # y1 = a ^ (a >> (22-13)
241
ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
242
psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
243
xor g, y2 # y2 = f^g
244
psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
245
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
246
and e, y2 # y2 = (f^g)&e
247
psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
248
ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
249
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
250
xor g, y2 # y2 = CH = ((f^g)&e)^g
251
ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
252
pxor XTMP3, XTMP2
253
add y0, y2 # y2 = S1 + CH
254
ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
255
add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
256
pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
257
mov a, y0 # y0 = a
258
add y2, h # h = h + S1 + CH + k + w
259
mov a, y2 # y2 = a
260
pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
261
or c, y0 # y0 = a|c
262
add h, d # d = d + h + S1 + CH + k + w
263
and c, y2 # y2 = a&c
264
paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
265
and b, y0 # y0 = (a|c)&b
266
add y1, h # h = h + S1 + CH + k + w + S0
267
## compute high s1
268
pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
269
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
270
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
271
#
272
ROTATE_ARGS #
273
movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
274
mov e, y0 # y0 = e
275
ror $(25-11), y0 # y0 = e >> (25-11)
276
mov a, y1 # y1 = a
277
movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
278
ror $(22-13), y1 # y1 = a >> (22-13)
279
xor e, y0 # y0 = e ^ (e >> (25-11))
280
mov f, y2 # y2 = f
281
ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
282
psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
283
xor a, y1 # y1 = a ^ (a >> (22-13)
284
xor g, y2 # y2 = f^g
285
psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
286
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
287
and e, y2 # y2 = (f^g)&e
288
ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
289
psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
290
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
291
ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
292
xor g, y2 # y2 = CH = ((f^g)&e)^g
293
pxor XTMP3, XTMP2 #
294
ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
295
add y0, y2 # y2 = S1 + CH
296
add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
297
pxor XTMP2, X0 # X0 = s1 {xDxC}
298
mov a, y0 # y0 = a
299
add y2, h # h = h + S1 + CH + k + w
300
mov a, y2 # y2 = a
301
pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
302
or c, y0 # y0 = a|c
303
add h, d # d = d + h + S1 + CH + k + w
304
and c, y2 # y2 = a&c
305
paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
306
and b, y0 # y0 = (a|c)&b
307
add y1, h # h = h + S1 + CH + k + w + S0
308
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
309
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
310
311
ROTATE_ARGS
312
rotate_Xs
313
.endm
314
315
## input is [rsp + _XFER + %1 * 4]
316
.macro DO_ROUND round
317
mov e, y0 # y0 = e
318
ror $(25-11), y0 # y0 = e >> (25-11)
319
mov a, y1 # y1 = a
320
xor e, y0 # y0 = e ^ (e >> (25-11))
321
ror $(22-13), y1 # y1 = a >> (22-13)
322
mov f, y2 # y2 = f
323
xor a, y1 # y1 = a ^ (a >> (22-13)
324
ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
325
xor g, y2 # y2 = f^g
326
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
327
ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
328
and e, y2 # y2 = (f^g)&e
329
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
330
ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
331
xor g, y2 # y2 = CH = ((f^g)&e)^g
332
add y0, y2 # y2 = S1 + CH
333
ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
334
offset = \round * 4 + _XFER
335
add offset(%rsp), y2 # y2 = k + w + S1 + CH
336
mov a, y0 # y0 = a
337
add y2, h # h = h + S1 + CH + k + w
338
mov a, y2 # y2 = a
339
or c, y0 # y0 = a|c
340
add h, d # d = d + h + S1 + CH + k + w
341
and c, y2 # y2 = a&c
342
and b, y0 # y0 = (a|c)&b
343
add y1, h # h = h + S1 + CH + k + w + S0
344
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
345
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
346
ROTATE_ARGS
347
.endm
348
349
########################################################################
350
## void sha256_transform_ssse3(struct sha256_block_state *state,
351
## const u8 *data, size_t nblocks);
352
########################################################################
353
.text
354
SYM_FUNC_START(sha256_transform_ssse3)
355
pushq %rbx
356
pushq %r12
357
pushq %r13
358
pushq %r14
359
pushq %r15
360
pushq %rbp
361
mov %rsp, %rbp
362
363
subq $STACK_SIZE, %rsp
364
and $~15, %rsp
365
366
shl $6, NUM_BLKS # convert to bytes
367
add INP, NUM_BLKS
368
mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
369
370
## load initial digest
371
mov 4*0(CTX), a
372
mov 4*1(CTX), b
373
mov 4*2(CTX), c
374
mov 4*3(CTX), d
375
mov 4*4(CTX), e
376
mov 4*5(CTX), f
377
mov 4*6(CTX), g
378
mov 4*7(CTX), h
379
380
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
381
movdqa _SHUF_00BA(%rip), SHUF_00BA
382
movdqa _SHUF_DC00(%rip), SHUF_DC00
383
384
.Lloop0:
385
lea K256(%rip), TBL
386
387
## byte swap first 16 dwords
388
COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
389
COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
390
COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
391
COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
392
393
mov INP, _INP(%rsp)
394
395
## schedule 48 input dwords, by doing 3 rounds of 16 each
396
mov $3, SRND
397
.align 16
398
.Lloop1:
399
movdqa (TBL), XFER
400
paddd X0, XFER
401
movdqa XFER, _XFER(%rsp)
402
FOUR_ROUNDS_AND_SCHED
403
404
movdqa 1*16(TBL), XFER
405
paddd X0, XFER
406
movdqa XFER, _XFER(%rsp)
407
FOUR_ROUNDS_AND_SCHED
408
409
movdqa 2*16(TBL), XFER
410
paddd X0, XFER
411
movdqa XFER, _XFER(%rsp)
412
FOUR_ROUNDS_AND_SCHED
413
414
movdqa 3*16(TBL), XFER
415
paddd X0, XFER
416
movdqa XFER, _XFER(%rsp)
417
add $4*16, TBL
418
FOUR_ROUNDS_AND_SCHED
419
420
sub $1, SRND
421
jne .Lloop1
422
423
mov $2, SRND
424
.Lloop2:
425
paddd (TBL), X0
426
movdqa X0, _XFER(%rsp)
427
DO_ROUND 0
428
DO_ROUND 1
429
DO_ROUND 2
430
DO_ROUND 3
431
paddd 1*16(TBL), X1
432
movdqa X1, _XFER(%rsp)
433
add $2*16, TBL
434
DO_ROUND 0
435
DO_ROUND 1
436
DO_ROUND 2
437
DO_ROUND 3
438
439
movdqa X2, X0
440
movdqa X3, X1
441
442
sub $1, SRND
443
jne .Lloop2
444
445
addm (4*0)(CTX),a
446
addm (4*1)(CTX),b
447
addm (4*2)(CTX),c
448
addm (4*3)(CTX),d
449
addm (4*4)(CTX),e
450
addm (4*5)(CTX),f
451
addm (4*6)(CTX),g
452
addm (4*7)(CTX),h
453
454
mov _INP(%rsp), INP
455
add $64, INP
456
cmp _INP_END(%rsp), INP
457
jne .Lloop0
458
459
mov %rbp, %rsp
460
popq %rbp
461
popq %r15
462
popq %r14
463
popq %r13
464
popq %r12
465
popq %rbx
466
467
RET
468
SYM_FUNC_END(sha256_transform_ssse3)
469
470
.section .rodata.cst256.K256, "aM", @progbits, 256
471
.align 64
472
K256:
473
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
474
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
475
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
476
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
477
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
478
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
479
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
480
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
481
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
482
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
483
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
484
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
485
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
486
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
487
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
488
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
489
490
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
491
.align 16
492
PSHUFFLE_BYTE_FLIP_MASK:
493
.octa 0x0c0d0e0f08090a0b0405060700010203
494
495
.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
496
.align 16
497
# shuffle xBxA -> 00BA
498
_SHUF_00BA:
499
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
500
501
.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
502
.align 16
503
# shuffle xDxC -> DC00
504
_SHUF_DC00:
505
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
506
507