Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/blake2s-core.S
26292 views
1
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2
/*
3
* Copyright (C) 2015-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
4
* Copyright (C) 2017-2019 Samuel Neves <[email protected]>. All Rights Reserved.
5
*/
6
7
#include <linux/linkage.h>
8
9
.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
10
.align 32
11
IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
12
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
13
.section .rodata.cst16.ROT16, "aM", @progbits, 16
14
.align 16
15
ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
16
.section .rodata.cst16.ROR328, "aM", @progbits, 16
17
.align 16
18
ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
19
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
20
.align 64
21
SIGMA:
22
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
23
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
24
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
25
.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
26
.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
27
.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
28
.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
29
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
30
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
31
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
32
.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
33
.align 64
34
SIGMA2:
35
.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
36
.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
37
.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
38
.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
39
.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
40
.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
41
.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
42
.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
43
.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
44
.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
45
46
.text
47
SYM_FUNC_START(blake2s_compress_ssse3)
48
testq %rdx,%rdx
49
je .Lendofloop
50
movdqu (%rdi),%xmm0
51
movdqu 0x10(%rdi),%xmm1
52
movdqa ROT16(%rip),%xmm12
53
movdqa ROR328(%rip),%xmm13
54
movdqu 0x20(%rdi),%xmm14
55
movq %rcx,%xmm15
56
leaq SIGMA+0xa0(%rip),%r8
57
jmp .Lbeginofloop
58
.align 32
59
.Lbeginofloop:
60
movdqa %xmm0,%xmm10
61
movdqa %xmm1,%xmm11
62
paddq %xmm15,%xmm14
63
movdqa IV(%rip),%xmm2
64
movdqa %xmm14,%xmm3
65
pxor IV+0x10(%rip),%xmm3
66
leaq SIGMA(%rip),%rcx
67
.Lroundloop:
68
movzbl (%rcx),%eax
69
movd (%rsi,%rax,4),%xmm4
70
movzbl 0x1(%rcx),%eax
71
movd (%rsi,%rax,4),%xmm5
72
movzbl 0x2(%rcx),%eax
73
movd (%rsi,%rax,4),%xmm6
74
movzbl 0x3(%rcx),%eax
75
movd (%rsi,%rax,4),%xmm7
76
punpckldq %xmm5,%xmm4
77
punpckldq %xmm7,%xmm6
78
punpcklqdq %xmm6,%xmm4
79
paddd %xmm4,%xmm0
80
paddd %xmm1,%xmm0
81
pxor %xmm0,%xmm3
82
pshufb %xmm12,%xmm3
83
paddd %xmm3,%xmm2
84
pxor %xmm2,%xmm1
85
movdqa %xmm1,%xmm8
86
psrld $0xc,%xmm1
87
pslld $0x14,%xmm8
88
por %xmm8,%xmm1
89
movzbl 0x4(%rcx),%eax
90
movd (%rsi,%rax,4),%xmm5
91
movzbl 0x5(%rcx),%eax
92
movd (%rsi,%rax,4),%xmm6
93
movzbl 0x6(%rcx),%eax
94
movd (%rsi,%rax,4),%xmm7
95
movzbl 0x7(%rcx),%eax
96
movd (%rsi,%rax,4),%xmm4
97
punpckldq %xmm6,%xmm5
98
punpckldq %xmm4,%xmm7
99
punpcklqdq %xmm7,%xmm5
100
paddd %xmm5,%xmm0
101
paddd %xmm1,%xmm0
102
pxor %xmm0,%xmm3
103
pshufb %xmm13,%xmm3
104
paddd %xmm3,%xmm2
105
pxor %xmm2,%xmm1
106
movdqa %xmm1,%xmm8
107
psrld $0x7,%xmm1
108
pslld $0x19,%xmm8
109
por %xmm8,%xmm1
110
pshufd $0x93,%xmm0,%xmm0
111
pshufd $0x4e,%xmm3,%xmm3
112
pshufd $0x39,%xmm2,%xmm2
113
movzbl 0x8(%rcx),%eax
114
movd (%rsi,%rax,4),%xmm6
115
movzbl 0x9(%rcx),%eax
116
movd (%rsi,%rax,4),%xmm7
117
movzbl 0xa(%rcx),%eax
118
movd (%rsi,%rax,4),%xmm4
119
movzbl 0xb(%rcx),%eax
120
movd (%rsi,%rax,4),%xmm5
121
punpckldq %xmm7,%xmm6
122
punpckldq %xmm5,%xmm4
123
punpcklqdq %xmm4,%xmm6
124
paddd %xmm6,%xmm0
125
paddd %xmm1,%xmm0
126
pxor %xmm0,%xmm3
127
pshufb %xmm12,%xmm3
128
paddd %xmm3,%xmm2
129
pxor %xmm2,%xmm1
130
movdqa %xmm1,%xmm8
131
psrld $0xc,%xmm1
132
pslld $0x14,%xmm8
133
por %xmm8,%xmm1
134
movzbl 0xc(%rcx),%eax
135
movd (%rsi,%rax,4),%xmm7
136
movzbl 0xd(%rcx),%eax
137
movd (%rsi,%rax,4),%xmm4
138
movzbl 0xe(%rcx),%eax
139
movd (%rsi,%rax,4),%xmm5
140
movzbl 0xf(%rcx),%eax
141
movd (%rsi,%rax,4),%xmm6
142
punpckldq %xmm4,%xmm7
143
punpckldq %xmm6,%xmm5
144
punpcklqdq %xmm5,%xmm7
145
paddd %xmm7,%xmm0
146
paddd %xmm1,%xmm0
147
pxor %xmm0,%xmm3
148
pshufb %xmm13,%xmm3
149
paddd %xmm3,%xmm2
150
pxor %xmm2,%xmm1
151
movdqa %xmm1,%xmm8
152
psrld $0x7,%xmm1
153
pslld $0x19,%xmm8
154
por %xmm8,%xmm1
155
pshufd $0x39,%xmm0,%xmm0
156
pshufd $0x4e,%xmm3,%xmm3
157
pshufd $0x93,%xmm2,%xmm2
158
addq $0x10,%rcx
159
cmpq %r8,%rcx
160
jnz .Lroundloop
161
pxor %xmm2,%xmm0
162
pxor %xmm3,%xmm1
163
pxor %xmm10,%xmm0
164
pxor %xmm11,%xmm1
165
addq $0x40,%rsi
166
decq %rdx
167
jnz .Lbeginofloop
168
movdqu %xmm0,(%rdi)
169
movdqu %xmm1,0x10(%rdi)
170
movdqu %xmm14,0x20(%rdi)
171
.Lendofloop:
172
RET
173
SYM_FUNC_END(blake2s_compress_ssse3)
174
175
SYM_FUNC_START(blake2s_compress_avx512)
176
vmovdqu (%rdi),%xmm0
177
vmovdqu 0x10(%rdi),%xmm1
178
vmovdqu 0x20(%rdi),%xmm4
179
vmovq %rcx,%xmm5
180
vmovdqa IV(%rip),%xmm14
181
vmovdqa IV+16(%rip),%xmm15
182
jmp .Lblake2s_compress_avx512_mainloop
183
.align 32
184
.Lblake2s_compress_avx512_mainloop:
185
vmovdqa %xmm0,%xmm10
186
vmovdqa %xmm1,%xmm11
187
vpaddq %xmm5,%xmm4,%xmm4
188
vmovdqa %xmm14,%xmm2
189
vpxor %xmm15,%xmm4,%xmm3
190
vmovdqu (%rsi),%ymm6
191
vmovdqu 0x20(%rsi),%ymm7
192
addq $0x40,%rsi
193
leaq SIGMA2(%rip),%rax
194
movb $0xa,%cl
195
.Lblake2s_compress_avx512_roundloop:
196
addq $0x40,%rax
197
vmovdqa -0x40(%rax),%ymm8
198
vmovdqa -0x20(%rax),%ymm9
199
vpermi2d %ymm7,%ymm6,%ymm8
200
vpermi2d %ymm7,%ymm6,%ymm9
201
vmovdqa %ymm8,%ymm6
202
vmovdqa %ymm9,%ymm7
203
vpaddd %xmm8,%xmm0,%xmm0
204
vpaddd %xmm1,%xmm0,%xmm0
205
vpxor %xmm0,%xmm3,%xmm3
206
vprord $0x10,%xmm3,%xmm3
207
vpaddd %xmm3,%xmm2,%xmm2
208
vpxor %xmm2,%xmm1,%xmm1
209
vprord $0xc,%xmm1,%xmm1
210
vextracti128 $0x1,%ymm8,%xmm8
211
vpaddd %xmm8,%xmm0,%xmm0
212
vpaddd %xmm1,%xmm0,%xmm0
213
vpxor %xmm0,%xmm3,%xmm3
214
vprord $0x8,%xmm3,%xmm3
215
vpaddd %xmm3,%xmm2,%xmm2
216
vpxor %xmm2,%xmm1,%xmm1
217
vprord $0x7,%xmm1,%xmm1
218
vpshufd $0x93,%xmm0,%xmm0
219
vpshufd $0x4e,%xmm3,%xmm3
220
vpshufd $0x39,%xmm2,%xmm2
221
vpaddd %xmm9,%xmm0,%xmm0
222
vpaddd %xmm1,%xmm0,%xmm0
223
vpxor %xmm0,%xmm3,%xmm3
224
vprord $0x10,%xmm3,%xmm3
225
vpaddd %xmm3,%xmm2,%xmm2
226
vpxor %xmm2,%xmm1,%xmm1
227
vprord $0xc,%xmm1,%xmm1
228
vextracti128 $0x1,%ymm9,%xmm9
229
vpaddd %xmm9,%xmm0,%xmm0
230
vpaddd %xmm1,%xmm0,%xmm0
231
vpxor %xmm0,%xmm3,%xmm3
232
vprord $0x8,%xmm3,%xmm3
233
vpaddd %xmm3,%xmm2,%xmm2
234
vpxor %xmm2,%xmm1,%xmm1
235
vprord $0x7,%xmm1,%xmm1
236
vpshufd $0x39,%xmm0,%xmm0
237
vpshufd $0x4e,%xmm3,%xmm3
238
vpshufd $0x93,%xmm2,%xmm2
239
decb %cl
240
jne .Lblake2s_compress_avx512_roundloop
241
vpxor %xmm10,%xmm0,%xmm0
242
vpxor %xmm11,%xmm1,%xmm1
243
vpxor %xmm2,%xmm0,%xmm0
244
vpxor %xmm3,%xmm1,%xmm1
245
decq %rdx
246
jne .Lblake2s_compress_avx512_mainloop
247
vmovdqu %xmm0,(%rdi)
248
vmovdqu %xmm1,0x10(%rdi)
249
vmovdqu %xmm4,0x20(%rdi)
250
vzeroupper
251
RET
252
SYM_FUNC_END(blake2s_compress_avx512)
253
254