Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/blake2s-core.S
51660 views
1
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2
/*
3
* Copyright (C) 2015-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
4
* Copyright (C) 2017-2019 Samuel Neves <[email protected]>. All Rights Reserved.
5
*/
6
7
#include <linux/linkage.h>
8
9
.section .rodata.cst32.iv, "aM", @progbits, 32
10
.align 32
11
.Liv:
12
.octa 0xA54FF53A3C6EF372BB67AE856A09E667
13
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
14
15
.section .rodata.cst16.ror16, "aM", @progbits, 16
16
.align 16
17
.Lror16:
18
.octa 0x0D0C0F0E09080B0A0504070601000302
19
20
.section .rodata.cst16.ror8, "aM", @progbits, 16
21
.align 16
22
.Lror8:
23
.octa 0x0C0F0E0D080B0A090407060500030201
24
25
.section .rodata.cst64.sigma, "aM", @progbits, 160
26
.align 64
27
.Lsigma:
28
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
29
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
30
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
31
.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
32
.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
33
.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
34
.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
35
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
36
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
37
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
38
39
.section .rodata.cst64.sigma2, "aM", @progbits, 160
40
.align 64
41
.Lsigma2:
42
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
43
.byte 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
44
.byte 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
45
.byte 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
46
.byte 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
47
.byte 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
48
.byte 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
49
.byte 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
50
.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
51
.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
52
53
#define CTX %rdi
54
#define DATA %rsi
55
#define NBLOCKS %rdx
56
#define INC %ecx
57
58
.text
59
//
60
// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
61
// const u8 *data, size_t nblocks, u32 inc);
62
//
63
// Only the first three fields of struct blake2s_ctx are used:
64
// u32 h[8]; (inout)
65
// u32 t[2]; (inout)
66
// u32 f[2]; (in)
67
//
68
SYM_FUNC_START(blake2s_compress_ssse3)
69
movdqu (CTX),%xmm0 // Load h[0..3]
70
movdqu 16(CTX),%xmm1 // Load h[4..7]
71
movdqa .Lror16(%rip),%xmm12
72
movdqa .Lror8(%rip),%xmm13
73
movdqu 32(CTX),%xmm14 // Load t and f
74
movd INC,%xmm15 // Load inc
75
leaq .Lsigma+160(%rip),%r8
76
jmp .Lssse3_mainloop
77
78
.align 32
79
.Lssse3_mainloop:
80
// Main loop: each iteration processes one 64-byte block.
81
movdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
82
movdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
83
paddq %xmm15,%xmm14 // t += inc (64-bit addition)
84
movdqa .Liv(%rip),%xmm2 // v[8..11] = iv[0..3]
85
movdqa %xmm14,%xmm3
86
pxor .Liv+16(%rip),%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
87
leaq .Lsigma(%rip),%rcx
88
89
.Lssse3_roundloop:
90
// Round loop: each iteration does 1 round (of 10 rounds total).
91
movzbl (%rcx),%eax
92
movd (DATA,%rax,4),%xmm4
93
movzbl 1(%rcx),%eax
94
movd (DATA,%rax,4),%xmm5
95
movzbl 2(%rcx),%eax
96
movd (DATA,%rax,4),%xmm6
97
movzbl 3(%rcx),%eax
98
movd (DATA,%rax,4),%xmm7
99
punpckldq %xmm5,%xmm4
100
punpckldq %xmm7,%xmm6
101
punpcklqdq %xmm6,%xmm4
102
paddd %xmm4,%xmm0
103
paddd %xmm1,%xmm0
104
pxor %xmm0,%xmm3
105
pshufb %xmm12,%xmm3
106
paddd %xmm3,%xmm2
107
pxor %xmm2,%xmm1
108
movdqa %xmm1,%xmm8
109
psrld $12,%xmm1
110
pslld $20,%xmm8
111
por %xmm8,%xmm1
112
movzbl 4(%rcx),%eax
113
movd (DATA,%rax,4),%xmm5
114
movzbl 5(%rcx),%eax
115
movd (DATA,%rax,4),%xmm6
116
movzbl 6(%rcx),%eax
117
movd (DATA,%rax,4),%xmm7
118
movzbl 7(%rcx),%eax
119
movd (DATA,%rax,4),%xmm4
120
punpckldq %xmm6,%xmm5
121
punpckldq %xmm4,%xmm7
122
punpcklqdq %xmm7,%xmm5
123
paddd %xmm5,%xmm0
124
paddd %xmm1,%xmm0
125
pxor %xmm0,%xmm3
126
pshufb %xmm13,%xmm3
127
paddd %xmm3,%xmm2
128
pxor %xmm2,%xmm1
129
movdqa %xmm1,%xmm8
130
psrld $7,%xmm1
131
pslld $25,%xmm8
132
por %xmm8,%xmm1
133
pshufd $0x93,%xmm0,%xmm0
134
pshufd $0x4e,%xmm3,%xmm3
135
pshufd $0x39,%xmm2,%xmm2
136
movzbl 8(%rcx),%eax
137
movd (DATA,%rax,4),%xmm6
138
movzbl 9(%rcx),%eax
139
movd (DATA,%rax,4),%xmm7
140
movzbl 10(%rcx),%eax
141
movd (DATA,%rax,4),%xmm4
142
movzbl 11(%rcx),%eax
143
movd (DATA,%rax,4),%xmm5
144
punpckldq %xmm7,%xmm6
145
punpckldq %xmm5,%xmm4
146
punpcklqdq %xmm4,%xmm6
147
paddd %xmm6,%xmm0
148
paddd %xmm1,%xmm0
149
pxor %xmm0,%xmm3
150
pshufb %xmm12,%xmm3
151
paddd %xmm3,%xmm2
152
pxor %xmm2,%xmm1
153
movdqa %xmm1,%xmm8
154
psrld $12,%xmm1
155
pslld $20,%xmm8
156
por %xmm8,%xmm1
157
movzbl 12(%rcx),%eax
158
movd (DATA,%rax,4),%xmm7
159
movzbl 13(%rcx),%eax
160
movd (DATA,%rax,4),%xmm4
161
movzbl 14(%rcx),%eax
162
movd (DATA,%rax,4),%xmm5
163
movzbl 15(%rcx),%eax
164
movd (DATA,%rax,4),%xmm6
165
punpckldq %xmm4,%xmm7
166
punpckldq %xmm6,%xmm5
167
punpcklqdq %xmm5,%xmm7
168
paddd %xmm7,%xmm0
169
paddd %xmm1,%xmm0
170
pxor %xmm0,%xmm3
171
pshufb %xmm13,%xmm3
172
paddd %xmm3,%xmm2
173
pxor %xmm2,%xmm1
174
movdqa %xmm1,%xmm8
175
psrld $7,%xmm1
176
pslld $25,%xmm8
177
por %xmm8,%xmm1
178
pshufd $0x39,%xmm0,%xmm0
179
pshufd $0x4e,%xmm3,%xmm3
180
pshufd $0x93,%xmm2,%xmm2
181
addq $16,%rcx
182
cmpq %r8,%rcx
183
jnz .Lssse3_roundloop
184
185
// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
186
pxor %xmm2,%xmm0
187
pxor %xmm3,%xmm1
188
pxor %xmm10,%xmm0
189
pxor %xmm11,%xmm1
190
addq $64,DATA
191
decq NBLOCKS
192
jnz .Lssse3_mainloop
193
194
movdqu %xmm0,(CTX) // Store new h[0..3]
195
movdqu %xmm1,16(CTX) // Store new h[4..7]
196
movq %xmm14,32(CTX) // Store new t (f is unchanged)
197
RET
198
SYM_FUNC_END(blake2s_compress_ssse3)
199
200
//
201
// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
202
// const u8 *data, size_t nblocks, u32 inc);
203
//
204
// Only the first three fields of struct blake2s_ctx are used:
205
// u32 h[8]; (inout)
206
// u32 t[2]; (inout)
207
// u32 f[2]; (in)
208
//
209
SYM_FUNC_START(blake2s_compress_avx512)
210
vmovdqu (CTX),%xmm0 // Load h[0..3]
211
vmovdqu 16(CTX),%xmm1 // Load h[4..7]
212
vmovdqu 32(CTX),%xmm4 // Load t and f
213
vmovd INC,%xmm5 // Load inc
214
vmovdqa .Liv(%rip),%xmm14 // Load iv[0..3]
215
vmovdqa .Liv+16(%rip),%xmm15 // Load iv[4..7]
216
jmp .Lavx512_mainloop
217
218
.align 32
219
.Lavx512_mainloop:
220
// Main loop: each iteration processes one 64-byte block.
221
vmovdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
222
vmovdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
223
vpaddq %xmm5,%xmm4,%xmm4 // t += inc (64-bit addition)
224
vmovdqa %xmm14,%xmm2 // v[8..11] = iv[0..3]
225
vpxor %xmm15,%xmm4,%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
226
vmovdqu (DATA),%ymm6 // Load first 8 data words
227
vmovdqu 32(DATA),%ymm7 // Load second 8 data words
228
addq $64,DATA
229
leaq .Lsigma2(%rip),%rax
230
movb $10,%cl // Set num rounds remaining
231
232
.Lavx512_roundloop:
233
// Round loop: each iteration does 1 round (of 10 rounds total).
234
vpmovzxbd (%rax),%ymm8
235
vpmovzxbd 8(%rax),%ymm9
236
addq $16,%rax
237
vpermi2d %ymm7,%ymm6,%ymm8
238
vpermi2d %ymm7,%ymm6,%ymm9
239
vmovdqa %ymm8,%ymm6
240
vmovdqa %ymm9,%ymm7
241
vpaddd %xmm8,%xmm0,%xmm0
242
vpaddd %xmm1,%xmm0,%xmm0
243
vpxor %xmm0,%xmm3,%xmm3
244
vprord $16,%xmm3,%xmm3
245
vpaddd %xmm3,%xmm2,%xmm2
246
vpxor %xmm2,%xmm1,%xmm1
247
vprord $12,%xmm1,%xmm1
248
vextracti128 $1,%ymm8,%xmm8
249
vpaddd %xmm8,%xmm0,%xmm0
250
vpaddd %xmm1,%xmm0,%xmm0
251
vpxor %xmm0,%xmm3,%xmm3
252
vprord $8,%xmm3,%xmm3
253
vpaddd %xmm3,%xmm2,%xmm2
254
vpxor %xmm2,%xmm1,%xmm1
255
vprord $7,%xmm1,%xmm1
256
vpshufd $0x93,%xmm0,%xmm0
257
vpshufd $0x4e,%xmm3,%xmm3
258
vpshufd $0x39,%xmm2,%xmm2
259
vpaddd %xmm9,%xmm0,%xmm0
260
vpaddd %xmm1,%xmm0,%xmm0
261
vpxor %xmm0,%xmm3,%xmm3
262
vprord $16,%xmm3,%xmm3
263
vpaddd %xmm3,%xmm2,%xmm2
264
vpxor %xmm2,%xmm1,%xmm1
265
vprord $12,%xmm1,%xmm1
266
vextracti128 $1,%ymm9,%xmm9
267
vpaddd %xmm9,%xmm0,%xmm0
268
vpaddd %xmm1,%xmm0,%xmm0
269
vpxor %xmm0,%xmm3,%xmm3
270
vprord $8,%xmm3,%xmm3
271
vpaddd %xmm3,%xmm2,%xmm2
272
vpxor %xmm2,%xmm1,%xmm1
273
vprord $7,%xmm1,%xmm1
274
vpshufd $0x39,%xmm0,%xmm0
275
vpshufd $0x4e,%xmm3,%xmm3
276
vpshufd $0x93,%xmm2,%xmm2
277
decb %cl
278
jne .Lavx512_roundloop
279
280
// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
281
vpternlogd $0x96,%xmm10,%xmm2,%xmm0
282
vpternlogd $0x96,%xmm11,%xmm3,%xmm1
283
decq NBLOCKS
284
jne .Lavx512_mainloop
285
286
vmovdqu %xmm0,(CTX) // Store new h[0..3]
287
vmovdqu %xmm1,16(CTX) // Store new h[4..7]
288
vmovq %xmm4,32(CTX) // Store new t (f is unchanged)
289
vzeroupper
290
RET
291
SYM_FUNC_END(blake2s_compress_avx512)
292
293