Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm/blake2b-neon-core.S
38184 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* BLAKE2b digest algorithm optimized with ARM NEON instructions. On ARM
4
* processors that have NEON support but not the ARMv8 Crypto Extensions,
5
* typically this BLAKE2b implementation is much faster than the SHA-2 family
6
* and slightly faster than SHA-1.
7
*
8
* Copyright 2020 Google LLC
9
*
10
* Author: Eric Biggers <[email protected]>
11
*/
12
13
#include <linux/linkage.h>
14
15
.text
16
.fpu neon
17
18
// The arguments to blake2b_compress_neon()
19
CTX .req r0
20
DATA .req r1
21
NBLOCKS .req r2
22
INC .req r3
23
24
// Pointers to the rotation tables
25
ROR24_TABLE .req r4
26
ROR16_TABLE .req r5
27
28
// The original stack pointer
29
ORIG_SP .req r6
30
31
// NEON registers which contain the message words of the current block.
32
// M_0-M_3 are occasionally used for other purposes too.
33
M_0 .req d16
34
M_1 .req d17
35
M_2 .req d18
36
M_3 .req d19
37
M_4 .req d20
38
M_5 .req d21
39
M_6 .req d22
40
M_7 .req d23
41
M_8 .req d24
42
M_9 .req d25
43
M_10 .req d26
44
M_11 .req d27
45
M_12 .req d28
46
M_13 .req d29
47
M_14 .req d30
48
M_15 .req d31
49
50
.align 4
51
// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
52
// instruction. This is the most efficient way to implement these
53
// rotation amounts with NEON. (On Cortex-A53 it's the same speed as
54
// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
55
.Lror24_table:
56
.byte 3, 4, 5, 6, 7, 0, 1, 2
57
.Lror16_table:
58
.byte 2, 3, 4, 5, 6, 7, 0, 1
59
// The BLAKE2b initialization vector
60
.Lblake2b_IV:
61
.quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
62
.quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
63
.quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
64
.quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
65
66
// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
67
// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack
68
// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
69
// (M_0-M_3), so that they can be reloaded if they are used as temporary
70
// registers. The macro arguments s0-s15 give the order in which the message
71
// words are used in this round. 'final' is 1 if this is the final round.
72
.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \
73
s8, s9, s10, s11, s12, s13, s14, s15, final=0
74
75
// Mix the columns:
76
// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
77
// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
78
79
// a += b + m[blake2b_sigma[r][2*i + 0]];
80
vadd.u64 q0, q0, q2
81
vadd.u64 q1, q1, q3
82
vadd.u64 d0, d0, M_\s0
83
vadd.u64 d1, d1, M_\s2
84
vadd.u64 d2, d2, M_\s4
85
vadd.u64 d3, d3, M_\s6
86
87
// d = ror64(d ^ a, 32);
88
veor q6, q6, q0
89
veor q7, q7, q1
90
vrev64.32 q6, q6
91
vrev64.32 q7, q7
92
93
// c += d;
94
vadd.u64 q4, q4, q6
95
vadd.u64 q5, q5, q7
96
97
// b = ror64(b ^ c, 24);
98
vld1.8 {M_0}, [ROR24_TABLE, :64]
99
veor q2, q2, q4
100
veor q3, q3, q5
101
vtbl.8 d4, {d4}, M_0
102
vtbl.8 d5, {d5}, M_0
103
vtbl.8 d6, {d6}, M_0
104
vtbl.8 d7, {d7}, M_0
105
106
// a += b + m[blake2b_sigma[r][2*i + 1]];
107
//
108
// M_0 got clobbered above, so we have to reload it if any of the four
109
// message words this step needs happens to be M_0. Otherwise we don't
110
// need to reload it here, as it will just get clobbered again below.
111
.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
112
vld1.8 {M_0}, [sp, :64]
113
.endif
114
vadd.u64 q0, q0, q2
115
vadd.u64 q1, q1, q3
116
vadd.u64 d0, d0, M_\s1
117
vadd.u64 d1, d1, M_\s3
118
vadd.u64 d2, d2, M_\s5
119
vadd.u64 d3, d3, M_\s7
120
121
// d = ror64(d ^ a, 16);
122
vld1.8 {M_0}, [ROR16_TABLE, :64]
123
veor q6, q6, q0
124
veor q7, q7, q1
125
vtbl.8 d12, {d12}, M_0
126
vtbl.8 d13, {d13}, M_0
127
vtbl.8 d14, {d14}, M_0
128
vtbl.8 d15, {d15}, M_0
129
130
// c += d;
131
vadd.u64 q4, q4, q6
132
vadd.u64 q5, q5, q7
133
134
// b = ror64(b ^ c, 63);
135
//
136
// This rotation amount isn't a multiple of 8, so it has to be
137
// implemented using a pair of shifts, which requires temporary
138
// registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
139
veor q8, q2, q4
140
veor q9, q3, q5
141
vshr.u64 q2, q8, #63
142
vshr.u64 q3, q9, #63
143
vsli.u64 q2, q8, #1
144
vsli.u64 q3, q9, #1
145
vld1.8 {q8-q9}, [sp, :256]
146
147
// Mix the diagonals:
148
// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
149
// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
150
//
151
// There are two possible ways to do this: use 'vext' instructions to
152
// shift the rows of the matrix so that the diagonals become columns,
153
// and undo it afterwards; or just use 64-bit operations on 'd'
154
// registers instead of 128-bit operations on 'q' registers. We use the
155
// latter approach, as it performs much better on Cortex-A7.
156
157
// a += b + m[blake2b_sigma[r][2*i + 0]];
158
vadd.u64 d0, d0, d5
159
vadd.u64 d1, d1, d6
160
vadd.u64 d2, d2, d7
161
vadd.u64 d3, d3, d4
162
vadd.u64 d0, d0, M_\s8
163
vadd.u64 d1, d1, M_\s10
164
vadd.u64 d2, d2, M_\s12
165
vadd.u64 d3, d3, M_\s14
166
167
// d = ror64(d ^ a, 32);
168
veor d15, d15, d0
169
veor d12, d12, d1
170
veor d13, d13, d2
171
veor d14, d14, d3
172
vrev64.32 d15, d15
173
vrev64.32 d12, d12
174
vrev64.32 d13, d13
175
vrev64.32 d14, d14
176
177
// c += d;
178
vadd.u64 d10, d10, d15
179
vadd.u64 d11, d11, d12
180
vadd.u64 d8, d8, d13
181
vadd.u64 d9, d9, d14
182
183
// b = ror64(b ^ c, 24);
184
vld1.8 {M_0}, [ROR24_TABLE, :64]
185
veor d5, d5, d10
186
veor d6, d6, d11
187
veor d7, d7, d8
188
veor d4, d4, d9
189
vtbl.8 d5, {d5}, M_0
190
vtbl.8 d6, {d6}, M_0
191
vtbl.8 d7, {d7}, M_0
192
vtbl.8 d4, {d4}, M_0
193
194
// a += b + m[blake2b_sigma[r][2*i + 1]];
195
.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
196
vld1.8 {M_0}, [sp, :64]
197
.endif
198
vadd.u64 d0, d0, d5
199
vadd.u64 d1, d1, d6
200
vadd.u64 d2, d2, d7
201
vadd.u64 d3, d3, d4
202
vadd.u64 d0, d0, M_\s9
203
vadd.u64 d1, d1, M_\s11
204
vadd.u64 d2, d2, M_\s13
205
vadd.u64 d3, d3, M_\s15
206
207
// d = ror64(d ^ a, 16);
208
vld1.8 {M_0}, [ROR16_TABLE, :64]
209
veor d15, d15, d0
210
veor d12, d12, d1
211
veor d13, d13, d2
212
veor d14, d14, d3
213
vtbl.8 d12, {d12}, M_0
214
vtbl.8 d13, {d13}, M_0
215
vtbl.8 d14, {d14}, M_0
216
vtbl.8 d15, {d15}, M_0
217
218
// c += d;
219
vadd.u64 d10, d10, d15
220
vadd.u64 d11, d11, d12
221
vadd.u64 d8, d8, d13
222
vadd.u64 d9, d9, d14
223
224
// b = ror64(b ^ c, 63);
225
veor d16, d4, d9
226
veor d17, d5, d10
227
veor d18, d6, d11
228
veor d19, d7, d8
229
vshr.u64 q2, q8, #63
230
vshr.u64 q3, q9, #63
231
vsli.u64 q2, q8, #1
232
vsli.u64 q3, q9, #1
233
// Reloading q8-q9 can be skipped on the final round.
234
.if ! \final
235
vld1.8 {q8-q9}, [sp, :256]
236
.endif
237
.endm
238
239
//
240
// void blake2b_compress_neon(struct blake2b_ctx *ctx,
241
// const u8 *data, size_t nblocks, u32 inc);
242
//
243
// Only the first three fields of struct blake2b_ctx are used:
244
// u64 h[8]; (inout)
245
// u64 t[2]; (inout)
246
// u64 f[2]; (in)
247
//
248
.align 5
249
ENTRY(blake2b_compress_neon)
250
push {r4-r10}
251
252
// Allocate a 32-byte stack buffer that is 32-byte aligned.
253
mov ORIG_SP, sp
254
sub ip, sp, #32
255
bic ip, ip, #31
256
mov sp, ip
257
258
adr ROR24_TABLE, .Lror24_table
259
adr ROR16_TABLE, .Lror16_table
260
261
mov ip, CTX
262
vld1.64 {q0-q1}, [ip]! // Load h[0..3]
263
vld1.64 {q2-q3}, [ip]! // Load h[4..7]
264
.Lnext_block:
265
adr r10, .Lblake2b_IV
266
vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1]
267
vld1.64 {q4-q5}, [r10]! // Load IV[0..3]
268
vmov r7, r8, d28 // Copy t[0] to (r7, r8)
269
vld1.64 {q6-q7}, [r10] // Load IV[4..7]
270
adds r7, r7, INC // Increment counter
271
bcs .Lslow_inc_ctr
272
vmov.i32 d28[0], r7
273
vst1.64 {d28}, [ip] // Update t[0]
274
.Linc_ctr_done:
275
276
// Load the next message block and finish initializing the state matrix
277
// 'v'. Fortunately, there are exactly enough NEON registers to fit the
278
// entire state matrix in q0-q7 and the entire message block in q8-15.
279
//
280
// However, _blake2b_round also needs some extra registers for rotates,
281
// so we have to spill some registers. It's better to spill the message
282
// registers than the state registers, as the message doesn't change.
283
// Therefore we store a copy of the first 32 bytes of the message block
284
// (q8-q9) in an aligned buffer on the stack so that they can be
285
// reloaded when needed. (We could just reload directly from the
286
// message buffer, but it's faster to use aligned loads.)
287
vld1.8 {q8-q9}, [DATA]!
288
veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]
289
vld1.8 {q10-q11}, [DATA]!
290
veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
291
vld1.8 {q12-q13}, [DATA]!
292
vst1.8 {q8-q9}, [sp, :256]
293
mov ip, CTX
294
vld1.8 {q14-q15}, [DATA]!
295
296
// Execute the rounds. Each round is provided the order in which it
297
// needs to use the message words.
298
_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
299
_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
300
_blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
301
_blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
302
_blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
303
_blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
304
_blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
305
_blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
306
_blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
307
_blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
308
_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
309
_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
310
final=1
311
312
// Fold the final state matrix into the hash chaining value:
313
//
314
// for (i = 0; i < 8; i++)
315
// h[i] ^= v[i] ^ v[i + 8];
316
//
317
vld1.64 {q8-q9}, [ip]! // Load old h[0..3]
318
veor q0, q0, q4 // v[0..1] ^= v[8..9]
319
veor q1, q1, q5 // v[2..3] ^= v[10..11]
320
vld1.64 {q10-q11}, [ip] // Load old h[4..7]
321
veor q2, q2, q6 // v[4..5] ^= v[12..13]
322
veor q3, q3, q7 // v[6..7] ^= v[14..15]
323
veor q0, q0, q8 // v[0..1] ^= h[0..1]
324
veor q1, q1, q9 // v[2..3] ^= h[2..3]
325
mov ip, CTX
326
subs NBLOCKS, NBLOCKS, #1 // nblocks--
327
vst1.64 {q0-q1}, [ip]! // Store new h[0..3]
328
veor q2, q2, q10 // v[4..5] ^= h[4..5]
329
veor q3, q3, q11 // v[6..7] ^= h[6..7]
330
vst1.64 {q2-q3}, [ip]! // Store new h[4..7]
331
332
// Advance to the next block, if there is one.
333
bne .Lnext_block // nblocks != 0?
334
335
mov sp, ORIG_SP
336
pop {r4-r10}
337
mov pc, lr
338
339
.Lslow_inc_ctr:
340
// Handle the case where the counter overflowed its low 32 bits, by
341
// carrying the overflow bit into the full 128-bit counter.
342
vmov r9, r10, d29
343
adcs r8, r8, #0
344
adcs r9, r9, #0
345
adc r10, r10, #0
346
vmov d28, r7, r8
347
vmov d29, r9, r10
348
vst1.64 {q14}, [ip] // Update t[0] and t[1]
349
b .Linc_ctr_done
350
ENDPROC(blake2b_compress_neon)
351
352