CoCalc -- blake2b-neon-core.S

GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm/blake2b-neon-core.S
⁵⁰⁹⁵¹ views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
 * BLAKE2b digest algorithm optimized with ARM NEON instructions.  On ARM
4
 * processors that have NEON support but not the ARMv8 Crypto Extensions,
5
 * typically this BLAKE2b implementation is much faster than the SHA-2 family
6
 * and slightly faster than SHA-1.
7
 *
8
 * Copyright 2020 Google LLC
9
 *
10
 * Author: Eric Biggers <[email protected]>
11
 */
12

13
#include <linux/linkage.h>
14

15
	.text
16
	.fpu		neon
17

18
	// The arguments to blake2b_compress_neon()
19
	CTX		.req	r0
20
	DATA		.req	r1
21
	NBLOCKS		.req	r2
22
	INC		.req	r3
23

24
	// Pointers to the rotation tables
25
	ROR24_TABLE	.req	r4
26
	ROR16_TABLE	.req	r5
27

28
	// The original stack pointer
29
	ORIG_SP		.req	r6
30

31
	// NEON registers which contain the message words of the current block.
32
	// M_0-M_3 are occasionally used for other purposes too.
33
	M_0		.req	d16
34
	M_1		.req	d17
35
	M_2		.req	d18
36
	M_3		.req	d19
37
	M_4		.req	d20
38
	M_5		.req	d21
39
	M_6		.req	d22
40
	M_7		.req	d23
41
	M_8		.req	d24
42
	M_9		.req	d25
43
	M_10		.req	d26
44
	M_11		.req	d27
45
	M_12		.req	d28
46
	M_13		.req	d29
47
	M_14		.req	d30
48
	M_15		.req	d31
49

50
	.align		4
51
	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
52
	// instruction.  This is the most efficient way to implement these
53
	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
54
	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
55
.Lror24_table:
56
	.byte		3, 4, 5, 6, 7, 0, 1, 2
57
.Lror16_table:
58
	.byte		2, 3, 4, 5, 6, 7, 0, 1
59
	// The BLAKE2b initialization vector
60
.Lblake2b_IV:
61
	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
62
	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
63
	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
64
	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
65

66
// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
67
// NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
68
// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
69
// (M_0-M_3), so that they can be reloaded if they are used as temporary
70
// registers.  The macro arguments s0-s15 give the order in which the message
71
// words are used in this round.  'final' is 1 if this is the final round.
72
.macro	_blake2b_round	s0, s1, s2, s3, s4, s5, s6, s7, \
73
			s8, s9, s10, s11, s12, s13, s14, s15, final=0
74

75
	// Mix the columns:
76
	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
77
	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
78

79
	// a += b + m[blake2b_sigma[r][2*i + 0]];
80
	vadd.u64	q0, q0, q2
81
	vadd.u64	q1, q1, q3
82
	vadd.u64	d0, d0, M_\s0
83
	vadd.u64	d1, d1, M_\s2
84
	vadd.u64	d2, d2, M_\s4
85
	vadd.u64	d3, d3, M_\s6
86

87
	// d = ror64(d ^ a, 32);
88
	veor		q6, q6, q0
89
	veor		q7, q7, q1
90
	vrev64.32	q6, q6
91
	vrev64.32	q7, q7
92

93
	// c += d;
94
	vadd.u64	q4, q4, q6
95
	vadd.u64	q5, q5, q7
96

97
	// b = ror64(b ^ c, 24);
98
	vld1.8		{M_0}, [ROR24_TABLE, :64]
99
	veor		q2, q2, q4
100
	veor		q3, q3, q5
101
	vtbl.8		d4, {d4}, M_0
102
	vtbl.8		d5, {d5}, M_0
103
	vtbl.8		d6, {d6}, M_0
104
	vtbl.8		d7, {d7}, M_0
105

106
	// a += b + m[blake2b_sigma[r][2*i + 1]];
107
	//
108
	// M_0 got clobbered above, so we have to reload it if any of the four
109
	// message words this step needs happens to be M_0.  Otherwise we don't
110
	// need to reload it here, as it will just get clobbered again below.
111
.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
112
	vld1.8		{M_0}, [sp, :64]
113
.endif
114
	vadd.u64	q0, q0, q2
115
	vadd.u64	q1, q1, q3
116
	vadd.u64	d0, d0, M_\s1
117
	vadd.u64	d1, d1, M_\s3
118
	vadd.u64	d2, d2, M_\s5
119
	vadd.u64	d3, d3, M_\s7
120

121
	// d = ror64(d ^ a, 16);
122
	vld1.8		{M_0}, [ROR16_TABLE, :64]
123
	veor		q6, q6, q0
124
	veor		q7, q7, q1
125
	vtbl.8		d12, {d12}, M_0
126
	vtbl.8		d13, {d13}, M_0
127
	vtbl.8		d14, {d14}, M_0
128
	vtbl.8		d15, {d15}, M_0
129

130
	// c += d;
131
	vadd.u64	q4, q4, q6
132
	vadd.u64	q5, q5, q7
133

134
	// b = ror64(b ^ c, 63);
135
	//
136
	// This rotation amount isn't a multiple of 8, so it has to be
137
	// implemented using a pair of shifts, which requires temporary
138
	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
139
	veor		q8, q2, q4
140
	veor		q9, q3, q5
141
	vshr.u64	q2, q8, #63
142
	vshr.u64	q3, q9, #63
143
	vsli.u64	q2, q8, #1
144
	vsli.u64	q3, q9, #1
145
	vld1.8		{q8-q9}, [sp, :256]
146

147
	// Mix the diagonals:
148
	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
149
	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
150
	//
151
	// There are two possible ways to do this: use 'vext' instructions to
152
	// shift the rows of the matrix so that the diagonals become columns,
153
	// and undo it afterwards; or just use 64-bit operations on 'd'
154
	// registers instead of 128-bit operations on 'q' registers.  We use the
155
	// latter approach, as it performs much better on Cortex-A7.
156

157
	// a += b + m[blake2b_sigma[r][2*i + 0]];
158
	vadd.u64	d0, d0, d5
159
	vadd.u64	d1, d1, d6
160
	vadd.u64	d2, d2, d7
161
	vadd.u64	d3, d3, d4
162
	vadd.u64	d0, d0, M_\s8
163
	vadd.u64	d1, d1, M_\s10
164
	vadd.u64	d2, d2, M_\s12
165
	vadd.u64	d3, d3, M_\s14
166

167
	// d = ror64(d ^ a, 32);
168
	veor		d15, d15, d0
169
	veor		d12, d12, d1
170
	veor		d13, d13, d2
171
	veor		d14, d14, d3
172
	vrev64.32	d15, d15
173
	vrev64.32	d12, d12
174
	vrev64.32	d13, d13
175
	vrev64.32	d14, d14
176

177
	// c += d;
178
	vadd.u64	d10, d10, d15
179
	vadd.u64	d11, d11, d12
180
	vadd.u64	d8, d8, d13
181
	vadd.u64	d9, d9, d14
182

183
	// b = ror64(b ^ c, 24);
184
	vld1.8		{M_0}, [ROR24_TABLE, :64]
185
	veor		d5, d5, d10
186
	veor		d6, d6, d11
187
	veor		d7, d7, d8
188
	veor		d4, d4, d9
189
	vtbl.8		d5, {d5}, M_0
190
	vtbl.8		d6, {d6}, M_0
191
	vtbl.8		d7, {d7}, M_0
192
	vtbl.8		d4, {d4}, M_0
193

194
	// a += b + m[blake2b_sigma[r][2*i + 1]];
195
.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
196
	vld1.8		{M_0}, [sp, :64]
197
.endif
198
	vadd.u64	d0, d0, d5
199
	vadd.u64	d1, d1, d6
200
	vadd.u64	d2, d2, d7
201
	vadd.u64	d3, d3, d4
202
	vadd.u64	d0, d0, M_\s9
203
	vadd.u64	d1, d1, M_\s11
204
	vadd.u64	d2, d2, M_\s13
205
	vadd.u64	d3, d3, M_\s15
206

207
	// d = ror64(d ^ a, 16);
208
	vld1.8		{M_0}, [ROR16_TABLE, :64]
209
	veor		d15, d15, d0
210
	veor		d12, d12, d1
211
	veor		d13, d13, d2
212
	veor		d14, d14, d3
213
	vtbl.8		d12, {d12}, M_0
214
	vtbl.8		d13, {d13}, M_0
215
	vtbl.8		d14, {d14}, M_0
216
	vtbl.8		d15, {d15}, M_0
217

218
	// c += d;
219
	vadd.u64	d10, d10, d15
220
	vadd.u64	d11, d11, d12
221
	vadd.u64	d8, d8, d13
222
	vadd.u64	d9, d9, d14
223

224
	// b = ror64(b ^ c, 63);
225
	veor		d16, d4, d9
226
	veor		d17, d5, d10
227
	veor		d18, d6, d11
228
	veor		d19, d7, d8
229
	vshr.u64	q2, q8, #63
230
	vshr.u64	q3, q9, #63
231
	vsli.u64	q2, q8, #1
232
	vsli.u64	q3, q9, #1
233
	// Reloading q8-q9 can be skipped on the final round.
234
.if ! \final
235
	vld1.8		{q8-q9}, [sp, :256]
236
.endif
237
.endm
238

239
//
240
// void blake2b_compress_neon(struct blake2b_ctx *ctx,
241
//			      const u8 *data, size_t nblocks, u32 inc);
242
//
243
// Only the first three fields of struct blake2b_ctx are used:
244
//	u64 h[8];	(inout)
245
//	u64 t[2];	(inout)
246
//	u64 f[2];	(in)
247
//
248
	.align		5
249
ENTRY(blake2b_compress_neon)
250
	push		{r4-r10}
251

252
	// Allocate a 32-byte stack buffer that is 32-byte aligned.
253
	mov		ORIG_SP, sp
254
	sub		ip, sp, #32
255
	bic		ip, ip, #31
256
	mov		sp, ip
257

258
	adr		ROR24_TABLE, .Lror24_table
259
	adr		ROR16_TABLE, .Lror16_table
260

261
	mov		ip, CTX
262
	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
263
	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
264
.Lnext_block:
265
	  adr		r10, .Lblake2b_IV
266
	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
267
	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
268
	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
269
	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
270
	  adds		r7, r7, INC		// Increment counter
271
	bcs		.Lslow_inc_ctr
272
	vmov.i32	d28[0], r7
273
	vst1.64		{d28}, [ip]		// Update t[0]
274
.Linc_ctr_done:
275

276
	// Load the next message block and finish initializing the state matrix
277
	// 'v'.  Fortunately, there are exactly enough NEON registers to fit the
278
	// entire state matrix in q0-q7 and the entire message block in q8-15.
279
	//
280
	// However, _blake2b_round also needs some extra registers for rotates,
281
	// so we have to spill some registers.  It's better to spill the message
282
	// registers than the state registers, as the message doesn't change.
283
	// Therefore we store a copy of the first 32 bytes of the message block
284
	// (q8-q9) in an aligned buffer on the stack so that they can be
285
	// reloaded when needed.  (We could just reload directly from the
286
	// message buffer, but it's faster to use aligned loads.)
287
	vld1.8		{q8-q9}, [DATA]!
288
	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
289
	vld1.8		{q10-q11}, [DATA]!
290
	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
291
	vld1.8		{q12-q13}, [DATA]!
292
	vst1.8		{q8-q9}, [sp, :256]
293
	  mov		ip, CTX
294
	vld1.8		{q14-q15}, [DATA]!
295

296
	// Execute the rounds.  Each round is provided the order in which it
297
	// needs to use the message words.
298
	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
299
	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
300
	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
301
	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
302
	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
303
	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
304
	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
305
	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
306
	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
307
	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
308
	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
309
	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
310
			final=1
311

312
	// Fold the final state matrix into the hash chaining value:
313
	//
314
	//	for (i = 0; i < 8; i++)
315
	//		h[i] ^= v[i] ^ v[i + 8];
316
	//
317
	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
318
	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
319
	veor		q1, q1, q5		// v[2..3] ^= v[10..11]
320
	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
321
	veor		q2, q2, q6		// v[4..5] ^= v[12..13]
322
	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
323
	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
324
	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
325
	  mov		ip, CTX
326
	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
327
	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
328
	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
329
	veor		q3, q3, q11		// v[6..7] ^= h[6..7]
330
	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]
331

332
	// Advance to the next block, if there is one.
333
	bne		.Lnext_block		// nblocks != 0?
334

335
	mov		sp, ORIG_SP
336
	pop		{r4-r10}
337
	mov		pc, lr
338

339
.Lslow_inc_ctr:
340
	// Handle the case where the counter overflowed its low 32 bits, by
341
	// carrying the overflow bit into the full 128-bit counter.
342
	vmov		r9, r10, d29
343
	adcs		r8, r8, #0
344
	adcs		r9, r9, #0
345
	adc		r10, r10, #0
346
	vmov		d28, r7, r8
347
	vmov		d29, r9, r10
348
	vst1.64		{q14}, [ip]		// Update t[0] and t[1]
349
	b		.Linc_ctr_done
350
ENDPROC(blake2b_compress_neon)
351

352
Product

Resources

Company