Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm/sha1-armv7-neon.S
26288 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
3
*
4
* Copyright © 2013-2014 Jussi Kivilinna <[email protected]>
5
*/
6
7
#include <linux/linkage.h>
8
#include <asm/assembler.h>
9
10
.syntax unified
11
.fpu neon
12
13
.text
14
15
16
/* Context structure */
17
18
#define state_h0 0
19
#define state_h1 4
20
#define state_h2 8
21
#define state_h3 12
22
#define state_h4 16
23
24
25
/* Constants */
26
27
#define K1 0x5A827999
28
#define K2 0x6ED9EBA1
29
#define K3 0x8F1BBCDC
30
#define K4 0xCA62C1D6
31
.align 4
32
.LK_VEC:
33
.LK1: .long K1, K1, K1, K1
34
.LK2: .long K2, K2, K2, K2
35
.LK3: .long K3, K3, K3, K3
36
.LK4: .long K4, K4, K4, K4
37
38
39
/* Register macros */
40
41
#define RSTATE r0
42
#define RDATA r1
43
#define RNBLKS r2
44
#define ROLDSTACK r3
45
#define RWK lr
46
47
#define _a r4
48
#define _b r5
49
#define _c r6
50
#define _d r7
51
#define _e r8
52
53
#define RT0 r9
54
#define RT1 r10
55
#define RT2 r11
56
#define RT3 r12
57
58
#define W0 q0
59
#define W1 q7
60
#define W2 q2
61
#define W3 q3
62
#define W4 q4
63
#define W5 q6
64
#define W6 q5
65
#define W7 q1
66
67
#define tmp0 q8
68
#define tmp1 q9
69
#define tmp2 q10
70
#define tmp3 q11
71
72
#define qK1 q12
73
#define qK2 q13
74
#define qK3 q14
75
#define qK4 q15
76
77
#ifdef CONFIG_CPU_BIG_ENDIAN
78
#define ARM_LE(code...)
79
#else
80
#define ARM_LE(code...) code
81
#endif
82
83
/* Round function macros. */
84
85
#define WK_offs(i) (((i) & 15) * 4)
86
87
#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
88
W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
89
ldr RT3, [sp, WK_offs(i)]; \
90
pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
91
bic RT0, d, b; \
92
add e, e, a, ror #(32 - 5); \
93
and RT1, c, b; \
94
pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
95
add RT0, RT0, RT3; \
96
add e, e, RT1; \
97
ror b, #(32 - 30); \
98
pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
99
add e, e, RT0;
100
101
#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
102
W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
103
ldr RT3, [sp, WK_offs(i)]; \
104
pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
105
eor RT0, d, b; \
106
add e, e, a, ror #(32 - 5); \
107
eor RT0, RT0, c; \
108
pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
109
add e, e, RT3; \
110
ror b, #(32 - 30); \
111
pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
112
add e, e, RT0; \
113
114
#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
115
W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
116
ldr RT3, [sp, WK_offs(i)]; \
117
pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
118
eor RT0, b, c; \
119
and RT1, b, c; \
120
add e, e, a, ror #(32 - 5); \
121
pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
122
and RT0, RT0, d; \
123
add RT1, RT1, RT3; \
124
add e, e, RT0; \
125
ror b, #(32 - 30); \
126
pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
127
add e, e, RT1;
128
129
#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
130
W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
131
_R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
132
W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
133
134
#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
135
W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
136
_R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
137
W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
138
139
#define R(a,b,c,d,e,f,i) \
140
_R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
141
W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
142
143
#define dummy(...)
144
145
146
/* Input expansion macros. */
147
148
/********* Precalc macros for rounds 0-15 *************************************/
149
150
#define W_PRECALC_00_15() \
151
add RWK, sp, #(WK_offs(0)); \
152
\
153
vld1.32 {W0, W7}, [RDATA]!; \
154
ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
155
vld1.32 {W6, W5}, [RDATA]!; \
156
vadd.u32 tmp0, W0, curK; \
157
ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
158
ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
159
vadd.u32 tmp1, W7, curK; \
160
ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
161
vadd.u32 tmp2, W6, curK; \
162
vst1.32 {tmp0, tmp1}, [RWK]!; \
163
vadd.u32 tmp3, W5, curK; \
164
vst1.32 {tmp2, tmp3}, [RWK]; \
165
166
#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
167
vld1.32 {W0, W7}, [RDATA]!; \
168
169
#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
170
add RWK, sp, #(WK_offs(0)); \
171
172
#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
173
ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
174
175
#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
176
vld1.32 {W6, W5}, [RDATA]!; \
177
178
#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
179
vadd.u32 tmp0, W0, curK; \
180
181
#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
182
ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
183
184
#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
185
ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
186
187
#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
188
vadd.u32 tmp1, W7, curK; \
189
190
#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
191
ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
192
193
#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
194
vadd.u32 tmp2, W6, curK; \
195
196
#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
197
vst1.32 {tmp0, tmp1}, [RWK]!; \
198
199
#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
200
vadd.u32 tmp3, W5, curK; \
201
202
#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
203
vst1.32 {tmp2, tmp3}, [RWK]; \
204
205
206
/********* Precalc macros for rounds 16-31 ************************************/
207
208
#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
209
veor tmp0, tmp0; \
210
vext.8 W, W_m16, W_m12, #8; \
211
212
#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
213
add RWK, sp, #(WK_offs(i)); \
214
vext.8 tmp0, W_m04, tmp0, #4; \
215
216
#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
217
veor tmp0, tmp0, W_m16; \
218
veor.32 W, W, W_m08; \
219
220
#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
221
veor tmp1, tmp1; \
222
veor W, W, tmp0; \
223
224
#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
225
vshl.u32 tmp0, W, #1; \
226
227
#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
228
vext.8 tmp1, tmp1, W, #(16-12); \
229
vshr.u32 W, W, #31; \
230
231
#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
232
vorr tmp0, tmp0, W; \
233
vshr.u32 W, tmp1, #30; \
234
235
#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
236
vshl.u32 tmp1, tmp1, #2; \
237
238
#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
239
veor tmp0, tmp0, W; \
240
241
#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
242
veor W, tmp0, tmp1; \
243
244
#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
245
vadd.u32 tmp0, W, curK; \
246
247
#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
248
vst1.32 {tmp0}, [RWK];
249
250
251
/********* Precalc macros for rounds 32-79 ************************************/
252
253
#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
254
veor W, W_m28; \
255
256
#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
257
vext.8 tmp0, W_m08, W_m04, #8; \
258
259
#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
260
veor W, W_m16; \
261
262
#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
263
veor W, tmp0; \
264
265
#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
266
add RWK, sp, #(WK_offs(i&~3)); \
267
268
#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
269
vshl.u32 tmp1, W, #2; \
270
271
#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
272
vshr.u32 tmp0, W, #30; \
273
274
#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
275
vorr W, tmp0, tmp1; \
276
277
#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
278
vadd.u32 tmp0, W, curK; \
279
280
#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
281
vst1.32 {tmp0}, [RWK];
282
283
284
/*
285
* Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA.
286
*
287
* void sha1_transform_neon(struct sha1_block_state *state,
288
* const u8 *data, size_t nblocks);
289
*/
290
.align 3
291
ENTRY(sha1_transform_neon)
292
/* input:
293
* r0: state
294
* r1: data (64*nblocks bytes)
295
* r2: nblocks
296
*/
297
298
cmp RNBLKS, #0;
299
beq .Ldo_nothing;
300
301
push {r4-r12, lr};
302
/*vpush {q4-q7};*/
303
304
adr RT3, .LK_VEC;
305
306
mov ROLDSTACK, sp;
307
308
/* Align stack. */
309
sub RT0, sp, #(16*4);
310
and RT0, #(~(16-1));
311
mov sp, RT0;
312
313
vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
314
315
/* Get the values of the chaining variables. */
316
ldm RSTATE, {_a-_e};
317
318
vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
319
320
#undef curK
321
#define curK qK1
322
/* Precalc 0-15. */
323
W_PRECALC_00_15();
324
325
.Loop:
326
/* Transform 0-15 + Precalc 16-31. */
327
_R( _a, _b, _c, _d, _e, F1, 0,
328
WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
329
W4, W5, W6, W7, W0, _, _, _ );
330
_R( _e, _a, _b, _c, _d, F1, 1,
331
WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
332
W4, W5, W6, W7, W0, _, _, _ );
333
_R( _d, _e, _a, _b, _c, F1, 2,
334
WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
335
W4, W5, W6, W7, W0, _, _, _ );
336
_R( _c, _d, _e, _a, _b, F1, 3,
337
WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
338
W4, W5, W6, W7, W0, _, _, _ );
339
340
#undef curK
341
#define curK qK2
342
_R( _b, _c, _d, _e, _a, F1, 4,
343
WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
344
W3, W4, W5, W6, W7, _, _, _ );
345
_R( _a, _b, _c, _d, _e, F1, 5,
346
WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
347
W3, W4, W5, W6, W7, _, _, _ );
348
_R( _e, _a, _b, _c, _d, F1, 6,
349
WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
350
W3, W4, W5, W6, W7, _, _, _ );
351
_R( _d, _e, _a, _b, _c, F1, 7,
352
WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
353
W3, W4, W5, W6, W7, _, _, _ );
354
355
_R( _c, _d, _e, _a, _b, F1, 8,
356
WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
357
W2, W3, W4, W5, W6, _, _, _ );
358
_R( _b, _c, _d, _e, _a, F1, 9,
359
WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
360
W2, W3, W4, W5, W6, _, _, _ );
361
_R( _a, _b, _c, _d, _e, F1, 10,
362
WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
363
W2, W3, W4, W5, W6, _, _, _ );
364
_R( _e, _a, _b, _c, _d, F1, 11,
365
WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
366
W2, W3, W4, W5, W6, _, _, _ );
367
368
_R( _d, _e, _a, _b, _c, F1, 12,
369
WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
370
W1, W2, W3, W4, W5, _, _, _ );
371
_R( _c, _d, _e, _a, _b, F1, 13,
372
WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
373
W1, W2, W3, W4, W5, _, _, _ );
374
_R( _b, _c, _d, _e, _a, F1, 14,
375
WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
376
W1, W2, W3, W4, W5, _, _, _ );
377
_R( _a, _b, _c, _d, _e, F1, 15,
378
WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
379
W1, W2, W3, W4, W5, _, _, _ );
380
381
/* Transform 16-63 + Precalc 32-79. */
382
_R( _e, _a, _b, _c, _d, F1, 16,
383
WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
384
W0, W1, W2, W3, W4, W5, W6, W7);
385
_R( _d, _e, _a, _b, _c, F1, 17,
386
WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
387
W0, W1, W2, W3, W4, W5, W6, W7);
388
_R( _c, _d, _e, _a, _b, F1, 18,
389
WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32,
390
W0, W1, W2, W3, W4, W5, W6, W7);
391
_R( _b, _c, _d, _e, _a, F1, 19,
392
WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32,
393
W0, W1, W2, W3, W4, W5, W6, W7);
394
395
_R( _a, _b, _c, _d, _e, F2, 20,
396
WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
397
W7, W0, W1, W2, W3, W4, W5, W6);
398
_R( _e, _a, _b, _c, _d, F2, 21,
399
WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
400
W7, W0, W1, W2, W3, W4, W5, W6);
401
_R( _d, _e, _a, _b, _c, F2, 22,
402
WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36,
403
W7, W0, W1, W2, W3, W4, W5, W6);
404
_R( _c, _d, _e, _a, _b, F2, 23,
405
WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36,
406
W7, W0, W1, W2, W3, W4, W5, W6);
407
408
#undef curK
409
#define curK qK3
410
_R( _b, _c, _d, _e, _a, F2, 24,
411
WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
412
W6, W7, W0, W1, W2, W3, W4, W5);
413
_R( _a, _b, _c, _d, _e, F2, 25,
414
WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
415
W6, W7, W0, W1, W2, W3, W4, W5);
416
_R( _e, _a, _b, _c, _d, F2, 26,
417
WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40,
418
W6, W7, W0, W1, W2, W3, W4, W5);
419
_R( _d, _e, _a, _b, _c, F2, 27,
420
WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40,
421
W6, W7, W0, W1, W2, W3, W4, W5);
422
423
_R( _c, _d, _e, _a, _b, F2, 28,
424
WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
425
W5, W6, W7, W0, W1, W2, W3, W4);
426
_R( _b, _c, _d, _e, _a, F2, 29,
427
WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
428
W5, W6, W7, W0, W1, W2, W3, W4);
429
_R( _a, _b, _c, _d, _e, F2, 30,
430
WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44,
431
W5, W6, W7, W0, W1, W2, W3, W4);
432
_R( _e, _a, _b, _c, _d, F2, 31,
433
WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44,
434
W5, W6, W7, W0, W1, W2, W3, W4);
435
436
_R( _d, _e, _a, _b, _c, F2, 32,
437
WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
438
W4, W5, W6, W7, W0, W1, W2, W3);
439
_R( _c, _d, _e, _a, _b, F2, 33,
440
WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
441
W4, W5, W6, W7, W0, W1, W2, W3);
442
_R( _b, _c, _d, _e, _a, F2, 34,
443
WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48,
444
W4, W5, W6, W7, W0, W1, W2, W3);
445
_R( _a, _b, _c, _d, _e, F2, 35,
446
WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48,
447
W4, W5, W6, W7, W0, W1, W2, W3);
448
449
_R( _e, _a, _b, _c, _d, F2, 36,
450
WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
451
W3, W4, W5, W6, W7, W0, W1, W2);
452
_R( _d, _e, _a, _b, _c, F2, 37,
453
WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
454
W3, W4, W5, W6, W7, W0, W1, W2);
455
_R( _c, _d, _e, _a, _b, F2, 38,
456
WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52,
457
W3, W4, W5, W6, W7, W0, W1, W2);
458
_R( _b, _c, _d, _e, _a, F2, 39,
459
WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52,
460
W3, W4, W5, W6, W7, W0, W1, W2);
461
462
_R( _a, _b, _c, _d, _e, F3, 40,
463
WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
464
W2, W3, W4, W5, W6, W7, W0, W1);
465
_R( _e, _a, _b, _c, _d, F3, 41,
466
WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
467
W2, W3, W4, W5, W6, W7, W0, W1);
468
_R( _d, _e, _a, _b, _c, F3, 42,
469
WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56,
470
W2, W3, W4, W5, W6, W7, W0, W1);
471
_R( _c, _d, _e, _a, _b, F3, 43,
472
WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56,
473
W2, W3, W4, W5, W6, W7, W0, W1);
474
475
#undef curK
476
#define curK qK4
477
_R( _b, _c, _d, _e, _a, F3, 44,
478
WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
479
W1, W2, W3, W4, W5, W6, W7, W0);
480
_R( _a, _b, _c, _d, _e, F3, 45,
481
WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
482
W1, W2, W3, W4, W5, W6, W7, W0);
483
_R( _e, _a, _b, _c, _d, F3, 46,
484
WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60,
485
W1, W2, W3, W4, W5, W6, W7, W0);
486
_R( _d, _e, _a, _b, _c, F3, 47,
487
WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60,
488
W1, W2, W3, W4, W5, W6, W7, W0);
489
490
_R( _c, _d, _e, _a, _b, F3, 48,
491
WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
492
W0, W1, W2, W3, W4, W5, W6, W7);
493
_R( _b, _c, _d, _e, _a, F3, 49,
494
WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
495
W0, W1, W2, W3, W4, W5, W6, W7);
496
_R( _a, _b, _c, _d, _e, F3, 50,
497
WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64,
498
W0, W1, W2, W3, W4, W5, W6, W7);
499
_R( _e, _a, _b, _c, _d, F3, 51,
500
WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64,
501
W0, W1, W2, W3, W4, W5, W6, W7);
502
503
_R( _d, _e, _a, _b, _c, F3, 52,
504
WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
505
W7, W0, W1, W2, W3, W4, W5, W6);
506
_R( _c, _d, _e, _a, _b, F3, 53,
507
WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
508
W7, W0, W1, W2, W3, W4, W5, W6);
509
_R( _b, _c, _d, _e, _a, F3, 54,
510
WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68,
511
W7, W0, W1, W2, W3, W4, W5, W6);
512
_R( _a, _b, _c, _d, _e, F3, 55,
513
WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68,
514
W7, W0, W1, W2, W3, W4, W5, W6);
515
516
_R( _e, _a, _b, _c, _d, F3, 56,
517
WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
518
W6, W7, W0, W1, W2, W3, W4, W5);
519
_R( _d, _e, _a, _b, _c, F3, 57,
520
WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
521
W6, W7, W0, W1, W2, W3, W4, W5);
522
_R( _c, _d, _e, _a, _b, F3, 58,
523
WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72,
524
W6, W7, W0, W1, W2, W3, W4, W5);
525
_R( _b, _c, _d, _e, _a, F3, 59,
526
WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72,
527
W6, W7, W0, W1, W2, W3, W4, W5);
528
529
subs RNBLKS, #1;
530
531
_R( _a, _b, _c, _d, _e, F4, 60,
532
WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
533
W5, W6, W7, W0, W1, W2, W3, W4);
534
_R( _e, _a, _b, _c, _d, F4, 61,
535
WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
536
W5, W6, W7, W0, W1, W2, W3, W4);
537
_R( _d, _e, _a, _b, _c, F4, 62,
538
WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76,
539
W5, W6, W7, W0, W1, W2, W3, W4);
540
_R( _c, _d, _e, _a, _b, F4, 63,
541
WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76,
542
W5, W6, W7, W0, W1, W2, W3, W4);
543
544
beq .Lend;
545
546
/* Transform 64-79 + Precalc 0-15 of next block. */
547
#undef curK
548
#define curK qK1
549
_R( _b, _c, _d, _e, _a, F4, 64,
550
WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
551
_R( _a, _b, _c, _d, _e, F4, 65,
552
WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
553
_R( _e, _a, _b, _c, _d, F4, 66,
554
WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
555
_R( _d, _e, _a, _b, _c, F4, 67,
556
WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
557
558
_R( _c, _d, _e, _a, _b, F4, 68,
559
dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
560
_R( _b, _c, _d, _e, _a, F4, 69,
561
dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
562
_R( _a, _b, _c, _d, _e, F4, 70,
563
WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
564
_R( _e, _a, _b, _c, _d, F4, 71,
565
WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
566
567
_R( _d, _e, _a, _b, _c, F4, 72,
568
dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
569
_R( _c, _d, _e, _a, _b, F4, 73,
570
dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
571
_R( _b, _c, _d, _e, _a, F4, 74,
572
WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
573
_R( _a, _b, _c, _d, _e, F4, 75,
574
WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
575
576
_R( _e, _a, _b, _c, _d, F4, 76,
577
WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
578
_R( _d, _e, _a, _b, _c, F4, 77,
579
WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
580
_R( _c, _d, _e, _a, _b, F4, 78,
581
WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
582
_R( _b, _c, _d, _e, _a, F4, 79,
583
WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
584
585
/* Update the chaining variables. */
586
ldm RSTATE, {RT0-RT3};
587
add _a, RT0;
588
ldr RT0, [RSTATE, #state_h4];
589
add _b, RT1;
590
add _c, RT2;
591
add _d, RT3;
592
add _e, RT0;
593
stm RSTATE, {_a-_e};
594
595
b .Loop;
596
597
.Lend:
598
/* Transform 64-79 */
599
R( _b, _c, _d, _e, _a, F4, 64 );
600
R( _a, _b, _c, _d, _e, F4, 65 );
601
R( _e, _a, _b, _c, _d, F4, 66 );
602
R( _d, _e, _a, _b, _c, F4, 67 );
603
R( _c, _d, _e, _a, _b, F4, 68 );
604
R( _b, _c, _d, _e, _a, F4, 69 );
605
R( _a, _b, _c, _d, _e, F4, 70 );
606
R( _e, _a, _b, _c, _d, F4, 71 );
607
R( _d, _e, _a, _b, _c, F4, 72 );
608
R( _c, _d, _e, _a, _b, F4, 73 );
609
R( _b, _c, _d, _e, _a, F4, 74 );
610
R( _a, _b, _c, _d, _e, F4, 75 );
611
R( _e, _a, _b, _c, _d, F4, 76 );
612
R( _d, _e, _a, _b, _c, F4, 77 );
613
R( _c, _d, _e, _a, _b, F4, 78 );
614
R( _b, _c, _d, _e, _a, F4, 79 );
615
616
mov sp, ROLDSTACK;
617
618
/* Update the chaining variables. */
619
ldm RSTATE, {RT0-RT3};
620
add _a, RT0;
621
ldr RT0, [RSTATE, #state_h4];
622
add _b, RT1;
623
add _c, RT2;
624
add _d, RT3;
625
/*vpop {q4-q7};*/
626
add _e, RT0;
627
stm RSTATE, {_a-_e};
628
629
pop {r4-r12, pc};
630
631
.Ldo_nothing:
632
bx lr
633
ENDPROC(sha1_transform_neon)
634
635