Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm64/sm3-neon-core.S
170891 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* sm3-neon-core.S - SM3 secure hash using NEON instructions
4
*
5
* Linux/arm64 port of the libgcrypt SM3 implementation for AArch64
6
*
7
* Copyright (C) 2021 Jussi Kivilinna <[email protected]>
8
* Copyright (c) 2022 Tianjia Zhang <[email protected]>
9
*/
10
11
#include <linux/linkage.h>
12
#include <asm/assembler.h>
13
14
/* Context structure */
15
16
#define state_h0 0
17
#define state_h1 4
18
#define state_h2 8
19
#define state_h3 12
20
#define state_h4 16
21
#define state_h5 20
22
#define state_h6 24
23
#define state_h7 28
24
25
/* Stack structure */
26
27
#define STACK_W_SIZE (32 * 2 * 3)
28
29
#define STACK_W (0)
30
#define STACK_SIZE (STACK_W + STACK_W_SIZE)
31
32
/* Register macros */
33
34
#define RSTATE x0
35
#define RDATA x1
36
#define RNBLKS x2
37
#define RKPTR x28
38
#define RFRAME x29
39
40
#define ra w3
41
#define rb w4
42
#define rc w5
43
#define rd w6
44
#define re w7
45
#define rf w8
46
#define rg w9
47
#define rh w10
48
49
#define t0 w11
50
#define t1 w12
51
#define t2 w13
52
#define t3 w14
53
#define t4 w15
54
#define t5 w16
55
#define t6 w17
56
57
#define k_even w19
58
#define k_odd w20
59
60
#define addr0 x21
61
#define addr1 x22
62
63
#define s0 w23
64
#define s1 w24
65
#define s2 w25
66
#define s3 w26
67
68
#define W0 v0
69
#define W1 v1
70
#define W2 v2
71
#define W3 v3
72
#define W4 v4
73
#define W5 v5
74
75
#define XTMP0 v6
76
#define XTMP1 v7
77
#define XTMP2 v16
78
#define XTMP3 v17
79
#define XTMP4 v18
80
#define XTMP5 v19
81
#define XTMP6 v20
82
83
/* Helper macros. */
84
85
#define _(...) /*_*/
86
87
#define clear_vec(x) \
88
movi x.8h, #0;
89
90
#define rolw(o, a, n) \
91
ror o, a, #(32 - n);
92
93
/* Round function macros. */
94
95
#define GG1_1(x, y, z, o, t) \
96
eor o, x, y;
97
#define GG1_2(x, y, z, o, t) \
98
eor o, o, z;
99
#define GG1_3(x, y, z, o, t)
100
101
#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t)
102
#define FF1_2(x, y, z, o, t)
103
#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t)
104
105
#define GG2_1(x, y, z, o, t) \
106
bic o, z, x;
107
#define GG2_2(x, y, z, o, t) \
108
and t, y, x;
109
#define GG2_3(x, y, z, o, t) \
110
eor o, o, t;
111
112
#define FF2_1(x, y, z, o, t) \
113
eor o, x, y;
114
#define FF2_2(x, y, z, o, t) \
115
and t, x, y; \
116
and o, o, z;
117
#define FF2_3(x, y, z, o, t) \
118
eor o, o, t;
119
120
#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
121
K_LOAD(round); \
122
ldr t5, [sp, #(wtype##_W1_ADDR(round, widx))]; \
123
rolw(t0, a, 12); /* rol(a, 12) => t0 */ \
124
IOP(1, iop_param); \
125
FF##i##_1(a, b, c, t1, t2); \
126
ldr t6, [sp, #(wtype##_W1W2_ADDR(round, widx))]; \
127
add k, k, e; \
128
IOP(2, iop_param); \
129
GG##i##_1(e, f, g, t3, t4); \
130
FF##i##_2(a, b, c, t1, t2); \
131
IOP(3, iop_param); \
132
add k, k, t0; \
133
add h, h, t5; \
134
add d, d, t6; /* w1w2 + d => d */ \
135
IOP(4, iop_param); \
136
rolw(k, k, 7); /* rol (t0 + e + t), 7) => k */ \
137
GG##i##_2(e, f, g, t3, t4); \
138
add h, h, k; /* h + w1 + k => h */ \
139
IOP(5, iop_param); \
140
FF##i##_3(a, b, c, t1, t2); \
141
eor t0, t0, k; /* k ^ t0 => t0 */ \
142
GG##i##_3(e, f, g, t3, t4); \
143
add d, d, t1; /* FF(a,b,c) + d => d */ \
144
IOP(6, iop_param); \
145
add t3, t3, h; /* GG(e,f,g) + h => t3 */ \
146
rolw(b, b, 9); /* rol(b, 9) => b */ \
147
eor h, t3, t3, ror #(32-9); \
148
IOP(7, iop_param); \
149
add d, d, t0; /* t0 + d => d */ \
150
rolw(f, f, 19); /* rol(f, 19) => f */ \
151
IOP(8, iop_param); \
152
eor h, h, t3, ror #(32-17); /* P0(t3) => h */
153
154
#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
155
R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
156
157
#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
158
R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
159
160
#define KL(round) \
161
ldp k_even, k_odd, [RKPTR, #(4*(round))];
162
163
/* Input expansion macros. */
164
165
/* Byte-swapped input address. */
166
#define IW_W_ADDR(round, widx, offs) \
167
(STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))
168
169
/* Expanded input address. */
170
#define XW_W_ADDR(round, widx, offs) \
171
(STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))
172
173
/* Rounds 1-12, byte-swapped input block addresses. */
174
#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 32)
175
#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48)
176
177
/* Rounds 1-12, expanded input block addresses. */
178
#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0)
179
#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16)
180
181
/* Input block loading.
182
* Interleaving within round function needed for in-order CPUs. */
183
#define LOAD_W_VEC_1_1() \
184
add addr0, sp, #IW_W1_ADDR(0, 0);
185
#define LOAD_W_VEC_1_2() \
186
add addr1, sp, #IW_W1_ADDR(4, 0);
187
#define LOAD_W_VEC_1_3() \
188
ld1 {W0.16b}, [RDATA], #16;
189
#define LOAD_W_VEC_1_4() \
190
ld1 {W1.16b}, [RDATA], #16;
191
#define LOAD_W_VEC_1_5() \
192
ld1 {W2.16b}, [RDATA], #16;
193
#define LOAD_W_VEC_1_6() \
194
ld1 {W3.16b}, [RDATA], #16;
195
#define LOAD_W_VEC_1_7() \
196
rev32 XTMP0.16b, W0.16b;
197
#define LOAD_W_VEC_1_8() \
198
rev32 XTMP1.16b, W1.16b;
199
#define LOAD_W_VEC_2_1() \
200
rev32 XTMP2.16b, W2.16b;
201
#define LOAD_W_VEC_2_2() \
202
rev32 XTMP3.16b, W3.16b;
203
#define LOAD_W_VEC_2_3() \
204
eor XTMP4.16b, XTMP1.16b, XTMP0.16b;
205
#define LOAD_W_VEC_2_4() \
206
eor XTMP5.16b, XTMP2.16b, XTMP1.16b;
207
#define LOAD_W_VEC_2_5() \
208
st1 {XTMP0.16b}, [addr0], #16;
209
#define LOAD_W_VEC_2_6() \
210
st1 {XTMP4.16b}, [addr0]; \
211
add addr0, sp, #IW_W1_ADDR(8, 0);
212
#define LOAD_W_VEC_2_7() \
213
eor XTMP6.16b, XTMP3.16b, XTMP2.16b;
214
#define LOAD_W_VEC_2_8() \
215
ext W0.16b, XTMP0.16b, XTMP0.16b, #8; /* W0: xx, w0, xx, xx */
216
#define LOAD_W_VEC_3_1() \
217
mov W2.16b, XTMP1.16b; /* W2: xx, w6, w5, w4 */
218
#define LOAD_W_VEC_3_2() \
219
st1 {XTMP1.16b}, [addr1], #16;
220
#define LOAD_W_VEC_3_3() \
221
st1 {XTMP5.16b}, [addr1]; \
222
ext W1.16b, XTMP0.16b, XTMP0.16b, #4; /* W1: xx, w3, w2, w1 */
223
#define LOAD_W_VEC_3_4() \
224
ext W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */
225
#define LOAD_W_VEC_3_5() \
226
ext W4.16b, XTMP2.16b, XTMP3.16b, #8; /* W4: xx, w12, w11, w10 */
227
#define LOAD_W_VEC_3_6() \
228
st1 {XTMP2.16b}, [addr0], #16;
229
#define LOAD_W_VEC_3_7() \
230
st1 {XTMP6.16b}, [addr0];
231
#define LOAD_W_VEC_3_8() \
232
ext W5.16b, XTMP3.16b, XTMP3.16b, #4; /* W5: xx, w15, w14, w13 */
233
234
#define LOAD_W_VEC_1(iop_num, ...) \
235
LOAD_W_VEC_1_##iop_num()
236
#define LOAD_W_VEC_2(iop_num, ...) \
237
LOAD_W_VEC_2_##iop_num()
238
#define LOAD_W_VEC_3(iop_num, ...) \
239
LOAD_W_VEC_3_##iop_num()
240
241
/* Message scheduling. Note: 3 words per vector register.
242
* Interleaving within round function needed for in-order CPUs. */
243
#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \
244
/* Load (w[i - 16]) => XTMP0 */ \
245
/* Load (w[i - 13]) => XTMP5 */ \
246
ext XTMP0.16b, w0.16b, w0.16b, #12; /* XTMP0: w0, xx, xx, xx */
247
#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \
248
ext XTMP5.16b, w1.16b, w1.16b, #12;
249
#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \
250
ext XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */
251
#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \
252
ext XTMP5.16b, XTMP5.16b, w2.16b, #12;
253
#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \
254
/* w[i - 9] == w3 */ \
255
/* W3 ^ XTMP0 => XTMP0 */ \
256
eor XTMP0.16b, XTMP0.16b, w3.16b;
257
#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \
258
/* w[i - 3] == w5 */ \
259
/* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \
260
/* rol(XTMP5, 7) => XTMP1 */ \
261
add addr0, sp, #XW_W1_ADDR((round), 0); \
262
shl XTMP2.4s, w5.4s, #15;
263
#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \
264
shl XTMP1.4s, XTMP5.4s, #7;
265
#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \
266
sri XTMP2.4s, w5.4s, #(32-15);
267
#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \
268
sri XTMP1.4s, XTMP5.4s, #(32-7);
269
#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \
270
eor XTMP0.16b, XTMP0.16b, XTMP2.16b;
271
#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \
272
/* w[i - 6] == W4 */ \
273
/* W4 ^ XTMP1 => XTMP1 */ \
274
eor XTMP1.16b, XTMP1.16b, w4.16b;
275
#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \
276
/* P1(XTMP0) ^ XTMP1 => W0 */ \
277
shl XTMP3.4s, XTMP0.4s, #15;
278
#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \
279
shl XTMP4.4s, XTMP0.4s, #23;
280
#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \
281
eor w0.16b, XTMP1.16b, XTMP0.16b;
282
#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \
283
sri XTMP3.4s, XTMP0.4s, #(32-15);
284
#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \
285
sri XTMP4.4s, XTMP0.4s, #(32-23);
286
#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \
287
eor w0.16b, w0.16b, XTMP3.16b;
288
#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \
289
/* Load (w[i - 3]) => XTMP2 */ \
290
ext XTMP2.16b, w4.16b, w4.16b, #12;
291
#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \
292
eor w0.16b, w0.16b, XTMP4.16b;
293
#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \
294
ext XTMP2.16b, XTMP2.16b, w5.16b, #12;
295
#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \
296
/* W1 ^ W2 => XTMP3 */ \
297
eor XTMP3.16b, XTMP2.16b, w0.16b;
298
#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5)
299
#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \
300
st1 {XTMP2.16b-XTMP3.16b}, [addr0];
301
#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5)
302
303
#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \
304
SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5)
305
#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \
306
SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5)
307
#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \
308
SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5)
309
310
#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \
311
SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0)
312
#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \
313
SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0)
314
#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \
315
SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0)
316
317
#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \
318
SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1)
319
#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \
320
SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1)
321
#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \
322
SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1)
323
324
#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \
325
SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2)
326
#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \
327
SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2)
328
#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \
329
SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2)
330
331
#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \
332
SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3)
333
#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \
334
SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3)
335
#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \
336
SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3)
337
338
#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \
339
SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4)
340
#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \
341
SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4)
342
#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \
343
SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4)
344
345
346
/*
347
* Transform nblocks*64 bytes (nblocks*16 32-bit words) at 'data'.
348
*
349
* void sm3_neon_transform(struct sm3_block_state *state,
350
* const u8 *data, size_t nblocks)
351
*/
352
.text
353
.align 3
354
SYM_FUNC_START(sm3_neon_transform)
355
ldp ra, rb, [RSTATE, #0]
356
ldp rc, rd, [RSTATE, #8]
357
ldp re, rf, [RSTATE, #16]
358
ldp rg, rh, [RSTATE, #24]
359
360
stp x28, x29, [sp, #-16]!
361
stp x19, x20, [sp, #-16]!
362
stp x21, x22, [sp, #-16]!
363
stp x23, x24, [sp, #-16]!
364
stp x25, x26, [sp, #-16]!
365
mov RFRAME, sp
366
367
sub addr0, sp, #STACK_SIZE
368
adr_l RKPTR, .LKtable
369
and sp, addr0, #(~63)
370
371
/* Preload first block. */
372
LOAD_W_VEC_1(1, 0)
373
LOAD_W_VEC_1(2, 0)
374
LOAD_W_VEC_1(3, 0)
375
LOAD_W_VEC_1(4, 0)
376
LOAD_W_VEC_1(5, 0)
377
LOAD_W_VEC_1(6, 0)
378
LOAD_W_VEC_1(7, 0)
379
LOAD_W_VEC_1(8, 0)
380
LOAD_W_VEC_2(1, 0)
381
LOAD_W_VEC_2(2, 0)
382
LOAD_W_VEC_2(3, 0)
383
LOAD_W_VEC_2(4, 0)
384
LOAD_W_VEC_2(5, 0)
385
LOAD_W_VEC_2(6, 0)
386
LOAD_W_VEC_2(7, 0)
387
LOAD_W_VEC_2(8, 0)
388
LOAD_W_VEC_3(1, 0)
389
LOAD_W_VEC_3(2, 0)
390
LOAD_W_VEC_3(3, 0)
391
LOAD_W_VEC_3(4, 0)
392
LOAD_W_VEC_3(5, 0)
393
LOAD_W_VEC_3(6, 0)
394
LOAD_W_VEC_3(7, 0)
395
LOAD_W_VEC_3(8, 0)
396
397
.balign 16
398
.Loop:
399
/* Transform 0-3 */
400
R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0)
401
R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 1, 1, IW, _, 0)
402
R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0)
403
R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 3, 3, IW, _, 0)
404
405
/* Transform 4-7 + Precalc 12-14 */
406
R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0)
407
R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 5, 1, IW, _, 0)
408
R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12)
409
R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12)
410
411
/* Transform 8-11 + Precalc 12-17 */
412
R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12)
413
R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15)
414
R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15)
415
R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15)
416
417
/* Transform 12-14 + Precalc 18-20 */
418
R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18)
419
R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18)
420
R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18)
421
422
/* Transform 15-17 + Precalc 21-23 */
423
R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21)
424
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21)
425
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21)
426
427
/* Transform 18-20 + Precalc 24-26 */
428
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24)
429
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24)
430
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24)
431
432
/* Transform 21-23 + Precalc 27-29 */
433
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27)
434
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27)
435
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27)
436
437
/* Transform 24-26 + Precalc 30-32 */
438
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30)
439
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30)
440
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30)
441
442
/* Transform 27-29 + Precalc 33-35 */
443
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33)
444
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33)
445
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33)
446
447
/* Transform 30-32 + Precalc 36-38 */
448
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36)
449
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36)
450
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36)
451
452
/* Transform 33-35 + Precalc 39-41 */
453
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39)
454
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39)
455
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39)
456
457
/* Transform 36-38 + Precalc 42-44 */
458
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42)
459
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42)
460
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42)
461
462
/* Transform 39-41 + Precalc 45-47 */
463
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45)
464
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45)
465
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45)
466
467
/* Transform 42-44 + Precalc 48-50 */
468
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48)
469
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48)
470
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48)
471
472
/* Transform 45-47 + Precalc 51-53 */
473
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51)
474
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51)
475
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51)
476
477
/* Transform 48-50 + Precalc 54-56 */
478
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54)
479
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54)
480
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54)
481
482
/* Transform 51-53 + Precalc 57-59 */
483
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57)
484
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57)
485
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57)
486
487
/* Transform 54-56 + Precalc 60-62 */
488
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60)
489
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60)
490
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60)
491
492
/* Transform 57-59 + Precalc 63 */
493
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63)
494
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63)
495
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63)
496
497
/* Transform 60 */
498
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _)
499
subs RNBLKS, RNBLKS, #1
500
b.eq .Lend
501
502
/* Transform 61-63 + Preload next block */
503
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, LOAD_W_VEC_1, _)
504
ldp s0, s1, [RSTATE, #0]
505
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _)
506
ldp s2, s3, [RSTATE, #8]
507
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, LOAD_W_VEC_3, _)
508
509
/* Update the chaining variables. */
510
eor ra, ra, s0
511
eor rb, rb, s1
512
ldp s0, s1, [RSTATE, #16]
513
eor rc, rc, s2
514
ldp k_even, k_odd, [RSTATE, #24]
515
eor rd, rd, s3
516
eor re, re, s0
517
stp ra, rb, [RSTATE, #0]
518
eor rf, rf, s1
519
stp rc, rd, [RSTATE, #8]
520
eor rg, rg, k_even
521
stp re, rf, [RSTATE, #16]
522
eor rh, rh, k_odd
523
stp rg, rh, [RSTATE, #24]
524
b .Loop
525
526
.Lend:
527
/* Transform 61-63 */
528
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, _, _)
529
ldp s0, s1, [RSTATE, #0]
530
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _)
531
ldp s2, s3, [RSTATE, #8]
532
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, _, _)
533
534
/* Update the chaining variables. */
535
eor ra, ra, s0
536
clear_vec(W0)
537
eor rb, rb, s1
538
clear_vec(W1)
539
ldp s0, s1, [RSTATE, #16]
540
clear_vec(W2)
541
eor rc, rc, s2
542
clear_vec(W3)
543
ldp k_even, k_odd, [RSTATE, #24]
544
clear_vec(W4)
545
eor rd, rd, s3
546
clear_vec(W5)
547
eor re, re, s0
548
clear_vec(XTMP0)
549
stp ra, rb, [RSTATE, #0]
550
clear_vec(XTMP1)
551
eor rf, rf, s1
552
clear_vec(XTMP2)
553
stp rc, rd, [RSTATE, #8]
554
clear_vec(XTMP3)
555
eor rg, rg, k_even
556
clear_vec(XTMP4)
557
stp re, rf, [RSTATE, #16]
558
clear_vec(XTMP5)
559
eor rh, rh, k_odd
560
clear_vec(XTMP6)
561
stp rg, rh, [RSTATE, #24]
562
563
/* Clear message expansion area */
564
add addr0, sp, #STACK_W
565
st1 {W0.16b-W3.16b}, [addr0], #64
566
st1 {W0.16b-W3.16b}, [addr0], #64
567
st1 {W0.16b-W3.16b}, [addr0]
568
569
mov sp, RFRAME
570
571
ldp x25, x26, [sp], #16
572
ldp x23, x24, [sp], #16
573
ldp x21, x22, [sp], #16
574
ldp x19, x20, [sp], #16
575
ldp x28, x29, [sp], #16
576
577
ret
578
SYM_FUNC_END(sm3_neon_transform)
579
580
581
.section ".rodata", "a"
582
583
.align 4
584
.LKtable:
585
.long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb
586
.long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc
587
.long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce
588
.long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6
589
.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
590
.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
591
.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
592
.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
593
.long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53
594
.long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d
595
.long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4
596
.long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43
597
.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
598
.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
599
.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
600
.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
601
602