Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/crypto/sm3-neon-core.S
26450 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* sm3-neon-core.S - SM3 secure hash using NEON instructions
4
*
5
* Linux/arm64 port of the libgcrypt SM3 implementation for AArch64
6
*
7
* Copyright (C) 2021 Jussi Kivilinna <[email protected]>
8
* Copyright (c) 2022 Tianjia Zhang <[email protected]>
9
*/
10
11
#include <linux/linkage.h>
12
#include <linux/cfi_types.h>
13
#include <asm/assembler.h>
14
15
/* Context structure */
16
17
#define state_h0 0
18
#define state_h1 4
19
#define state_h2 8
20
#define state_h3 12
21
#define state_h4 16
22
#define state_h5 20
23
#define state_h6 24
24
#define state_h7 28
25
26
/* Stack structure */
27
28
#define STACK_W_SIZE (32 * 2 * 3)
29
30
#define STACK_W (0)
31
#define STACK_SIZE (STACK_W + STACK_W_SIZE)
32
33
/* Register macros */
34
35
#define RSTATE x0
36
#define RDATA x1
37
#define RNBLKS x2
38
#define RKPTR x28
39
#define RFRAME x29
40
41
#define ra w3
42
#define rb w4
43
#define rc w5
44
#define rd w6
45
#define re w7
46
#define rf w8
47
#define rg w9
48
#define rh w10
49
50
#define t0 w11
51
#define t1 w12
52
#define t2 w13
53
#define t3 w14
54
#define t4 w15
55
#define t5 w16
56
#define t6 w17
57
58
#define k_even w19
59
#define k_odd w20
60
61
#define addr0 x21
62
#define addr1 x22
63
64
#define s0 w23
65
#define s1 w24
66
#define s2 w25
67
#define s3 w26
68
69
#define W0 v0
70
#define W1 v1
71
#define W2 v2
72
#define W3 v3
73
#define W4 v4
74
#define W5 v5
75
76
#define XTMP0 v6
77
#define XTMP1 v7
78
#define XTMP2 v16
79
#define XTMP3 v17
80
#define XTMP4 v18
81
#define XTMP5 v19
82
#define XTMP6 v20
83
84
/* Helper macros. */
85
86
#define _(...) /*_*/
87
88
#define clear_vec(x) \
89
movi x.8h, #0;
90
91
#define rolw(o, a, n) \
92
ror o, a, #(32 - n);
93
94
/* Round function macros. */
95
96
#define GG1_1(x, y, z, o, t) \
97
eor o, x, y;
98
#define GG1_2(x, y, z, o, t) \
99
eor o, o, z;
100
#define GG1_3(x, y, z, o, t)
101
102
#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t)
103
#define FF1_2(x, y, z, o, t)
104
#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t)
105
106
#define GG2_1(x, y, z, o, t) \
107
bic o, z, x;
108
#define GG2_2(x, y, z, o, t) \
109
and t, y, x;
110
#define GG2_3(x, y, z, o, t) \
111
eor o, o, t;
112
113
#define FF2_1(x, y, z, o, t) \
114
eor o, x, y;
115
#define FF2_2(x, y, z, o, t) \
116
and t, x, y; \
117
and o, o, z;
118
#define FF2_3(x, y, z, o, t) \
119
eor o, o, t;
120
121
#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
122
K_LOAD(round); \
123
ldr t5, [sp, #(wtype##_W1_ADDR(round, widx))]; \
124
rolw(t0, a, 12); /* rol(a, 12) => t0 */ \
125
IOP(1, iop_param); \
126
FF##i##_1(a, b, c, t1, t2); \
127
ldr t6, [sp, #(wtype##_W1W2_ADDR(round, widx))]; \
128
add k, k, e; \
129
IOP(2, iop_param); \
130
GG##i##_1(e, f, g, t3, t4); \
131
FF##i##_2(a, b, c, t1, t2); \
132
IOP(3, iop_param); \
133
add k, k, t0; \
134
add h, h, t5; \
135
add d, d, t6; /* w1w2 + d => d */ \
136
IOP(4, iop_param); \
137
rolw(k, k, 7); /* rol (t0 + e + t), 7) => k */ \
138
GG##i##_2(e, f, g, t3, t4); \
139
add h, h, k; /* h + w1 + k => h */ \
140
IOP(5, iop_param); \
141
FF##i##_3(a, b, c, t1, t2); \
142
eor t0, t0, k; /* k ^ t0 => t0 */ \
143
GG##i##_3(e, f, g, t3, t4); \
144
add d, d, t1; /* FF(a,b,c) + d => d */ \
145
IOP(6, iop_param); \
146
add t3, t3, h; /* GG(e,f,g) + h => t3 */ \
147
rolw(b, b, 9); /* rol(b, 9) => b */ \
148
eor h, t3, t3, ror #(32-9); \
149
IOP(7, iop_param); \
150
add d, d, t0; /* t0 + d => d */ \
151
rolw(f, f, 19); /* rol(f, 19) => f */ \
152
IOP(8, iop_param); \
153
eor h, h, t3, ror #(32-17); /* P0(t3) => h */
154
155
#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
156
R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
157
158
#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
159
R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
160
161
#define KL(round) \
162
ldp k_even, k_odd, [RKPTR, #(4*(round))];
163
164
/* Input expansion macros. */
165
166
/* Byte-swapped input address. */
167
#define IW_W_ADDR(round, widx, offs) \
168
(STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))
169
170
/* Expanded input address. */
171
#define XW_W_ADDR(round, widx, offs) \
172
(STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))
173
174
/* Rounds 1-12, byte-swapped input block addresses. */
175
#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 32)
176
#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48)
177
178
/* Rounds 1-12, expanded input block addresses. */
179
#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0)
180
#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16)
181
182
/* Input block loading.
183
* Interleaving within round function needed for in-order CPUs. */
184
#define LOAD_W_VEC_1_1() \
185
add addr0, sp, #IW_W1_ADDR(0, 0);
186
#define LOAD_W_VEC_1_2() \
187
add addr1, sp, #IW_W1_ADDR(4, 0);
188
#define LOAD_W_VEC_1_3() \
189
ld1 {W0.16b}, [RDATA], #16;
190
#define LOAD_W_VEC_1_4() \
191
ld1 {W1.16b}, [RDATA], #16;
192
#define LOAD_W_VEC_1_5() \
193
ld1 {W2.16b}, [RDATA], #16;
194
#define LOAD_W_VEC_1_6() \
195
ld1 {W3.16b}, [RDATA], #16;
196
#define LOAD_W_VEC_1_7() \
197
rev32 XTMP0.16b, W0.16b;
198
#define LOAD_W_VEC_1_8() \
199
rev32 XTMP1.16b, W1.16b;
200
#define LOAD_W_VEC_2_1() \
201
rev32 XTMP2.16b, W2.16b;
202
#define LOAD_W_VEC_2_2() \
203
rev32 XTMP3.16b, W3.16b;
204
#define LOAD_W_VEC_2_3() \
205
eor XTMP4.16b, XTMP1.16b, XTMP0.16b;
206
#define LOAD_W_VEC_2_4() \
207
eor XTMP5.16b, XTMP2.16b, XTMP1.16b;
208
#define LOAD_W_VEC_2_5() \
209
st1 {XTMP0.16b}, [addr0], #16;
210
#define LOAD_W_VEC_2_6() \
211
st1 {XTMP4.16b}, [addr0]; \
212
add addr0, sp, #IW_W1_ADDR(8, 0);
213
#define LOAD_W_VEC_2_7() \
214
eor XTMP6.16b, XTMP3.16b, XTMP2.16b;
215
#define LOAD_W_VEC_2_8() \
216
ext W0.16b, XTMP0.16b, XTMP0.16b, #8; /* W0: xx, w0, xx, xx */
217
#define LOAD_W_VEC_3_1() \
218
mov W2.16b, XTMP1.16b; /* W2: xx, w6, w5, w4 */
219
#define LOAD_W_VEC_3_2() \
220
st1 {XTMP1.16b}, [addr1], #16;
221
#define LOAD_W_VEC_3_3() \
222
st1 {XTMP5.16b}, [addr1]; \
223
ext W1.16b, XTMP0.16b, XTMP0.16b, #4; /* W1: xx, w3, w2, w1 */
224
#define LOAD_W_VEC_3_4() \
225
ext W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */
226
#define LOAD_W_VEC_3_5() \
227
ext W4.16b, XTMP2.16b, XTMP3.16b, #8; /* W4: xx, w12, w11, w10 */
228
#define LOAD_W_VEC_3_6() \
229
st1 {XTMP2.16b}, [addr0], #16;
230
#define LOAD_W_VEC_3_7() \
231
st1 {XTMP6.16b}, [addr0];
232
#define LOAD_W_VEC_3_8() \
233
ext W5.16b, XTMP3.16b, XTMP3.16b, #4; /* W5: xx, w15, w14, w13 */
234
235
#define LOAD_W_VEC_1(iop_num, ...) \
236
LOAD_W_VEC_1_##iop_num()
237
#define LOAD_W_VEC_2(iop_num, ...) \
238
LOAD_W_VEC_2_##iop_num()
239
#define LOAD_W_VEC_3(iop_num, ...) \
240
LOAD_W_VEC_3_##iop_num()
241
242
/* Message scheduling. Note: 3 words per vector register.
243
* Interleaving within round function needed for in-order CPUs. */
244
#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \
245
/* Load (w[i - 16]) => XTMP0 */ \
246
/* Load (w[i - 13]) => XTMP5 */ \
247
ext XTMP0.16b, w0.16b, w0.16b, #12; /* XTMP0: w0, xx, xx, xx */
248
#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \
249
ext XTMP5.16b, w1.16b, w1.16b, #12;
250
#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \
251
ext XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */
252
#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \
253
ext XTMP5.16b, XTMP5.16b, w2.16b, #12;
254
#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \
255
/* w[i - 9] == w3 */ \
256
/* W3 ^ XTMP0 => XTMP0 */ \
257
eor XTMP0.16b, XTMP0.16b, w3.16b;
258
#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \
259
/* w[i - 3] == w5 */ \
260
/* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \
261
/* rol(XTMP5, 7) => XTMP1 */ \
262
add addr0, sp, #XW_W1_ADDR((round), 0); \
263
shl XTMP2.4s, w5.4s, #15;
264
#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \
265
shl XTMP1.4s, XTMP5.4s, #7;
266
#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \
267
sri XTMP2.4s, w5.4s, #(32-15);
268
#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \
269
sri XTMP1.4s, XTMP5.4s, #(32-7);
270
#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \
271
eor XTMP0.16b, XTMP0.16b, XTMP2.16b;
272
#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \
273
/* w[i - 6] == W4 */ \
274
/* W4 ^ XTMP1 => XTMP1 */ \
275
eor XTMP1.16b, XTMP1.16b, w4.16b;
276
#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \
277
/* P1(XTMP0) ^ XTMP1 => W0 */ \
278
shl XTMP3.4s, XTMP0.4s, #15;
279
#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \
280
shl XTMP4.4s, XTMP0.4s, #23;
281
#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \
282
eor w0.16b, XTMP1.16b, XTMP0.16b;
283
#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \
284
sri XTMP3.4s, XTMP0.4s, #(32-15);
285
#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \
286
sri XTMP4.4s, XTMP0.4s, #(32-23);
287
#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \
288
eor w0.16b, w0.16b, XTMP3.16b;
289
#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \
290
/* Load (w[i - 3]) => XTMP2 */ \
291
ext XTMP2.16b, w4.16b, w4.16b, #12;
292
#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \
293
eor w0.16b, w0.16b, XTMP4.16b;
294
#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \
295
ext XTMP2.16b, XTMP2.16b, w5.16b, #12;
296
#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \
297
/* W1 ^ W2 => XTMP3 */ \
298
eor XTMP3.16b, XTMP2.16b, w0.16b;
299
#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5)
300
#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \
301
st1 {XTMP2.16b-XTMP3.16b}, [addr0];
302
#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5)
303
304
#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \
305
SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5)
306
#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \
307
SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5)
308
#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \
309
SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5)
310
311
#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \
312
SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0)
313
#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \
314
SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0)
315
#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \
316
SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0)
317
318
#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \
319
SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1)
320
#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \
321
SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1)
322
#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \
323
SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1)
324
325
#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \
326
SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2)
327
#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \
328
SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2)
329
#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \
330
SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2)
331
332
#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \
333
SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3)
334
#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \
335
SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3)
336
#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \
337
SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3)
338
339
#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \
340
SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4)
341
#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \
342
SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4)
343
#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \
344
SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4)
345
346
347
/*
348
* Transform blocks*64 bytes (blocks*16 32-bit words) at 'src'.
349
*
350
* void sm3_neon_transform(struct sm3_state *sst, u8 const *src,
351
* int blocks)
352
*/
353
.text
354
.align 3
355
SYM_TYPED_FUNC_START(sm3_neon_transform)
356
ldp ra, rb, [RSTATE, #0]
357
ldp rc, rd, [RSTATE, #8]
358
ldp re, rf, [RSTATE, #16]
359
ldp rg, rh, [RSTATE, #24]
360
361
stp x28, x29, [sp, #-16]!
362
stp x19, x20, [sp, #-16]!
363
stp x21, x22, [sp, #-16]!
364
stp x23, x24, [sp, #-16]!
365
stp x25, x26, [sp, #-16]!
366
mov RFRAME, sp
367
368
sub addr0, sp, #STACK_SIZE
369
adr_l RKPTR, .LKtable
370
and sp, addr0, #(~63)
371
372
/* Preload first block. */
373
LOAD_W_VEC_1(1, 0)
374
LOAD_W_VEC_1(2, 0)
375
LOAD_W_VEC_1(3, 0)
376
LOAD_W_VEC_1(4, 0)
377
LOAD_W_VEC_1(5, 0)
378
LOAD_W_VEC_1(6, 0)
379
LOAD_W_VEC_1(7, 0)
380
LOAD_W_VEC_1(8, 0)
381
LOAD_W_VEC_2(1, 0)
382
LOAD_W_VEC_2(2, 0)
383
LOAD_W_VEC_2(3, 0)
384
LOAD_W_VEC_2(4, 0)
385
LOAD_W_VEC_2(5, 0)
386
LOAD_W_VEC_2(6, 0)
387
LOAD_W_VEC_2(7, 0)
388
LOAD_W_VEC_2(8, 0)
389
LOAD_W_VEC_3(1, 0)
390
LOAD_W_VEC_3(2, 0)
391
LOAD_W_VEC_3(3, 0)
392
LOAD_W_VEC_3(4, 0)
393
LOAD_W_VEC_3(5, 0)
394
LOAD_W_VEC_3(6, 0)
395
LOAD_W_VEC_3(7, 0)
396
LOAD_W_VEC_3(8, 0)
397
398
.balign 16
399
.Loop:
400
/* Transform 0-3 */
401
R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0)
402
R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 1, 1, IW, _, 0)
403
R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0)
404
R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 3, 3, IW, _, 0)
405
406
/* Transform 4-7 + Precalc 12-14 */
407
R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0)
408
R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 5, 1, IW, _, 0)
409
R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12)
410
R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12)
411
412
/* Transform 8-11 + Precalc 12-17 */
413
R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12)
414
R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15)
415
R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15)
416
R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15)
417
418
/* Transform 12-14 + Precalc 18-20 */
419
R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18)
420
R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18)
421
R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18)
422
423
/* Transform 15-17 + Precalc 21-23 */
424
R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21)
425
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21)
426
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21)
427
428
/* Transform 18-20 + Precalc 24-26 */
429
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24)
430
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24)
431
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24)
432
433
/* Transform 21-23 + Precalc 27-29 */
434
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27)
435
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27)
436
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27)
437
438
/* Transform 24-26 + Precalc 30-32 */
439
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30)
440
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30)
441
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30)
442
443
/* Transform 27-29 + Precalc 33-35 */
444
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33)
445
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33)
446
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33)
447
448
/* Transform 30-32 + Precalc 36-38 */
449
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36)
450
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36)
451
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36)
452
453
/* Transform 33-35 + Precalc 39-41 */
454
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39)
455
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39)
456
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39)
457
458
/* Transform 36-38 + Precalc 42-44 */
459
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42)
460
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42)
461
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42)
462
463
/* Transform 39-41 + Precalc 45-47 */
464
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45)
465
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45)
466
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45)
467
468
/* Transform 42-44 + Precalc 48-50 */
469
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48)
470
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48)
471
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48)
472
473
/* Transform 45-47 + Precalc 51-53 */
474
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51)
475
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51)
476
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51)
477
478
/* Transform 48-50 + Precalc 54-56 */
479
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54)
480
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54)
481
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54)
482
483
/* Transform 51-53 + Precalc 57-59 */
484
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57)
485
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57)
486
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57)
487
488
/* Transform 54-56 + Precalc 60-62 */
489
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60)
490
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60)
491
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60)
492
493
/* Transform 57-59 + Precalc 63 */
494
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63)
495
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63)
496
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63)
497
498
/* Transform 60 */
499
R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _)
500
subs RNBLKS, RNBLKS, #1
501
b.eq .Lend
502
503
/* Transform 61-63 + Preload next block */
504
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, LOAD_W_VEC_1, _)
505
ldp s0, s1, [RSTATE, #0]
506
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _)
507
ldp s2, s3, [RSTATE, #8]
508
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, LOAD_W_VEC_3, _)
509
510
/* Update the chaining variables. */
511
eor ra, ra, s0
512
eor rb, rb, s1
513
ldp s0, s1, [RSTATE, #16]
514
eor rc, rc, s2
515
ldp k_even, k_odd, [RSTATE, #24]
516
eor rd, rd, s3
517
eor re, re, s0
518
stp ra, rb, [RSTATE, #0]
519
eor rf, rf, s1
520
stp rc, rd, [RSTATE, #8]
521
eor rg, rg, k_even
522
stp re, rf, [RSTATE, #16]
523
eor rh, rh, k_odd
524
stp rg, rh, [RSTATE, #24]
525
b .Loop
526
527
.Lend:
528
/* Transform 61-63 */
529
R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, _, _)
530
ldp s0, s1, [RSTATE, #0]
531
R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _)
532
ldp s2, s3, [RSTATE, #8]
533
R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, _, _)
534
535
/* Update the chaining variables. */
536
eor ra, ra, s0
537
clear_vec(W0)
538
eor rb, rb, s1
539
clear_vec(W1)
540
ldp s0, s1, [RSTATE, #16]
541
clear_vec(W2)
542
eor rc, rc, s2
543
clear_vec(W3)
544
ldp k_even, k_odd, [RSTATE, #24]
545
clear_vec(W4)
546
eor rd, rd, s3
547
clear_vec(W5)
548
eor re, re, s0
549
clear_vec(XTMP0)
550
stp ra, rb, [RSTATE, #0]
551
clear_vec(XTMP1)
552
eor rf, rf, s1
553
clear_vec(XTMP2)
554
stp rc, rd, [RSTATE, #8]
555
clear_vec(XTMP3)
556
eor rg, rg, k_even
557
clear_vec(XTMP4)
558
stp re, rf, [RSTATE, #16]
559
clear_vec(XTMP5)
560
eor rh, rh, k_odd
561
clear_vec(XTMP6)
562
stp rg, rh, [RSTATE, #24]
563
564
/* Clear message expansion area */
565
add addr0, sp, #STACK_W
566
st1 {W0.16b-W3.16b}, [addr0], #64
567
st1 {W0.16b-W3.16b}, [addr0], #64
568
st1 {W0.16b-W3.16b}, [addr0]
569
570
mov sp, RFRAME
571
572
ldp x25, x26, [sp], #16
573
ldp x23, x24, [sp], #16
574
ldp x21, x22, [sp], #16
575
ldp x19, x20, [sp], #16
576
ldp x28, x29, [sp], #16
577
578
ret
579
SYM_FUNC_END(sm3_neon_transform)
580
581
582
.section ".rodata", "a"
583
584
.align 4
585
.LKtable:
586
.long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb
587
.long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc
588
.long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce
589
.long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6
590
.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
591
.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
592
.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
593
.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
594
.long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53
595
.long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d
596
.long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4
597
.long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43
598
.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
599
.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
600
.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
601
.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
602
603