Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/crypto/sm4-neon-core.S
26451 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* SM4 Cipher Algorithm for ARMv8 NEON
4
* as specified in
5
* https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6
*
7
* Copyright (C) 2022, Alibaba Group.
8
* Copyright (C) 2022 Tianjia Zhang <[email protected]>
9
*/
10
11
#include <linux/linkage.h>
12
#include <asm/assembler.h>
13
14
/* Register macros */
15
16
#define RTMP0 v8
17
#define RTMP1 v9
18
#define RTMP2 v10
19
#define RTMP3 v11
20
21
#define RTMP4 v12
22
#define RTMP5 v13
23
#define RTMP6 v14
24
#define RTMP7 v15
25
26
#define RX0 v12
27
#define RX1 v13
28
#define RKEY v14
29
#define RIV v15
30
31
/* Helper macros. */
32
33
#define SM4_PREPARE() \
34
adr_l x5, crypto_sm4_sbox; \
35
ld1 {v16.16b-v19.16b}, [x5], #64; \
36
ld1 {v20.16b-v23.16b}, [x5], #64; \
37
ld1 {v24.16b-v27.16b}, [x5], #64; \
38
ld1 {v28.16b-v31.16b}, [x5];
39
40
#define transpose_4x4(s0, s1, s2, s3) \
41
zip1 RTMP0.4s, s0.4s, s1.4s; \
42
zip1 RTMP1.4s, s2.4s, s3.4s; \
43
zip2 RTMP2.4s, s0.4s, s1.4s; \
44
zip2 RTMP3.4s, s2.4s, s3.4s; \
45
zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
46
zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
47
zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
48
zip2 s3.2d, RTMP2.2d, RTMP3.2d;
49
50
#define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
51
zip1 RTMP0.4s, s0.4s, s1.4s; \
52
zip1 RTMP1.4s, s2.4s, s3.4s; \
53
zip2 RTMP2.4s, s0.4s, s1.4s; \
54
zip2 RTMP3.4s, s2.4s, s3.4s; \
55
zip1 RTMP4.4s, s4.4s, s5.4s; \
56
zip1 RTMP5.4s, s6.4s, s7.4s; \
57
zip2 RTMP6.4s, s4.4s, s5.4s; \
58
zip2 RTMP7.4s, s6.4s, s7.4s; \
59
zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
60
zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
61
zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
62
zip2 s3.2d, RTMP2.2d, RTMP3.2d; \
63
zip1 s4.2d, RTMP4.2d, RTMP5.2d; \
64
zip2 s5.2d, RTMP4.2d, RTMP5.2d; \
65
zip1 s6.2d, RTMP6.2d, RTMP7.2d; \
66
zip2 s7.2d, RTMP6.2d, RTMP7.2d;
67
68
#define rotate_clockwise_4x4(s0, s1, s2, s3) \
69
zip1 RTMP0.4s, s1.4s, s0.4s; \
70
zip2 RTMP1.4s, s1.4s, s0.4s; \
71
zip1 RTMP2.4s, s3.4s, s2.4s; \
72
zip2 RTMP3.4s, s3.4s, s2.4s; \
73
zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
74
zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
75
zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
76
zip2 s3.2d, RTMP3.2d, RTMP1.2d;
77
78
#define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
79
zip1 RTMP0.4s, s1.4s, s0.4s; \
80
zip1 RTMP2.4s, s3.4s, s2.4s; \
81
zip2 RTMP1.4s, s1.4s, s0.4s; \
82
zip2 RTMP3.4s, s3.4s, s2.4s; \
83
zip1 RTMP4.4s, s5.4s, s4.4s; \
84
zip1 RTMP6.4s, s7.4s, s6.4s; \
85
zip2 RTMP5.4s, s5.4s, s4.4s; \
86
zip2 RTMP7.4s, s7.4s, s6.4s; \
87
zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
88
zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
89
zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
90
zip2 s3.2d, RTMP3.2d, RTMP1.2d; \
91
zip1 s4.2d, RTMP6.2d, RTMP4.2d; \
92
zip2 s5.2d, RTMP6.2d, RTMP4.2d; \
93
zip1 s6.2d, RTMP7.2d, RTMP5.2d; \
94
zip2 s7.2d, RTMP7.2d, RTMP5.2d;
95
96
#define ROUND4(round, s0, s1, s2, s3) \
97
dup RX0.4s, RKEY.s[round]; \
98
/* rk ^ s1 ^ s2 ^ s3 */ \
99
eor RTMP1.16b, s2.16b, s3.16b; \
100
eor RX0.16b, RX0.16b, s1.16b; \
101
eor RX0.16b, RX0.16b, RTMP1.16b; \
102
\
103
/* sbox, non-linear part */ \
104
movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
105
tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
106
sub RX0.16b, RX0.16b, RTMP3.16b; \
107
tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
108
sub RX0.16b, RX0.16b, RTMP3.16b; \
109
tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
110
sub RX0.16b, RX0.16b, RTMP3.16b; \
111
tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
112
\
113
/* linear part */ \
114
shl RTMP1.4s, RTMP0.4s, #8; \
115
shl RTMP2.4s, RTMP0.4s, #16; \
116
shl RTMP3.4s, RTMP0.4s, #24; \
117
sri RTMP1.4s, RTMP0.4s, #(32-8); \
118
sri RTMP2.4s, RTMP0.4s, #(32-16); \
119
sri RTMP3.4s, RTMP0.4s, #(32-24); \
120
/* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
121
eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \
122
eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \
123
/* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \
124
eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \
125
shl RTMP2.4s, RTMP1.4s, 2; \
126
sri RTMP2.4s, RTMP1.4s, #(32-2); \
127
eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \
128
/* s0 ^= RTMP3 */ \
129
eor s0.16b, s0.16b, RTMP3.16b;
130
131
#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \
132
mov x6, 8; \
133
4: \
134
ld1 {RKEY.4s}, [x0], #16; \
135
subs x6, x6, #1; \
136
\
137
ROUND4(0, b0, b1, b2, b3); \
138
ROUND4(1, b1, b2, b3, b0); \
139
ROUND4(2, b2, b3, b0, b1); \
140
ROUND4(3, b3, b0, b1, b2); \
141
\
142
bne 4b; \
143
\
144
rev32 b0.16b, b0.16b; \
145
rev32 b1.16b, b1.16b; \
146
rev32 b2.16b, b2.16b; \
147
rev32 b3.16b, b3.16b; \
148
\
149
rotate_clockwise_4x4(b0, b1, b2, b3); \
150
\
151
/* repoint to rkey */ \
152
sub x0, x0, #128;
153
154
#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
155
rev32 b0.16b, b0.16b; \
156
rev32 b1.16b, b1.16b; \
157
rev32 b2.16b, b2.16b; \
158
rev32 b3.16b, b3.16b; \
159
SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
160
161
#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \
162
/* rk ^ s1 ^ s2 ^ s3 */ \
163
dup RX0.4s, RKEY.s[round]; \
164
eor RTMP0.16b, s2.16b, s3.16b; \
165
mov RX1.16b, RX0.16b; \
166
eor RTMP1.16b, t2.16b, t3.16b; \
167
eor RX0.16b, RX0.16b, s1.16b; \
168
eor RX1.16b, RX1.16b, t1.16b; \
169
eor RX0.16b, RX0.16b, RTMP0.16b; \
170
eor RX1.16b, RX1.16b, RTMP1.16b; \
171
\
172
/* sbox, non-linear part */ \
173
movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
174
tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
175
tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \
176
sub RX0.16b, RX0.16b, RTMP3.16b; \
177
sub RX1.16b, RX1.16b, RTMP3.16b; \
178
tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
179
tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \
180
sub RX0.16b, RX0.16b, RTMP3.16b; \
181
sub RX1.16b, RX1.16b, RTMP3.16b; \
182
tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
183
tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \
184
sub RX0.16b, RX0.16b, RTMP3.16b; \
185
sub RX1.16b, RX1.16b, RTMP3.16b; \
186
tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
187
tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \
188
\
189
/* linear part */ \
190
shl RX0.4s, RTMP0.4s, #8; \
191
shl RX1.4s, RTMP1.4s, #8; \
192
shl RTMP2.4s, RTMP0.4s, #16; \
193
shl RTMP3.4s, RTMP1.4s, #16; \
194
sri RX0.4s, RTMP0.4s, #(32 - 8); \
195
sri RX1.4s, RTMP1.4s, #(32 - 8); \
196
sri RTMP2.4s, RTMP0.4s, #(32 - 16); \
197
sri RTMP3.4s, RTMP1.4s, #(32 - 16); \
198
/* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
199
eor RX0.16b, RX0.16b, RTMP0.16b; \
200
eor RX1.16b, RX1.16b, RTMP1.16b; \
201
eor RX0.16b, RX0.16b, RTMP2.16b; \
202
eor RX1.16b, RX1.16b, RTMP3.16b; \
203
/* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \
204
shl RTMP2.4s, RTMP0.4s, #24; \
205
shl RTMP3.4s, RTMP1.4s, #24; \
206
sri RTMP2.4s, RTMP0.4s, #(32 - 24); \
207
sri RTMP3.4s, RTMP1.4s, #(32 - 24); \
208
eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
209
eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
210
shl RTMP2.4s, RX0.4s, #2; \
211
shl RTMP3.4s, RX1.4s, #2; \
212
sri RTMP2.4s, RX0.4s, #(32 - 2); \
213
sri RTMP3.4s, RX1.4s, #(32 - 2); \
214
eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
215
eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
216
/* s0/t0 ^= RTMP0/1 */ \
217
eor s0.16b, s0.16b, RTMP0.16b; \
218
eor t0.16b, t0.16b, RTMP1.16b;
219
220
#define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \
221
rev32 b0.16b, b0.16b; \
222
rev32 b1.16b, b1.16b; \
223
rev32 b2.16b, b2.16b; \
224
rev32 b3.16b, b3.16b; \
225
rev32 b4.16b, b4.16b; \
226
rev32 b5.16b, b5.16b; \
227
rev32 b6.16b, b6.16b; \
228
rev32 b7.16b, b7.16b; \
229
\
230
mov x6, 8; \
231
8: \
232
ld1 {RKEY.4s}, [x0], #16; \
233
subs x6, x6, #1; \
234
\
235
ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \
236
ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \
237
ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \
238
ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \
239
\
240
bne 8b; \
241
\
242
rev32 b0.16b, b0.16b; \
243
rev32 b1.16b, b1.16b; \
244
rev32 b2.16b, b2.16b; \
245
rev32 b3.16b, b3.16b; \
246
rev32 b4.16b, b4.16b; \
247
rev32 b5.16b, b5.16b; \
248
rev32 b6.16b, b6.16b; \
249
rev32 b7.16b, b7.16b; \
250
\
251
/* repoint to rkey */ \
252
sub x0, x0, #128;
253
254
#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
255
SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7); \
256
rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7); \
257
258
259
.align 3
260
SYM_FUNC_START(sm4_neon_crypt)
261
/* input:
262
* x0: round key array, CTX
263
* x1: dst
264
* x2: src
265
* w3: nblocks
266
*/
267
SM4_PREPARE()
268
269
.Lcrypt_loop_8x:
270
sub w3, w3, #8
271
tbnz w3, #31, .Lcrypt_4x
272
273
ld4 {v0.4s-v3.4s}, [x2], #64
274
ld4 {v4.4s-v7.4s}, [x2], #64
275
276
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
277
278
st1 {v0.16b-v3.16b}, [x1], #64
279
st1 {v4.16b-v7.16b}, [x1], #64
280
281
cbz w3, .Lcrypt_end
282
b .Lcrypt_loop_8x
283
284
.Lcrypt_4x:
285
add w3, w3, #8
286
cmp w3, #4
287
blt .Lcrypt_tail
288
289
sub w3, w3, #4
290
291
ld4 {v0.4s-v3.4s}, [x2], #64
292
293
SM4_CRYPT_BLK4(v0, v1, v2, v3)
294
295
st1 {v0.16b-v3.16b}, [x1], #64
296
297
cbz w3, .Lcrypt_end
298
299
.Lcrypt_tail:
300
cmp w3, #2
301
ld1 {v0.16b}, [x2], #16
302
blt .Lcrypt_tail_load_done
303
ld1 {v1.16b}, [x2], #16
304
beq .Lcrypt_tail_load_done
305
ld1 {v2.16b}, [x2], #16
306
307
.Lcrypt_tail_load_done:
308
transpose_4x4(v0, v1, v2, v3)
309
310
SM4_CRYPT_BLK4(v0, v1, v2, v3)
311
312
cmp w3, #2
313
st1 {v0.16b}, [x1], #16
314
blt .Lcrypt_end
315
st1 {v1.16b}, [x1], #16
316
beq .Lcrypt_end
317
st1 {v2.16b}, [x1], #16
318
319
.Lcrypt_end:
320
ret
321
SYM_FUNC_END(sm4_neon_crypt)
322
323
.align 3
324
SYM_FUNC_START(sm4_neon_cbc_dec)
325
/* input:
326
* x0: round key array, CTX
327
* x1: dst
328
* x2: src
329
* x3: iv (big endian, 128 bit)
330
* w4: nblocks
331
*/
332
SM4_PREPARE()
333
334
ld1 {RIV.16b}, [x3]
335
336
.Lcbc_dec_loop_8x:
337
sub w4, w4, #8
338
tbnz w4, #31, .Lcbc_dec_4x
339
340
ld4 {v0.4s-v3.4s}, [x2], #64
341
ld4 {v4.4s-v7.4s}, [x2]
342
343
SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7)
344
345
/* Avoid overwriting the RIV register */
346
rotate_clockwise_4x4(v0, v1, v2, v3)
347
rotate_clockwise_4x4(v4, v5, v6, v7)
348
349
sub x2, x2, #64
350
351
eor v0.16b, v0.16b, RIV.16b
352
353
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
354
ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64
355
356
eor v1.16b, v1.16b, RTMP0.16b
357
eor v2.16b, v2.16b, RTMP1.16b
358
eor v3.16b, v3.16b, RTMP2.16b
359
eor v4.16b, v4.16b, RTMP3.16b
360
eor v5.16b, v5.16b, RTMP4.16b
361
eor v6.16b, v6.16b, RTMP5.16b
362
eor v7.16b, v7.16b, RTMP6.16b
363
364
mov RIV.16b, RTMP7.16b
365
366
st1 {v0.16b-v3.16b}, [x1], #64
367
st1 {v4.16b-v7.16b}, [x1], #64
368
369
cbz w4, .Lcbc_dec_end
370
b .Lcbc_dec_loop_8x
371
372
.Lcbc_dec_4x:
373
add w4, w4, #8
374
cmp w4, #4
375
blt .Lcbc_dec_tail
376
377
sub w4, w4, #4
378
379
ld1 {v0.16b-v3.16b}, [x2], #64
380
381
rev32 v4.16b, v0.16b
382
rev32 v5.16b, v1.16b
383
rev32 v6.16b, v2.16b
384
rev32 v7.16b, v3.16b
385
386
transpose_4x4(v4, v5, v6, v7)
387
388
SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
389
390
eor v4.16b, v4.16b, RIV.16b
391
eor v5.16b, v5.16b, v0.16b
392
eor v6.16b, v6.16b, v1.16b
393
eor v7.16b, v7.16b, v2.16b
394
395
mov RIV.16b, v3.16b
396
397
st1 {v4.16b-v7.16b}, [x1], #64
398
399
cbz w4, .Lcbc_dec_end
400
401
.Lcbc_dec_tail:
402
cmp w4, #2
403
ld1 {v0.16b}, [x2], #16
404
blt .Lcbc_dec_tail_load_done
405
ld1 {v1.16b}, [x2], #16
406
beq .Lcbc_dec_tail_load_done
407
ld1 {v2.16b}, [x2], #16
408
409
.Lcbc_dec_tail_load_done:
410
rev32 v4.16b, v0.16b
411
rev32 v5.16b, v1.16b
412
rev32 v6.16b, v2.16b
413
414
transpose_4x4(v4, v5, v6, v7)
415
416
SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
417
418
cmp w4, #2
419
eor v4.16b, v4.16b, RIV.16b
420
mov RIV.16b, v0.16b
421
st1 {v4.16b}, [x1], #16
422
blt .Lcbc_dec_end
423
424
eor v5.16b, v5.16b, v0.16b
425
mov RIV.16b, v1.16b
426
st1 {v5.16b}, [x1], #16
427
beq .Lcbc_dec_end
428
429
eor v6.16b, v6.16b, v1.16b
430
mov RIV.16b, v2.16b
431
st1 {v6.16b}, [x1], #16
432
433
.Lcbc_dec_end:
434
/* store new IV */
435
st1 {RIV.16b}, [x3]
436
437
ret
438
SYM_FUNC_END(sm4_neon_cbc_dec)
439
440
.align 3
441
SYM_FUNC_START(sm4_neon_ctr_crypt)
442
/* input:
443
* x0: round key array, CTX
444
* x1: dst
445
* x2: src
446
* x3: ctr (big endian, 128 bit)
447
* w4: nblocks
448
*/
449
SM4_PREPARE()
450
451
ldp x7, x8, [x3]
452
rev x7, x7
453
rev x8, x8
454
455
.Lctr_crypt_loop_8x:
456
sub w4, w4, #8
457
tbnz w4, #31, .Lctr_crypt_4x
458
459
#define inc_le128(vctr) \
460
mov vctr.d[1], x8; \
461
mov vctr.d[0], x7; \
462
adds x8, x8, #1; \
463
rev64 vctr.16b, vctr.16b; \
464
adc x7, x7, xzr;
465
466
/* construct CTRs */
467
inc_le128(v0) /* +0 */
468
inc_le128(v1) /* +1 */
469
inc_le128(v2) /* +2 */
470
inc_le128(v3) /* +3 */
471
inc_le128(v4) /* +4 */
472
inc_le128(v5) /* +5 */
473
inc_le128(v6) /* +6 */
474
inc_le128(v7) /* +7 */
475
476
transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7)
477
478
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
479
480
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
481
ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64
482
483
eor v0.16b, v0.16b, RTMP0.16b
484
eor v1.16b, v1.16b, RTMP1.16b
485
eor v2.16b, v2.16b, RTMP2.16b
486
eor v3.16b, v3.16b, RTMP3.16b
487
eor v4.16b, v4.16b, RTMP4.16b
488
eor v5.16b, v5.16b, RTMP5.16b
489
eor v6.16b, v6.16b, RTMP6.16b
490
eor v7.16b, v7.16b, RTMP7.16b
491
492
st1 {v0.16b-v3.16b}, [x1], #64
493
st1 {v4.16b-v7.16b}, [x1], #64
494
495
cbz w4, .Lctr_crypt_end
496
b .Lctr_crypt_loop_8x
497
498
.Lctr_crypt_4x:
499
add w4, w4, #8
500
cmp w4, #4
501
blt .Lctr_crypt_tail
502
503
sub w4, w4, #4
504
505
/* construct CTRs */
506
inc_le128(v0) /* +0 */
507
inc_le128(v1) /* +1 */
508
inc_le128(v2) /* +2 */
509
inc_le128(v3) /* +3 */
510
511
ld1 {v4.16b-v7.16b}, [x2], #64
512
513
transpose_4x4(v0, v1, v2, v3)
514
515
SM4_CRYPT_BLK4(v0, v1, v2, v3)
516
517
eor v0.16b, v0.16b, v4.16b
518
eor v1.16b, v1.16b, v5.16b
519
eor v2.16b, v2.16b, v6.16b
520
eor v3.16b, v3.16b, v7.16b
521
522
st1 {v0.16b-v3.16b}, [x1], #64
523
524
cbz w4, .Lctr_crypt_end
525
526
.Lctr_crypt_tail:
527
/* inc_le128 will change the sign bit */
528
ld1 {v4.16b}, [x2], #16
529
inc_le128(v0)
530
cmp w4, #2
531
blt .Lctr_crypt_tail_load_done
532
533
ld1 {v5.16b}, [x2], #16
534
inc_le128(v1)
535
cmp w4, #2
536
beq .Lctr_crypt_tail_load_done
537
538
ld1 {v6.16b}, [x2], #16
539
inc_le128(v2)
540
541
.Lctr_crypt_tail_load_done:
542
transpose_4x4(v0, v1, v2, v3)
543
544
SM4_CRYPT_BLK4(v0, v1, v2, v3)
545
546
cmp w4, #2
547
548
eor v0.16b, v0.16b, v4.16b
549
st1 {v0.16b}, [x1], #16
550
blt .Lctr_crypt_end
551
552
eor v1.16b, v1.16b, v5.16b
553
st1 {v1.16b}, [x1], #16
554
beq .Lctr_crypt_end
555
556
eor v2.16b, v2.16b, v6.16b
557
st1 {v2.16b}, [x1], #16
558
559
.Lctr_crypt_end:
560
/* store new CTR */
561
rev x7, x7
562
rev x8, x8
563
stp x7, x8, [x3]
564
565
ret
566
SYM_FUNC_END(sm4_neon_ctr_crypt)
567
568