Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/crypto/sm4-ce-core.S
26451 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
4
* as specified in
5
* https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6
*
7
* Copyright (C) 2022, Alibaba Group.
8
* Copyright (C) 2022 Tianjia Zhang <[email protected]>
9
*/
10
11
#include <linux/linkage.h>
12
#include <asm/assembler.h>
13
#include "sm4-ce-asm.h"
14
15
.arch armv8-a+crypto
16
17
.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
18
20, 24, 25, 26, 27, 28, 29, 30, 31
19
.set .Lv\b\().4s, \b
20
.endr
21
22
.macro sm4e, vd, vn
23
.inst 0xcec08400 | (.L\vn << 5) | .L\vd
24
.endm
25
26
.macro sm4ekey, vd, vn, vm
27
.inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
28
.endm
29
30
/* Register macros */
31
32
#define RTMP0 v16
33
#define RTMP1 v17
34
#define RTMP2 v18
35
#define RTMP3 v19
36
37
#define RIV v20
38
#define RMAC v20
39
#define RMASK v21
40
41
42
.align 3
43
SYM_FUNC_START(sm4_ce_expand_key)
44
/* input:
45
* x0: 128-bit key
46
* x1: rkey_enc
47
* x2: rkey_dec
48
* x3: fk array
49
* x4: ck array
50
*/
51
ld1 {v0.16b}, [x0];
52
rev32 v0.16b, v0.16b;
53
ld1 {v1.16b}, [x3];
54
/* load ck */
55
ld1 {v24.16b-v27.16b}, [x4], #64;
56
ld1 {v28.16b-v31.16b}, [x4];
57
58
/* input ^ fk */
59
eor v0.16b, v0.16b, v1.16b;
60
61
sm4ekey v0.4s, v0.4s, v24.4s;
62
sm4ekey v1.4s, v0.4s, v25.4s;
63
sm4ekey v2.4s, v1.4s, v26.4s;
64
sm4ekey v3.4s, v2.4s, v27.4s;
65
sm4ekey v4.4s, v3.4s, v28.4s;
66
sm4ekey v5.4s, v4.4s, v29.4s;
67
sm4ekey v6.4s, v5.4s, v30.4s;
68
sm4ekey v7.4s, v6.4s, v31.4s;
69
70
adr_l x5, .Lbswap128_mask
71
ld1 {v24.16b}, [x5]
72
73
st1 {v0.16b-v3.16b}, [x1], #64;
74
st1 {v4.16b-v7.16b}, [x1];
75
76
tbl v16.16b, {v7.16b}, v24.16b
77
tbl v17.16b, {v6.16b}, v24.16b
78
tbl v18.16b, {v5.16b}, v24.16b
79
tbl v19.16b, {v4.16b}, v24.16b
80
tbl v20.16b, {v3.16b}, v24.16b
81
tbl v21.16b, {v2.16b}, v24.16b
82
tbl v22.16b, {v1.16b}, v24.16b
83
tbl v23.16b, {v0.16b}, v24.16b
84
85
st1 {v16.16b-v19.16b}, [x2], #64
86
st1 {v20.16b-v23.16b}, [x2]
87
88
ret;
89
SYM_FUNC_END(sm4_ce_expand_key)
90
91
.align 3
92
SYM_FUNC_START(sm4_ce_crypt_block)
93
/* input:
94
* x0: round key array, CTX
95
* x1: dst
96
* x2: src
97
*/
98
SM4_PREPARE(x0)
99
100
ld1 {v0.16b}, [x2];
101
SM4_CRYPT_BLK(v0);
102
st1 {v0.16b}, [x1];
103
104
ret;
105
SYM_FUNC_END(sm4_ce_crypt_block)
106
107
.align 3
108
SYM_FUNC_START(sm4_ce_crypt)
109
/* input:
110
* x0: round key array, CTX
111
* x1: dst
112
* x2: src
113
* w3: nblocks
114
*/
115
SM4_PREPARE(x0)
116
117
.Lcrypt_loop_blk:
118
sub w3, w3, #8;
119
tbnz w3, #31, .Lcrypt_tail8;
120
121
ld1 {v0.16b-v3.16b}, [x2], #64;
122
ld1 {v4.16b-v7.16b}, [x2], #64;
123
124
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
125
126
st1 {v0.16b-v3.16b}, [x1], #64;
127
st1 {v4.16b-v7.16b}, [x1], #64;
128
129
cbz w3, .Lcrypt_end;
130
b .Lcrypt_loop_blk;
131
132
.Lcrypt_tail8:
133
add w3, w3, #8;
134
cmp w3, #4;
135
blt .Lcrypt_tail4;
136
137
sub w3, w3, #4;
138
139
ld1 {v0.16b-v3.16b}, [x2], #64;
140
SM4_CRYPT_BLK4(v0, v1, v2, v3);
141
st1 {v0.16b-v3.16b}, [x1], #64;
142
143
cbz w3, .Lcrypt_end;
144
145
.Lcrypt_tail4:
146
sub w3, w3, #1;
147
148
ld1 {v0.16b}, [x2], #16;
149
SM4_CRYPT_BLK(v0);
150
st1 {v0.16b}, [x1], #16;
151
152
cbnz w3, .Lcrypt_tail4;
153
154
.Lcrypt_end:
155
ret;
156
SYM_FUNC_END(sm4_ce_crypt)
157
158
.align 3
159
SYM_FUNC_START(sm4_ce_cbc_enc)
160
/* input:
161
* x0: round key array, CTX
162
* x1: dst
163
* x2: src
164
* x3: iv (big endian, 128 bit)
165
* w4: nblocks
166
*/
167
SM4_PREPARE(x0)
168
169
ld1 {RIV.16b}, [x3]
170
171
.Lcbc_enc_loop_4x:
172
cmp w4, #4
173
blt .Lcbc_enc_loop_1x
174
175
sub w4, w4, #4
176
177
ld1 {v0.16b-v3.16b}, [x2], #64
178
179
eor v0.16b, v0.16b, RIV.16b
180
SM4_CRYPT_BLK(v0)
181
eor v1.16b, v1.16b, v0.16b
182
SM4_CRYPT_BLK(v1)
183
eor v2.16b, v2.16b, v1.16b
184
SM4_CRYPT_BLK(v2)
185
eor v3.16b, v3.16b, v2.16b
186
SM4_CRYPT_BLK(v3)
187
188
st1 {v0.16b-v3.16b}, [x1], #64
189
mov RIV.16b, v3.16b
190
191
cbz w4, .Lcbc_enc_end
192
b .Lcbc_enc_loop_4x
193
194
.Lcbc_enc_loop_1x:
195
sub w4, w4, #1
196
197
ld1 {v0.16b}, [x2], #16
198
199
eor RIV.16b, RIV.16b, v0.16b
200
SM4_CRYPT_BLK(RIV)
201
202
st1 {RIV.16b}, [x1], #16
203
204
cbnz w4, .Lcbc_enc_loop_1x
205
206
.Lcbc_enc_end:
207
/* store new IV */
208
st1 {RIV.16b}, [x3]
209
210
ret
211
SYM_FUNC_END(sm4_ce_cbc_enc)
212
213
.align 3
214
SYM_FUNC_START(sm4_ce_cbc_dec)
215
/* input:
216
* x0: round key array, CTX
217
* x1: dst
218
* x2: src
219
* x3: iv (big endian, 128 bit)
220
* w4: nblocks
221
*/
222
SM4_PREPARE(x0)
223
224
ld1 {RIV.16b}, [x3]
225
226
.Lcbc_dec_loop_8x:
227
sub w4, w4, #8
228
tbnz w4, #31, .Lcbc_dec_4x
229
230
ld1 {v0.16b-v3.16b}, [x2], #64
231
ld1 {v4.16b-v7.16b}, [x2], #64
232
233
rev32 v8.16b, v0.16b
234
rev32 v9.16b, v1.16b
235
rev32 v10.16b, v2.16b
236
rev32 v11.16b, v3.16b
237
rev32 v12.16b, v4.16b
238
rev32 v13.16b, v5.16b
239
rev32 v14.16b, v6.16b
240
rev32 v15.16b, v7.16b
241
242
SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
243
244
eor v8.16b, v8.16b, RIV.16b
245
eor v9.16b, v9.16b, v0.16b
246
eor v10.16b, v10.16b, v1.16b
247
eor v11.16b, v11.16b, v2.16b
248
eor v12.16b, v12.16b, v3.16b
249
eor v13.16b, v13.16b, v4.16b
250
eor v14.16b, v14.16b, v5.16b
251
eor v15.16b, v15.16b, v6.16b
252
253
st1 {v8.16b-v11.16b}, [x1], #64
254
st1 {v12.16b-v15.16b}, [x1], #64
255
256
mov RIV.16b, v7.16b
257
258
cbz w4, .Lcbc_dec_end
259
b .Lcbc_dec_loop_8x
260
261
.Lcbc_dec_4x:
262
add w4, w4, #8
263
cmp w4, #4
264
blt .Lcbc_dec_loop_1x
265
266
sub w4, w4, #4
267
268
ld1 {v0.16b-v3.16b}, [x2], #64
269
270
rev32 v8.16b, v0.16b
271
rev32 v9.16b, v1.16b
272
rev32 v10.16b, v2.16b
273
rev32 v11.16b, v3.16b
274
275
SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
276
277
eor v8.16b, v8.16b, RIV.16b
278
eor v9.16b, v9.16b, v0.16b
279
eor v10.16b, v10.16b, v1.16b
280
eor v11.16b, v11.16b, v2.16b
281
282
st1 {v8.16b-v11.16b}, [x1], #64
283
284
mov RIV.16b, v3.16b
285
286
cbz w4, .Lcbc_dec_end
287
288
.Lcbc_dec_loop_1x:
289
sub w4, w4, #1
290
291
ld1 {v0.16b}, [x2], #16
292
293
rev32 v8.16b, v0.16b
294
295
SM4_CRYPT_BLK_BE(v8)
296
297
eor v8.16b, v8.16b, RIV.16b
298
st1 {v8.16b}, [x1], #16
299
300
mov RIV.16b, v0.16b
301
302
cbnz w4, .Lcbc_dec_loop_1x
303
304
.Lcbc_dec_end:
305
/* store new IV */
306
st1 {RIV.16b}, [x3]
307
308
ret
309
SYM_FUNC_END(sm4_ce_cbc_dec)
310
311
.align 3
312
SYM_FUNC_START(sm4_ce_cbc_cts_enc)
313
/* input:
314
* x0: round key array, CTX
315
* x1: dst
316
* x2: src
317
* x3: iv (big endian, 128 bit)
318
* w4: nbytes
319
*/
320
SM4_PREPARE(x0)
321
322
sub w5, w4, #16
323
uxtw x5, w5
324
325
ld1 {RIV.16b}, [x3]
326
327
ld1 {v0.16b}, [x2]
328
eor RIV.16b, RIV.16b, v0.16b
329
SM4_CRYPT_BLK(RIV)
330
331
/* load permute table */
332
adr_l x6, .Lcts_permute_table
333
add x7, x6, #32
334
add x6, x6, x5
335
sub x7, x7, x5
336
ld1 {v3.16b}, [x6]
337
ld1 {v4.16b}, [x7]
338
339
/* overlapping loads */
340
add x2, x2, x5
341
ld1 {v1.16b}, [x2]
342
343
/* create Cn from En-1 */
344
tbl v0.16b, {RIV.16b}, v3.16b
345
/* padding Pn with zeros */
346
tbl v1.16b, {v1.16b}, v4.16b
347
348
eor v1.16b, v1.16b, RIV.16b
349
SM4_CRYPT_BLK(v1)
350
351
/* overlapping stores */
352
add x5, x1, x5
353
st1 {v0.16b}, [x5]
354
st1 {v1.16b}, [x1]
355
356
ret
357
SYM_FUNC_END(sm4_ce_cbc_cts_enc)
358
359
.align 3
360
SYM_FUNC_START(sm4_ce_cbc_cts_dec)
361
/* input:
362
* x0: round key array, CTX
363
* x1: dst
364
* x2: src
365
* x3: iv (big endian, 128 bit)
366
* w4: nbytes
367
*/
368
SM4_PREPARE(x0)
369
370
sub w5, w4, #16
371
uxtw x5, w5
372
373
ld1 {RIV.16b}, [x3]
374
375
/* load permute table */
376
adr_l x6, .Lcts_permute_table
377
add x7, x6, #32
378
add x6, x6, x5
379
sub x7, x7, x5
380
ld1 {v3.16b}, [x6]
381
ld1 {v4.16b}, [x7]
382
383
/* overlapping loads */
384
ld1 {v0.16b}, [x2], x5
385
ld1 {v1.16b}, [x2]
386
387
SM4_CRYPT_BLK(v0)
388
/* select the first Ln bytes of Xn to create Pn */
389
tbl v2.16b, {v0.16b}, v3.16b
390
eor v2.16b, v2.16b, v1.16b
391
392
/* overwrite the first Ln bytes with Cn to create En-1 */
393
tbx v0.16b, {v1.16b}, v4.16b
394
SM4_CRYPT_BLK(v0)
395
eor v0.16b, v0.16b, RIV.16b
396
397
/* overlapping stores */
398
add x5, x1, x5
399
st1 {v2.16b}, [x5]
400
st1 {v0.16b}, [x1]
401
402
ret
403
SYM_FUNC_END(sm4_ce_cbc_cts_dec)
404
405
.align 3
406
SYM_FUNC_START(sm4_ce_ctr_enc)
407
/* input:
408
* x0: round key array, CTX
409
* x1: dst
410
* x2: src
411
* x3: ctr (big endian, 128 bit)
412
* w4: nblocks
413
*/
414
SM4_PREPARE(x0)
415
416
ldp x7, x8, [x3]
417
rev x7, x7
418
rev x8, x8
419
420
.Lctr_loop_8x:
421
sub w4, w4, #8
422
tbnz w4, #31, .Lctr_4x
423
424
#define inc_le128(vctr) \
425
mov vctr.d[1], x8; \
426
mov vctr.d[0], x7; \
427
adds x8, x8, #1; \
428
rev64 vctr.16b, vctr.16b; \
429
adc x7, x7, xzr;
430
431
/* construct CTRs */
432
inc_le128(v0) /* +0 */
433
inc_le128(v1) /* +1 */
434
inc_le128(v2) /* +2 */
435
inc_le128(v3) /* +3 */
436
inc_le128(v4) /* +4 */
437
inc_le128(v5) /* +5 */
438
inc_le128(v6) /* +6 */
439
inc_le128(v7) /* +7 */
440
441
ld1 {v8.16b-v11.16b}, [x2], #64
442
ld1 {v12.16b-v15.16b}, [x2], #64
443
444
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
445
446
eor v0.16b, v0.16b, v8.16b
447
eor v1.16b, v1.16b, v9.16b
448
eor v2.16b, v2.16b, v10.16b
449
eor v3.16b, v3.16b, v11.16b
450
eor v4.16b, v4.16b, v12.16b
451
eor v5.16b, v5.16b, v13.16b
452
eor v6.16b, v6.16b, v14.16b
453
eor v7.16b, v7.16b, v15.16b
454
455
st1 {v0.16b-v3.16b}, [x1], #64
456
st1 {v4.16b-v7.16b}, [x1], #64
457
458
cbz w4, .Lctr_end
459
b .Lctr_loop_8x
460
461
.Lctr_4x:
462
add w4, w4, #8
463
cmp w4, #4
464
blt .Lctr_loop_1x
465
466
sub w4, w4, #4
467
468
/* construct CTRs */
469
inc_le128(v0) /* +0 */
470
inc_le128(v1) /* +1 */
471
inc_le128(v2) /* +2 */
472
inc_le128(v3) /* +3 */
473
474
ld1 {v8.16b-v11.16b}, [x2], #64
475
476
SM4_CRYPT_BLK4(v0, v1, v2, v3)
477
478
eor v0.16b, v0.16b, v8.16b
479
eor v1.16b, v1.16b, v9.16b
480
eor v2.16b, v2.16b, v10.16b
481
eor v3.16b, v3.16b, v11.16b
482
483
st1 {v0.16b-v3.16b}, [x1], #64
484
485
cbz w4, .Lctr_end
486
487
.Lctr_loop_1x:
488
sub w4, w4, #1
489
490
/* construct CTRs */
491
inc_le128(v0)
492
493
ld1 {v8.16b}, [x2], #16
494
495
SM4_CRYPT_BLK(v0)
496
497
eor v0.16b, v0.16b, v8.16b
498
st1 {v0.16b}, [x1], #16
499
500
cbnz w4, .Lctr_loop_1x
501
502
.Lctr_end:
503
/* store new CTR */
504
rev x7, x7
505
rev x8, x8
506
stp x7, x8, [x3]
507
508
ret
509
SYM_FUNC_END(sm4_ce_ctr_enc)
510
511
512
#define tweak_next(vt, vin, RTMP) \
513
sshr RTMP.2d, vin.2d, #63; \
514
and RTMP.16b, RTMP.16b, RMASK.16b; \
515
add vt.2d, vin.2d, vin.2d; \
516
ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \
517
eor vt.16b, vt.16b, RTMP.16b;
518
519
.align 3
520
SYM_FUNC_START(sm4_ce_xts_enc)
521
/* input:
522
* x0: round key array, CTX
523
* x1: dst
524
* x2: src
525
* x3: tweak (big endian, 128 bit)
526
* w4: nbytes
527
* x5: round key array for IV
528
*/
529
ld1 {v8.16b}, [x3]
530
531
cbz x5, .Lxts_enc_nofirst
532
533
SM4_PREPARE(x5)
534
535
/* Generate first tweak */
536
SM4_CRYPT_BLK(v8)
537
538
.Lxts_enc_nofirst:
539
SM4_PREPARE(x0)
540
541
ands w5, w4, #15
542
lsr w4, w4, #4
543
sub w6, w4, #1
544
csel w4, w4, w6, eq
545
uxtw x5, w5
546
547
movi RMASK.2s, #0x1
548
movi RTMP0.2s, #0x87
549
uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
550
551
cbz w4, .Lxts_enc_cts
552
553
.Lxts_enc_loop_8x:
554
sub w4, w4, #8
555
tbnz w4, #31, .Lxts_enc_4x
556
557
tweak_next( v9, v8, RTMP0)
558
tweak_next(v10, v9, RTMP1)
559
tweak_next(v11, v10, RTMP2)
560
tweak_next(v12, v11, RTMP3)
561
tweak_next(v13, v12, RTMP0)
562
tweak_next(v14, v13, RTMP1)
563
tweak_next(v15, v14, RTMP2)
564
565
ld1 {v0.16b-v3.16b}, [x2], #64
566
ld1 {v4.16b-v7.16b}, [x2], #64
567
eor v0.16b, v0.16b, v8.16b
568
eor v1.16b, v1.16b, v9.16b
569
eor v2.16b, v2.16b, v10.16b
570
eor v3.16b, v3.16b, v11.16b
571
eor v4.16b, v4.16b, v12.16b
572
eor v5.16b, v5.16b, v13.16b
573
eor v6.16b, v6.16b, v14.16b
574
eor v7.16b, v7.16b, v15.16b
575
576
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
577
578
eor v0.16b, v0.16b, v8.16b
579
eor v1.16b, v1.16b, v9.16b
580
eor v2.16b, v2.16b, v10.16b
581
eor v3.16b, v3.16b, v11.16b
582
eor v4.16b, v4.16b, v12.16b
583
eor v5.16b, v5.16b, v13.16b
584
eor v6.16b, v6.16b, v14.16b
585
eor v7.16b, v7.16b, v15.16b
586
st1 {v0.16b-v3.16b}, [x1], #64
587
st1 {v4.16b-v7.16b}, [x1], #64
588
589
tweak_next(v8, v15, RTMP3)
590
591
cbz w4, .Lxts_enc_cts
592
b .Lxts_enc_loop_8x
593
594
.Lxts_enc_4x:
595
add w4, w4, #8
596
cmp w4, #4
597
blt .Lxts_enc_loop_1x
598
599
sub w4, w4, #4
600
601
tweak_next( v9, v8, RTMP0)
602
tweak_next(v10, v9, RTMP1)
603
tweak_next(v11, v10, RTMP2)
604
605
ld1 {v0.16b-v3.16b}, [x2], #64
606
eor v0.16b, v0.16b, v8.16b
607
eor v1.16b, v1.16b, v9.16b
608
eor v2.16b, v2.16b, v10.16b
609
eor v3.16b, v3.16b, v11.16b
610
611
SM4_CRYPT_BLK4(v0, v1, v2, v3)
612
613
eor v0.16b, v0.16b, v8.16b
614
eor v1.16b, v1.16b, v9.16b
615
eor v2.16b, v2.16b, v10.16b
616
eor v3.16b, v3.16b, v11.16b
617
st1 {v0.16b-v3.16b}, [x1], #64
618
619
tweak_next(v8, v11, RTMP3)
620
621
cbz w4, .Lxts_enc_cts
622
623
.Lxts_enc_loop_1x:
624
sub w4, w4, #1
625
626
ld1 {v0.16b}, [x2], #16
627
eor v0.16b, v0.16b, v8.16b
628
629
SM4_CRYPT_BLK(v0)
630
631
eor v0.16b, v0.16b, v8.16b
632
st1 {v0.16b}, [x1], #16
633
634
tweak_next(v8, v8, RTMP0)
635
636
cbnz w4, .Lxts_enc_loop_1x
637
638
.Lxts_enc_cts:
639
cbz x5, .Lxts_enc_end
640
641
/* cipher text stealing */
642
643
tweak_next(v9, v8, RTMP0)
644
ld1 {v0.16b}, [x2]
645
eor v0.16b, v0.16b, v8.16b
646
SM4_CRYPT_BLK(v0)
647
eor v0.16b, v0.16b, v8.16b
648
649
/* load permute table */
650
adr_l x6, .Lcts_permute_table
651
add x7, x6, #32
652
add x6, x6, x5
653
sub x7, x7, x5
654
ld1 {v3.16b}, [x6]
655
ld1 {v4.16b}, [x7]
656
657
/* overlapping loads */
658
add x2, x2, x5
659
ld1 {v1.16b}, [x2]
660
661
/* create Cn from En-1 */
662
tbl v2.16b, {v0.16b}, v3.16b
663
/* padding Pn with En-1 at the end */
664
tbx v0.16b, {v1.16b}, v4.16b
665
666
eor v0.16b, v0.16b, v9.16b
667
SM4_CRYPT_BLK(v0)
668
eor v0.16b, v0.16b, v9.16b
669
670
671
/* overlapping stores */
672
add x5, x1, x5
673
st1 {v2.16b}, [x5]
674
st1 {v0.16b}, [x1]
675
676
b .Lxts_enc_ret
677
678
.Lxts_enc_end:
679
/* store new tweak */
680
st1 {v8.16b}, [x3]
681
682
.Lxts_enc_ret:
683
ret
684
SYM_FUNC_END(sm4_ce_xts_enc)
685
686
.align 3
687
SYM_FUNC_START(sm4_ce_xts_dec)
688
/* input:
689
* x0: round key array, CTX
690
* x1: dst
691
* x2: src
692
* x3: tweak (big endian, 128 bit)
693
* w4: nbytes
694
* x5: round key array for IV
695
*/
696
ld1 {v8.16b}, [x3]
697
698
cbz x5, .Lxts_dec_nofirst
699
700
SM4_PREPARE(x5)
701
702
/* Generate first tweak */
703
SM4_CRYPT_BLK(v8)
704
705
.Lxts_dec_nofirst:
706
SM4_PREPARE(x0)
707
708
ands w5, w4, #15
709
lsr w4, w4, #4
710
sub w6, w4, #1
711
csel w4, w4, w6, eq
712
uxtw x5, w5
713
714
movi RMASK.2s, #0x1
715
movi RTMP0.2s, #0x87
716
uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
717
718
cbz w4, .Lxts_dec_cts
719
720
.Lxts_dec_loop_8x:
721
sub w4, w4, #8
722
tbnz w4, #31, .Lxts_dec_4x
723
724
tweak_next( v9, v8, RTMP0)
725
tweak_next(v10, v9, RTMP1)
726
tweak_next(v11, v10, RTMP2)
727
tweak_next(v12, v11, RTMP3)
728
tweak_next(v13, v12, RTMP0)
729
tweak_next(v14, v13, RTMP1)
730
tweak_next(v15, v14, RTMP2)
731
732
ld1 {v0.16b-v3.16b}, [x2], #64
733
ld1 {v4.16b-v7.16b}, [x2], #64
734
eor v0.16b, v0.16b, v8.16b
735
eor v1.16b, v1.16b, v9.16b
736
eor v2.16b, v2.16b, v10.16b
737
eor v3.16b, v3.16b, v11.16b
738
eor v4.16b, v4.16b, v12.16b
739
eor v5.16b, v5.16b, v13.16b
740
eor v6.16b, v6.16b, v14.16b
741
eor v7.16b, v7.16b, v15.16b
742
743
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
744
745
eor v0.16b, v0.16b, v8.16b
746
eor v1.16b, v1.16b, v9.16b
747
eor v2.16b, v2.16b, v10.16b
748
eor v3.16b, v3.16b, v11.16b
749
eor v4.16b, v4.16b, v12.16b
750
eor v5.16b, v5.16b, v13.16b
751
eor v6.16b, v6.16b, v14.16b
752
eor v7.16b, v7.16b, v15.16b
753
st1 {v0.16b-v3.16b}, [x1], #64
754
st1 {v4.16b-v7.16b}, [x1], #64
755
756
tweak_next(v8, v15, RTMP3)
757
758
cbz w4, .Lxts_dec_cts
759
b .Lxts_dec_loop_8x
760
761
.Lxts_dec_4x:
762
add w4, w4, #8
763
cmp w4, #4
764
blt .Lxts_dec_loop_1x
765
766
sub w4, w4, #4
767
768
tweak_next( v9, v8, RTMP0)
769
tweak_next(v10, v9, RTMP1)
770
tweak_next(v11, v10, RTMP2)
771
772
ld1 {v0.16b-v3.16b}, [x2], #64
773
eor v0.16b, v0.16b, v8.16b
774
eor v1.16b, v1.16b, v9.16b
775
eor v2.16b, v2.16b, v10.16b
776
eor v3.16b, v3.16b, v11.16b
777
778
SM4_CRYPT_BLK4(v0, v1, v2, v3)
779
780
eor v0.16b, v0.16b, v8.16b
781
eor v1.16b, v1.16b, v9.16b
782
eor v2.16b, v2.16b, v10.16b
783
eor v3.16b, v3.16b, v11.16b
784
st1 {v0.16b-v3.16b}, [x1], #64
785
786
tweak_next(v8, v11, RTMP3)
787
788
cbz w4, .Lxts_dec_cts
789
790
.Lxts_dec_loop_1x:
791
sub w4, w4, #1
792
793
ld1 {v0.16b}, [x2], #16
794
eor v0.16b, v0.16b, v8.16b
795
796
SM4_CRYPT_BLK(v0)
797
798
eor v0.16b, v0.16b, v8.16b
799
st1 {v0.16b}, [x1], #16
800
801
tweak_next(v8, v8, RTMP0)
802
803
cbnz w4, .Lxts_dec_loop_1x
804
805
.Lxts_dec_cts:
806
cbz x5, .Lxts_dec_end
807
808
/* cipher text stealing */
809
810
tweak_next(v9, v8, RTMP0)
811
ld1 {v0.16b}, [x2]
812
eor v0.16b, v0.16b, v9.16b
813
SM4_CRYPT_BLK(v0)
814
eor v0.16b, v0.16b, v9.16b
815
816
/* load permute table */
817
adr_l x6, .Lcts_permute_table
818
add x7, x6, #32
819
add x6, x6, x5
820
sub x7, x7, x5
821
ld1 {v3.16b}, [x6]
822
ld1 {v4.16b}, [x7]
823
824
/* overlapping loads */
825
add x2, x2, x5
826
ld1 {v1.16b}, [x2]
827
828
/* create Cn from En-1 */
829
tbl v2.16b, {v0.16b}, v3.16b
830
/* padding Pn with En-1 at the end */
831
tbx v0.16b, {v1.16b}, v4.16b
832
833
eor v0.16b, v0.16b, v8.16b
834
SM4_CRYPT_BLK(v0)
835
eor v0.16b, v0.16b, v8.16b
836
837
838
/* overlapping stores */
839
add x5, x1, x5
840
st1 {v2.16b}, [x5]
841
st1 {v0.16b}, [x1]
842
843
b .Lxts_dec_ret
844
845
.Lxts_dec_end:
846
/* store new tweak */
847
st1 {v8.16b}, [x3]
848
849
.Lxts_dec_ret:
850
ret
851
SYM_FUNC_END(sm4_ce_xts_dec)
852
853
.align 3
854
SYM_FUNC_START(sm4_ce_mac_update)
855
/* input:
856
* x0: round key array, CTX
857
* x1: digest
858
* x2: src
859
* w3: nblocks
860
* w4: enc_before
861
* w5: enc_after
862
*/
863
SM4_PREPARE(x0)
864
865
ld1 {RMAC.16b}, [x1]
866
867
cbz w4, .Lmac_update
868
869
SM4_CRYPT_BLK(RMAC)
870
871
.Lmac_update:
872
cbz w3, .Lmac_ret
873
874
sub w6, w3, #1
875
cmp w5, wzr
876
csel w3, w3, w6, ne
877
878
cbz w3, .Lmac_end
879
880
.Lmac_loop_4x:
881
cmp w3, #4
882
blt .Lmac_loop_1x
883
884
sub w3, w3, #4
885
886
ld1 {v0.16b-v3.16b}, [x2], #64
887
888
eor RMAC.16b, RMAC.16b, v0.16b
889
SM4_CRYPT_BLK(RMAC)
890
eor RMAC.16b, RMAC.16b, v1.16b
891
SM4_CRYPT_BLK(RMAC)
892
eor RMAC.16b, RMAC.16b, v2.16b
893
SM4_CRYPT_BLK(RMAC)
894
eor RMAC.16b, RMAC.16b, v3.16b
895
SM4_CRYPT_BLK(RMAC)
896
897
cbz w3, .Lmac_end
898
b .Lmac_loop_4x
899
900
.Lmac_loop_1x:
901
sub w3, w3, #1
902
903
ld1 {v0.16b}, [x2], #16
904
905
eor RMAC.16b, RMAC.16b, v0.16b
906
SM4_CRYPT_BLK(RMAC)
907
908
cbnz w3, .Lmac_loop_1x
909
910
911
.Lmac_end:
912
cbnz w5, .Lmac_ret
913
914
ld1 {v0.16b}, [x2], #16
915
eor RMAC.16b, RMAC.16b, v0.16b
916
917
.Lmac_ret:
918
st1 {RMAC.16b}, [x1]
919
ret
920
SYM_FUNC_END(sm4_ce_mac_update)
921
922
923
.section ".rodata", "a"
924
.align 4
925
.Lbswap128_mask:
926
.byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
927
.byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
928
929
.Lcts_permute_table:
930
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
931
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
932
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
933
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
934
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
935
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
936
937