Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/crypto/sm4-ce-gcm-core.S
26424 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
4
* as specified in rfc8998
5
* https://datatracker.ietf.org/doc/html/rfc8998
6
*
7
* Copyright (C) 2016 Jussi Kivilinna <[email protected]>
8
* Copyright (C) 2022 Tianjia Zhang <[email protected]>
9
*/
10
11
#include <linux/linkage.h>
12
#include <linux/cfi_types.h>
13
#include <asm/assembler.h>
14
#include "sm4-ce-asm.h"
15
16
.arch armv8-a+crypto
17
18
.irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
19
.set .Lv\b\().4s, \b
20
.endr
21
22
.macro sm4e, vd, vn
23
.inst 0xcec08400 | (.L\vn << 5) | .L\vd
24
.endm
25
26
/* Register macros */
27
28
/* Used for both encryption and decryption */
29
#define RHASH v21
30
#define RRCONST v22
31
#define RZERO v23
32
33
/* Helper macros. */
34
35
/*
36
* input: m0, m1
37
* output: r0:r1 (low 128-bits in r0, high in r1)
38
*/
39
#define PMUL_128x128(r0, r1, m0, m1, T0, T1) \
40
ext T0.16b, m1.16b, m1.16b, #8; \
41
pmull r0.1q, m0.1d, m1.1d; \
42
pmull T1.1q, m0.1d, T0.1d; \
43
pmull2 T0.1q, m0.2d, T0.2d; \
44
pmull2 r1.1q, m0.2d, m1.2d; \
45
eor T0.16b, T0.16b, T1.16b; \
46
ext T1.16b, RZERO.16b, T0.16b, #8; \
47
ext T0.16b, T0.16b, RZERO.16b, #8; \
48
eor r0.16b, r0.16b, T1.16b; \
49
eor r1.16b, r1.16b, T0.16b;
50
51
#define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1, \
52
r2, r3, m2, m3, T2, T3, \
53
r4, r5, m4, m5, T4, T5, \
54
r6, r7, m6, m7, T6, T7) \
55
ext T0.16b, m1.16b, m1.16b, #8; \
56
ext T2.16b, m3.16b, m3.16b, #8; \
57
ext T4.16b, m5.16b, m5.16b, #8; \
58
ext T6.16b, m7.16b, m7.16b, #8; \
59
pmull r0.1q, m0.1d, m1.1d; \
60
pmull r2.1q, m2.1d, m3.1d; \
61
pmull r4.1q, m4.1d, m5.1d; \
62
pmull r6.1q, m6.1d, m7.1d; \
63
pmull T1.1q, m0.1d, T0.1d; \
64
pmull T3.1q, m2.1d, T2.1d; \
65
pmull T5.1q, m4.1d, T4.1d; \
66
pmull T7.1q, m6.1d, T6.1d; \
67
pmull2 T0.1q, m0.2d, T0.2d; \
68
pmull2 T2.1q, m2.2d, T2.2d; \
69
pmull2 T4.1q, m4.2d, T4.2d; \
70
pmull2 T6.1q, m6.2d, T6.2d; \
71
pmull2 r1.1q, m0.2d, m1.2d; \
72
pmull2 r3.1q, m2.2d, m3.2d; \
73
pmull2 r5.1q, m4.2d, m5.2d; \
74
pmull2 r7.1q, m6.2d, m7.2d; \
75
eor T0.16b, T0.16b, T1.16b; \
76
eor T2.16b, T2.16b, T3.16b; \
77
eor T4.16b, T4.16b, T5.16b; \
78
eor T6.16b, T6.16b, T7.16b; \
79
ext T1.16b, RZERO.16b, T0.16b, #8; \
80
ext T3.16b, RZERO.16b, T2.16b, #8; \
81
ext T5.16b, RZERO.16b, T4.16b, #8; \
82
ext T7.16b, RZERO.16b, T6.16b, #8; \
83
ext T0.16b, T0.16b, RZERO.16b, #8; \
84
ext T2.16b, T2.16b, RZERO.16b, #8; \
85
ext T4.16b, T4.16b, RZERO.16b, #8; \
86
ext T6.16b, T6.16b, RZERO.16b, #8; \
87
eor r0.16b, r0.16b, T1.16b; \
88
eor r2.16b, r2.16b, T3.16b; \
89
eor r4.16b, r4.16b, T5.16b; \
90
eor r6.16b, r6.16b, T7.16b; \
91
eor r1.16b, r1.16b, T0.16b; \
92
eor r3.16b, r3.16b, T2.16b; \
93
eor r5.16b, r5.16b, T4.16b; \
94
eor r7.16b, r7.16b, T6.16b;
95
96
/*
97
* input: r0:r1 (low 128-bits in r0, high in r1)
98
* output: a
99
*/
100
#define REDUCTION(a, r0, r1, rconst, T0, T1) \
101
pmull2 T0.1q, r1.2d, rconst.2d; \
102
ext T1.16b, T0.16b, RZERO.16b, #8; \
103
ext T0.16b, RZERO.16b, T0.16b, #8; \
104
eor r1.16b, r1.16b, T1.16b; \
105
eor r0.16b, r0.16b, T0.16b; \
106
pmull T0.1q, r1.1d, rconst.1d; \
107
eor a.16b, r0.16b, T0.16b;
108
109
#define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1) \
110
rev32 b0.16b, b0.16b; \
111
ext T0.16b, m1.16b, m1.16b, #8; \
112
sm4e b0.4s, v24.4s; \
113
pmull r0.1q, m0.1d, m1.1d; \
114
sm4e b0.4s, v25.4s; \
115
pmull T1.1q, m0.1d, T0.1d; \
116
sm4e b0.4s, v26.4s; \
117
pmull2 T0.1q, m0.2d, T0.2d; \
118
sm4e b0.4s, v27.4s; \
119
pmull2 r1.1q, m0.2d, m1.2d; \
120
sm4e b0.4s, v28.4s; \
121
eor T0.16b, T0.16b, T1.16b; \
122
sm4e b0.4s, v29.4s; \
123
ext T1.16b, RZERO.16b, T0.16b, #8; \
124
sm4e b0.4s, v30.4s; \
125
ext T0.16b, T0.16b, RZERO.16b, #8; \
126
sm4e b0.4s, v31.4s; \
127
eor r0.16b, r0.16b, T1.16b; \
128
rev64 b0.4s, b0.4s; \
129
eor r1.16b, r1.16b, T0.16b; \
130
ext b0.16b, b0.16b, b0.16b, #8; \
131
rev32 b0.16b, b0.16b;
132
133
#define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2, \
134
r0, r1, m0, m1, T0, T1, \
135
r2, r3, m2, m3, T2, T3, \
136
r4, r5, m4, m5, T4, T5) \
137
rev32 b0.16b, b0.16b; \
138
rev32 b1.16b, b1.16b; \
139
rev32 b2.16b, b2.16b; \
140
ext T0.16b, m1.16b, m1.16b, #8; \
141
ext T2.16b, m3.16b, m3.16b, #8; \
142
ext T4.16b, m5.16b, m5.16b, #8; \
143
sm4e b0.4s, v24.4s; \
144
sm4e b1.4s, v24.4s; \
145
sm4e b2.4s, v24.4s; \
146
pmull r0.1q, m0.1d, m1.1d; \
147
pmull r2.1q, m2.1d, m3.1d; \
148
pmull r4.1q, m4.1d, m5.1d; \
149
sm4e b0.4s, v25.4s; \
150
sm4e b1.4s, v25.4s; \
151
sm4e b2.4s, v25.4s; \
152
pmull T1.1q, m0.1d, T0.1d; \
153
pmull T3.1q, m2.1d, T2.1d; \
154
pmull T5.1q, m4.1d, T4.1d; \
155
sm4e b0.4s, v26.4s; \
156
sm4e b1.4s, v26.4s; \
157
sm4e b2.4s, v26.4s; \
158
pmull2 T0.1q, m0.2d, T0.2d; \
159
pmull2 T2.1q, m2.2d, T2.2d; \
160
pmull2 T4.1q, m4.2d, T4.2d; \
161
sm4e b0.4s, v27.4s; \
162
sm4e b1.4s, v27.4s; \
163
sm4e b2.4s, v27.4s; \
164
pmull2 r1.1q, m0.2d, m1.2d; \
165
pmull2 r3.1q, m2.2d, m3.2d; \
166
pmull2 r5.1q, m4.2d, m5.2d; \
167
sm4e b0.4s, v28.4s; \
168
sm4e b1.4s, v28.4s; \
169
sm4e b2.4s, v28.4s; \
170
eor T0.16b, T0.16b, T1.16b; \
171
eor T2.16b, T2.16b, T3.16b; \
172
eor T4.16b, T4.16b, T5.16b; \
173
sm4e b0.4s, v29.4s; \
174
sm4e b1.4s, v29.4s; \
175
sm4e b2.4s, v29.4s; \
176
ext T1.16b, RZERO.16b, T0.16b, #8; \
177
ext T3.16b, RZERO.16b, T2.16b, #8; \
178
ext T5.16b, RZERO.16b, T4.16b, #8; \
179
sm4e b0.4s, v30.4s; \
180
sm4e b1.4s, v30.4s; \
181
sm4e b2.4s, v30.4s; \
182
ext T0.16b, T0.16b, RZERO.16b, #8; \
183
ext T2.16b, T2.16b, RZERO.16b, #8; \
184
ext T4.16b, T4.16b, RZERO.16b, #8; \
185
sm4e b0.4s, v31.4s; \
186
sm4e b1.4s, v31.4s; \
187
sm4e b2.4s, v31.4s; \
188
eor r0.16b, r0.16b, T1.16b; \
189
eor r2.16b, r2.16b, T3.16b; \
190
eor r4.16b, r4.16b, T5.16b; \
191
rev64 b0.4s, b0.4s; \
192
rev64 b1.4s, b1.4s; \
193
rev64 b2.4s, b2.4s; \
194
eor r1.16b, r1.16b, T0.16b; \
195
eor r3.16b, r3.16b, T2.16b; \
196
eor r5.16b, r5.16b, T4.16b; \
197
ext b0.16b, b0.16b, b0.16b, #8; \
198
ext b1.16b, b1.16b, b1.16b, #8; \
199
ext b2.16b, b2.16b, b2.16b, #8; \
200
eor r0.16b, r0.16b, r2.16b; \
201
eor r1.16b, r1.16b, r3.16b; \
202
rev32 b0.16b, b0.16b; \
203
rev32 b1.16b, b1.16b; \
204
rev32 b2.16b, b2.16b; \
205
eor r0.16b, r0.16b, r4.16b; \
206
eor r1.16b, r1.16b, r5.16b;
207
208
#define inc32_le128(vctr) \
209
mov vctr.d[1], x9; \
210
add w6, w9, #1; \
211
mov vctr.d[0], x8; \
212
bfi x9, x6, #0, #32; \
213
rev64 vctr.16b, vctr.16b;
214
215
#define GTAG_HASH_LENGTHS(vctr0, vlen) \
216
ld1 {vlen.16b}, [x7]; \
217
/* construct CTR0 */ \
218
/* the lower 32-bits of initial IV is always be32(1) */ \
219
mov x6, #0x1; \
220
bfi x9, x6, #0, #32; \
221
mov vctr0.d[0], x8; \
222
mov vctr0.d[1], x9; \
223
rbit vlen.16b, vlen.16b; \
224
rev64 vctr0.16b, vctr0.16b; \
225
/* authtag = GCTR(CTR0, GHASH) */ \
226
eor RHASH.16b, RHASH.16b, vlen.16b; \
227
SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \
228
RTMP0, RTMP1); \
229
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3); \
230
rbit RHASH.16b, RHASH.16b; \
231
eor RHASH.16b, RHASH.16b, vctr0.16b;
232
233
234
/* Register macros for encrypt and ghash */
235
236
/* can be the same as input v0-v3 */
237
#define RR1 v0
238
#define RR3 v1
239
#define RR5 v2
240
#define RR7 v3
241
242
#define RR0 v4
243
#define RR2 v5
244
#define RR4 v6
245
#define RR6 v7
246
247
#define RTMP0 v8
248
#define RTMP1 v9
249
#define RTMP2 v10
250
#define RTMP3 v11
251
#define RTMP4 v12
252
#define RTMP5 v13
253
#define RTMP6 v14
254
#define RTMP7 v15
255
256
#define RH1 v16
257
#define RH2 v17
258
#define RH3 v18
259
#define RH4 v19
260
261
.align 3
262
SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
263
/* input:
264
* x0: round key array, CTX
265
* x1: ghash table
266
*/
267
SM4_PREPARE(x0)
268
269
adr_l x2, .Lghash_rconst
270
ld1r {RRCONST.2d}, [x2]
271
272
eor RZERO.16b, RZERO.16b, RZERO.16b
273
274
/* H = E(K, 0^128) */
275
rev32 v0.16b, RZERO.16b
276
SM4_CRYPT_BLK_BE(v0)
277
278
/* H ^ 1 */
279
rbit RH1.16b, v0.16b
280
281
/* H ^ 2 */
282
PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
283
REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
284
285
/* H ^ 3 */
286
PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
287
REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
288
289
/* H ^ 4 */
290
PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
291
REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
292
293
st1 {RH1.16b-RH4.16b}, [x1]
294
295
ret
296
SYM_FUNC_END(sm4_ce_pmull_ghash_setup)
297
298
.align 3
299
SYM_FUNC_START(pmull_ghash_update)
300
/* input:
301
* x0: ghash table
302
* x1: ghash result
303
* x2: src
304
* w3: nblocks
305
*/
306
ld1 {RH1.16b-RH4.16b}, [x0]
307
308
ld1 {RHASH.16b}, [x1]
309
rbit RHASH.16b, RHASH.16b
310
311
adr_l x4, .Lghash_rconst
312
ld1r {RRCONST.2d}, [x4]
313
314
eor RZERO.16b, RZERO.16b, RZERO.16b
315
316
.Lghash_loop_4x:
317
cmp w3, #4
318
blt .Lghash_loop_1x
319
320
sub w3, w3, #4
321
322
ld1 {v0.16b-v3.16b}, [x2], #64
323
324
rbit v0.16b, v0.16b
325
rbit v1.16b, v1.16b
326
rbit v2.16b, v2.16b
327
rbit v3.16b, v3.16b
328
329
/*
330
* (in0 ^ HASH) * H^4 => rr0:rr1
331
* (in1) * H^3 => rr2:rr3
332
* (in2) * H^2 => rr4:rr5
333
* (in3) * H^1 => rr6:rr7
334
*/
335
eor RHASH.16b, RHASH.16b, v0.16b
336
337
PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
338
RR2, RR3, v1, RH3, RTMP2, RTMP3,
339
RR4, RR5, v2, RH2, RTMP4, RTMP5,
340
RR6, RR7, v3, RH1, RTMP6, RTMP7)
341
342
eor RR0.16b, RR0.16b, RR2.16b
343
eor RR1.16b, RR1.16b, RR3.16b
344
eor RR0.16b, RR0.16b, RR4.16b
345
eor RR1.16b, RR1.16b, RR5.16b
346
eor RR0.16b, RR0.16b, RR6.16b
347
eor RR1.16b, RR1.16b, RR7.16b
348
349
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
350
351
cbz w3, .Lghash_end
352
b .Lghash_loop_4x
353
354
.Lghash_loop_1x:
355
sub w3, w3, #1
356
357
ld1 {v0.16b}, [x2], #16
358
rbit v0.16b, v0.16b
359
eor RHASH.16b, RHASH.16b, v0.16b
360
361
PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
362
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
363
364
cbnz w3, .Lghash_loop_1x
365
366
.Lghash_end:
367
rbit RHASH.16b, RHASH.16b
368
st1 {RHASH.2d}, [x1]
369
370
ret
371
SYM_FUNC_END(pmull_ghash_update)
372
373
.align 3
374
SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
375
/* input:
376
* x0: round key array, CTX
377
* x1: dst
378
* x2: src
379
* x3: ctr (big endian, 128 bit)
380
* w4: nbytes
381
* x5: ghash result
382
* x6: ghash table
383
* x7: lengths (only for last block)
384
*/
385
SM4_PREPARE(x0)
386
387
ldp x8, x9, [x3]
388
rev x8, x8
389
rev x9, x9
390
391
ld1 {RH1.16b-RH4.16b}, [x6]
392
393
ld1 {RHASH.16b}, [x5]
394
rbit RHASH.16b, RHASH.16b
395
396
adr_l x6, .Lghash_rconst
397
ld1r {RRCONST.2d}, [x6]
398
399
eor RZERO.16b, RZERO.16b, RZERO.16b
400
401
cbz w4, .Lgcm_enc_hash_len
402
403
.Lgcm_enc_loop_4x:
404
cmp w4, #(4 * 16)
405
blt .Lgcm_enc_loop_1x
406
407
sub w4, w4, #(4 * 16)
408
409
/* construct CTRs */
410
inc32_le128(v0) /* +0 */
411
inc32_le128(v1) /* +1 */
412
inc32_le128(v2) /* +2 */
413
inc32_le128(v3) /* +3 */
414
415
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
416
417
SM4_CRYPT_BLK4(v0, v1, v2, v3)
418
419
eor v0.16b, v0.16b, RTMP0.16b
420
eor v1.16b, v1.16b, RTMP1.16b
421
eor v2.16b, v2.16b, RTMP2.16b
422
eor v3.16b, v3.16b, RTMP3.16b
423
st1 {v0.16b-v3.16b}, [x1], #64
424
425
/* ghash update */
426
427
rbit v0.16b, v0.16b
428
rbit v1.16b, v1.16b
429
rbit v2.16b, v2.16b
430
rbit v3.16b, v3.16b
431
432
/*
433
* (in0 ^ HASH) * H^4 => rr0:rr1
434
* (in1) * H^3 => rr2:rr3
435
* (in2) * H^2 => rr4:rr5
436
* (in3) * H^1 => rr6:rr7
437
*/
438
eor RHASH.16b, RHASH.16b, v0.16b
439
440
PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
441
RR2, RR3, v1, RH3, RTMP2, RTMP3,
442
RR4, RR5, v2, RH2, RTMP4, RTMP5,
443
RR6, RR7, v3, RH1, RTMP6, RTMP7)
444
445
eor RR0.16b, RR0.16b, RR2.16b
446
eor RR1.16b, RR1.16b, RR3.16b
447
eor RR0.16b, RR0.16b, RR4.16b
448
eor RR1.16b, RR1.16b, RR5.16b
449
eor RR0.16b, RR0.16b, RR6.16b
450
eor RR1.16b, RR1.16b, RR7.16b
451
452
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
453
454
cbz w4, .Lgcm_enc_hash_len
455
b .Lgcm_enc_loop_4x
456
457
.Lgcm_enc_loop_1x:
458
cmp w4, #16
459
blt .Lgcm_enc_tail
460
461
sub w4, w4, #16
462
463
/* construct CTRs */
464
inc32_le128(v0)
465
466
ld1 {RTMP0.16b}, [x2], #16
467
468
SM4_CRYPT_BLK(v0)
469
470
eor v0.16b, v0.16b, RTMP0.16b
471
st1 {v0.16b}, [x1], #16
472
473
/* ghash update */
474
rbit v0.16b, v0.16b
475
eor RHASH.16b, RHASH.16b, v0.16b
476
PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
477
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
478
479
cbz w4, .Lgcm_enc_hash_len
480
b .Lgcm_enc_loop_1x
481
482
.Lgcm_enc_tail:
483
/* construct CTRs */
484
inc32_le128(v0)
485
SM4_CRYPT_BLK(v0)
486
487
/* load permute table */
488
adr_l x0, .Lcts_permute_table
489
add x0, x0, #32
490
sub x0, x0, w4, uxtw
491
ld1 {v3.16b}, [x0]
492
493
.Lgcm_enc_tail_loop:
494
/* do encrypt */
495
ldrb w0, [x2], #1 /* get 1 byte from input */
496
umov w6, v0.b[0] /* get top crypted byte */
497
eor w6, w6, w0 /* w6 = CTR ^ input */
498
strb w6, [x1], #1 /* store out byte */
499
500
/* shift right out one byte */
501
ext v0.16b, v0.16b, v0.16b, #1
502
/* the last ciphertext is placed in high bytes */
503
ins v0.b[15], w6
504
505
subs w4, w4, #1
506
bne .Lgcm_enc_tail_loop
507
508
/* padding last block with zeros */
509
tbl v0.16b, {v0.16b}, v3.16b
510
511
/* ghash update */
512
rbit v0.16b, v0.16b
513
eor RHASH.16b, RHASH.16b, v0.16b
514
PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
515
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
516
517
.Lgcm_enc_hash_len:
518
cbz x7, .Lgcm_enc_end
519
520
GTAG_HASH_LENGTHS(v1, v3)
521
522
b .Lgcm_enc_ret
523
524
.Lgcm_enc_end:
525
/* store new CTR */
526
rev x8, x8
527
rev x9, x9
528
stp x8, x9, [x3]
529
530
rbit RHASH.16b, RHASH.16b
531
532
.Lgcm_enc_ret:
533
/* store new MAC */
534
st1 {RHASH.2d}, [x5]
535
536
ret
537
SYM_FUNC_END(sm4_ce_pmull_gcm_enc)
538
539
#undef RR1
540
#undef RR3
541
#undef RR5
542
#undef RR7
543
#undef RR0
544
#undef RR2
545
#undef RR4
546
#undef RR6
547
#undef RTMP0
548
#undef RTMP1
549
#undef RTMP2
550
#undef RTMP3
551
#undef RTMP4
552
#undef RTMP5
553
#undef RTMP6
554
#undef RTMP7
555
#undef RH1
556
#undef RH2
557
#undef RH3
558
#undef RH4
559
560
561
/* Register macros for decrypt */
562
563
/* v0-v2 for building CTRs, v3-v5 for saving inputs */
564
565
#define RR1 v6
566
#define RR3 v7
567
#define RR5 v8
568
569
#define RR0 v9
570
#define RR2 v10
571
#define RR4 v11
572
573
#define RTMP0 v12
574
#define RTMP1 v13
575
#define RTMP2 v14
576
#define RTMP3 v15
577
#define RTMP4 v16
578
#define RTMP5 v17
579
580
#define RH1 v18
581
#define RH2 v19
582
#define RH3 v20
583
584
.align 3
585
SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
586
/* input:
587
* x0: round key array, CTX
588
* x1: dst
589
* x2: src
590
* x3: ctr (big endian, 128 bit)
591
* w4: nbytes
592
* x5: ghash result
593
* x6: ghash table
594
* x7: lengths (only for last block)
595
*/
596
SM4_PREPARE(x0)
597
598
ldp x8, x9, [x3]
599
rev x8, x8
600
rev x9, x9
601
602
ld1 {RH1.16b-RH3.16b}, [x6]
603
604
ld1 {RHASH.16b}, [x5]
605
rbit RHASH.16b, RHASH.16b
606
607
adr_l x6, .Lghash_rconst
608
ld1r {RRCONST.2d}, [x6]
609
610
eor RZERO.16b, RZERO.16b, RZERO.16b
611
612
cbz w4, .Lgcm_dec_hash_len
613
614
.Lgcm_dec_loop_3x:
615
cmp w4, #(3 * 16)
616
blt .Lgcm_dec_loop_1x
617
618
sub w4, w4, #(3 * 16)
619
620
ld1 {v3.16b-v5.16b}, [x2], #(3 * 16)
621
622
/* construct CTRs */
623
inc32_le128(v0) /* +0 */
624
rbit v6.16b, v3.16b
625
inc32_le128(v1) /* +1 */
626
rbit v7.16b, v4.16b
627
inc32_le128(v2) /* +2 */
628
rbit v8.16b, v5.16b
629
630
eor RHASH.16b, RHASH.16b, v6.16b
631
632
/* decrypt & ghash update */
633
SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
634
RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
635
RR2, RR3, v7, RH2, RTMP2, RTMP3,
636
RR4, RR5, v8, RH1, RTMP4, RTMP5)
637
638
eor v0.16b, v0.16b, v3.16b
639
eor v1.16b, v1.16b, v4.16b
640
eor v2.16b, v2.16b, v5.16b
641
642
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
643
644
st1 {v0.16b-v2.16b}, [x1], #(3 * 16)
645
646
cbz w4, .Lgcm_dec_hash_len
647
b .Lgcm_dec_loop_3x
648
649
.Lgcm_dec_loop_1x:
650
cmp w4, #16
651
blt .Lgcm_dec_tail
652
653
sub w4, w4, #16
654
655
ld1 {v3.16b}, [x2], #16
656
657
/* construct CTRs */
658
inc32_le128(v0)
659
rbit v6.16b, v3.16b
660
661
eor RHASH.16b, RHASH.16b, v6.16b
662
663
SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
664
665
eor v0.16b, v0.16b, v3.16b
666
667
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
668
669
st1 {v0.16b}, [x1], #16
670
671
cbz w4, .Lgcm_dec_hash_len
672
b .Lgcm_dec_loop_1x
673
674
.Lgcm_dec_tail:
675
/* construct CTRs */
676
inc32_le128(v0)
677
SM4_CRYPT_BLK(v0)
678
679
/* load permute table */
680
adr_l x0, .Lcts_permute_table
681
add x0, x0, #32
682
sub x0, x0, w4, uxtw
683
ld1 {v3.16b}, [x0]
684
685
.Lgcm_dec_tail_loop:
686
/* do decrypt */
687
ldrb w0, [x2], #1 /* get 1 byte from input */
688
umov w6, v0.b[0] /* get top crypted byte */
689
eor w6, w6, w0 /* w6 = CTR ^ input */
690
strb w6, [x1], #1 /* store out byte */
691
692
/* shift right out one byte */
693
ext v0.16b, v0.16b, v0.16b, #1
694
/* the last ciphertext is placed in high bytes */
695
ins v0.b[15], w0
696
697
subs w4, w4, #1
698
bne .Lgcm_dec_tail_loop
699
700
/* padding last block with zeros */
701
tbl v0.16b, {v0.16b}, v3.16b
702
703
/* ghash update */
704
rbit v0.16b, v0.16b
705
eor RHASH.16b, RHASH.16b, v0.16b
706
PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
707
REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
708
709
.Lgcm_dec_hash_len:
710
cbz x7, .Lgcm_dec_end
711
712
GTAG_HASH_LENGTHS(v1, v3)
713
714
b .Lgcm_dec_ret
715
716
.Lgcm_dec_end:
717
/* store new CTR */
718
rev x8, x8
719
rev x9, x9
720
stp x8, x9, [x3]
721
722
rbit RHASH.16b, RHASH.16b
723
724
.Lgcm_dec_ret:
725
/* store new MAC */
726
st1 {RHASH.2d}, [x5]
727
728
ret
729
SYM_FUNC_END(sm4_ce_pmull_gcm_dec)
730
731
.section ".rodata", "a"
732
.align 4
733
.Lcts_permute_table:
734
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
735
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
736
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
737
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
738
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
739
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
740
741
.Lghash_rconst:
742
.quad 0x87
743
744