Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
170899 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* SM4 Cipher Algorithm, AES-NI/AVX2 optimized.
4
* as specified in
5
* https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6
*
7
* Copyright (C) 2018 Markku-Juhani O. Saarinen <[email protected]>
8
* Copyright (C) 2020 Jussi Kivilinna <[email protected]>
9
* Copyright (c) 2021 Tianjia Zhang <[email protected]>
10
*/
11
12
/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
13
* https://github.com/mjosaarinen/sm4ni
14
*/
15
16
#include <linux/linkage.h>
17
#include <linux/cfi_types.h>
18
#include <asm/frame.h>
19
20
#define rRIP (%rip)
21
22
/* vector registers */
23
#define RX0 %ymm0
24
#define RX1 %ymm1
25
#define MASK_4BIT %ymm2
26
#define RTMP0 %ymm3
27
#define RTMP1 %ymm4
28
#define RTMP2 %ymm5
29
#define RTMP3 %ymm6
30
#define RTMP4 %ymm7
31
32
#define RA0 %ymm8
33
#define RA1 %ymm9
34
#define RA2 %ymm10
35
#define RA3 %ymm11
36
37
#define RB0 %ymm12
38
#define RB1 %ymm13
39
#define RB2 %ymm14
40
#define RB3 %ymm15
41
42
#define RNOT %ymm0
43
#define RBSWAP %ymm1
44
45
#define RX0x %xmm0
46
#define RX1x %xmm1
47
#define MASK_4BITx %xmm2
48
49
#define RNOTx %xmm0
50
#define RBSWAPx %xmm1
51
52
#define RTMP0x %xmm3
53
#define RTMP1x %xmm4
54
#define RTMP2x %xmm5
55
#define RTMP3x %xmm6
56
#define RTMP4x %xmm7
57
58
59
/* helper macros */
60
61
/* Transpose four 32-bit words between 128-bit vector lanes. */
62
#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
63
vpunpckhdq x1, x0, t2; \
64
vpunpckldq x1, x0, x0; \
65
\
66
vpunpckldq x3, x2, t1; \
67
vpunpckhdq x3, x2, x2; \
68
\
69
vpunpckhqdq t1, x0, x1; \
70
vpunpcklqdq t1, x0, x0; \
71
\
72
vpunpckhqdq x2, t2, x3; \
73
vpunpcklqdq x2, t2, x2;
74
75
/* post-SubByte transform. */
76
#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
77
vpand x, mask4bit, tmp0; \
78
vpandn x, mask4bit, x; \
79
vpsrld $4, x, x; \
80
\
81
vpshufb tmp0, lo_t, tmp0; \
82
vpshufb x, hi_t, x; \
83
vpxor tmp0, x, x;
84
85
/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
86
* 'vaeslastenc' instruction. */
87
#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
88
vpandn mask4bit, x, tmp0; \
89
vpsrld $4, x, x; \
90
vpand x, mask4bit, x; \
91
\
92
vpshufb tmp0, lo_t, tmp0; \
93
vpshufb x, hi_t, x; \
94
vpxor tmp0, x, x;
95
96
97
.section .rodata.cst16, "aM", @progbits, 16
98
.align 16
99
100
/*
101
* Following four affine transform look-up tables are from work by
102
* Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
103
*
104
* These allow exposing SM4 S-Box from AES SubByte.
105
*/
106
107
/* pre-SubByte affine transform, from SM4 field to AES field. */
108
.Lpre_tf_lo_s:
109
.quad 0x9197E2E474720701, 0xC7C1B4B222245157
110
.Lpre_tf_hi_s:
111
.quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
112
113
/* post-SubByte affine transform, from AES field to SM4 field. */
114
.Lpost_tf_lo_s:
115
.quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
116
.Lpost_tf_hi_s:
117
.quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
118
119
/* For isolating SubBytes from AESENCLAST, inverse shift row */
120
.Linv_shift_row:
121
.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
122
.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
123
124
/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
125
.Linv_shift_row_rol_8:
126
.byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
127
.byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
128
129
/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
130
.Linv_shift_row_rol_16:
131
.byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
132
.byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
133
134
/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
135
.Linv_shift_row_rol_24:
136
.byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
137
.byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
138
139
/* For CTR-mode IV byteswap */
140
.Lbswap128_mask:
141
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
142
143
/* For input word byte-swap */
144
.Lbswap32_mask:
145
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
146
147
.align 4
148
/* 4-bit mask */
149
.L0f0f0f0f:
150
.long 0x0f0f0f0f
151
152
/* 12 bytes, only for padding */
153
.Lpadding_deadbeef:
154
.long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
155
156
.text
157
SYM_FUNC_START_LOCAL(__sm4_crypt_blk16)
158
/* input:
159
* %rdi: round key array, CTX
160
* RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
161
* plaintext blocks
162
* output:
163
* RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
164
* ciphertext blocks
165
*/
166
FRAME_BEGIN
167
168
vbroadcasti128 .Lbswap32_mask rRIP, RTMP2;
169
vpshufb RTMP2, RA0, RA0;
170
vpshufb RTMP2, RA1, RA1;
171
vpshufb RTMP2, RA2, RA2;
172
vpshufb RTMP2, RA3, RA3;
173
vpshufb RTMP2, RB0, RB0;
174
vpshufb RTMP2, RB1, RB1;
175
vpshufb RTMP2, RB2, RB2;
176
vpshufb RTMP2, RB3, RB3;
177
178
vpbroadcastd .L0f0f0f0f rRIP, MASK_4BIT;
179
transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
180
transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
181
182
#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
183
vpbroadcastd (4*(round))(%rdi), RX0; \
184
vbroadcasti128 .Lpre_tf_lo_s rRIP, RTMP4; \
185
vbroadcasti128 .Lpre_tf_hi_s rRIP, RTMP1; \
186
vmovdqa RX0, RX1; \
187
vpxor s1, RX0, RX0; \
188
vpxor s2, RX0, RX0; \
189
vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
190
vbroadcasti128 .Lpost_tf_lo_s rRIP, RTMP2; \
191
vbroadcasti128 .Lpost_tf_hi_s rRIP, RTMP3; \
192
vpxor r1, RX1, RX1; \
193
vpxor r2, RX1, RX1; \
194
vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \
195
\
196
/* sbox, non-linear part */ \
197
transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
198
transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
199
vextracti128 $1, RX0, RTMP4x; \
200
vextracti128 $1, RX1, RTMP0x; \
201
vaesenclast MASK_4BITx, RX0x, RX0x; \
202
vaesenclast MASK_4BITx, RTMP4x, RTMP4x; \
203
vaesenclast MASK_4BITx, RX1x, RX1x; \
204
vaesenclast MASK_4BITx, RTMP0x, RTMP0x; \
205
vinserti128 $1, RTMP4x, RX0, RX0; \
206
vbroadcasti128 .Linv_shift_row rRIP, RTMP4; \
207
vinserti128 $1, RTMP0x, RX1, RX1; \
208
transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
209
transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
210
\
211
/* linear part */ \
212
vpshufb RTMP4, RX0, RTMP0; \
213
vpxor RTMP0, s0, s0; /* s0 ^ x */ \
214
vpshufb RTMP4, RX1, RTMP2; \
215
vbroadcasti128 .Linv_shift_row_rol_8 rRIP, RTMP4; \
216
vpxor RTMP2, r0, r0; /* r0 ^ x */ \
217
vpshufb RTMP4, RX0, RTMP1; \
218
vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
219
vpshufb RTMP4, RX1, RTMP3; \
220
vbroadcasti128 .Linv_shift_row_rol_16 rRIP, RTMP4; \
221
vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \
222
vpshufb RTMP4, RX0, RTMP1; \
223
vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
224
vpshufb RTMP4, RX1, RTMP3; \
225
vbroadcasti128 .Linv_shift_row_rol_24 rRIP, RTMP4; \
226
vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \
227
vpshufb RTMP4, RX0, RTMP1; \
228
vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
229
vpslld $2, RTMP0, RTMP1; \
230
vpsrld $30, RTMP0, RTMP0; \
231
vpxor RTMP0, s0, s0; \
232
/* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
233
vpxor RTMP1, s0, s0; \
234
vpshufb RTMP4, RX1, RTMP3; \
235
vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
236
vpslld $2, RTMP2, RTMP3; \
237
vpsrld $30, RTMP2, RTMP2; \
238
vpxor RTMP2, r0, r0; \
239
/* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
240
vpxor RTMP3, r0, r0;
241
242
leaq (32*4)(%rdi), %rax;
243
.align 16
244
.Lroundloop_blk8:
245
ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
246
ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
247
ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
248
ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
249
leaq (4*4)(%rdi), %rdi;
250
cmpq %rax, %rdi;
251
jne .Lroundloop_blk8;
252
253
#undef ROUND
254
255
vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;
256
257
transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
258
transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
259
vpshufb RTMP2, RA0, RA0;
260
vpshufb RTMP2, RA1, RA1;
261
vpshufb RTMP2, RA2, RA2;
262
vpshufb RTMP2, RA3, RA3;
263
vpshufb RTMP2, RB0, RB0;
264
vpshufb RTMP2, RB1, RB1;
265
vpshufb RTMP2, RB2, RB2;
266
vpshufb RTMP2, RB3, RB3;
267
268
FRAME_END
269
RET;
270
SYM_FUNC_END(__sm4_crypt_blk16)
271
272
#define inc_le128(x, minus_one, tmp) \
273
vpcmpeqq minus_one, x, tmp; \
274
vpsubq minus_one, x, x; \
275
vpslldq $8, tmp, tmp; \
276
vpsubq tmp, x, x;
277
278
/*
279
* void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
280
* const u8 *src, u8 *iv)
281
*/
282
SYM_TYPED_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)
283
/* input:
284
* %rdi: round key array, CTX
285
* %rsi: dst (16 blocks)
286
* %rdx: src (16 blocks)
287
* %rcx: iv (big endian, 128bit)
288
*/
289
FRAME_BEGIN
290
291
movq 8(%rcx), %rax;
292
bswapq %rax;
293
294
vzeroupper;
295
296
vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
297
vpcmpeqd RNOT, RNOT, RNOT;
298
vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
299
vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
300
301
/* load IV and byteswap */
302
vmovdqu (%rcx), RTMP4x;
303
vpshufb RTMP3x, RTMP4x, RTMP4x;
304
vmovdqa RTMP4x, RTMP0x;
305
inc_le128(RTMP4x, RNOTx, RTMP1x);
306
vinserti128 $1, RTMP4x, RTMP0, RTMP0;
307
vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
308
309
/* check need for handling 64-bit overflow and carry */
310
cmpq $(0xffffffffffffffff - 16), %rax;
311
ja .Lhandle_ctr_carry;
312
313
/* construct IVs */
314
vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
315
vpshufb RTMP3, RTMP0, RA1;
316
vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
317
vpshufb RTMP3, RTMP0, RA2;
318
vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
319
vpshufb RTMP3, RTMP0, RA3;
320
vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
321
vpshufb RTMP3, RTMP0, RB0;
322
vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
323
vpshufb RTMP3, RTMP0, RB1;
324
vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
325
vpshufb RTMP3, RTMP0, RB2;
326
vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
327
vpshufb RTMP3, RTMP0, RB3;
328
vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
329
vpshufb RTMP3x, RTMP0x, RTMP0x;
330
331
jmp .Lctr_carry_done;
332
333
.Lhandle_ctr_carry:
334
/* construct IVs */
335
inc_le128(RTMP0, RNOT, RTMP1);
336
inc_le128(RTMP0, RNOT, RTMP1);
337
vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
338
inc_le128(RTMP0, RNOT, RTMP1);
339
inc_le128(RTMP0, RNOT, RTMP1);
340
vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
341
inc_le128(RTMP0, RNOT, RTMP1);
342
inc_le128(RTMP0, RNOT, RTMP1);
343
vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
344
inc_le128(RTMP0, RNOT, RTMP1);
345
inc_le128(RTMP0, RNOT, RTMP1);
346
vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
347
inc_le128(RTMP0, RNOT, RTMP1);
348
inc_le128(RTMP0, RNOT, RTMP1);
349
vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
350
inc_le128(RTMP0, RNOT, RTMP1);
351
inc_le128(RTMP0, RNOT, RTMP1);
352
vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
353
inc_le128(RTMP0, RNOT, RTMP1);
354
inc_le128(RTMP0, RNOT, RTMP1);
355
vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
356
inc_le128(RTMP0, RNOT, RTMP1);
357
vextracti128 $1, RTMP0, RTMP0x;
358
vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
359
360
.align 4
361
.Lctr_carry_done:
362
/* store new IV */
363
vmovdqu RTMP0x, (%rcx);
364
365
call __sm4_crypt_blk16;
366
367
vpxor (0 * 32)(%rdx), RA0, RA0;
368
vpxor (1 * 32)(%rdx), RA1, RA1;
369
vpxor (2 * 32)(%rdx), RA2, RA2;
370
vpxor (3 * 32)(%rdx), RA3, RA3;
371
vpxor (4 * 32)(%rdx), RB0, RB0;
372
vpxor (5 * 32)(%rdx), RB1, RB1;
373
vpxor (6 * 32)(%rdx), RB2, RB2;
374
vpxor (7 * 32)(%rdx), RB3, RB3;
375
376
vmovdqu RA0, (0 * 32)(%rsi);
377
vmovdqu RA1, (1 * 32)(%rsi);
378
vmovdqu RA2, (2 * 32)(%rsi);
379
vmovdqu RA3, (3 * 32)(%rsi);
380
vmovdqu RB0, (4 * 32)(%rsi);
381
vmovdqu RB1, (5 * 32)(%rsi);
382
vmovdqu RB2, (6 * 32)(%rsi);
383
vmovdqu RB3, (7 * 32)(%rsi);
384
385
vzeroall;
386
FRAME_END
387
RET;
388
SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16)
389
390
/*
391
* void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
392
* const u8 *src, u8 *iv)
393
*/
394
SYM_TYPED_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
395
/* input:
396
* %rdi: round key array, CTX
397
* %rsi: dst (16 blocks)
398
* %rdx: src (16 blocks)
399
* %rcx: iv
400
*/
401
FRAME_BEGIN
402
403
vzeroupper;
404
405
vmovdqu (0 * 32)(%rdx), RA0;
406
vmovdqu (1 * 32)(%rdx), RA1;
407
vmovdqu (2 * 32)(%rdx), RA2;
408
vmovdqu (3 * 32)(%rdx), RA3;
409
vmovdqu (4 * 32)(%rdx), RB0;
410
vmovdqu (5 * 32)(%rdx), RB1;
411
vmovdqu (6 * 32)(%rdx), RB2;
412
vmovdqu (7 * 32)(%rdx), RB3;
413
414
call __sm4_crypt_blk16;
415
416
vmovdqu (%rcx), RNOTx;
417
vinserti128 $1, (%rdx), RNOT, RNOT;
418
vpxor RNOT, RA0, RA0;
419
vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
420
vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
421
vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
422
vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
423
vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
424
vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
425
vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
426
vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
427
vmovdqu RNOTx, (%rcx); /* store new IV */
428
429
vmovdqu RA0, (0 * 32)(%rsi);
430
vmovdqu RA1, (1 * 32)(%rsi);
431
vmovdqu RA2, (2 * 32)(%rsi);
432
vmovdqu RA3, (3 * 32)(%rsi);
433
vmovdqu RB0, (4 * 32)(%rsi);
434
vmovdqu RB1, (5 * 32)(%rsi);
435
vmovdqu RB2, (6 * 32)(%rsi);
436
vmovdqu RB3, (7 * 32)(%rsi);
437
438
vzeroall;
439
FRAME_END
440
RET;
441
SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
442
443