Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S
39536 views
1
/* Do not modify. This file is auto-generated from vpsm4_ex-armv8.pl. */
2
// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved.
3
//
4
// Licensed under the Apache License 2.0 (the "License"). You may not use
5
// this file except in compliance with the License. You can obtain a copy
6
// in the file LICENSE in the source distribution or at
7
// https://www.openssl.org/source/license.html
8
9
//
10
// This module implements SM4 with ASIMD and AESE on AARCH64
11
//
12
// Dec 2022
13
//
14
15
// $output is the last argument if it looks like a file (it has an extension)
16
// $flavour is the first argument if it doesn't look like a file
17
#include "arm_arch.h"
18
.arch armv8-a+crypto
19
.text
20
21
.type _vpsm4_ex_consts,%object
22
.align 7
23
_vpsm4_ex_consts:
24
.Lck:
25
.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
26
.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
27
.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
28
.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
29
.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
30
.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
31
.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
32
.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
33
.Lfk:
34
.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
35
.Lshuffles:
36
.quad 0x0B0A090807060504,0x030201000F0E0D0C
37
.Lxts_magic:
38
.quad 0x0101010101010187,0x0101010101010101
39
.Lsbox_magic:
40
.quad 0x0b0e0104070a0d00,0x0306090c0f020508
41
.quad 0x62185a2042387a00,0x22581a6002783a40
42
.quad 0x15df62a89e54e923,0xc10bb67c4a803df7
43
.quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
44
.quad 0x6404462679195b3b,0xe383c1a1fe9edcbc
45
.quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
46
47
.size _vpsm4_ex_consts,.-_vpsm4_ex_consts
48
.type _vpsm4_ex_set_key,%function
49
.align 4
50
_vpsm4_ex_set_key:
51
AARCH64_VALID_CALL_TARGET
52
ld1 {v5.4s},[x0]
53
adrp x9, .Lsbox_magic
54
ldr q26, [x9, #:lo12:.Lsbox_magic]
55
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
56
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
57
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
58
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
59
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
60
#ifndef __AARCH64EB__
61
rev32 v5.16b,v5.16b
62
#endif
63
adrp x5,.Lshuffles
64
add x5,x5,#:lo12:.Lshuffles
65
ld1 {v7.2d},[x5]
66
adrp x5,.Lfk
67
add x5,x5,#:lo12:.Lfk
68
ld1 {v6.2d},[x5]
69
eor v5.16b,v5.16b,v6.16b
70
mov x6,#32
71
adrp x5,.Lck
72
add x5,x5,#:lo12:.Lck
73
movi v0.16b,#64
74
cbnz w2,1f
75
add x1,x1,124
76
1:
77
mov w7,v5.s[1]
78
ldr w8,[x5],#4
79
eor w8,w8,w7
80
mov w7,v5.s[2]
81
eor w8,w8,w7
82
mov w7,v5.s[3]
83
eor w8,w8,w7
84
// optimize sbox using AESE instruction
85
mov v4.s[0],w8
86
tbl v0.16b, {v4.16b}, v26.16b
87
ushr v2.16b, v0.16b, 4
88
and v0.16b, v0.16b, v31.16b
89
tbl v0.16b, {v28.16b}, v0.16b
90
tbl v2.16b, {v27.16b}, v2.16b
91
eor v0.16b, v0.16b, v2.16b
92
eor v1.16b, v1.16b, v1.16b
93
aese v0.16b,v1.16b
94
ushr v2.16b, v0.16b, 4
95
and v0.16b, v0.16b, v31.16b
96
tbl v0.16b, {v30.16b}, v0.16b
97
tbl v2.16b, {v29.16b}, v2.16b
98
eor v0.16b, v0.16b, v2.16b
99
mov w7,v0.s[0]
100
eor w8,w7,w7,ror #19
101
eor w8,w8,w7,ror #9
102
mov w7,v5.s[0]
103
eor w8,w8,w7
104
mov v5.s[0],w8
105
cbz w2,2f
106
str w8,[x1],#4
107
b 3f
108
2:
109
str w8,[x1],#-4
110
3:
111
tbl v5.16b,{v5.16b},v7.16b
112
subs x6,x6,#1
113
b.ne 1b
114
ret
115
.size _vpsm4_ex_set_key,.-_vpsm4_ex_set_key
116
.type _vpsm4_ex_enc_4blks,%function
117
.align 4
118
_vpsm4_ex_enc_4blks:
119
AARCH64_VALID_CALL_TARGET
120
mov x10,x3
121
mov w11,#8
122
10:
123
ldp w7,w8,[x10],8
124
dup v12.4s,w7
125
dup v13.4s,w8
126
127
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
128
eor v14.16b,v6.16b,v7.16b
129
eor v12.16b,v5.16b,v12.16b
130
eor v12.16b,v14.16b,v12.16b
131
// optimize sbox using AESE instruction
132
tbl v0.16b, {v12.16b}, v26.16b
133
ushr v24.16b, v0.16b, 4
134
and v0.16b, v0.16b, v31.16b
135
tbl v0.16b, {v28.16b}, v0.16b
136
tbl v24.16b, {v27.16b}, v24.16b
137
eor v0.16b, v0.16b, v24.16b
138
eor v1.16b, v1.16b, v1.16b
139
aese v0.16b,v1.16b
140
ushr v24.16b, v0.16b, 4
141
and v0.16b, v0.16b, v31.16b
142
tbl v0.16b, {v30.16b}, v0.16b
143
tbl v24.16b, {v29.16b}, v24.16b
144
eor v0.16b, v0.16b, v24.16b
145
mov v12.16b,v0.16b
146
147
// linear transformation
148
ushr v0.4s,v12.4s,32-2
149
ushr v1.4s,v12.4s,32-10
150
ushr v2.4s,v12.4s,32-18
151
ushr v3.4s,v12.4s,32-24
152
sli v0.4s,v12.4s,2
153
sli v1.4s,v12.4s,10
154
sli v2.4s,v12.4s,18
155
sli v3.4s,v12.4s,24
156
eor v24.16b,v0.16b,v12.16b
157
eor v24.16b,v24.16b,v1.16b
158
eor v12.16b,v2.16b,v3.16b
159
eor v12.16b,v12.16b,v24.16b
160
eor v4.16b,v4.16b,v12.16b
161
162
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
163
eor v14.16b,v14.16b,v4.16b
164
eor v13.16b,v14.16b,v13.16b
165
// optimize sbox using AESE instruction
166
tbl v0.16b, {v13.16b}, v26.16b
167
ushr v24.16b, v0.16b, 4
168
and v0.16b, v0.16b, v31.16b
169
tbl v0.16b, {v28.16b}, v0.16b
170
tbl v24.16b, {v27.16b}, v24.16b
171
eor v0.16b, v0.16b, v24.16b
172
eor v1.16b, v1.16b, v1.16b
173
aese v0.16b,v1.16b
174
ushr v24.16b, v0.16b, 4
175
and v0.16b, v0.16b, v31.16b
176
tbl v0.16b, {v30.16b}, v0.16b
177
tbl v24.16b, {v29.16b}, v24.16b
178
eor v0.16b, v0.16b, v24.16b
179
mov v13.16b,v0.16b
180
181
// linear transformation
182
ushr v0.4s,v13.4s,32-2
183
ushr v1.4s,v13.4s,32-10
184
ushr v2.4s,v13.4s,32-18
185
ushr v3.4s,v13.4s,32-24
186
sli v0.4s,v13.4s,2
187
sli v1.4s,v13.4s,10
188
sli v2.4s,v13.4s,18
189
sli v3.4s,v13.4s,24
190
eor v24.16b,v0.16b,v13.16b
191
eor v24.16b,v24.16b,v1.16b
192
eor v13.16b,v2.16b,v3.16b
193
eor v13.16b,v13.16b,v24.16b
194
ldp w7,w8,[x10],8
195
eor v5.16b,v5.16b,v13.16b
196
197
dup v12.4s,w7
198
dup v13.4s,w8
199
200
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
201
eor v14.16b,v4.16b,v5.16b
202
eor v12.16b,v7.16b,v12.16b
203
eor v12.16b,v14.16b,v12.16b
204
// optimize sbox using AESE instruction
205
tbl v0.16b, {v12.16b}, v26.16b
206
ushr v24.16b, v0.16b, 4
207
and v0.16b, v0.16b, v31.16b
208
tbl v0.16b, {v28.16b}, v0.16b
209
tbl v24.16b, {v27.16b}, v24.16b
210
eor v0.16b, v0.16b, v24.16b
211
eor v1.16b, v1.16b, v1.16b
212
aese v0.16b,v1.16b
213
ushr v24.16b, v0.16b, 4
214
and v0.16b, v0.16b, v31.16b
215
tbl v0.16b, {v30.16b}, v0.16b
216
tbl v24.16b, {v29.16b}, v24.16b
217
eor v0.16b, v0.16b, v24.16b
218
mov v12.16b,v0.16b
219
220
// linear transformation
221
ushr v0.4s,v12.4s,32-2
222
ushr v1.4s,v12.4s,32-10
223
ushr v2.4s,v12.4s,32-18
224
ushr v3.4s,v12.4s,32-24
225
sli v0.4s,v12.4s,2
226
sli v1.4s,v12.4s,10
227
sli v2.4s,v12.4s,18
228
sli v3.4s,v12.4s,24
229
eor v24.16b,v0.16b,v12.16b
230
eor v24.16b,v24.16b,v1.16b
231
eor v12.16b,v2.16b,v3.16b
232
eor v12.16b,v12.16b,v24.16b
233
eor v6.16b,v6.16b,v12.16b
234
235
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
236
eor v14.16b,v14.16b,v6.16b
237
eor v13.16b,v14.16b,v13.16b
238
// optimize sbox using AESE instruction
239
tbl v0.16b, {v13.16b}, v26.16b
240
ushr v24.16b, v0.16b, 4
241
and v0.16b, v0.16b, v31.16b
242
tbl v0.16b, {v28.16b}, v0.16b
243
tbl v24.16b, {v27.16b}, v24.16b
244
eor v0.16b, v0.16b, v24.16b
245
eor v1.16b, v1.16b, v1.16b
246
aese v0.16b,v1.16b
247
ushr v24.16b, v0.16b, 4
248
and v0.16b, v0.16b, v31.16b
249
tbl v0.16b, {v30.16b}, v0.16b
250
tbl v24.16b, {v29.16b}, v24.16b
251
eor v0.16b, v0.16b, v24.16b
252
mov v13.16b,v0.16b
253
254
// linear transformation
255
ushr v0.4s,v13.4s,32-2
256
ushr v1.4s,v13.4s,32-10
257
ushr v2.4s,v13.4s,32-18
258
ushr v3.4s,v13.4s,32-24
259
sli v0.4s,v13.4s,2
260
sli v1.4s,v13.4s,10
261
sli v2.4s,v13.4s,18
262
sli v3.4s,v13.4s,24
263
eor v24.16b,v0.16b,v13.16b
264
eor v24.16b,v24.16b,v1.16b
265
eor v13.16b,v2.16b,v3.16b
266
eor v13.16b,v13.16b,v24.16b
267
eor v7.16b,v7.16b,v13.16b
268
subs w11,w11,#1
269
b.ne 10b
270
#ifndef __AARCH64EB__
271
rev32 v3.16b,v4.16b
272
#else
273
mov v3.16b,v4.16b
274
#endif
275
#ifndef __AARCH64EB__
276
rev32 v2.16b,v5.16b
277
#else
278
mov v2.16b,v5.16b
279
#endif
280
#ifndef __AARCH64EB__
281
rev32 v1.16b,v6.16b
282
#else
283
mov v1.16b,v6.16b
284
#endif
285
#ifndef __AARCH64EB__
286
rev32 v0.16b,v7.16b
287
#else
288
mov v0.16b,v7.16b
289
#endif
290
ret
291
.size _vpsm4_ex_enc_4blks,.-_vpsm4_ex_enc_4blks
292
.type _vpsm4_ex_enc_8blks,%function
293
.align 4
294
_vpsm4_ex_enc_8blks:
295
AARCH64_VALID_CALL_TARGET
296
mov x10,x3
297
mov w11,#8
298
10:
299
ldp w7,w8,[x10],8
300
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
301
dup v12.4s,w7
302
eor v14.16b,v6.16b,v7.16b
303
eor v15.16b,v10.16b,v11.16b
304
eor v0.16b,v5.16b,v12.16b
305
eor v1.16b,v9.16b,v12.16b
306
eor v12.16b,v14.16b,v0.16b
307
eor v13.16b,v15.16b,v1.16b
308
// optimize sbox using AESE instruction
309
tbl v0.16b, {v12.16b}, v26.16b
310
tbl v1.16b, {v13.16b}, v26.16b
311
ushr v24.16b, v0.16b, 4
312
and v0.16b, v0.16b, v31.16b
313
tbl v0.16b, {v28.16b}, v0.16b
314
tbl v24.16b, {v27.16b}, v24.16b
315
eor v0.16b, v0.16b, v24.16b
316
ushr v24.16b, v1.16b, 4
317
and v1.16b, v1.16b, v31.16b
318
tbl v1.16b, {v28.16b}, v1.16b
319
tbl v24.16b, {v27.16b}, v24.16b
320
eor v1.16b, v1.16b, v24.16b
321
eor v25.16b, v25.16b, v25.16b
322
aese v0.16b,v25.16b
323
aese v1.16b,v25.16b
324
ushr v24.16b, v0.16b, 4
325
and v0.16b, v0.16b, v31.16b
326
tbl v0.16b, {v30.16b}, v0.16b
327
tbl v24.16b, {v29.16b}, v24.16b
328
eor v0.16b, v0.16b, v24.16b
329
ushr v24.16b, v1.16b, 4
330
and v1.16b, v1.16b, v31.16b
331
tbl v1.16b, {v30.16b}, v1.16b
332
tbl v24.16b, {v29.16b}, v24.16b
333
eor v1.16b, v1.16b, v24.16b
334
mov v12.16b,v0.16b
335
mov v13.16b,v1.16b
336
337
// linear transformation
338
ushr v0.4s,v12.4s,32-2
339
ushr v25.4s,v13.4s,32-2
340
ushr v1.4s,v12.4s,32-10
341
ushr v2.4s,v12.4s,32-18
342
ushr v3.4s,v12.4s,32-24
343
sli v0.4s,v12.4s,2
344
sli v25.4s,v13.4s,2
345
sli v1.4s,v12.4s,10
346
sli v2.4s,v12.4s,18
347
sli v3.4s,v12.4s,24
348
eor v24.16b,v0.16b,v12.16b
349
eor v24.16b,v24.16b,v1.16b
350
eor v12.16b,v2.16b,v3.16b
351
eor v12.16b,v12.16b,v24.16b
352
ushr v1.4s,v13.4s,32-10
353
ushr v2.4s,v13.4s,32-18
354
ushr v3.4s,v13.4s,32-24
355
sli v1.4s,v13.4s,10
356
sli v2.4s,v13.4s,18
357
sli v3.4s,v13.4s,24
358
eor v24.16b,v25.16b,v13.16b
359
eor v24.16b,v24.16b,v1.16b
360
eor v13.16b,v2.16b,v3.16b
361
eor v13.16b,v13.16b,v24.16b
362
eor v4.16b,v4.16b,v12.16b
363
eor v8.16b,v8.16b,v13.16b
364
365
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
366
dup v13.4s,w8
367
eor v14.16b,v14.16b,v4.16b
368
eor v15.16b,v15.16b,v8.16b
369
eor v12.16b,v14.16b,v13.16b
370
eor v13.16b,v15.16b,v13.16b
371
// optimize sbox using AESE instruction
372
tbl v0.16b, {v12.16b}, v26.16b
373
tbl v1.16b, {v13.16b}, v26.16b
374
ushr v24.16b, v0.16b, 4
375
and v0.16b, v0.16b, v31.16b
376
tbl v0.16b, {v28.16b}, v0.16b
377
tbl v24.16b, {v27.16b}, v24.16b
378
eor v0.16b, v0.16b, v24.16b
379
ushr v24.16b, v1.16b, 4
380
and v1.16b, v1.16b, v31.16b
381
tbl v1.16b, {v28.16b}, v1.16b
382
tbl v24.16b, {v27.16b}, v24.16b
383
eor v1.16b, v1.16b, v24.16b
384
eor v25.16b, v25.16b, v25.16b
385
aese v0.16b,v25.16b
386
aese v1.16b,v25.16b
387
ushr v24.16b, v0.16b, 4
388
and v0.16b, v0.16b, v31.16b
389
tbl v0.16b, {v30.16b}, v0.16b
390
tbl v24.16b, {v29.16b}, v24.16b
391
eor v0.16b, v0.16b, v24.16b
392
ushr v24.16b, v1.16b, 4
393
and v1.16b, v1.16b, v31.16b
394
tbl v1.16b, {v30.16b}, v1.16b
395
tbl v24.16b, {v29.16b}, v24.16b
396
eor v1.16b, v1.16b, v24.16b
397
mov v12.16b,v0.16b
398
mov v13.16b,v1.16b
399
400
// linear transformation
401
ushr v0.4s,v12.4s,32-2
402
ushr v25.4s,v13.4s,32-2
403
ushr v1.4s,v12.4s,32-10
404
ushr v2.4s,v12.4s,32-18
405
ushr v3.4s,v12.4s,32-24
406
sli v0.4s,v12.4s,2
407
sli v25.4s,v13.4s,2
408
sli v1.4s,v12.4s,10
409
sli v2.4s,v12.4s,18
410
sli v3.4s,v12.4s,24
411
eor v24.16b,v0.16b,v12.16b
412
eor v24.16b,v24.16b,v1.16b
413
eor v12.16b,v2.16b,v3.16b
414
eor v12.16b,v12.16b,v24.16b
415
ushr v1.4s,v13.4s,32-10
416
ushr v2.4s,v13.4s,32-18
417
ushr v3.4s,v13.4s,32-24
418
sli v1.4s,v13.4s,10
419
sli v2.4s,v13.4s,18
420
sli v3.4s,v13.4s,24
421
eor v24.16b,v25.16b,v13.16b
422
eor v24.16b,v24.16b,v1.16b
423
eor v13.16b,v2.16b,v3.16b
424
eor v13.16b,v13.16b,v24.16b
425
ldp w7,w8,[x10],8
426
eor v5.16b,v5.16b,v12.16b
427
eor v9.16b,v9.16b,v13.16b
428
429
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
430
dup v12.4s,w7
431
eor v14.16b,v4.16b,v5.16b
432
eor v15.16b,v8.16b,v9.16b
433
eor v0.16b,v7.16b,v12.16b
434
eor v1.16b,v11.16b,v12.16b
435
eor v12.16b,v14.16b,v0.16b
436
eor v13.16b,v15.16b,v1.16b
437
// optimize sbox using AESE instruction
438
tbl v0.16b, {v12.16b}, v26.16b
439
tbl v1.16b, {v13.16b}, v26.16b
440
ushr v24.16b, v0.16b, 4
441
and v0.16b, v0.16b, v31.16b
442
tbl v0.16b, {v28.16b}, v0.16b
443
tbl v24.16b, {v27.16b}, v24.16b
444
eor v0.16b, v0.16b, v24.16b
445
ushr v24.16b, v1.16b, 4
446
and v1.16b, v1.16b, v31.16b
447
tbl v1.16b, {v28.16b}, v1.16b
448
tbl v24.16b, {v27.16b}, v24.16b
449
eor v1.16b, v1.16b, v24.16b
450
eor v25.16b, v25.16b, v25.16b
451
aese v0.16b,v25.16b
452
aese v1.16b,v25.16b
453
ushr v24.16b, v0.16b, 4
454
and v0.16b, v0.16b, v31.16b
455
tbl v0.16b, {v30.16b}, v0.16b
456
tbl v24.16b, {v29.16b}, v24.16b
457
eor v0.16b, v0.16b, v24.16b
458
ushr v24.16b, v1.16b, 4
459
and v1.16b, v1.16b, v31.16b
460
tbl v1.16b, {v30.16b}, v1.16b
461
tbl v24.16b, {v29.16b}, v24.16b
462
eor v1.16b, v1.16b, v24.16b
463
mov v12.16b,v0.16b
464
mov v13.16b,v1.16b
465
466
// linear transformation
467
ushr v0.4s,v12.4s,32-2
468
ushr v25.4s,v13.4s,32-2
469
ushr v1.4s,v12.4s,32-10
470
ushr v2.4s,v12.4s,32-18
471
ushr v3.4s,v12.4s,32-24
472
sli v0.4s,v12.4s,2
473
sli v25.4s,v13.4s,2
474
sli v1.4s,v12.4s,10
475
sli v2.4s,v12.4s,18
476
sli v3.4s,v12.4s,24
477
eor v24.16b,v0.16b,v12.16b
478
eor v24.16b,v24.16b,v1.16b
479
eor v12.16b,v2.16b,v3.16b
480
eor v12.16b,v12.16b,v24.16b
481
ushr v1.4s,v13.4s,32-10
482
ushr v2.4s,v13.4s,32-18
483
ushr v3.4s,v13.4s,32-24
484
sli v1.4s,v13.4s,10
485
sli v2.4s,v13.4s,18
486
sli v3.4s,v13.4s,24
487
eor v24.16b,v25.16b,v13.16b
488
eor v24.16b,v24.16b,v1.16b
489
eor v13.16b,v2.16b,v3.16b
490
eor v13.16b,v13.16b,v24.16b
491
eor v6.16b,v6.16b,v12.16b
492
eor v10.16b,v10.16b,v13.16b
493
494
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
495
dup v13.4s,w8
496
eor v14.16b,v14.16b,v6.16b
497
eor v15.16b,v15.16b,v10.16b
498
eor v12.16b,v14.16b,v13.16b
499
eor v13.16b,v15.16b,v13.16b
500
// optimize sbox using AESE instruction
501
tbl v0.16b, {v12.16b}, v26.16b
502
tbl v1.16b, {v13.16b}, v26.16b
503
ushr v24.16b, v0.16b, 4
504
and v0.16b, v0.16b, v31.16b
505
tbl v0.16b, {v28.16b}, v0.16b
506
tbl v24.16b, {v27.16b}, v24.16b
507
eor v0.16b, v0.16b, v24.16b
508
ushr v24.16b, v1.16b, 4
509
and v1.16b, v1.16b, v31.16b
510
tbl v1.16b, {v28.16b}, v1.16b
511
tbl v24.16b, {v27.16b}, v24.16b
512
eor v1.16b, v1.16b, v24.16b
513
eor v25.16b, v25.16b, v25.16b
514
aese v0.16b,v25.16b
515
aese v1.16b,v25.16b
516
ushr v24.16b, v0.16b, 4
517
and v0.16b, v0.16b, v31.16b
518
tbl v0.16b, {v30.16b}, v0.16b
519
tbl v24.16b, {v29.16b}, v24.16b
520
eor v0.16b, v0.16b, v24.16b
521
ushr v24.16b, v1.16b, 4
522
and v1.16b, v1.16b, v31.16b
523
tbl v1.16b, {v30.16b}, v1.16b
524
tbl v24.16b, {v29.16b}, v24.16b
525
eor v1.16b, v1.16b, v24.16b
526
mov v12.16b,v0.16b
527
mov v13.16b,v1.16b
528
529
// linear transformation
530
ushr v0.4s,v12.4s,32-2
531
ushr v25.4s,v13.4s,32-2
532
ushr v1.4s,v12.4s,32-10
533
ushr v2.4s,v12.4s,32-18
534
ushr v3.4s,v12.4s,32-24
535
sli v0.4s,v12.4s,2
536
sli v25.4s,v13.4s,2
537
sli v1.4s,v12.4s,10
538
sli v2.4s,v12.4s,18
539
sli v3.4s,v12.4s,24
540
eor v24.16b,v0.16b,v12.16b
541
eor v24.16b,v24.16b,v1.16b
542
eor v12.16b,v2.16b,v3.16b
543
eor v12.16b,v12.16b,v24.16b
544
ushr v1.4s,v13.4s,32-10
545
ushr v2.4s,v13.4s,32-18
546
ushr v3.4s,v13.4s,32-24
547
sli v1.4s,v13.4s,10
548
sli v2.4s,v13.4s,18
549
sli v3.4s,v13.4s,24
550
eor v24.16b,v25.16b,v13.16b
551
eor v24.16b,v24.16b,v1.16b
552
eor v13.16b,v2.16b,v3.16b
553
eor v13.16b,v13.16b,v24.16b
554
eor v7.16b,v7.16b,v12.16b
555
eor v11.16b,v11.16b,v13.16b
556
subs w11,w11,#1
557
b.ne 10b
558
#ifndef __AARCH64EB__
559
rev32 v3.16b,v4.16b
560
#else
561
mov v3.16b,v4.16b
562
#endif
563
#ifndef __AARCH64EB__
564
rev32 v2.16b,v5.16b
565
#else
566
mov v2.16b,v5.16b
567
#endif
568
#ifndef __AARCH64EB__
569
rev32 v1.16b,v6.16b
570
#else
571
mov v1.16b,v6.16b
572
#endif
573
#ifndef __AARCH64EB__
574
rev32 v0.16b,v7.16b
575
#else
576
mov v0.16b,v7.16b
577
#endif
578
#ifndef __AARCH64EB__
579
rev32 v7.16b,v8.16b
580
#else
581
mov v7.16b,v8.16b
582
#endif
583
#ifndef __AARCH64EB__
584
rev32 v6.16b,v9.16b
585
#else
586
mov v6.16b,v9.16b
587
#endif
588
#ifndef __AARCH64EB__
589
rev32 v5.16b,v10.16b
590
#else
591
mov v5.16b,v10.16b
592
#endif
593
#ifndef __AARCH64EB__
594
rev32 v4.16b,v11.16b
595
#else
596
mov v4.16b,v11.16b
597
#endif
598
ret
599
.size _vpsm4_ex_enc_8blks,.-_vpsm4_ex_enc_8blks
600
.globl vpsm4_ex_set_encrypt_key
601
.type vpsm4_ex_set_encrypt_key,%function
602
.align 5
603
vpsm4_ex_set_encrypt_key:
604
AARCH64_SIGN_LINK_REGISTER
605
stp x29,x30,[sp,#-16]!
606
mov w2,1
607
bl _vpsm4_ex_set_key
608
ldp x29,x30,[sp],#16
609
AARCH64_VALIDATE_LINK_REGISTER
610
ret
611
.size vpsm4_ex_set_encrypt_key,.-vpsm4_ex_set_encrypt_key
612
.globl vpsm4_ex_set_decrypt_key
613
.type vpsm4_ex_set_decrypt_key,%function
614
.align 5
615
vpsm4_ex_set_decrypt_key:
616
AARCH64_SIGN_LINK_REGISTER
617
stp x29,x30,[sp,#-16]!
618
mov w2,0
619
bl _vpsm4_ex_set_key
620
ldp x29,x30,[sp],#16
621
AARCH64_VALIDATE_LINK_REGISTER
622
ret
623
.size vpsm4_ex_set_decrypt_key,.-vpsm4_ex_set_decrypt_key
624
.globl vpsm4_ex_encrypt
625
.type vpsm4_ex_encrypt,%function
626
.align 5
627
vpsm4_ex_encrypt:
628
AARCH64_VALID_CALL_TARGET
629
ld1 {v4.4s},[x0]
630
adrp x9, .Lsbox_magic
631
ldr q26, [x9, #:lo12:.Lsbox_magic]
632
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
633
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
634
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
635
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
636
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
637
#ifndef __AARCH64EB__
638
rev32 v4.16b,v4.16b
639
#endif
640
mov x3,x2
641
mov x10,x3
642
mov w11,#8
643
mov w12,v4.s[0]
644
mov w13,v4.s[1]
645
mov w14,v4.s[2]
646
mov w15,v4.s[3]
647
10:
648
ldp w7,w8,[x10],8
649
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
650
eor w6,w14,w15
651
eor w9,w7,w13
652
eor w6,w6,w9
653
mov v3.s[0],w6
654
// optimize sbox using AESE instruction
655
tbl v0.16b, {v3.16b}, v26.16b
656
ushr v2.16b, v0.16b, 4
657
and v0.16b, v0.16b, v31.16b
658
tbl v0.16b, {v28.16b}, v0.16b
659
tbl v2.16b, {v27.16b}, v2.16b
660
eor v0.16b, v0.16b, v2.16b
661
eor v1.16b, v1.16b, v1.16b
662
aese v0.16b,v1.16b
663
ushr v2.16b, v0.16b, 4
664
and v0.16b, v0.16b, v31.16b
665
tbl v0.16b, {v30.16b}, v0.16b
666
tbl v2.16b, {v29.16b}, v2.16b
667
eor v0.16b, v0.16b, v2.16b
668
669
mov w7,v0.s[0]
670
eor w6,w7,w7,ror #32-2
671
eor w6,w6,w7,ror #32-10
672
eor w6,w6,w7,ror #32-18
673
eor w6,w6,w7,ror #32-24
674
eor w12,w12,w6
675
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
676
eor w6,w14,w15
677
eor w9,w12,w8
678
eor w6,w6,w9
679
mov v3.s[0],w6
680
// optimize sbox using AESE instruction
681
tbl v0.16b, {v3.16b}, v26.16b
682
ushr v2.16b, v0.16b, 4
683
and v0.16b, v0.16b, v31.16b
684
tbl v0.16b, {v28.16b}, v0.16b
685
tbl v2.16b, {v27.16b}, v2.16b
686
eor v0.16b, v0.16b, v2.16b
687
eor v1.16b, v1.16b, v1.16b
688
aese v0.16b,v1.16b
689
ushr v2.16b, v0.16b, 4
690
and v0.16b, v0.16b, v31.16b
691
tbl v0.16b, {v30.16b}, v0.16b
692
tbl v2.16b, {v29.16b}, v2.16b
693
eor v0.16b, v0.16b, v2.16b
694
695
mov w7,v0.s[0]
696
eor w6,w7,w7,ror #32-2
697
eor w6,w6,w7,ror #32-10
698
eor w6,w6,w7,ror #32-18
699
eor w6,w6,w7,ror #32-24
700
ldp w7,w8,[x10],8
701
eor w13,w13,w6
702
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
703
eor w6,w12,w13
704
eor w9,w7,w15
705
eor w6,w6,w9
706
mov v3.s[0],w6
707
// optimize sbox using AESE instruction
708
tbl v0.16b, {v3.16b}, v26.16b
709
ushr v2.16b, v0.16b, 4
710
and v0.16b, v0.16b, v31.16b
711
tbl v0.16b, {v28.16b}, v0.16b
712
tbl v2.16b, {v27.16b}, v2.16b
713
eor v0.16b, v0.16b, v2.16b
714
eor v1.16b, v1.16b, v1.16b
715
aese v0.16b,v1.16b
716
ushr v2.16b, v0.16b, 4
717
and v0.16b, v0.16b, v31.16b
718
tbl v0.16b, {v30.16b}, v0.16b
719
tbl v2.16b, {v29.16b}, v2.16b
720
eor v0.16b, v0.16b, v2.16b
721
722
mov w7,v0.s[0]
723
eor w6,w7,w7,ror #32-2
724
eor w6,w6,w7,ror #32-10
725
eor w6,w6,w7,ror #32-18
726
eor w6,w6,w7,ror #32-24
727
eor w14,w14,w6
728
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
729
eor w6,w12,w13
730
eor w9,w14,w8
731
eor w6,w6,w9
732
mov v3.s[0],w6
733
// optimize sbox using AESE instruction
734
tbl v0.16b, {v3.16b}, v26.16b
735
ushr v2.16b, v0.16b, 4
736
and v0.16b, v0.16b, v31.16b
737
tbl v0.16b, {v28.16b}, v0.16b
738
tbl v2.16b, {v27.16b}, v2.16b
739
eor v0.16b, v0.16b, v2.16b
740
eor v1.16b, v1.16b, v1.16b
741
aese v0.16b,v1.16b
742
ushr v2.16b, v0.16b, 4
743
and v0.16b, v0.16b, v31.16b
744
tbl v0.16b, {v30.16b}, v0.16b
745
tbl v2.16b, {v29.16b}, v2.16b
746
eor v0.16b, v0.16b, v2.16b
747
748
mov w7,v0.s[0]
749
eor w6,w7,w7,ror #32-2
750
eor w6,w6,w7,ror #32-10
751
eor w6,w6,w7,ror #32-18
752
eor w6,w6,w7,ror #32-24
753
eor w15,w15,w6
754
subs w11,w11,#1
755
b.ne 10b
756
mov v4.s[0],w15
757
mov v4.s[1],w14
758
mov v4.s[2],w13
759
mov v4.s[3],w12
760
#ifndef __AARCH64EB__
761
rev32 v4.16b,v4.16b
762
#endif
763
st1 {v4.4s},[x1]
764
ret
765
.size vpsm4_ex_encrypt,.-vpsm4_ex_encrypt
766
.globl vpsm4_ex_decrypt
767
.type vpsm4_ex_decrypt,%function
768
.align 5
769
vpsm4_ex_decrypt:
770
AARCH64_VALID_CALL_TARGET
771
ld1 {v4.4s},[x0]
772
adrp x9, .Lsbox_magic
773
ldr q26, [x9, #:lo12:.Lsbox_magic]
774
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
775
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
776
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
777
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
778
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
779
#ifndef __AARCH64EB__
780
rev32 v4.16b,v4.16b
781
#endif
782
mov x3,x2
783
mov x10,x3
784
mov w11,#8
785
mov w12,v4.s[0]
786
mov w13,v4.s[1]
787
mov w14,v4.s[2]
788
mov w15,v4.s[3]
789
10:
790
ldp w7,w8,[x10],8
791
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
792
eor w6,w14,w15
793
eor w9,w7,w13
794
eor w6,w6,w9
795
mov v3.s[0],w6
796
// optimize sbox using AESE instruction
797
tbl v0.16b, {v3.16b}, v26.16b
798
ushr v2.16b, v0.16b, 4
799
and v0.16b, v0.16b, v31.16b
800
tbl v0.16b, {v28.16b}, v0.16b
801
tbl v2.16b, {v27.16b}, v2.16b
802
eor v0.16b, v0.16b, v2.16b
803
eor v1.16b, v1.16b, v1.16b
804
aese v0.16b,v1.16b
805
ushr v2.16b, v0.16b, 4
806
and v0.16b, v0.16b, v31.16b
807
tbl v0.16b, {v30.16b}, v0.16b
808
tbl v2.16b, {v29.16b}, v2.16b
809
eor v0.16b, v0.16b, v2.16b
810
811
mov w7,v0.s[0]
812
eor w6,w7,w7,ror #32-2
813
eor w6,w6,w7,ror #32-10
814
eor w6,w6,w7,ror #32-18
815
eor w6,w6,w7,ror #32-24
816
eor w12,w12,w6
817
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
818
eor w6,w14,w15
819
eor w9,w12,w8
820
eor w6,w6,w9
821
mov v3.s[0],w6
822
// optimize sbox using AESE instruction
823
tbl v0.16b, {v3.16b}, v26.16b
824
ushr v2.16b, v0.16b, 4
825
and v0.16b, v0.16b, v31.16b
826
tbl v0.16b, {v28.16b}, v0.16b
827
tbl v2.16b, {v27.16b}, v2.16b
828
eor v0.16b, v0.16b, v2.16b
829
eor v1.16b, v1.16b, v1.16b
830
aese v0.16b,v1.16b
831
ushr v2.16b, v0.16b, 4
832
and v0.16b, v0.16b, v31.16b
833
tbl v0.16b, {v30.16b}, v0.16b
834
tbl v2.16b, {v29.16b}, v2.16b
835
eor v0.16b, v0.16b, v2.16b
836
837
mov w7,v0.s[0]
838
eor w6,w7,w7,ror #32-2
839
eor w6,w6,w7,ror #32-10
840
eor w6,w6,w7,ror #32-18
841
eor w6,w6,w7,ror #32-24
842
ldp w7,w8,[x10],8
843
eor w13,w13,w6
844
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
845
eor w6,w12,w13
846
eor w9,w7,w15
847
eor w6,w6,w9
848
mov v3.s[0],w6
849
// optimize sbox using AESE instruction
850
tbl v0.16b, {v3.16b}, v26.16b
851
ushr v2.16b, v0.16b, 4
852
and v0.16b, v0.16b, v31.16b
853
tbl v0.16b, {v28.16b}, v0.16b
854
tbl v2.16b, {v27.16b}, v2.16b
855
eor v0.16b, v0.16b, v2.16b
856
eor v1.16b, v1.16b, v1.16b
857
aese v0.16b,v1.16b
858
ushr v2.16b, v0.16b, 4
859
and v0.16b, v0.16b, v31.16b
860
tbl v0.16b, {v30.16b}, v0.16b
861
tbl v2.16b, {v29.16b}, v2.16b
862
eor v0.16b, v0.16b, v2.16b
863
864
mov w7,v0.s[0]
865
eor w6,w7,w7,ror #32-2
866
eor w6,w6,w7,ror #32-10
867
eor w6,w6,w7,ror #32-18
868
eor w6,w6,w7,ror #32-24
869
eor w14,w14,w6
870
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
871
eor w6,w12,w13
872
eor w9,w14,w8
873
eor w6,w6,w9
874
mov v3.s[0],w6
875
// optimize sbox using AESE instruction
876
tbl v0.16b, {v3.16b}, v26.16b
877
ushr v2.16b, v0.16b, 4
878
and v0.16b, v0.16b, v31.16b
879
tbl v0.16b, {v28.16b}, v0.16b
880
tbl v2.16b, {v27.16b}, v2.16b
881
eor v0.16b, v0.16b, v2.16b
882
eor v1.16b, v1.16b, v1.16b
883
aese v0.16b,v1.16b
884
ushr v2.16b, v0.16b, 4
885
and v0.16b, v0.16b, v31.16b
886
tbl v0.16b, {v30.16b}, v0.16b
887
tbl v2.16b, {v29.16b}, v2.16b
888
eor v0.16b, v0.16b, v2.16b
889
890
mov w7,v0.s[0]
891
eor w6,w7,w7,ror #32-2
892
eor w6,w6,w7,ror #32-10
893
eor w6,w6,w7,ror #32-18
894
eor w6,w6,w7,ror #32-24
895
eor w15,w15,w6
896
subs w11,w11,#1
897
b.ne 10b
898
mov v4.s[0],w15
899
mov v4.s[1],w14
900
mov v4.s[2],w13
901
mov v4.s[3],w12
902
#ifndef __AARCH64EB__
903
rev32 v4.16b,v4.16b
904
#endif
905
st1 {v4.4s},[x1]
906
ret
907
.size vpsm4_ex_decrypt,.-vpsm4_ex_decrypt
908
.globl vpsm4_ex_ecb_encrypt
909
.type vpsm4_ex_ecb_encrypt,%function
910
.align 5
911
vpsm4_ex_ecb_encrypt:
912
AARCH64_SIGN_LINK_REGISTER
913
// convert length into blocks
914
lsr x2,x2,4
915
stp d8,d9,[sp,#-80]!
916
stp d10,d11,[sp,#16]
917
stp d12,d13,[sp,#32]
918
stp d14,d15,[sp,#48]
919
stp x29,x30,[sp,#64]
920
adrp x9, .Lsbox_magic
921
ldr q26, [x9, #:lo12:.Lsbox_magic]
922
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
923
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
924
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
925
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
926
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
927
.Lecb_8_blocks_process:
928
cmp w2,#8
929
b.lt .Lecb_4_blocks_process
930
ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
931
ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
932
#ifndef __AARCH64EB__
933
rev32 v4.16b,v4.16b
934
#endif
935
#ifndef __AARCH64EB__
936
rev32 v5.16b,v5.16b
937
#endif
938
#ifndef __AARCH64EB__
939
rev32 v6.16b,v6.16b
940
#endif
941
#ifndef __AARCH64EB__
942
rev32 v7.16b,v7.16b
943
#endif
944
#ifndef __AARCH64EB__
945
rev32 v8.16b,v8.16b
946
#endif
947
#ifndef __AARCH64EB__
948
rev32 v9.16b,v9.16b
949
#endif
950
#ifndef __AARCH64EB__
951
rev32 v10.16b,v10.16b
952
#endif
953
#ifndef __AARCH64EB__
954
rev32 v11.16b,v11.16b
955
#endif
956
bl _vpsm4_ex_enc_8blks
957
st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
958
st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
959
subs w2,w2,#8
960
b.gt .Lecb_8_blocks_process
961
b 100f
962
.Lecb_4_blocks_process:
963
cmp w2,#4
964
b.lt 1f
965
ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
966
#ifndef __AARCH64EB__
967
rev32 v4.16b,v4.16b
968
#endif
969
#ifndef __AARCH64EB__
970
rev32 v5.16b,v5.16b
971
#endif
972
#ifndef __AARCH64EB__
973
rev32 v6.16b,v6.16b
974
#endif
975
#ifndef __AARCH64EB__
976
rev32 v7.16b,v7.16b
977
#endif
978
bl _vpsm4_ex_enc_4blks
979
st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
980
sub w2,w2,#4
981
1:
982
// process last block
983
cmp w2,#1
984
b.lt 100f
985
b.gt 1f
986
ld1 {v4.4s},[x0]
987
#ifndef __AARCH64EB__
988
rev32 v4.16b,v4.16b
989
#endif
990
mov x10,x3
991
mov w11,#8
992
mov w12,v4.s[0]
993
mov w13,v4.s[1]
994
mov w14,v4.s[2]
995
mov w15,v4.s[3]
996
10:
997
ldp w7,w8,[x10],8
998
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
999
eor w6,w14,w15
1000
eor w9,w7,w13
1001
eor w6,w6,w9
1002
mov v3.s[0],w6
1003
// optimize sbox using AESE instruction
1004
tbl v0.16b, {v3.16b}, v26.16b
1005
ushr v2.16b, v0.16b, 4
1006
and v0.16b, v0.16b, v31.16b
1007
tbl v0.16b, {v28.16b}, v0.16b
1008
tbl v2.16b, {v27.16b}, v2.16b
1009
eor v0.16b, v0.16b, v2.16b
1010
eor v1.16b, v1.16b, v1.16b
1011
aese v0.16b,v1.16b
1012
ushr v2.16b, v0.16b, 4
1013
and v0.16b, v0.16b, v31.16b
1014
tbl v0.16b, {v30.16b}, v0.16b
1015
tbl v2.16b, {v29.16b}, v2.16b
1016
eor v0.16b, v0.16b, v2.16b
1017
1018
mov w7,v0.s[0]
1019
eor w6,w7,w7,ror #32-2
1020
eor w6,w6,w7,ror #32-10
1021
eor w6,w6,w7,ror #32-18
1022
eor w6,w6,w7,ror #32-24
1023
eor w12,w12,w6
1024
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1025
eor w6,w14,w15
1026
eor w9,w12,w8
1027
eor w6,w6,w9
1028
mov v3.s[0],w6
1029
// optimize sbox using AESE instruction
1030
tbl v0.16b, {v3.16b}, v26.16b
1031
ushr v2.16b, v0.16b, 4
1032
and v0.16b, v0.16b, v31.16b
1033
tbl v0.16b, {v28.16b}, v0.16b
1034
tbl v2.16b, {v27.16b}, v2.16b
1035
eor v0.16b, v0.16b, v2.16b
1036
eor v1.16b, v1.16b, v1.16b
1037
aese v0.16b,v1.16b
1038
ushr v2.16b, v0.16b, 4
1039
and v0.16b, v0.16b, v31.16b
1040
tbl v0.16b, {v30.16b}, v0.16b
1041
tbl v2.16b, {v29.16b}, v2.16b
1042
eor v0.16b, v0.16b, v2.16b
1043
1044
mov w7,v0.s[0]
1045
eor w6,w7,w7,ror #32-2
1046
eor w6,w6,w7,ror #32-10
1047
eor w6,w6,w7,ror #32-18
1048
eor w6,w6,w7,ror #32-24
1049
ldp w7,w8,[x10],8
1050
eor w13,w13,w6
1051
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1052
eor w6,w12,w13
1053
eor w9,w7,w15
1054
eor w6,w6,w9
1055
mov v3.s[0],w6
1056
// optimize sbox using AESE instruction
1057
tbl v0.16b, {v3.16b}, v26.16b
1058
ushr v2.16b, v0.16b, 4
1059
and v0.16b, v0.16b, v31.16b
1060
tbl v0.16b, {v28.16b}, v0.16b
1061
tbl v2.16b, {v27.16b}, v2.16b
1062
eor v0.16b, v0.16b, v2.16b
1063
eor v1.16b, v1.16b, v1.16b
1064
aese v0.16b,v1.16b
1065
ushr v2.16b, v0.16b, 4
1066
and v0.16b, v0.16b, v31.16b
1067
tbl v0.16b, {v30.16b}, v0.16b
1068
tbl v2.16b, {v29.16b}, v2.16b
1069
eor v0.16b, v0.16b, v2.16b
1070
1071
mov w7,v0.s[0]
1072
eor w6,w7,w7,ror #32-2
1073
eor w6,w6,w7,ror #32-10
1074
eor w6,w6,w7,ror #32-18
1075
eor w6,w6,w7,ror #32-24
1076
eor w14,w14,w6
1077
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1078
eor w6,w12,w13
1079
eor w9,w14,w8
1080
eor w6,w6,w9
1081
mov v3.s[0],w6
1082
// optimize sbox using AESE instruction
1083
tbl v0.16b, {v3.16b}, v26.16b
1084
ushr v2.16b, v0.16b, 4
1085
and v0.16b, v0.16b, v31.16b
1086
tbl v0.16b, {v28.16b}, v0.16b
1087
tbl v2.16b, {v27.16b}, v2.16b
1088
eor v0.16b, v0.16b, v2.16b
1089
eor v1.16b, v1.16b, v1.16b
1090
aese v0.16b,v1.16b
1091
ushr v2.16b, v0.16b, 4
1092
and v0.16b, v0.16b, v31.16b
1093
tbl v0.16b, {v30.16b}, v0.16b
1094
tbl v2.16b, {v29.16b}, v2.16b
1095
eor v0.16b, v0.16b, v2.16b
1096
1097
mov w7,v0.s[0]
1098
eor w6,w7,w7,ror #32-2
1099
eor w6,w6,w7,ror #32-10
1100
eor w6,w6,w7,ror #32-18
1101
eor w6,w6,w7,ror #32-24
1102
eor w15,w15,w6
1103
subs w11,w11,#1
1104
b.ne 10b
1105
mov v4.s[0],w15
1106
mov v4.s[1],w14
1107
mov v4.s[2],w13
1108
mov v4.s[3],w12
1109
#ifndef __AARCH64EB__
1110
rev32 v4.16b,v4.16b
1111
#endif
1112
st1 {v4.4s},[x1]
1113
b 100f
1114
1: // process last 2 blocks
1115
ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16
1116
ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16
1117
cmp w2,#2
1118
b.gt 1f
1119
#ifndef __AARCH64EB__
1120
rev32 v4.16b,v4.16b
1121
#endif
1122
#ifndef __AARCH64EB__
1123
rev32 v5.16b,v5.16b
1124
#endif
1125
#ifndef __AARCH64EB__
1126
rev32 v6.16b,v6.16b
1127
#endif
1128
#ifndef __AARCH64EB__
1129
rev32 v7.16b,v7.16b
1130
#endif
1131
bl _vpsm4_ex_enc_4blks
1132
st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
1133
st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1]
1134
b 100f
1135
1: // process last 3 blocks
1136
ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16
1137
#ifndef __AARCH64EB__
1138
rev32 v4.16b,v4.16b
1139
#endif
1140
#ifndef __AARCH64EB__
1141
rev32 v5.16b,v5.16b
1142
#endif
1143
#ifndef __AARCH64EB__
1144
rev32 v6.16b,v6.16b
1145
#endif
1146
#ifndef __AARCH64EB__
1147
rev32 v7.16b,v7.16b
1148
#endif
1149
bl _vpsm4_ex_enc_4blks
1150
st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
1151
st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
1152
st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1]
1153
100:
1154
ldp d10,d11,[sp,#16]
1155
ldp d12,d13,[sp,#32]
1156
ldp d14,d15,[sp,#48]
1157
ldp x29,x30,[sp,#64]
1158
ldp d8,d9,[sp],#80
1159
AARCH64_VALIDATE_LINK_REGISTER
1160
ret
1161
.size vpsm4_ex_ecb_encrypt,.-vpsm4_ex_ecb_encrypt
1162
.globl vpsm4_ex_cbc_encrypt
1163
.type vpsm4_ex_cbc_encrypt,%function
1164
.align 5
1165
vpsm4_ex_cbc_encrypt:
1166
AARCH64_VALID_CALL_TARGET
1167
lsr x2,x2,4
1168
adrp x9, .Lsbox_magic
1169
ldr q26, [x9, #:lo12:.Lsbox_magic]
1170
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
1171
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
1172
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
1173
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
1174
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
1175
cbz w5,.Ldec
1176
ld1 {v3.4s},[x4]
1177
.Lcbc_4_blocks_enc:
1178
cmp w2,#4
1179
b.lt 1f
1180
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1181
eor v4.16b,v4.16b,v3.16b
1182
#ifndef __AARCH64EB__
1183
rev32 v5.16b,v5.16b
1184
#endif
1185
#ifndef __AARCH64EB__
1186
rev32 v4.16b,v4.16b
1187
#endif
1188
#ifndef __AARCH64EB__
1189
rev32 v6.16b,v6.16b
1190
#endif
1191
#ifndef __AARCH64EB__
1192
rev32 v7.16b,v7.16b
1193
#endif
1194
mov x10,x3
1195
mov w11,#8
1196
mov w12,v4.s[0]
1197
mov w13,v4.s[1]
1198
mov w14,v4.s[2]
1199
mov w15,v4.s[3]
1200
10:
1201
ldp w7,w8,[x10],8
1202
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1203
eor w6,w14,w15
1204
eor w9,w7,w13
1205
eor w6,w6,w9
1206
mov v3.s[0],w6
1207
// optimize sbox using AESE instruction
1208
tbl v0.16b, {v3.16b}, v26.16b
1209
ushr v2.16b, v0.16b, 4
1210
and v0.16b, v0.16b, v31.16b
1211
tbl v0.16b, {v28.16b}, v0.16b
1212
tbl v2.16b, {v27.16b}, v2.16b
1213
eor v0.16b, v0.16b, v2.16b
1214
eor v1.16b, v1.16b, v1.16b
1215
aese v0.16b,v1.16b
1216
ushr v2.16b, v0.16b, 4
1217
and v0.16b, v0.16b, v31.16b
1218
tbl v0.16b, {v30.16b}, v0.16b
1219
tbl v2.16b, {v29.16b}, v2.16b
1220
eor v0.16b, v0.16b, v2.16b
1221
1222
mov w7,v0.s[0]
1223
eor w6,w7,w7,ror #32-2
1224
eor w6,w6,w7,ror #32-10
1225
eor w6,w6,w7,ror #32-18
1226
eor w6,w6,w7,ror #32-24
1227
eor w12,w12,w6
1228
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1229
eor w6,w14,w15
1230
eor w9,w12,w8
1231
eor w6,w6,w9
1232
mov v3.s[0],w6
1233
// optimize sbox using AESE instruction
1234
tbl v0.16b, {v3.16b}, v26.16b
1235
ushr v2.16b, v0.16b, 4
1236
and v0.16b, v0.16b, v31.16b
1237
tbl v0.16b, {v28.16b}, v0.16b
1238
tbl v2.16b, {v27.16b}, v2.16b
1239
eor v0.16b, v0.16b, v2.16b
1240
eor v1.16b, v1.16b, v1.16b
1241
aese v0.16b,v1.16b
1242
ushr v2.16b, v0.16b, 4
1243
and v0.16b, v0.16b, v31.16b
1244
tbl v0.16b, {v30.16b}, v0.16b
1245
tbl v2.16b, {v29.16b}, v2.16b
1246
eor v0.16b, v0.16b, v2.16b
1247
1248
mov w7,v0.s[0]
1249
eor w6,w7,w7,ror #32-2
1250
eor w6,w6,w7,ror #32-10
1251
eor w6,w6,w7,ror #32-18
1252
eor w6,w6,w7,ror #32-24
1253
ldp w7,w8,[x10],8
1254
eor w13,w13,w6
1255
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1256
eor w6,w12,w13
1257
eor w9,w7,w15
1258
eor w6,w6,w9
1259
mov v3.s[0],w6
1260
// optimize sbox using AESE instruction
1261
tbl v0.16b, {v3.16b}, v26.16b
1262
ushr v2.16b, v0.16b, 4
1263
and v0.16b, v0.16b, v31.16b
1264
tbl v0.16b, {v28.16b}, v0.16b
1265
tbl v2.16b, {v27.16b}, v2.16b
1266
eor v0.16b, v0.16b, v2.16b
1267
eor v1.16b, v1.16b, v1.16b
1268
aese v0.16b,v1.16b
1269
ushr v2.16b, v0.16b, 4
1270
and v0.16b, v0.16b, v31.16b
1271
tbl v0.16b, {v30.16b}, v0.16b
1272
tbl v2.16b, {v29.16b}, v2.16b
1273
eor v0.16b, v0.16b, v2.16b
1274
1275
mov w7,v0.s[0]
1276
eor w6,w7,w7,ror #32-2
1277
eor w6,w6,w7,ror #32-10
1278
eor w6,w6,w7,ror #32-18
1279
eor w6,w6,w7,ror #32-24
1280
eor w14,w14,w6
1281
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1282
eor w6,w12,w13
1283
eor w9,w14,w8
1284
eor w6,w6,w9
1285
mov v3.s[0],w6
1286
// optimize sbox using AESE instruction
1287
tbl v0.16b, {v3.16b}, v26.16b
1288
ushr v2.16b, v0.16b, 4
1289
and v0.16b, v0.16b, v31.16b
1290
tbl v0.16b, {v28.16b}, v0.16b
1291
tbl v2.16b, {v27.16b}, v2.16b
1292
eor v0.16b, v0.16b, v2.16b
1293
eor v1.16b, v1.16b, v1.16b
1294
aese v0.16b,v1.16b
1295
ushr v2.16b, v0.16b, 4
1296
and v0.16b, v0.16b, v31.16b
1297
tbl v0.16b, {v30.16b}, v0.16b
1298
tbl v2.16b, {v29.16b}, v2.16b
1299
eor v0.16b, v0.16b, v2.16b
1300
1301
mov w7,v0.s[0]
1302
eor w6,w7,w7,ror #32-2
1303
eor w6,w6,w7,ror #32-10
1304
eor w6,w6,w7,ror #32-18
1305
eor w6,w6,w7,ror #32-24
1306
eor w15,w15,w6
1307
subs w11,w11,#1
1308
b.ne 10b
1309
mov v4.s[0],w15
1310
mov v4.s[1],w14
1311
mov v4.s[2],w13
1312
mov v4.s[3],w12
1313
eor v5.16b,v5.16b,v4.16b
1314
mov x10,x3
1315
mov w11,#8
1316
mov w12,v5.s[0]
1317
mov w13,v5.s[1]
1318
mov w14,v5.s[2]
1319
mov w15,v5.s[3]
1320
10:
1321
ldp w7,w8,[x10],8
1322
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1323
eor w6,w14,w15
1324
eor w9,w7,w13
1325
eor w6,w6,w9
1326
mov v3.s[0],w6
1327
// optimize sbox using AESE instruction
1328
tbl v0.16b, {v3.16b}, v26.16b
1329
ushr v2.16b, v0.16b, 4
1330
and v0.16b, v0.16b, v31.16b
1331
tbl v0.16b, {v28.16b}, v0.16b
1332
tbl v2.16b, {v27.16b}, v2.16b
1333
eor v0.16b, v0.16b, v2.16b
1334
eor v1.16b, v1.16b, v1.16b
1335
aese v0.16b,v1.16b
1336
ushr v2.16b, v0.16b, 4
1337
and v0.16b, v0.16b, v31.16b
1338
tbl v0.16b, {v30.16b}, v0.16b
1339
tbl v2.16b, {v29.16b}, v2.16b
1340
eor v0.16b, v0.16b, v2.16b
1341
1342
mov w7,v0.s[0]
1343
eor w6,w7,w7,ror #32-2
1344
eor w6,w6,w7,ror #32-10
1345
eor w6,w6,w7,ror #32-18
1346
eor w6,w6,w7,ror #32-24
1347
eor w12,w12,w6
1348
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1349
eor w6,w14,w15
1350
eor w9,w12,w8
1351
eor w6,w6,w9
1352
mov v3.s[0],w6
1353
// optimize sbox using AESE instruction
1354
tbl v0.16b, {v3.16b}, v26.16b
1355
ushr v2.16b, v0.16b, 4
1356
and v0.16b, v0.16b, v31.16b
1357
tbl v0.16b, {v28.16b}, v0.16b
1358
tbl v2.16b, {v27.16b}, v2.16b
1359
eor v0.16b, v0.16b, v2.16b
1360
eor v1.16b, v1.16b, v1.16b
1361
aese v0.16b,v1.16b
1362
ushr v2.16b, v0.16b, 4
1363
and v0.16b, v0.16b, v31.16b
1364
tbl v0.16b, {v30.16b}, v0.16b
1365
tbl v2.16b, {v29.16b}, v2.16b
1366
eor v0.16b, v0.16b, v2.16b
1367
1368
mov w7,v0.s[0]
1369
eor w6,w7,w7,ror #32-2
1370
eor w6,w6,w7,ror #32-10
1371
eor w6,w6,w7,ror #32-18
1372
eor w6,w6,w7,ror #32-24
1373
ldp w7,w8,[x10],8
1374
eor w13,w13,w6
1375
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1376
eor w6,w12,w13
1377
eor w9,w7,w15
1378
eor w6,w6,w9
1379
mov v3.s[0],w6
1380
// optimize sbox using AESE instruction
1381
tbl v0.16b, {v3.16b}, v26.16b
1382
ushr v2.16b, v0.16b, 4
1383
and v0.16b, v0.16b, v31.16b
1384
tbl v0.16b, {v28.16b}, v0.16b
1385
tbl v2.16b, {v27.16b}, v2.16b
1386
eor v0.16b, v0.16b, v2.16b
1387
eor v1.16b, v1.16b, v1.16b
1388
aese v0.16b,v1.16b
1389
ushr v2.16b, v0.16b, 4
1390
and v0.16b, v0.16b, v31.16b
1391
tbl v0.16b, {v30.16b}, v0.16b
1392
tbl v2.16b, {v29.16b}, v2.16b
1393
eor v0.16b, v0.16b, v2.16b
1394
1395
mov w7,v0.s[0]
1396
eor w6,w7,w7,ror #32-2
1397
eor w6,w6,w7,ror #32-10
1398
eor w6,w6,w7,ror #32-18
1399
eor w6,w6,w7,ror #32-24
1400
eor w14,w14,w6
1401
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1402
eor w6,w12,w13
1403
eor w9,w14,w8
1404
eor w6,w6,w9
1405
mov v3.s[0],w6
1406
// optimize sbox using AESE instruction
1407
tbl v0.16b, {v3.16b}, v26.16b
1408
ushr v2.16b, v0.16b, 4
1409
and v0.16b, v0.16b, v31.16b
1410
tbl v0.16b, {v28.16b}, v0.16b
1411
tbl v2.16b, {v27.16b}, v2.16b
1412
eor v0.16b, v0.16b, v2.16b
1413
eor v1.16b, v1.16b, v1.16b
1414
aese v0.16b,v1.16b
1415
ushr v2.16b, v0.16b, 4
1416
and v0.16b, v0.16b, v31.16b
1417
tbl v0.16b, {v30.16b}, v0.16b
1418
tbl v2.16b, {v29.16b}, v2.16b
1419
eor v0.16b, v0.16b, v2.16b
1420
1421
mov w7,v0.s[0]
1422
eor w6,w7,w7,ror #32-2
1423
eor w6,w6,w7,ror #32-10
1424
eor w6,w6,w7,ror #32-18
1425
eor w6,w6,w7,ror #32-24
1426
eor w15,w15,w6
1427
subs w11,w11,#1
1428
b.ne 10b
1429
mov v5.s[0],w15
1430
mov v5.s[1],w14
1431
mov v5.s[2],w13
1432
mov v5.s[3],w12
1433
#ifndef __AARCH64EB__
1434
rev32 v4.16b,v4.16b
1435
#endif
1436
eor v6.16b,v6.16b,v5.16b
1437
mov x10,x3
1438
mov w11,#8
1439
mov w12,v6.s[0]
1440
mov w13,v6.s[1]
1441
mov w14,v6.s[2]
1442
mov w15,v6.s[3]
1443
10:
1444
ldp w7,w8,[x10],8
1445
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1446
eor w6,w14,w15
1447
eor w9,w7,w13
1448
eor w6,w6,w9
1449
mov v3.s[0],w6
1450
// optimize sbox using AESE instruction
1451
tbl v0.16b, {v3.16b}, v26.16b
1452
ushr v2.16b, v0.16b, 4
1453
and v0.16b, v0.16b, v31.16b
1454
tbl v0.16b, {v28.16b}, v0.16b
1455
tbl v2.16b, {v27.16b}, v2.16b
1456
eor v0.16b, v0.16b, v2.16b
1457
eor v1.16b, v1.16b, v1.16b
1458
aese v0.16b,v1.16b
1459
ushr v2.16b, v0.16b, 4
1460
and v0.16b, v0.16b, v31.16b
1461
tbl v0.16b, {v30.16b}, v0.16b
1462
tbl v2.16b, {v29.16b}, v2.16b
1463
eor v0.16b, v0.16b, v2.16b
1464
1465
mov w7,v0.s[0]
1466
eor w6,w7,w7,ror #32-2
1467
eor w6,w6,w7,ror #32-10
1468
eor w6,w6,w7,ror #32-18
1469
eor w6,w6,w7,ror #32-24
1470
eor w12,w12,w6
1471
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1472
eor w6,w14,w15
1473
eor w9,w12,w8
1474
eor w6,w6,w9
1475
mov v3.s[0],w6
1476
// optimize sbox using AESE instruction
1477
tbl v0.16b, {v3.16b}, v26.16b
1478
ushr v2.16b, v0.16b, 4
1479
and v0.16b, v0.16b, v31.16b
1480
tbl v0.16b, {v28.16b}, v0.16b
1481
tbl v2.16b, {v27.16b}, v2.16b
1482
eor v0.16b, v0.16b, v2.16b
1483
eor v1.16b, v1.16b, v1.16b
1484
aese v0.16b,v1.16b
1485
ushr v2.16b, v0.16b, 4
1486
and v0.16b, v0.16b, v31.16b
1487
tbl v0.16b, {v30.16b}, v0.16b
1488
tbl v2.16b, {v29.16b}, v2.16b
1489
eor v0.16b, v0.16b, v2.16b
1490
1491
mov w7,v0.s[0]
1492
eor w6,w7,w7,ror #32-2
1493
eor w6,w6,w7,ror #32-10
1494
eor w6,w6,w7,ror #32-18
1495
eor w6,w6,w7,ror #32-24
1496
ldp w7,w8,[x10],8
1497
eor w13,w13,w6
1498
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1499
eor w6,w12,w13
1500
eor w9,w7,w15
1501
eor w6,w6,w9
1502
mov v3.s[0],w6
1503
// optimize sbox using AESE instruction
1504
tbl v0.16b, {v3.16b}, v26.16b
1505
ushr v2.16b, v0.16b, 4
1506
and v0.16b, v0.16b, v31.16b
1507
tbl v0.16b, {v28.16b}, v0.16b
1508
tbl v2.16b, {v27.16b}, v2.16b
1509
eor v0.16b, v0.16b, v2.16b
1510
eor v1.16b, v1.16b, v1.16b
1511
aese v0.16b,v1.16b
1512
ushr v2.16b, v0.16b, 4
1513
and v0.16b, v0.16b, v31.16b
1514
tbl v0.16b, {v30.16b}, v0.16b
1515
tbl v2.16b, {v29.16b}, v2.16b
1516
eor v0.16b, v0.16b, v2.16b
1517
1518
mov w7,v0.s[0]
1519
eor w6,w7,w7,ror #32-2
1520
eor w6,w6,w7,ror #32-10
1521
eor w6,w6,w7,ror #32-18
1522
eor w6,w6,w7,ror #32-24
1523
eor w14,w14,w6
1524
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1525
eor w6,w12,w13
1526
eor w9,w14,w8
1527
eor w6,w6,w9
1528
mov v3.s[0],w6
1529
// optimize sbox using AESE instruction
1530
tbl v0.16b, {v3.16b}, v26.16b
1531
ushr v2.16b, v0.16b, 4
1532
and v0.16b, v0.16b, v31.16b
1533
tbl v0.16b, {v28.16b}, v0.16b
1534
tbl v2.16b, {v27.16b}, v2.16b
1535
eor v0.16b, v0.16b, v2.16b
1536
eor v1.16b, v1.16b, v1.16b
1537
aese v0.16b,v1.16b
1538
ushr v2.16b, v0.16b, 4
1539
and v0.16b, v0.16b, v31.16b
1540
tbl v0.16b, {v30.16b}, v0.16b
1541
tbl v2.16b, {v29.16b}, v2.16b
1542
eor v0.16b, v0.16b, v2.16b
1543
1544
mov w7,v0.s[0]
1545
eor w6,w7,w7,ror #32-2
1546
eor w6,w6,w7,ror #32-10
1547
eor w6,w6,w7,ror #32-18
1548
eor w6,w6,w7,ror #32-24
1549
eor w15,w15,w6
1550
subs w11,w11,#1
1551
b.ne 10b
1552
mov v6.s[0],w15
1553
mov v6.s[1],w14
1554
mov v6.s[2],w13
1555
mov v6.s[3],w12
1556
#ifndef __AARCH64EB__
1557
rev32 v5.16b,v5.16b
1558
#endif
1559
eor v7.16b,v7.16b,v6.16b
1560
mov x10,x3
1561
mov w11,#8
1562
mov w12,v7.s[0]
1563
mov w13,v7.s[1]
1564
mov w14,v7.s[2]
1565
mov w15,v7.s[3]
1566
10:
1567
ldp w7,w8,[x10],8
1568
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1569
eor w6,w14,w15
1570
eor w9,w7,w13
1571
eor w6,w6,w9
1572
mov v3.s[0],w6
1573
// optimize sbox using AESE instruction
1574
tbl v0.16b, {v3.16b}, v26.16b
1575
ushr v2.16b, v0.16b, 4
1576
and v0.16b, v0.16b, v31.16b
1577
tbl v0.16b, {v28.16b}, v0.16b
1578
tbl v2.16b, {v27.16b}, v2.16b
1579
eor v0.16b, v0.16b, v2.16b
1580
eor v1.16b, v1.16b, v1.16b
1581
aese v0.16b,v1.16b
1582
ushr v2.16b, v0.16b, 4
1583
and v0.16b, v0.16b, v31.16b
1584
tbl v0.16b, {v30.16b}, v0.16b
1585
tbl v2.16b, {v29.16b}, v2.16b
1586
eor v0.16b, v0.16b, v2.16b
1587
1588
mov w7,v0.s[0]
1589
eor w6,w7,w7,ror #32-2
1590
eor w6,w6,w7,ror #32-10
1591
eor w6,w6,w7,ror #32-18
1592
eor w6,w6,w7,ror #32-24
1593
eor w12,w12,w6
1594
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1595
eor w6,w14,w15
1596
eor w9,w12,w8
1597
eor w6,w6,w9
1598
mov v3.s[0],w6
1599
// optimize sbox using AESE instruction
1600
tbl v0.16b, {v3.16b}, v26.16b
1601
ushr v2.16b, v0.16b, 4
1602
and v0.16b, v0.16b, v31.16b
1603
tbl v0.16b, {v28.16b}, v0.16b
1604
tbl v2.16b, {v27.16b}, v2.16b
1605
eor v0.16b, v0.16b, v2.16b
1606
eor v1.16b, v1.16b, v1.16b
1607
aese v0.16b,v1.16b
1608
ushr v2.16b, v0.16b, 4
1609
and v0.16b, v0.16b, v31.16b
1610
tbl v0.16b, {v30.16b}, v0.16b
1611
tbl v2.16b, {v29.16b}, v2.16b
1612
eor v0.16b, v0.16b, v2.16b
1613
1614
mov w7,v0.s[0]
1615
eor w6,w7,w7,ror #32-2
1616
eor w6,w6,w7,ror #32-10
1617
eor w6,w6,w7,ror #32-18
1618
eor w6,w6,w7,ror #32-24
1619
ldp w7,w8,[x10],8
1620
eor w13,w13,w6
1621
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1622
eor w6,w12,w13
1623
eor w9,w7,w15
1624
eor w6,w6,w9
1625
mov v3.s[0],w6
1626
// optimize sbox using AESE instruction
1627
tbl v0.16b, {v3.16b}, v26.16b
1628
ushr v2.16b, v0.16b, 4
1629
and v0.16b, v0.16b, v31.16b
1630
tbl v0.16b, {v28.16b}, v0.16b
1631
tbl v2.16b, {v27.16b}, v2.16b
1632
eor v0.16b, v0.16b, v2.16b
1633
eor v1.16b, v1.16b, v1.16b
1634
aese v0.16b,v1.16b
1635
ushr v2.16b, v0.16b, 4
1636
and v0.16b, v0.16b, v31.16b
1637
tbl v0.16b, {v30.16b}, v0.16b
1638
tbl v2.16b, {v29.16b}, v2.16b
1639
eor v0.16b, v0.16b, v2.16b
1640
1641
mov w7,v0.s[0]
1642
eor w6,w7,w7,ror #32-2
1643
eor w6,w6,w7,ror #32-10
1644
eor w6,w6,w7,ror #32-18
1645
eor w6,w6,w7,ror #32-24
1646
eor w14,w14,w6
1647
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1648
eor w6,w12,w13
1649
eor w9,w14,w8
1650
eor w6,w6,w9
1651
mov v3.s[0],w6
1652
// optimize sbox using AESE instruction
1653
tbl v0.16b, {v3.16b}, v26.16b
1654
ushr v2.16b, v0.16b, 4
1655
and v0.16b, v0.16b, v31.16b
1656
tbl v0.16b, {v28.16b}, v0.16b
1657
tbl v2.16b, {v27.16b}, v2.16b
1658
eor v0.16b, v0.16b, v2.16b
1659
eor v1.16b, v1.16b, v1.16b
1660
aese v0.16b,v1.16b
1661
ushr v2.16b, v0.16b, 4
1662
and v0.16b, v0.16b, v31.16b
1663
tbl v0.16b, {v30.16b}, v0.16b
1664
tbl v2.16b, {v29.16b}, v2.16b
1665
eor v0.16b, v0.16b, v2.16b
1666
1667
mov w7,v0.s[0]
1668
eor w6,w7,w7,ror #32-2
1669
eor w6,w6,w7,ror #32-10
1670
eor w6,w6,w7,ror #32-18
1671
eor w6,w6,w7,ror #32-24
1672
eor w15,w15,w6
1673
subs w11,w11,#1
1674
b.ne 10b
1675
mov v7.s[0],w15
1676
mov v7.s[1],w14
1677
mov v7.s[2],w13
1678
mov v7.s[3],w12
1679
#ifndef __AARCH64EB__
1680
rev32 v6.16b,v6.16b
1681
#endif
1682
#ifndef __AARCH64EB__
1683
rev32 v7.16b,v7.16b
1684
#endif
1685
orr v3.16b,v7.16b,v7.16b
1686
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
1687
subs w2,w2,#4
1688
b.ne .Lcbc_4_blocks_enc
1689
b 2f
1690
1:
1691
subs w2,w2,#1
1692
b.lt 2f
1693
ld1 {v4.4s},[x0],#16
1694
eor v3.16b,v3.16b,v4.16b
1695
#ifndef __AARCH64EB__
1696
rev32 v3.16b,v3.16b
1697
#endif
1698
mov x10,x3
1699
mov w11,#8
1700
mov w12,v3.s[0]
1701
mov w13,v3.s[1]
1702
mov w14,v3.s[2]
1703
mov w15,v3.s[3]
1704
10:
1705
ldp w7,w8,[x10],8
1706
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1707
eor w6,w14,w15
1708
eor w9,w7,w13
1709
eor w6,w6,w9
1710
mov v3.s[0],w6
1711
// optimize sbox using AESE instruction
1712
tbl v0.16b, {v3.16b}, v26.16b
1713
ushr v2.16b, v0.16b, 4
1714
and v0.16b, v0.16b, v31.16b
1715
tbl v0.16b, {v28.16b}, v0.16b
1716
tbl v2.16b, {v27.16b}, v2.16b
1717
eor v0.16b, v0.16b, v2.16b
1718
eor v1.16b, v1.16b, v1.16b
1719
aese v0.16b,v1.16b
1720
ushr v2.16b, v0.16b, 4
1721
and v0.16b, v0.16b, v31.16b
1722
tbl v0.16b, {v30.16b}, v0.16b
1723
tbl v2.16b, {v29.16b}, v2.16b
1724
eor v0.16b, v0.16b, v2.16b
1725
1726
mov w7,v0.s[0]
1727
eor w6,w7,w7,ror #32-2
1728
eor w6,w6,w7,ror #32-10
1729
eor w6,w6,w7,ror #32-18
1730
eor w6,w6,w7,ror #32-24
1731
eor w12,w12,w6
1732
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1733
eor w6,w14,w15
1734
eor w9,w12,w8
1735
eor w6,w6,w9
1736
mov v3.s[0],w6
1737
// optimize sbox using AESE instruction
1738
tbl v0.16b, {v3.16b}, v26.16b
1739
ushr v2.16b, v0.16b, 4
1740
and v0.16b, v0.16b, v31.16b
1741
tbl v0.16b, {v28.16b}, v0.16b
1742
tbl v2.16b, {v27.16b}, v2.16b
1743
eor v0.16b, v0.16b, v2.16b
1744
eor v1.16b, v1.16b, v1.16b
1745
aese v0.16b,v1.16b
1746
ushr v2.16b, v0.16b, 4
1747
and v0.16b, v0.16b, v31.16b
1748
tbl v0.16b, {v30.16b}, v0.16b
1749
tbl v2.16b, {v29.16b}, v2.16b
1750
eor v0.16b, v0.16b, v2.16b
1751
1752
mov w7,v0.s[0]
1753
eor w6,w7,w7,ror #32-2
1754
eor w6,w6,w7,ror #32-10
1755
eor w6,w6,w7,ror #32-18
1756
eor w6,w6,w7,ror #32-24
1757
ldp w7,w8,[x10],8
1758
eor w13,w13,w6
1759
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1760
eor w6,w12,w13
1761
eor w9,w7,w15
1762
eor w6,w6,w9
1763
mov v3.s[0],w6
1764
// optimize sbox using AESE instruction
1765
tbl v0.16b, {v3.16b}, v26.16b
1766
ushr v2.16b, v0.16b, 4
1767
and v0.16b, v0.16b, v31.16b
1768
tbl v0.16b, {v28.16b}, v0.16b
1769
tbl v2.16b, {v27.16b}, v2.16b
1770
eor v0.16b, v0.16b, v2.16b
1771
eor v1.16b, v1.16b, v1.16b
1772
aese v0.16b,v1.16b
1773
ushr v2.16b, v0.16b, 4
1774
and v0.16b, v0.16b, v31.16b
1775
tbl v0.16b, {v30.16b}, v0.16b
1776
tbl v2.16b, {v29.16b}, v2.16b
1777
eor v0.16b, v0.16b, v2.16b
1778
1779
mov w7,v0.s[0]
1780
eor w6,w7,w7,ror #32-2
1781
eor w6,w6,w7,ror #32-10
1782
eor w6,w6,w7,ror #32-18
1783
eor w6,w6,w7,ror #32-24
1784
eor w14,w14,w6
1785
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1786
eor w6,w12,w13
1787
eor w9,w14,w8
1788
eor w6,w6,w9
1789
mov v3.s[0],w6
1790
// optimize sbox using AESE instruction
1791
tbl v0.16b, {v3.16b}, v26.16b
1792
ushr v2.16b, v0.16b, 4
1793
and v0.16b, v0.16b, v31.16b
1794
tbl v0.16b, {v28.16b}, v0.16b
1795
tbl v2.16b, {v27.16b}, v2.16b
1796
eor v0.16b, v0.16b, v2.16b
1797
eor v1.16b, v1.16b, v1.16b
1798
aese v0.16b,v1.16b
1799
ushr v2.16b, v0.16b, 4
1800
and v0.16b, v0.16b, v31.16b
1801
tbl v0.16b, {v30.16b}, v0.16b
1802
tbl v2.16b, {v29.16b}, v2.16b
1803
eor v0.16b, v0.16b, v2.16b
1804
1805
mov w7,v0.s[0]
1806
eor w6,w7,w7,ror #32-2
1807
eor w6,w6,w7,ror #32-10
1808
eor w6,w6,w7,ror #32-18
1809
eor w6,w6,w7,ror #32-24
1810
eor w15,w15,w6
1811
subs w11,w11,#1
1812
b.ne 10b
1813
mov v3.s[0],w15
1814
mov v3.s[1],w14
1815
mov v3.s[2],w13
1816
mov v3.s[3],w12
1817
#ifndef __AARCH64EB__
1818
rev32 v3.16b,v3.16b
1819
#endif
1820
st1 {v3.4s},[x1],#16
1821
b 1b
1822
2:
1823
// save back IV
1824
st1 {v3.4s},[x4]
1825
ret
1826
1827
.Ldec:
1828
// decryption mode starts
1829
AARCH64_SIGN_LINK_REGISTER
1830
stp d8,d9,[sp,#-80]!
1831
stp d10,d11,[sp,#16]
1832
stp d12,d13,[sp,#32]
1833
stp d14,d15,[sp,#48]
1834
stp x29,x30,[sp,#64]
1835
.Lcbc_8_blocks_dec:
1836
cmp w2,#8
1837
b.lt 1f
1838
ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
1839
add x10,x0,#64
1840
ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10]
1841
#ifndef __AARCH64EB__
1842
rev32 v4.16b,v4.16b
1843
#endif
1844
#ifndef __AARCH64EB__
1845
rev32 v5.16b,v5.16b
1846
#endif
1847
#ifndef __AARCH64EB__
1848
rev32 v6.16b,v6.16b
1849
#endif
1850
#ifndef __AARCH64EB__
1851
rev32 v7.16b,v7.16b
1852
#endif
1853
#ifndef __AARCH64EB__
1854
rev32 v8.16b,v8.16b
1855
#endif
1856
#ifndef __AARCH64EB__
1857
rev32 v9.16b,v9.16b
1858
#endif
1859
#ifndef __AARCH64EB__
1860
rev32 v10.16b,v10.16b
1861
#endif
1862
#ifndef __AARCH64EB__
1863
rev32 v11.16b,v11.16b
1864
#endif
1865
bl _vpsm4_ex_enc_8blks
1866
zip1 v8.4s,v0.4s,v1.4s
1867
zip2 v9.4s,v0.4s,v1.4s
1868
zip1 v10.4s,v2.4s,v3.4s
1869
zip2 v11.4s,v2.4s,v3.4s
1870
zip1 v0.2d,v8.2d,v10.2d
1871
zip2 v1.2d,v8.2d,v10.2d
1872
zip1 v2.2d,v9.2d,v11.2d
1873
zip2 v3.2d,v9.2d,v11.2d
1874
zip1 v8.4s,v4.4s,v5.4s
1875
zip2 v9.4s,v4.4s,v5.4s
1876
zip1 v10.4s,v6.4s,v7.4s
1877
zip2 v11.4s,v6.4s,v7.4s
1878
zip1 v4.2d,v8.2d,v10.2d
1879
zip2 v5.2d,v8.2d,v10.2d
1880
zip1 v6.2d,v9.2d,v11.2d
1881
zip2 v7.2d,v9.2d,v11.2d
1882
ld1 {v15.4s},[x4]
1883
ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
1884
// note ivec1 and vtmpx[3] are reusing the same register
1885
// care needs to be taken to avoid conflict
1886
eor v0.16b,v0.16b,v15.16b
1887
ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
1888
eor v1.16b,v1.16b,v8.16b
1889
eor v2.16b,v2.16b,v9.16b
1890
eor v3.16b,v3.16b,v10.16b
1891
// save back IV
1892
st1 {v15.4s}, [x4]
1893
eor v4.16b,v4.16b,v11.16b
1894
eor v5.16b,v5.16b,v12.16b
1895
eor v6.16b,v6.16b,v13.16b
1896
eor v7.16b,v7.16b,v14.16b
1897
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
1898
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
1899
subs w2,w2,#8
1900
b.gt .Lcbc_8_blocks_dec
1901
b.eq 100f
1902
1:
1903
ld1 {v15.4s},[x4]
1904
.Lcbc_4_blocks_dec:
1905
cmp w2,#4
1906
b.lt 1f
1907
ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
1908
#ifndef __AARCH64EB__
1909
rev32 v4.16b,v4.16b
1910
#endif
1911
#ifndef __AARCH64EB__
1912
rev32 v5.16b,v5.16b
1913
#endif
1914
#ifndef __AARCH64EB__
1915
rev32 v6.16b,v6.16b
1916
#endif
1917
#ifndef __AARCH64EB__
1918
rev32 v7.16b,v7.16b
1919
#endif
1920
bl _vpsm4_ex_enc_4blks
1921
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1922
zip1 v8.4s,v0.4s,v1.4s
1923
zip2 v9.4s,v0.4s,v1.4s
1924
zip1 v10.4s,v2.4s,v3.4s
1925
zip2 v11.4s,v2.4s,v3.4s
1926
zip1 v0.2d,v8.2d,v10.2d
1927
zip2 v1.2d,v8.2d,v10.2d
1928
zip1 v2.2d,v9.2d,v11.2d
1929
zip2 v3.2d,v9.2d,v11.2d
1930
eor v0.16b,v0.16b,v15.16b
1931
eor v1.16b,v1.16b,v4.16b
1932
orr v15.16b,v7.16b,v7.16b
1933
eor v2.16b,v2.16b,v5.16b
1934
eor v3.16b,v3.16b,v6.16b
1935
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
1936
subs w2,w2,#4
1937
b.gt .Lcbc_4_blocks_dec
1938
// save back IV
1939
st1 {v7.4s}, [x4]
1940
b 100f
1941
1: // last block
1942
subs w2,w2,#1
1943
b.lt 100f
1944
b.gt 1f
1945
ld1 {v4.4s},[x0],#16
1946
// save back IV
1947
st1 {v4.4s}, [x4]
1948
#ifndef __AARCH64EB__
1949
rev32 v8.16b,v4.16b
1950
#else
1951
mov v8.16b,v4.16b
1952
#endif
1953
mov x10,x3
1954
mov w11,#8
1955
mov w12,v8.s[0]
1956
mov w13,v8.s[1]
1957
mov w14,v8.s[2]
1958
mov w15,v8.s[3]
1959
10:
1960
ldp w7,w8,[x10],8
1961
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1962
eor w6,w14,w15
1963
eor w9,w7,w13
1964
eor w6,w6,w9
1965
mov v3.s[0],w6
1966
// optimize sbox using AESE instruction
1967
tbl v0.16b, {v3.16b}, v26.16b
1968
ushr v2.16b, v0.16b, 4
1969
and v0.16b, v0.16b, v31.16b
1970
tbl v0.16b, {v28.16b}, v0.16b
1971
tbl v2.16b, {v27.16b}, v2.16b
1972
eor v0.16b, v0.16b, v2.16b
1973
eor v1.16b, v1.16b, v1.16b
1974
aese v0.16b,v1.16b
1975
ushr v2.16b, v0.16b, 4
1976
and v0.16b, v0.16b, v31.16b
1977
tbl v0.16b, {v30.16b}, v0.16b
1978
tbl v2.16b, {v29.16b}, v2.16b
1979
eor v0.16b, v0.16b, v2.16b
1980
1981
mov w7,v0.s[0]
1982
eor w6,w7,w7,ror #32-2
1983
eor w6,w6,w7,ror #32-10
1984
eor w6,w6,w7,ror #32-18
1985
eor w6,w6,w7,ror #32-24
1986
eor w12,w12,w6
1987
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1988
eor w6,w14,w15
1989
eor w9,w12,w8
1990
eor w6,w6,w9
1991
mov v3.s[0],w6
1992
// optimize sbox using AESE instruction
1993
tbl v0.16b, {v3.16b}, v26.16b
1994
ushr v2.16b, v0.16b, 4
1995
and v0.16b, v0.16b, v31.16b
1996
tbl v0.16b, {v28.16b}, v0.16b
1997
tbl v2.16b, {v27.16b}, v2.16b
1998
eor v0.16b, v0.16b, v2.16b
1999
eor v1.16b, v1.16b, v1.16b
2000
aese v0.16b,v1.16b
2001
ushr v2.16b, v0.16b, 4
2002
and v0.16b, v0.16b, v31.16b
2003
tbl v0.16b, {v30.16b}, v0.16b
2004
tbl v2.16b, {v29.16b}, v2.16b
2005
eor v0.16b, v0.16b, v2.16b
2006
2007
mov w7,v0.s[0]
2008
eor w6,w7,w7,ror #32-2
2009
eor w6,w6,w7,ror #32-10
2010
eor w6,w6,w7,ror #32-18
2011
eor w6,w6,w7,ror #32-24
2012
ldp w7,w8,[x10],8
2013
eor w13,w13,w6
2014
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2015
eor w6,w12,w13
2016
eor w9,w7,w15
2017
eor w6,w6,w9
2018
mov v3.s[0],w6
2019
// optimize sbox using AESE instruction
2020
tbl v0.16b, {v3.16b}, v26.16b
2021
ushr v2.16b, v0.16b, 4
2022
and v0.16b, v0.16b, v31.16b
2023
tbl v0.16b, {v28.16b}, v0.16b
2024
tbl v2.16b, {v27.16b}, v2.16b
2025
eor v0.16b, v0.16b, v2.16b
2026
eor v1.16b, v1.16b, v1.16b
2027
aese v0.16b,v1.16b
2028
ushr v2.16b, v0.16b, 4
2029
and v0.16b, v0.16b, v31.16b
2030
tbl v0.16b, {v30.16b}, v0.16b
2031
tbl v2.16b, {v29.16b}, v2.16b
2032
eor v0.16b, v0.16b, v2.16b
2033
2034
mov w7,v0.s[0]
2035
eor w6,w7,w7,ror #32-2
2036
eor w6,w6,w7,ror #32-10
2037
eor w6,w6,w7,ror #32-18
2038
eor w6,w6,w7,ror #32-24
2039
eor w14,w14,w6
2040
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2041
eor w6,w12,w13
2042
eor w9,w14,w8
2043
eor w6,w6,w9
2044
mov v3.s[0],w6
2045
// optimize sbox using AESE instruction
2046
tbl v0.16b, {v3.16b}, v26.16b
2047
ushr v2.16b, v0.16b, 4
2048
and v0.16b, v0.16b, v31.16b
2049
tbl v0.16b, {v28.16b}, v0.16b
2050
tbl v2.16b, {v27.16b}, v2.16b
2051
eor v0.16b, v0.16b, v2.16b
2052
eor v1.16b, v1.16b, v1.16b
2053
aese v0.16b,v1.16b
2054
ushr v2.16b, v0.16b, 4
2055
and v0.16b, v0.16b, v31.16b
2056
tbl v0.16b, {v30.16b}, v0.16b
2057
tbl v2.16b, {v29.16b}, v2.16b
2058
eor v0.16b, v0.16b, v2.16b
2059
2060
mov w7,v0.s[0]
2061
eor w6,w7,w7,ror #32-2
2062
eor w6,w6,w7,ror #32-10
2063
eor w6,w6,w7,ror #32-18
2064
eor w6,w6,w7,ror #32-24
2065
eor w15,w15,w6
2066
subs w11,w11,#1
2067
b.ne 10b
2068
mov v8.s[0],w15
2069
mov v8.s[1],w14
2070
mov v8.s[2],w13
2071
mov v8.s[3],w12
2072
#ifndef __AARCH64EB__
2073
rev32 v8.16b,v8.16b
2074
#endif
2075
eor v8.16b,v8.16b,v15.16b
2076
st1 {v8.4s},[x1],#16
2077
b 100f
2078
1: // last two blocks
2079
ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0]
2080
add x10,x0,#16
2081
ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16
2082
subs w2,w2,1
2083
b.gt 1f
2084
#ifndef __AARCH64EB__
2085
rev32 v4.16b,v4.16b
2086
#endif
2087
#ifndef __AARCH64EB__
2088
rev32 v5.16b,v5.16b
2089
#endif
2090
#ifndef __AARCH64EB__
2091
rev32 v6.16b,v6.16b
2092
#endif
2093
#ifndef __AARCH64EB__
2094
rev32 v7.16b,v7.16b
2095
#endif
2096
bl _vpsm4_ex_enc_4blks
2097
ld1 {v4.4s,v5.4s},[x0],#32
2098
zip1 v8.4s,v0.4s,v1.4s
2099
zip2 v9.4s,v0.4s,v1.4s
2100
zip1 v10.4s,v2.4s,v3.4s
2101
zip2 v11.4s,v2.4s,v3.4s
2102
zip1 v0.2d,v8.2d,v10.2d
2103
zip2 v1.2d,v8.2d,v10.2d
2104
zip1 v2.2d,v9.2d,v11.2d
2105
zip2 v3.2d,v9.2d,v11.2d
2106
eor v0.16b,v0.16b,v15.16b
2107
eor v1.16b,v1.16b,v4.16b
2108
st1 {v0.4s,v1.4s},[x1],#32
2109
// save back IV
2110
st1 {v5.4s}, [x4]
2111
b 100f
2112
1: // last 3 blocks
2113
ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10]
2114
#ifndef __AARCH64EB__
2115
rev32 v4.16b,v4.16b
2116
#endif
2117
#ifndef __AARCH64EB__
2118
rev32 v5.16b,v5.16b
2119
#endif
2120
#ifndef __AARCH64EB__
2121
rev32 v6.16b,v6.16b
2122
#endif
2123
#ifndef __AARCH64EB__
2124
rev32 v7.16b,v7.16b
2125
#endif
2126
bl _vpsm4_ex_enc_4blks
2127
ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
2128
zip1 v8.4s,v0.4s,v1.4s
2129
zip2 v9.4s,v0.4s,v1.4s
2130
zip1 v10.4s,v2.4s,v3.4s
2131
zip2 v11.4s,v2.4s,v3.4s
2132
zip1 v0.2d,v8.2d,v10.2d
2133
zip2 v1.2d,v8.2d,v10.2d
2134
zip1 v2.2d,v9.2d,v11.2d
2135
zip2 v3.2d,v9.2d,v11.2d
2136
eor v0.16b,v0.16b,v15.16b
2137
eor v1.16b,v1.16b,v4.16b
2138
eor v2.16b,v2.16b,v5.16b
2139
st1 {v0.4s,v1.4s,v2.4s},[x1],#48
2140
// save back IV
2141
st1 {v6.4s}, [x4]
2142
100:
2143
ldp d10,d11,[sp,#16]
2144
ldp d12,d13,[sp,#32]
2145
ldp d14,d15,[sp,#48]
2146
ldp x29,x30,[sp,#64]
2147
ldp d8,d9,[sp],#80
2148
AARCH64_VALIDATE_LINK_REGISTER
2149
ret
2150
.size vpsm4_ex_cbc_encrypt,.-vpsm4_ex_cbc_encrypt
2151
.globl vpsm4_ex_ctr32_encrypt_blocks
2152
.type vpsm4_ex_ctr32_encrypt_blocks,%function
2153
.align 5
2154
vpsm4_ex_ctr32_encrypt_blocks:
2155
AARCH64_VALID_CALL_TARGET
2156
ld1 {v3.4s},[x4]
2157
#ifndef __AARCH64EB__
2158
rev32 v3.16b,v3.16b
2159
#endif
2160
adrp x9, .Lsbox_magic
2161
ldr q26, [x9, #:lo12:.Lsbox_magic]
2162
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
2163
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
2164
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
2165
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
2166
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
2167
cmp w2,#1
2168
b.ne 1f
2169
// fast processing for one single block without
2170
// context saving overhead
2171
mov x10,x3
2172
mov w11,#8
2173
mov w12,v3.s[0]
2174
mov w13,v3.s[1]
2175
mov w14,v3.s[2]
2176
mov w15,v3.s[3]
2177
10:
2178
ldp w7,w8,[x10],8
2179
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2180
eor w6,w14,w15
2181
eor w9,w7,w13
2182
eor w6,w6,w9
2183
mov v3.s[0],w6
2184
// optimize sbox using AESE instruction
2185
tbl v0.16b, {v3.16b}, v26.16b
2186
ushr v2.16b, v0.16b, 4
2187
and v0.16b, v0.16b, v31.16b
2188
tbl v0.16b, {v28.16b}, v0.16b
2189
tbl v2.16b, {v27.16b}, v2.16b
2190
eor v0.16b, v0.16b, v2.16b
2191
eor v1.16b, v1.16b, v1.16b
2192
aese v0.16b,v1.16b
2193
ushr v2.16b, v0.16b, 4
2194
and v0.16b, v0.16b, v31.16b
2195
tbl v0.16b, {v30.16b}, v0.16b
2196
tbl v2.16b, {v29.16b}, v2.16b
2197
eor v0.16b, v0.16b, v2.16b
2198
2199
mov w7,v0.s[0]
2200
eor w6,w7,w7,ror #32-2
2201
eor w6,w6,w7,ror #32-10
2202
eor w6,w6,w7,ror #32-18
2203
eor w6,w6,w7,ror #32-24
2204
eor w12,w12,w6
2205
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2206
eor w6,w14,w15
2207
eor w9,w12,w8
2208
eor w6,w6,w9
2209
mov v3.s[0],w6
2210
// optimize sbox using AESE instruction
2211
tbl v0.16b, {v3.16b}, v26.16b
2212
ushr v2.16b, v0.16b, 4
2213
and v0.16b, v0.16b, v31.16b
2214
tbl v0.16b, {v28.16b}, v0.16b
2215
tbl v2.16b, {v27.16b}, v2.16b
2216
eor v0.16b, v0.16b, v2.16b
2217
eor v1.16b, v1.16b, v1.16b
2218
aese v0.16b,v1.16b
2219
ushr v2.16b, v0.16b, 4
2220
and v0.16b, v0.16b, v31.16b
2221
tbl v0.16b, {v30.16b}, v0.16b
2222
tbl v2.16b, {v29.16b}, v2.16b
2223
eor v0.16b, v0.16b, v2.16b
2224
2225
mov w7,v0.s[0]
2226
eor w6,w7,w7,ror #32-2
2227
eor w6,w6,w7,ror #32-10
2228
eor w6,w6,w7,ror #32-18
2229
eor w6,w6,w7,ror #32-24
2230
ldp w7,w8,[x10],8
2231
eor w13,w13,w6
2232
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2233
eor w6,w12,w13
2234
eor w9,w7,w15
2235
eor w6,w6,w9
2236
mov v3.s[0],w6
2237
// optimize sbox using AESE instruction
2238
tbl v0.16b, {v3.16b}, v26.16b
2239
ushr v2.16b, v0.16b, 4
2240
and v0.16b, v0.16b, v31.16b
2241
tbl v0.16b, {v28.16b}, v0.16b
2242
tbl v2.16b, {v27.16b}, v2.16b
2243
eor v0.16b, v0.16b, v2.16b
2244
eor v1.16b, v1.16b, v1.16b
2245
aese v0.16b,v1.16b
2246
ushr v2.16b, v0.16b, 4
2247
and v0.16b, v0.16b, v31.16b
2248
tbl v0.16b, {v30.16b}, v0.16b
2249
tbl v2.16b, {v29.16b}, v2.16b
2250
eor v0.16b, v0.16b, v2.16b
2251
2252
mov w7,v0.s[0]
2253
eor w6,w7,w7,ror #32-2
2254
eor w6,w6,w7,ror #32-10
2255
eor w6,w6,w7,ror #32-18
2256
eor w6,w6,w7,ror #32-24
2257
eor w14,w14,w6
2258
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2259
eor w6,w12,w13
2260
eor w9,w14,w8
2261
eor w6,w6,w9
2262
mov v3.s[0],w6
2263
// optimize sbox using AESE instruction
2264
tbl v0.16b, {v3.16b}, v26.16b
2265
ushr v2.16b, v0.16b, 4
2266
and v0.16b, v0.16b, v31.16b
2267
tbl v0.16b, {v28.16b}, v0.16b
2268
tbl v2.16b, {v27.16b}, v2.16b
2269
eor v0.16b, v0.16b, v2.16b
2270
eor v1.16b, v1.16b, v1.16b
2271
aese v0.16b,v1.16b
2272
ushr v2.16b, v0.16b, 4
2273
and v0.16b, v0.16b, v31.16b
2274
tbl v0.16b, {v30.16b}, v0.16b
2275
tbl v2.16b, {v29.16b}, v2.16b
2276
eor v0.16b, v0.16b, v2.16b
2277
2278
mov w7,v0.s[0]
2279
eor w6,w7,w7,ror #32-2
2280
eor w6,w6,w7,ror #32-10
2281
eor w6,w6,w7,ror #32-18
2282
eor w6,w6,w7,ror #32-24
2283
eor w15,w15,w6
2284
subs w11,w11,#1
2285
b.ne 10b
2286
mov v3.s[0],w15
2287
mov v3.s[1],w14
2288
mov v3.s[2],w13
2289
mov v3.s[3],w12
2290
#ifndef __AARCH64EB__
2291
rev32 v3.16b,v3.16b
2292
#endif
2293
ld1 {v4.4s},[x0]
2294
eor v4.16b,v4.16b,v3.16b
2295
st1 {v4.4s},[x1]
2296
ret
2297
1:
2298
AARCH64_SIGN_LINK_REGISTER
2299
stp d8,d9,[sp,#-80]!
2300
stp d10,d11,[sp,#16]
2301
stp d12,d13,[sp,#32]
2302
stp d14,d15,[sp,#48]
2303
stp x29,x30,[sp,#64]
2304
mov w12,v3.s[0]
2305
mov w13,v3.s[1]
2306
mov w14,v3.s[2]
2307
mov w5,v3.s[3]
2308
.Lctr32_4_blocks_process:
2309
cmp w2,#4
2310
b.lt 1f
2311
dup v4.4s,w12
2312
dup v5.4s,w13
2313
dup v6.4s,w14
2314
mov v7.s[0],w5
2315
add w5,w5,#1
2316
mov v7.s[1],w5
2317
add w5,w5,#1
2318
mov v7.s[2],w5
2319
add w5,w5,#1
2320
mov v7.s[3],w5
2321
add w5,w5,#1
2322
cmp w2,#8
2323
b.ge .Lctr32_8_blocks_process
2324
bl _vpsm4_ex_enc_4blks
2325
ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2326
eor v0.16b,v0.16b,v12.16b
2327
eor v1.16b,v1.16b,v13.16b
2328
eor v2.16b,v2.16b,v14.16b
2329
eor v3.16b,v3.16b,v15.16b
2330
st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2331
subs w2,w2,#4
2332
b.ne .Lctr32_4_blocks_process
2333
b 100f
2334
.Lctr32_8_blocks_process:
2335
dup v8.4s,w12
2336
dup v9.4s,w13
2337
dup v10.4s,w14
2338
mov v11.s[0],w5
2339
add w5,w5,#1
2340
mov v11.s[1],w5
2341
add w5,w5,#1
2342
mov v11.s[2],w5
2343
add w5,w5,#1
2344
mov v11.s[3],w5
2345
add w5,w5,#1
2346
bl _vpsm4_ex_enc_8blks
2347
ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2348
ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2349
eor v0.16b,v0.16b,v12.16b
2350
eor v1.16b,v1.16b,v13.16b
2351
eor v2.16b,v2.16b,v14.16b
2352
eor v3.16b,v3.16b,v15.16b
2353
eor v4.16b,v4.16b,v8.16b
2354
eor v5.16b,v5.16b,v9.16b
2355
eor v6.16b,v6.16b,v10.16b
2356
eor v7.16b,v7.16b,v11.16b
2357
st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2358
st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
2359
subs w2,w2,#8
2360
b.ne .Lctr32_4_blocks_process
2361
b 100f
2362
1: // last block processing
2363
subs w2,w2,#1
2364
b.lt 100f
2365
b.gt 1f
2366
mov v3.s[0],w12
2367
mov v3.s[1],w13
2368
mov v3.s[2],w14
2369
mov v3.s[3],w5
2370
mov x10,x3
2371
mov w11,#8
2372
mov w12,v3.s[0]
2373
mov w13,v3.s[1]
2374
mov w14,v3.s[2]
2375
mov w15,v3.s[3]
2376
10:
2377
ldp w7,w8,[x10],8
2378
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2379
eor w6,w14,w15
2380
eor w9,w7,w13
2381
eor w6,w6,w9
2382
mov v3.s[0],w6
2383
// optimize sbox using AESE instruction
2384
tbl v0.16b, {v3.16b}, v26.16b
2385
ushr v2.16b, v0.16b, 4
2386
and v0.16b, v0.16b, v31.16b
2387
tbl v0.16b, {v28.16b}, v0.16b
2388
tbl v2.16b, {v27.16b}, v2.16b
2389
eor v0.16b, v0.16b, v2.16b
2390
eor v1.16b, v1.16b, v1.16b
2391
aese v0.16b,v1.16b
2392
ushr v2.16b, v0.16b, 4
2393
and v0.16b, v0.16b, v31.16b
2394
tbl v0.16b, {v30.16b}, v0.16b
2395
tbl v2.16b, {v29.16b}, v2.16b
2396
eor v0.16b, v0.16b, v2.16b
2397
2398
mov w7,v0.s[0]
2399
eor w6,w7,w7,ror #32-2
2400
eor w6,w6,w7,ror #32-10
2401
eor w6,w6,w7,ror #32-18
2402
eor w6,w6,w7,ror #32-24
2403
eor w12,w12,w6
2404
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2405
eor w6,w14,w15
2406
eor w9,w12,w8
2407
eor w6,w6,w9
2408
mov v3.s[0],w6
2409
// optimize sbox using AESE instruction
2410
tbl v0.16b, {v3.16b}, v26.16b
2411
ushr v2.16b, v0.16b, 4
2412
and v0.16b, v0.16b, v31.16b
2413
tbl v0.16b, {v28.16b}, v0.16b
2414
tbl v2.16b, {v27.16b}, v2.16b
2415
eor v0.16b, v0.16b, v2.16b
2416
eor v1.16b, v1.16b, v1.16b
2417
aese v0.16b,v1.16b
2418
ushr v2.16b, v0.16b, 4
2419
and v0.16b, v0.16b, v31.16b
2420
tbl v0.16b, {v30.16b}, v0.16b
2421
tbl v2.16b, {v29.16b}, v2.16b
2422
eor v0.16b, v0.16b, v2.16b
2423
2424
mov w7,v0.s[0]
2425
eor w6,w7,w7,ror #32-2
2426
eor w6,w6,w7,ror #32-10
2427
eor w6,w6,w7,ror #32-18
2428
eor w6,w6,w7,ror #32-24
2429
ldp w7,w8,[x10],8
2430
eor w13,w13,w6
2431
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2432
eor w6,w12,w13
2433
eor w9,w7,w15
2434
eor w6,w6,w9
2435
mov v3.s[0],w6
2436
// optimize sbox using AESE instruction
2437
tbl v0.16b, {v3.16b}, v26.16b
2438
ushr v2.16b, v0.16b, 4
2439
and v0.16b, v0.16b, v31.16b
2440
tbl v0.16b, {v28.16b}, v0.16b
2441
tbl v2.16b, {v27.16b}, v2.16b
2442
eor v0.16b, v0.16b, v2.16b
2443
eor v1.16b, v1.16b, v1.16b
2444
aese v0.16b,v1.16b
2445
ushr v2.16b, v0.16b, 4
2446
and v0.16b, v0.16b, v31.16b
2447
tbl v0.16b, {v30.16b}, v0.16b
2448
tbl v2.16b, {v29.16b}, v2.16b
2449
eor v0.16b, v0.16b, v2.16b
2450
2451
mov w7,v0.s[0]
2452
eor w6,w7,w7,ror #32-2
2453
eor w6,w6,w7,ror #32-10
2454
eor w6,w6,w7,ror #32-18
2455
eor w6,w6,w7,ror #32-24
2456
eor w14,w14,w6
2457
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2458
eor w6,w12,w13
2459
eor w9,w14,w8
2460
eor w6,w6,w9
2461
mov v3.s[0],w6
2462
// optimize sbox using AESE instruction
2463
tbl v0.16b, {v3.16b}, v26.16b
2464
ushr v2.16b, v0.16b, 4
2465
and v0.16b, v0.16b, v31.16b
2466
tbl v0.16b, {v28.16b}, v0.16b
2467
tbl v2.16b, {v27.16b}, v2.16b
2468
eor v0.16b, v0.16b, v2.16b
2469
eor v1.16b, v1.16b, v1.16b
2470
aese v0.16b,v1.16b
2471
ushr v2.16b, v0.16b, 4
2472
and v0.16b, v0.16b, v31.16b
2473
tbl v0.16b, {v30.16b}, v0.16b
2474
tbl v2.16b, {v29.16b}, v2.16b
2475
eor v0.16b, v0.16b, v2.16b
2476
2477
mov w7,v0.s[0]
2478
eor w6,w7,w7,ror #32-2
2479
eor w6,w6,w7,ror #32-10
2480
eor w6,w6,w7,ror #32-18
2481
eor w6,w6,w7,ror #32-24
2482
eor w15,w15,w6
2483
subs w11,w11,#1
2484
b.ne 10b
2485
mov v3.s[0],w15
2486
mov v3.s[1],w14
2487
mov v3.s[2],w13
2488
mov v3.s[3],w12
2489
#ifndef __AARCH64EB__
2490
rev32 v3.16b,v3.16b
2491
#endif
2492
ld1 {v4.4s},[x0]
2493
eor v4.16b,v4.16b,v3.16b
2494
st1 {v4.4s},[x1]
2495
b 100f
2496
1: // last 2 blocks processing
2497
dup v4.4s,w12
2498
dup v5.4s,w13
2499
dup v6.4s,w14
2500
mov v7.s[0],w5
2501
add w5,w5,#1
2502
mov v7.s[1],w5
2503
subs w2,w2,#1
2504
b.ne 1f
2505
bl _vpsm4_ex_enc_4blks
2506
ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
2507
ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
2508
eor v0.16b,v0.16b,v12.16b
2509
eor v1.16b,v1.16b,v13.16b
2510
eor v2.16b,v2.16b,v14.16b
2511
eor v3.16b,v3.16b,v15.16b
2512
st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
2513
st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
2514
b 100f
2515
1: // last 3 blocks processing
2516
add w5,w5,#1
2517
mov v7.s[2],w5
2518
bl _vpsm4_ex_enc_4blks
2519
ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
2520
ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
2521
ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16
2522
eor v0.16b,v0.16b,v12.16b
2523
eor v1.16b,v1.16b,v13.16b
2524
eor v2.16b,v2.16b,v14.16b
2525
eor v3.16b,v3.16b,v15.16b
2526
st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
2527
st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
2528
st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16
2529
100:
2530
ldp d10,d11,[sp,#16]
2531
ldp d12,d13,[sp,#32]
2532
ldp d14,d15,[sp,#48]
2533
ldp x29,x30,[sp,#64]
2534
ldp d8,d9,[sp],#80
2535
AARCH64_VALIDATE_LINK_REGISTER
2536
ret
2537
.size vpsm4_ex_ctr32_encrypt_blocks,.-vpsm4_ex_ctr32_encrypt_blocks
2538
.globl vpsm4_ex_xts_encrypt_gb
2539
.type vpsm4_ex_xts_encrypt_gb,%function
2540
.align 5
2541
vpsm4_ex_xts_encrypt_gb:
2542
AARCH64_SIGN_LINK_REGISTER
2543
stp x15, x16, [sp, #-0x10]!
2544
stp x17, x18, [sp, #-0x10]!
2545
stp x19, x20, [sp, #-0x10]!
2546
stp x21, x22, [sp, #-0x10]!
2547
stp x23, x24, [sp, #-0x10]!
2548
stp x25, x26, [sp, #-0x10]!
2549
stp x27, x28, [sp, #-0x10]!
2550
stp x29, x30, [sp, #-0x10]!
2551
stp d8, d9, [sp, #-0x10]!
2552
stp d10, d11, [sp, #-0x10]!
2553
stp d12, d13, [sp, #-0x10]!
2554
stp d14, d15, [sp, #-0x10]!
2555
mov x26,x3
2556
mov x27,x4
2557
mov w28,w6
2558
ld1 {v16.4s}, [x5]
2559
mov x3,x27
2560
adrp x9, .Lsbox_magic
2561
ldr q26, [x9, #:lo12:.Lsbox_magic]
2562
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
2563
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
2564
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
2565
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
2566
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
2567
#ifndef __AARCH64EB__
2568
rev32 v16.16b,v16.16b
2569
#endif
2570
mov x10,x3
2571
mov w11,#8
2572
mov w12,v16.s[0]
2573
mov w13,v16.s[1]
2574
mov w14,v16.s[2]
2575
mov w15,v16.s[3]
2576
10:
2577
ldp w7,w8,[x10],8
2578
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2579
eor w6,w14,w15
2580
eor w9,w7,w13
2581
eor w6,w6,w9
2582
mov v3.s[0],w6
2583
// optimize sbox using AESE instruction
2584
tbl v0.16b, {v3.16b}, v26.16b
2585
ushr v2.16b, v0.16b, 4
2586
and v0.16b, v0.16b, v31.16b
2587
tbl v0.16b, {v28.16b}, v0.16b
2588
tbl v2.16b, {v27.16b}, v2.16b
2589
eor v0.16b, v0.16b, v2.16b
2590
eor v1.16b, v1.16b, v1.16b
2591
aese v0.16b,v1.16b
2592
ushr v2.16b, v0.16b, 4
2593
and v0.16b, v0.16b, v31.16b
2594
tbl v0.16b, {v30.16b}, v0.16b
2595
tbl v2.16b, {v29.16b}, v2.16b
2596
eor v0.16b, v0.16b, v2.16b
2597
2598
mov w7,v0.s[0]
2599
eor w6,w7,w7,ror #32-2
2600
eor w6,w6,w7,ror #32-10
2601
eor w6,w6,w7,ror #32-18
2602
eor w6,w6,w7,ror #32-24
2603
eor w12,w12,w6
2604
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2605
eor w6,w14,w15
2606
eor w9,w12,w8
2607
eor w6,w6,w9
2608
mov v3.s[0],w6
2609
// optimize sbox using AESE instruction
2610
tbl v0.16b, {v3.16b}, v26.16b
2611
ushr v2.16b, v0.16b, 4
2612
and v0.16b, v0.16b, v31.16b
2613
tbl v0.16b, {v28.16b}, v0.16b
2614
tbl v2.16b, {v27.16b}, v2.16b
2615
eor v0.16b, v0.16b, v2.16b
2616
eor v1.16b, v1.16b, v1.16b
2617
aese v0.16b,v1.16b
2618
ushr v2.16b, v0.16b, 4
2619
and v0.16b, v0.16b, v31.16b
2620
tbl v0.16b, {v30.16b}, v0.16b
2621
tbl v2.16b, {v29.16b}, v2.16b
2622
eor v0.16b, v0.16b, v2.16b
2623
2624
mov w7,v0.s[0]
2625
eor w6,w7,w7,ror #32-2
2626
eor w6,w6,w7,ror #32-10
2627
eor w6,w6,w7,ror #32-18
2628
eor w6,w6,w7,ror #32-24
2629
ldp w7,w8,[x10],8
2630
eor w13,w13,w6
2631
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2632
eor w6,w12,w13
2633
eor w9,w7,w15
2634
eor w6,w6,w9
2635
mov v3.s[0],w6
2636
// optimize sbox using AESE instruction
2637
tbl v0.16b, {v3.16b}, v26.16b
2638
ushr v2.16b, v0.16b, 4
2639
and v0.16b, v0.16b, v31.16b
2640
tbl v0.16b, {v28.16b}, v0.16b
2641
tbl v2.16b, {v27.16b}, v2.16b
2642
eor v0.16b, v0.16b, v2.16b
2643
eor v1.16b, v1.16b, v1.16b
2644
aese v0.16b,v1.16b
2645
ushr v2.16b, v0.16b, 4
2646
and v0.16b, v0.16b, v31.16b
2647
tbl v0.16b, {v30.16b}, v0.16b
2648
tbl v2.16b, {v29.16b}, v2.16b
2649
eor v0.16b, v0.16b, v2.16b
2650
2651
mov w7,v0.s[0]
2652
eor w6,w7,w7,ror #32-2
2653
eor w6,w6,w7,ror #32-10
2654
eor w6,w6,w7,ror #32-18
2655
eor w6,w6,w7,ror #32-24
2656
eor w14,w14,w6
2657
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2658
eor w6,w12,w13
2659
eor w9,w14,w8
2660
eor w6,w6,w9
2661
mov v3.s[0],w6
2662
// optimize sbox using AESE instruction
2663
tbl v0.16b, {v3.16b}, v26.16b
2664
ushr v2.16b, v0.16b, 4
2665
and v0.16b, v0.16b, v31.16b
2666
tbl v0.16b, {v28.16b}, v0.16b
2667
tbl v2.16b, {v27.16b}, v2.16b
2668
eor v0.16b, v0.16b, v2.16b
2669
eor v1.16b, v1.16b, v1.16b
2670
aese v0.16b,v1.16b
2671
ushr v2.16b, v0.16b, 4
2672
and v0.16b, v0.16b, v31.16b
2673
tbl v0.16b, {v30.16b}, v0.16b
2674
tbl v2.16b, {v29.16b}, v2.16b
2675
eor v0.16b, v0.16b, v2.16b
2676
2677
mov w7,v0.s[0]
2678
eor w6,w7,w7,ror #32-2
2679
eor w6,w6,w7,ror #32-10
2680
eor w6,w6,w7,ror #32-18
2681
eor w6,w6,w7,ror #32-24
2682
eor w15,w15,w6
2683
subs w11,w11,#1
2684
b.ne 10b
2685
mov v16.s[0],w15
2686
mov v16.s[1],w14
2687
mov v16.s[2],w13
2688
mov v16.s[3],w12
2689
#ifndef __AARCH64EB__
2690
rev32 v16.16b,v16.16b
2691
#endif
2692
mov x3,x26
2693
and x29,x2,#0x0F
2694
// convert length into blocks
2695
lsr x2,x2,4
2696
cmp x2,#1
2697
b.lt .return_gb
2698
2699
cmp x29,0
2700
// If the encryption/decryption Length is N times of 16,
2701
// the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
2702
b.eq .xts_encrypt_blocks_gb
2703
2704
// If the encryption/decryption length is not N times of 16,
2705
// the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb
2706
// the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
2707
subs x2,x2,#1
2708
b.eq .only_2blks_tweak_gb
2709
.xts_encrypt_blocks_gb:
2710
rbit v16.16b,v16.16b
2711
#ifdef __AARCH64EB__
2712
rev32 v16.16b,v16.16b
2713
#endif
2714
mov x12,v16.d[0]
2715
mov x13,v16.d[1]
2716
mov w7,0x87
2717
extr x9,x13,x13,#32
2718
extr x15,x13,x12,#63
2719
and w8,w7,w9,asr#31
2720
eor x14,x8,x12,lsl#1
2721
mov w7,0x87
2722
extr x9,x15,x15,#32
2723
extr x17,x15,x14,#63
2724
and w8,w7,w9,asr#31
2725
eor x16,x8,x14,lsl#1
2726
mov w7,0x87
2727
extr x9,x17,x17,#32
2728
extr x19,x17,x16,#63
2729
and w8,w7,w9,asr#31
2730
eor x18,x8,x16,lsl#1
2731
mov w7,0x87
2732
extr x9,x19,x19,#32
2733
extr x21,x19,x18,#63
2734
and w8,w7,w9,asr#31
2735
eor x20,x8,x18,lsl#1
2736
mov w7,0x87
2737
extr x9,x21,x21,#32
2738
extr x23,x21,x20,#63
2739
and w8,w7,w9,asr#31
2740
eor x22,x8,x20,lsl#1
2741
mov w7,0x87
2742
extr x9,x23,x23,#32
2743
extr x25,x23,x22,#63
2744
and w8,w7,w9,asr#31
2745
eor x24,x8,x22,lsl#1
2746
mov w7,0x87
2747
extr x9,x25,x25,#32
2748
extr x27,x25,x24,#63
2749
and w8,w7,w9,asr#31
2750
eor x26,x8,x24,lsl#1
2751
.Lxts_8_blocks_process_gb:
2752
cmp x2,#8
2753
mov v16.d[0],x12
2754
mov v16.d[1],x13
2755
#ifdef __AARCH64EB__
2756
rev32 v16.16b,v16.16b
2757
#endif
2758
mov w7,0x87
2759
extr x9,x27,x27,#32
2760
extr x13,x27,x26,#63
2761
and w8,w7,w9,asr#31
2762
eor x12,x8,x26,lsl#1
2763
mov v17.d[0],x14
2764
mov v17.d[1],x15
2765
#ifdef __AARCH64EB__
2766
rev32 v17.16b,v17.16b
2767
#endif
2768
mov w7,0x87
2769
extr x9,x13,x13,#32
2770
extr x15,x13,x12,#63
2771
and w8,w7,w9,asr#31
2772
eor x14,x8,x12,lsl#1
2773
mov v18.d[0],x16
2774
mov v18.d[1],x17
2775
#ifdef __AARCH64EB__
2776
rev32 v18.16b,v18.16b
2777
#endif
2778
mov w7,0x87
2779
extr x9,x15,x15,#32
2780
extr x17,x15,x14,#63
2781
and w8,w7,w9,asr#31
2782
eor x16,x8,x14,lsl#1
2783
mov v19.d[0],x18
2784
mov v19.d[1],x19
2785
#ifdef __AARCH64EB__
2786
rev32 v19.16b,v19.16b
2787
#endif
2788
mov w7,0x87
2789
extr x9,x17,x17,#32
2790
extr x19,x17,x16,#63
2791
and w8,w7,w9,asr#31
2792
eor x18,x8,x16,lsl#1
2793
mov v20.d[0],x20
2794
mov v20.d[1],x21
2795
#ifdef __AARCH64EB__
2796
rev32 v20.16b,v20.16b
2797
#endif
2798
mov w7,0x87
2799
extr x9,x19,x19,#32
2800
extr x21,x19,x18,#63
2801
and w8,w7,w9,asr#31
2802
eor x20,x8,x18,lsl#1
2803
mov v21.d[0],x22
2804
mov v21.d[1],x23
2805
#ifdef __AARCH64EB__
2806
rev32 v21.16b,v21.16b
2807
#endif
2808
mov w7,0x87
2809
extr x9,x21,x21,#32
2810
extr x23,x21,x20,#63
2811
and w8,w7,w9,asr#31
2812
eor x22,x8,x20,lsl#1
2813
mov v22.d[0],x24
2814
mov v22.d[1],x25
2815
#ifdef __AARCH64EB__
2816
rev32 v22.16b,v22.16b
2817
#endif
2818
mov w7,0x87
2819
extr x9,x23,x23,#32
2820
extr x25,x23,x22,#63
2821
and w8,w7,w9,asr#31
2822
eor x24,x8,x22,lsl#1
2823
mov v23.d[0],x26
2824
mov v23.d[1],x27
2825
#ifdef __AARCH64EB__
2826
rev32 v23.16b,v23.16b
2827
#endif
2828
mov w7,0x87
2829
extr x9,x25,x25,#32
2830
extr x27,x25,x24,#63
2831
and w8,w7,w9,asr#31
2832
eor x26,x8,x24,lsl#1
2833
b.lt .Lxts_4_blocks_process_gb
2834
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
2835
rbit v16.16b,v16.16b
2836
rbit v17.16b,v17.16b
2837
rbit v18.16b,v18.16b
2838
rbit v19.16b,v19.16b
2839
eor v4.16b, v4.16b, v16.16b
2840
eor v5.16b, v5.16b, v17.16b
2841
eor v6.16b, v6.16b, v18.16b
2842
eor v7.16b, v7.16b, v19.16b
2843
ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2844
rbit v20.16b,v20.16b
2845
rbit v21.16b,v21.16b
2846
rbit v22.16b,v22.16b
2847
rbit v23.16b,v23.16b
2848
eor v8.16b, v8.16b, v20.16b
2849
eor v9.16b, v9.16b, v21.16b
2850
eor v10.16b, v10.16b, v22.16b
2851
eor v11.16b, v11.16b, v23.16b
2852
#ifndef __AARCH64EB__
2853
rev32 v4.16b,v4.16b
2854
#endif
2855
#ifndef __AARCH64EB__
2856
rev32 v5.16b,v5.16b
2857
#endif
2858
#ifndef __AARCH64EB__
2859
rev32 v6.16b,v6.16b
2860
#endif
2861
#ifndef __AARCH64EB__
2862
rev32 v7.16b,v7.16b
2863
#endif
2864
#ifndef __AARCH64EB__
2865
rev32 v8.16b,v8.16b
2866
#endif
2867
#ifndef __AARCH64EB__
2868
rev32 v9.16b,v9.16b
2869
#endif
2870
#ifndef __AARCH64EB__
2871
rev32 v10.16b,v10.16b
2872
#endif
2873
#ifndef __AARCH64EB__
2874
rev32 v11.16b,v11.16b
2875
#endif
2876
zip1 v0.4s,v4.4s,v5.4s
2877
zip2 v1.4s,v4.4s,v5.4s
2878
zip1 v2.4s,v6.4s,v7.4s
2879
zip2 v3.4s,v6.4s,v7.4s
2880
zip1 v4.2d,v0.2d,v2.2d
2881
zip2 v5.2d,v0.2d,v2.2d
2882
zip1 v6.2d,v1.2d,v3.2d
2883
zip2 v7.2d,v1.2d,v3.2d
2884
zip1 v0.4s,v8.4s,v9.4s
2885
zip2 v1.4s,v8.4s,v9.4s
2886
zip1 v2.4s,v10.4s,v11.4s
2887
zip2 v3.4s,v10.4s,v11.4s
2888
zip1 v8.2d,v0.2d,v2.2d
2889
zip2 v9.2d,v0.2d,v2.2d
2890
zip1 v10.2d,v1.2d,v3.2d
2891
zip2 v11.2d,v1.2d,v3.2d
2892
bl _vpsm4_ex_enc_8blks
2893
zip1 v8.4s,v0.4s,v1.4s
2894
zip2 v9.4s,v0.4s,v1.4s
2895
zip1 v10.4s,v2.4s,v3.4s
2896
zip2 v11.4s,v2.4s,v3.4s
2897
zip1 v0.2d,v8.2d,v10.2d
2898
zip2 v1.2d,v8.2d,v10.2d
2899
zip1 v2.2d,v9.2d,v11.2d
2900
zip2 v3.2d,v9.2d,v11.2d
2901
zip1 v8.4s,v4.4s,v5.4s
2902
zip2 v9.4s,v4.4s,v5.4s
2903
zip1 v10.4s,v6.4s,v7.4s
2904
zip2 v11.4s,v6.4s,v7.4s
2905
zip1 v4.2d,v8.2d,v10.2d
2906
zip2 v5.2d,v8.2d,v10.2d
2907
zip1 v6.2d,v9.2d,v11.2d
2908
zip2 v7.2d,v9.2d,v11.2d
2909
eor v0.16b, v0.16b, v16.16b
2910
eor v1.16b, v1.16b, v17.16b
2911
eor v2.16b, v2.16b, v18.16b
2912
eor v3.16b, v3.16b, v19.16b
2913
eor v4.16b, v4.16b, v20.16b
2914
eor v5.16b, v5.16b, v21.16b
2915
eor v6.16b, v6.16b, v22.16b
2916
eor v7.16b, v7.16b, v23.16b
2917
2918
// save the last tweak
2919
mov v25.16b,v23.16b
2920
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2921
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
2922
subs x2,x2,#8
2923
b.gt .Lxts_8_blocks_process_gb
2924
b 100f
2925
.Lxts_4_blocks_process_gb:
2926
cmp x2,#4
2927
b.lt 1f
2928
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
2929
rbit v16.16b,v16.16b
2930
rbit v17.16b,v17.16b
2931
rbit v18.16b,v18.16b
2932
rbit v19.16b,v19.16b
2933
eor v4.16b, v4.16b, v16.16b
2934
eor v5.16b, v5.16b, v17.16b
2935
eor v6.16b, v6.16b, v18.16b
2936
eor v7.16b, v7.16b, v19.16b
2937
#ifndef __AARCH64EB__
2938
rev32 v4.16b,v4.16b
2939
#endif
2940
#ifndef __AARCH64EB__
2941
rev32 v5.16b,v5.16b
2942
#endif
2943
#ifndef __AARCH64EB__
2944
rev32 v6.16b,v6.16b
2945
#endif
2946
#ifndef __AARCH64EB__
2947
rev32 v7.16b,v7.16b
2948
#endif
2949
zip1 v0.4s,v4.4s,v5.4s
2950
zip2 v1.4s,v4.4s,v5.4s
2951
zip1 v2.4s,v6.4s,v7.4s
2952
zip2 v3.4s,v6.4s,v7.4s
2953
zip1 v4.2d,v0.2d,v2.2d
2954
zip2 v5.2d,v0.2d,v2.2d
2955
zip1 v6.2d,v1.2d,v3.2d
2956
zip2 v7.2d,v1.2d,v3.2d
2957
bl _vpsm4_ex_enc_4blks
2958
zip1 v4.4s,v0.4s,v1.4s
2959
zip2 v5.4s,v0.4s,v1.4s
2960
zip1 v6.4s,v2.4s,v3.4s
2961
zip2 v7.4s,v2.4s,v3.4s
2962
zip1 v0.2d,v4.2d,v6.2d
2963
zip2 v1.2d,v4.2d,v6.2d
2964
zip1 v2.2d,v5.2d,v7.2d
2965
zip2 v3.2d,v5.2d,v7.2d
2966
eor v0.16b, v0.16b, v16.16b
2967
eor v1.16b, v1.16b, v17.16b
2968
eor v2.16b, v2.16b, v18.16b
2969
eor v3.16b, v3.16b, v19.16b
2970
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2971
sub x2,x2,#4
2972
mov v16.16b,v20.16b
2973
mov v17.16b,v21.16b
2974
mov v18.16b,v22.16b
2975
// save the last tweak
2976
mov v25.16b,v19.16b
2977
1:
2978
// process last block
2979
cmp x2,#1
2980
b.lt 100f
2981
b.gt 1f
2982
ld1 {v4.4s},[x0],#16
2983
rbit v16.16b,v16.16b
2984
eor v4.16b, v4.16b, v16.16b
2985
#ifndef __AARCH64EB__
2986
rev32 v4.16b,v4.16b
2987
#endif
2988
mov x10,x3
2989
mov w11,#8
2990
mov w12,v4.s[0]
2991
mov w13,v4.s[1]
2992
mov w14,v4.s[2]
2993
mov w15,v4.s[3]
2994
10:
2995
ldp w7,w8,[x10],8
2996
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2997
eor w6,w14,w15
2998
eor w9,w7,w13
2999
eor w6,w6,w9
3000
mov v3.s[0],w6
3001
// optimize sbox using AESE instruction
3002
tbl v0.16b, {v3.16b}, v26.16b
3003
ushr v2.16b, v0.16b, 4
3004
and v0.16b, v0.16b, v31.16b
3005
tbl v0.16b, {v28.16b}, v0.16b
3006
tbl v2.16b, {v27.16b}, v2.16b
3007
eor v0.16b, v0.16b, v2.16b
3008
eor v1.16b, v1.16b, v1.16b
3009
aese v0.16b,v1.16b
3010
ushr v2.16b, v0.16b, 4
3011
and v0.16b, v0.16b, v31.16b
3012
tbl v0.16b, {v30.16b}, v0.16b
3013
tbl v2.16b, {v29.16b}, v2.16b
3014
eor v0.16b, v0.16b, v2.16b
3015
3016
mov w7,v0.s[0]
3017
eor w6,w7,w7,ror #32-2
3018
eor w6,w6,w7,ror #32-10
3019
eor w6,w6,w7,ror #32-18
3020
eor w6,w6,w7,ror #32-24
3021
eor w12,w12,w6
3022
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3023
eor w6,w14,w15
3024
eor w9,w12,w8
3025
eor w6,w6,w9
3026
mov v3.s[0],w6
3027
// optimize sbox using AESE instruction
3028
tbl v0.16b, {v3.16b}, v26.16b
3029
ushr v2.16b, v0.16b, 4
3030
and v0.16b, v0.16b, v31.16b
3031
tbl v0.16b, {v28.16b}, v0.16b
3032
tbl v2.16b, {v27.16b}, v2.16b
3033
eor v0.16b, v0.16b, v2.16b
3034
eor v1.16b, v1.16b, v1.16b
3035
aese v0.16b,v1.16b
3036
ushr v2.16b, v0.16b, 4
3037
and v0.16b, v0.16b, v31.16b
3038
tbl v0.16b, {v30.16b}, v0.16b
3039
tbl v2.16b, {v29.16b}, v2.16b
3040
eor v0.16b, v0.16b, v2.16b
3041
3042
mov w7,v0.s[0]
3043
eor w6,w7,w7,ror #32-2
3044
eor w6,w6,w7,ror #32-10
3045
eor w6,w6,w7,ror #32-18
3046
eor w6,w6,w7,ror #32-24
3047
ldp w7,w8,[x10],8
3048
eor w13,w13,w6
3049
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3050
eor w6,w12,w13
3051
eor w9,w7,w15
3052
eor w6,w6,w9
3053
mov v3.s[0],w6
3054
// optimize sbox using AESE instruction
3055
tbl v0.16b, {v3.16b}, v26.16b
3056
ushr v2.16b, v0.16b, 4
3057
and v0.16b, v0.16b, v31.16b
3058
tbl v0.16b, {v28.16b}, v0.16b
3059
tbl v2.16b, {v27.16b}, v2.16b
3060
eor v0.16b, v0.16b, v2.16b
3061
eor v1.16b, v1.16b, v1.16b
3062
aese v0.16b,v1.16b
3063
ushr v2.16b, v0.16b, 4
3064
and v0.16b, v0.16b, v31.16b
3065
tbl v0.16b, {v30.16b}, v0.16b
3066
tbl v2.16b, {v29.16b}, v2.16b
3067
eor v0.16b, v0.16b, v2.16b
3068
3069
mov w7,v0.s[0]
3070
eor w6,w7,w7,ror #32-2
3071
eor w6,w6,w7,ror #32-10
3072
eor w6,w6,w7,ror #32-18
3073
eor w6,w6,w7,ror #32-24
3074
eor w14,w14,w6
3075
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3076
eor w6,w12,w13
3077
eor w9,w14,w8
3078
eor w6,w6,w9
3079
mov v3.s[0],w6
3080
// optimize sbox using AESE instruction
3081
tbl v0.16b, {v3.16b}, v26.16b
3082
ushr v2.16b, v0.16b, 4
3083
and v0.16b, v0.16b, v31.16b
3084
tbl v0.16b, {v28.16b}, v0.16b
3085
tbl v2.16b, {v27.16b}, v2.16b
3086
eor v0.16b, v0.16b, v2.16b
3087
eor v1.16b, v1.16b, v1.16b
3088
aese v0.16b,v1.16b
3089
ushr v2.16b, v0.16b, 4
3090
and v0.16b, v0.16b, v31.16b
3091
tbl v0.16b, {v30.16b}, v0.16b
3092
tbl v2.16b, {v29.16b}, v2.16b
3093
eor v0.16b, v0.16b, v2.16b
3094
3095
mov w7,v0.s[0]
3096
eor w6,w7,w7,ror #32-2
3097
eor w6,w6,w7,ror #32-10
3098
eor w6,w6,w7,ror #32-18
3099
eor w6,w6,w7,ror #32-24
3100
eor w15,w15,w6
3101
subs w11,w11,#1
3102
b.ne 10b
3103
mov v4.s[0],w15
3104
mov v4.s[1],w14
3105
mov v4.s[2],w13
3106
mov v4.s[3],w12
3107
#ifndef __AARCH64EB__
3108
rev32 v4.16b,v4.16b
3109
#endif
3110
eor v4.16b, v4.16b, v16.16b
3111
st1 {v4.4s},[x1],#16
3112
// save the last tweak
3113
mov v25.16b,v16.16b
3114
b 100f
3115
1: // process last 2 blocks
3116
cmp x2,#2
3117
b.gt 1f
3118
ld1 {v4.4s,v5.4s},[x0],#32
3119
rbit v16.16b,v16.16b
3120
rbit v17.16b,v17.16b
3121
eor v4.16b, v4.16b, v16.16b
3122
eor v5.16b, v5.16b, v17.16b
3123
#ifndef __AARCH64EB__
3124
rev32 v4.16b,v4.16b
3125
#endif
3126
#ifndef __AARCH64EB__
3127
rev32 v5.16b,v5.16b
3128
#endif
3129
zip1 v0.4s,v4.4s,v5.4s
3130
zip2 v1.4s,v4.4s,v5.4s
3131
zip1 v2.4s,v6.4s,v7.4s
3132
zip2 v3.4s,v6.4s,v7.4s
3133
zip1 v4.2d,v0.2d,v2.2d
3134
zip2 v5.2d,v0.2d,v2.2d
3135
zip1 v6.2d,v1.2d,v3.2d
3136
zip2 v7.2d,v1.2d,v3.2d
3137
bl _vpsm4_ex_enc_4blks
3138
zip1 v4.4s,v0.4s,v1.4s
3139
zip2 v5.4s,v0.4s,v1.4s
3140
zip1 v6.4s,v2.4s,v3.4s
3141
zip2 v7.4s,v2.4s,v3.4s
3142
zip1 v0.2d,v4.2d,v6.2d
3143
zip2 v1.2d,v4.2d,v6.2d
3144
zip1 v2.2d,v5.2d,v7.2d
3145
zip2 v3.2d,v5.2d,v7.2d
3146
eor v0.16b, v0.16b, v16.16b
3147
eor v1.16b, v1.16b, v17.16b
3148
st1 {v0.4s,v1.4s},[x1],#32
3149
// save the last tweak
3150
mov v25.16b,v17.16b
3151
b 100f
3152
1: // process last 3 blocks
3153
ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
3154
rbit v16.16b,v16.16b
3155
rbit v17.16b,v17.16b
3156
rbit v18.16b,v18.16b
3157
eor v4.16b, v4.16b, v16.16b
3158
eor v5.16b, v5.16b, v17.16b
3159
eor v6.16b, v6.16b, v18.16b
3160
#ifndef __AARCH64EB__
3161
rev32 v4.16b,v4.16b
3162
#endif
3163
#ifndef __AARCH64EB__
3164
rev32 v5.16b,v5.16b
3165
#endif
3166
#ifndef __AARCH64EB__
3167
rev32 v6.16b,v6.16b
3168
#endif
3169
zip1 v0.4s,v4.4s,v5.4s
3170
zip2 v1.4s,v4.4s,v5.4s
3171
zip1 v2.4s,v6.4s,v7.4s
3172
zip2 v3.4s,v6.4s,v7.4s
3173
zip1 v4.2d,v0.2d,v2.2d
3174
zip2 v5.2d,v0.2d,v2.2d
3175
zip1 v6.2d,v1.2d,v3.2d
3176
zip2 v7.2d,v1.2d,v3.2d
3177
bl _vpsm4_ex_enc_4blks
3178
zip1 v4.4s,v0.4s,v1.4s
3179
zip2 v5.4s,v0.4s,v1.4s
3180
zip1 v6.4s,v2.4s,v3.4s
3181
zip2 v7.4s,v2.4s,v3.4s
3182
zip1 v0.2d,v4.2d,v6.2d
3183
zip2 v1.2d,v4.2d,v6.2d
3184
zip1 v2.2d,v5.2d,v7.2d
3185
zip2 v3.2d,v5.2d,v7.2d
3186
eor v0.16b, v0.16b, v16.16b
3187
eor v1.16b, v1.16b, v17.16b
3188
eor v2.16b, v2.16b, v18.16b
3189
st1 {v0.4s,v1.4s,v2.4s},[x1],#48
3190
// save the last tweak
3191
mov v25.16b,v18.16b
3192
100:
3193
cmp x29,0
3194
b.eq .return_gb
3195
3196
// This branch calculates the last two tweaks,
3197
// while the encryption/decryption length is larger than 32
3198
.last_2blks_tweak_gb:
3199
#ifdef __AARCH64EB__
3200
rev32 v25.16b,v25.16b
3201
#endif
3202
rbit v2.16b,v25.16b
3203
adrp x9, .Lxts_magic
3204
ldr q0, [x9, #:lo12:.Lxts_magic]
3205
shl v17.16b, v2.16b, #1
3206
ext v1.16b, v2.16b, v2.16b,#15
3207
ushr v1.16b, v1.16b, #7
3208
mul v1.16b, v1.16b, v0.16b
3209
eor v17.16b, v17.16b, v1.16b
3210
rbit v17.16b,v17.16b
3211
rbit v2.16b,v17.16b
3212
adrp x9, .Lxts_magic
3213
ldr q0, [x9, #:lo12:.Lxts_magic]
3214
shl v18.16b, v2.16b, #1
3215
ext v1.16b, v2.16b, v2.16b,#15
3216
ushr v1.16b, v1.16b, #7
3217
mul v1.16b, v1.16b, v0.16b
3218
eor v18.16b, v18.16b, v1.16b
3219
rbit v18.16b,v18.16b
3220
b .check_dec_gb
3221
3222
3223
// This branch calculates the last two tweaks,
3224
// while the encryption/decryption length is equal to 32, who only need two tweaks
3225
.only_2blks_tweak_gb:
3226
mov v17.16b,v16.16b
3227
#ifdef __AARCH64EB__
3228
rev32 v17.16b,v17.16b
3229
#endif
3230
rbit v2.16b,v17.16b
3231
adrp x9, .Lxts_magic
3232
ldr q0, [x9, #:lo12:.Lxts_magic]
3233
shl v18.16b, v2.16b, #1
3234
ext v1.16b, v2.16b, v2.16b,#15
3235
ushr v1.16b, v1.16b, #7
3236
mul v1.16b, v1.16b, v0.16b
3237
eor v18.16b, v18.16b, v1.16b
3238
rbit v18.16b,v18.16b
3239
b .check_dec_gb
3240
3241
3242
// Determine whether encryption or decryption is required.
3243
// The last two tweaks need to be swapped for decryption.
3244
.check_dec_gb:
3245
// encryption:1 decryption:0
3246
cmp w28,1
3247
b.eq .process_last_2blks_gb
3248
mov v0.16B,v17.16b
3249
mov v17.16B,v18.16b
3250
mov v18.16B,v0.16b
3251
3252
.process_last_2blks_gb:
3253
#ifdef __AARCH64EB__
3254
rev32 v17.16b,v17.16b
3255
#endif
3256
#ifdef __AARCH64EB__
3257
rev32 v18.16b,v18.16b
3258
#endif
3259
ld1 {v4.4s},[x0],#16
3260
eor v4.16b, v4.16b, v17.16b
3261
#ifndef __AARCH64EB__
3262
rev32 v4.16b,v4.16b
3263
#endif
3264
mov x10,x3
3265
mov w11,#8
3266
mov w12,v4.s[0]
3267
mov w13,v4.s[1]
3268
mov w14,v4.s[2]
3269
mov w15,v4.s[3]
3270
10:
3271
ldp w7,w8,[x10],8
3272
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3273
eor w6,w14,w15
3274
eor w9,w7,w13
3275
eor w6,w6,w9
3276
mov v3.s[0],w6
3277
// optimize sbox using AESE instruction
3278
tbl v0.16b, {v3.16b}, v26.16b
3279
ushr v2.16b, v0.16b, 4
3280
and v0.16b, v0.16b, v31.16b
3281
tbl v0.16b, {v28.16b}, v0.16b
3282
tbl v2.16b, {v27.16b}, v2.16b
3283
eor v0.16b, v0.16b, v2.16b
3284
eor v1.16b, v1.16b, v1.16b
3285
aese v0.16b,v1.16b
3286
ushr v2.16b, v0.16b, 4
3287
and v0.16b, v0.16b, v31.16b
3288
tbl v0.16b, {v30.16b}, v0.16b
3289
tbl v2.16b, {v29.16b}, v2.16b
3290
eor v0.16b, v0.16b, v2.16b
3291
3292
mov w7,v0.s[0]
3293
eor w6,w7,w7,ror #32-2
3294
eor w6,w6,w7,ror #32-10
3295
eor w6,w6,w7,ror #32-18
3296
eor w6,w6,w7,ror #32-24
3297
eor w12,w12,w6
3298
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3299
eor w6,w14,w15
3300
eor w9,w12,w8
3301
eor w6,w6,w9
3302
mov v3.s[0],w6
3303
// optimize sbox using AESE instruction
3304
tbl v0.16b, {v3.16b}, v26.16b
3305
ushr v2.16b, v0.16b, 4
3306
and v0.16b, v0.16b, v31.16b
3307
tbl v0.16b, {v28.16b}, v0.16b
3308
tbl v2.16b, {v27.16b}, v2.16b
3309
eor v0.16b, v0.16b, v2.16b
3310
eor v1.16b, v1.16b, v1.16b
3311
aese v0.16b,v1.16b
3312
ushr v2.16b, v0.16b, 4
3313
and v0.16b, v0.16b, v31.16b
3314
tbl v0.16b, {v30.16b}, v0.16b
3315
tbl v2.16b, {v29.16b}, v2.16b
3316
eor v0.16b, v0.16b, v2.16b
3317
3318
mov w7,v0.s[0]
3319
eor w6,w7,w7,ror #32-2
3320
eor w6,w6,w7,ror #32-10
3321
eor w6,w6,w7,ror #32-18
3322
eor w6,w6,w7,ror #32-24
3323
ldp w7,w8,[x10],8
3324
eor w13,w13,w6
3325
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3326
eor w6,w12,w13
3327
eor w9,w7,w15
3328
eor w6,w6,w9
3329
mov v3.s[0],w6
3330
// optimize sbox using AESE instruction
3331
tbl v0.16b, {v3.16b}, v26.16b
3332
ushr v2.16b, v0.16b, 4
3333
and v0.16b, v0.16b, v31.16b
3334
tbl v0.16b, {v28.16b}, v0.16b
3335
tbl v2.16b, {v27.16b}, v2.16b
3336
eor v0.16b, v0.16b, v2.16b
3337
eor v1.16b, v1.16b, v1.16b
3338
aese v0.16b,v1.16b
3339
ushr v2.16b, v0.16b, 4
3340
and v0.16b, v0.16b, v31.16b
3341
tbl v0.16b, {v30.16b}, v0.16b
3342
tbl v2.16b, {v29.16b}, v2.16b
3343
eor v0.16b, v0.16b, v2.16b
3344
3345
mov w7,v0.s[0]
3346
eor w6,w7,w7,ror #32-2
3347
eor w6,w6,w7,ror #32-10
3348
eor w6,w6,w7,ror #32-18
3349
eor w6,w6,w7,ror #32-24
3350
eor w14,w14,w6
3351
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3352
eor w6,w12,w13
3353
eor w9,w14,w8
3354
eor w6,w6,w9
3355
mov v3.s[0],w6
3356
// optimize sbox using AESE instruction
3357
tbl v0.16b, {v3.16b}, v26.16b
3358
ushr v2.16b, v0.16b, 4
3359
and v0.16b, v0.16b, v31.16b
3360
tbl v0.16b, {v28.16b}, v0.16b
3361
tbl v2.16b, {v27.16b}, v2.16b
3362
eor v0.16b, v0.16b, v2.16b
3363
eor v1.16b, v1.16b, v1.16b
3364
aese v0.16b,v1.16b
3365
ushr v2.16b, v0.16b, 4
3366
and v0.16b, v0.16b, v31.16b
3367
tbl v0.16b, {v30.16b}, v0.16b
3368
tbl v2.16b, {v29.16b}, v2.16b
3369
eor v0.16b, v0.16b, v2.16b
3370
3371
mov w7,v0.s[0]
3372
eor w6,w7,w7,ror #32-2
3373
eor w6,w6,w7,ror #32-10
3374
eor w6,w6,w7,ror #32-18
3375
eor w6,w6,w7,ror #32-24
3376
eor w15,w15,w6
3377
subs w11,w11,#1
3378
b.ne 10b
3379
mov v4.s[0],w15
3380
mov v4.s[1],w14
3381
mov v4.s[2],w13
3382
mov v4.s[3],w12
3383
#ifndef __AARCH64EB__
3384
rev32 v4.16b,v4.16b
3385
#endif
3386
eor v4.16b, v4.16b, v17.16b
3387
st1 {v4.4s},[x1],#16
3388
3389
sub x26,x1,16
3390
.loop_gb:
3391
subs x29,x29,1
3392
ldrb w7,[x26,x29]
3393
ldrb w8,[x0,x29]
3394
strb w8,[x26,x29]
3395
strb w7,[x1,x29]
3396
b.gt .loop_gb
3397
ld1 {v4.4s}, [x26]
3398
eor v4.16b, v4.16b, v18.16b
3399
#ifndef __AARCH64EB__
3400
rev32 v4.16b,v4.16b
3401
#endif
3402
mov x10,x3
3403
mov w11,#8
3404
mov w12,v4.s[0]
3405
mov w13,v4.s[1]
3406
mov w14,v4.s[2]
3407
mov w15,v4.s[3]
3408
10:
3409
ldp w7,w8,[x10],8
3410
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3411
eor w6,w14,w15
3412
eor w9,w7,w13
3413
eor w6,w6,w9
3414
mov v3.s[0],w6
3415
// optimize sbox using AESE instruction
3416
tbl v0.16b, {v3.16b}, v26.16b
3417
ushr v2.16b, v0.16b, 4
3418
and v0.16b, v0.16b, v31.16b
3419
tbl v0.16b, {v28.16b}, v0.16b
3420
tbl v2.16b, {v27.16b}, v2.16b
3421
eor v0.16b, v0.16b, v2.16b
3422
eor v1.16b, v1.16b, v1.16b
3423
aese v0.16b,v1.16b
3424
ushr v2.16b, v0.16b, 4
3425
and v0.16b, v0.16b, v31.16b
3426
tbl v0.16b, {v30.16b}, v0.16b
3427
tbl v2.16b, {v29.16b}, v2.16b
3428
eor v0.16b, v0.16b, v2.16b
3429
3430
mov w7,v0.s[0]
3431
eor w6,w7,w7,ror #32-2
3432
eor w6,w6,w7,ror #32-10
3433
eor w6,w6,w7,ror #32-18
3434
eor w6,w6,w7,ror #32-24
3435
eor w12,w12,w6
3436
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3437
eor w6,w14,w15
3438
eor w9,w12,w8
3439
eor w6,w6,w9
3440
mov v3.s[0],w6
3441
// optimize sbox using AESE instruction
3442
tbl v0.16b, {v3.16b}, v26.16b
3443
ushr v2.16b, v0.16b, 4
3444
and v0.16b, v0.16b, v31.16b
3445
tbl v0.16b, {v28.16b}, v0.16b
3446
tbl v2.16b, {v27.16b}, v2.16b
3447
eor v0.16b, v0.16b, v2.16b
3448
eor v1.16b, v1.16b, v1.16b
3449
aese v0.16b,v1.16b
3450
ushr v2.16b, v0.16b, 4
3451
and v0.16b, v0.16b, v31.16b
3452
tbl v0.16b, {v30.16b}, v0.16b
3453
tbl v2.16b, {v29.16b}, v2.16b
3454
eor v0.16b, v0.16b, v2.16b
3455
3456
mov w7,v0.s[0]
3457
eor w6,w7,w7,ror #32-2
3458
eor w6,w6,w7,ror #32-10
3459
eor w6,w6,w7,ror #32-18
3460
eor w6,w6,w7,ror #32-24
3461
ldp w7,w8,[x10],8
3462
eor w13,w13,w6
3463
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3464
eor w6,w12,w13
3465
eor w9,w7,w15
3466
eor w6,w6,w9
3467
mov v3.s[0],w6
3468
// optimize sbox using AESE instruction
3469
tbl v0.16b, {v3.16b}, v26.16b
3470
ushr v2.16b, v0.16b, 4
3471
and v0.16b, v0.16b, v31.16b
3472
tbl v0.16b, {v28.16b}, v0.16b
3473
tbl v2.16b, {v27.16b}, v2.16b
3474
eor v0.16b, v0.16b, v2.16b
3475
eor v1.16b, v1.16b, v1.16b
3476
aese v0.16b,v1.16b
3477
ushr v2.16b, v0.16b, 4
3478
and v0.16b, v0.16b, v31.16b
3479
tbl v0.16b, {v30.16b}, v0.16b
3480
tbl v2.16b, {v29.16b}, v2.16b
3481
eor v0.16b, v0.16b, v2.16b
3482
3483
mov w7,v0.s[0]
3484
eor w6,w7,w7,ror #32-2
3485
eor w6,w6,w7,ror #32-10
3486
eor w6,w6,w7,ror #32-18
3487
eor w6,w6,w7,ror #32-24
3488
eor w14,w14,w6
3489
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3490
eor w6,w12,w13
3491
eor w9,w14,w8
3492
eor w6,w6,w9
3493
mov v3.s[0],w6
3494
// optimize sbox using AESE instruction
3495
tbl v0.16b, {v3.16b}, v26.16b
3496
ushr v2.16b, v0.16b, 4
3497
and v0.16b, v0.16b, v31.16b
3498
tbl v0.16b, {v28.16b}, v0.16b
3499
tbl v2.16b, {v27.16b}, v2.16b
3500
eor v0.16b, v0.16b, v2.16b
3501
eor v1.16b, v1.16b, v1.16b
3502
aese v0.16b,v1.16b
3503
ushr v2.16b, v0.16b, 4
3504
and v0.16b, v0.16b, v31.16b
3505
tbl v0.16b, {v30.16b}, v0.16b
3506
tbl v2.16b, {v29.16b}, v2.16b
3507
eor v0.16b, v0.16b, v2.16b
3508
3509
mov w7,v0.s[0]
3510
eor w6,w7,w7,ror #32-2
3511
eor w6,w6,w7,ror #32-10
3512
eor w6,w6,w7,ror #32-18
3513
eor w6,w6,w7,ror #32-24
3514
eor w15,w15,w6
3515
subs w11,w11,#1
3516
b.ne 10b
3517
mov v4.s[0],w15
3518
mov v4.s[1],w14
3519
mov v4.s[2],w13
3520
mov v4.s[3],w12
3521
#ifndef __AARCH64EB__
3522
rev32 v4.16b,v4.16b
3523
#endif
3524
eor v4.16b, v4.16b, v18.16b
3525
st1 {v4.4s}, [x26]
3526
.return_gb:
3527
ldp d14, d15, [sp], #0x10
3528
ldp d12, d13, [sp], #0x10
3529
ldp d10, d11, [sp], #0x10
3530
ldp d8, d9, [sp], #0x10
3531
ldp x29, x30, [sp], #0x10
3532
ldp x27, x28, [sp], #0x10
3533
ldp x25, x26, [sp], #0x10
3534
ldp x23, x24, [sp], #0x10
3535
ldp x21, x22, [sp], #0x10
3536
ldp x19, x20, [sp], #0x10
3537
ldp x17, x18, [sp], #0x10
3538
ldp x15, x16, [sp], #0x10
3539
AARCH64_VALIDATE_LINK_REGISTER
3540
ret
3541
.size vpsm4_ex_xts_encrypt_gb,.-vpsm4_ex_xts_encrypt_gb
3542
.globl vpsm4_ex_xts_encrypt
3543
.type vpsm4_ex_xts_encrypt,%function
3544
.align 5
3545
vpsm4_ex_xts_encrypt:
3546
AARCH64_SIGN_LINK_REGISTER
3547
stp x15, x16, [sp, #-0x10]!
3548
stp x17, x18, [sp, #-0x10]!
3549
stp x19, x20, [sp, #-0x10]!
3550
stp x21, x22, [sp, #-0x10]!
3551
stp x23, x24, [sp, #-0x10]!
3552
stp x25, x26, [sp, #-0x10]!
3553
stp x27, x28, [sp, #-0x10]!
3554
stp x29, x30, [sp, #-0x10]!
3555
stp d8, d9, [sp, #-0x10]!
3556
stp d10, d11, [sp, #-0x10]!
3557
stp d12, d13, [sp, #-0x10]!
3558
stp d14, d15, [sp, #-0x10]!
3559
mov x26,x3
3560
mov x27,x4
3561
mov w28,w6
3562
ld1 {v16.4s}, [x5]
3563
mov x3,x27
3564
adrp x9, .Lsbox_magic
3565
ldr q26, [x9, #:lo12:.Lsbox_magic]
3566
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
3567
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
3568
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
3569
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
3570
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
3571
#ifndef __AARCH64EB__
3572
rev32 v16.16b,v16.16b
3573
#endif
3574
mov x10,x3
3575
mov w11,#8
3576
mov w12,v16.s[0]
3577
mov w13,v16.s[1]
3578
mov w14,v16.s[2]
3579
mov w15,v16.s[3]
3580
10:
3581
ldp w7,w8,[x10],8
3582
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3583
eor w6,w14,w15
3584
eor w9,w7,w13
3585
eor w6,w6,w9
3586
mov v3.s[0],w6
3587
// optimize sbox using AESE instruction
3588
tbl v0.16b, {v3.16b}, v26.16b
3589
ushr v2.16b, v0.16b, 4
3590
and v0.16b, v0.16b, v31.16b
3591
tbl v0.16b, {v28.16b}, v0.16b
3592
tbl v2.16b, {v27.16b}, v2.16b
3593
eor v0.16b, v0.16b, v2.16b
3594
eor v1.16b, v1.16b, v1.16b
3595
aese v0.16b,v1.16b
3596
ushr v2.16b, v0.16b, 4
3597
and v0.16b, v0.16b, v31.16b
3598
tbl v0.16b, {v30.16b}, v0.16b
3599
tbl v2.16b, {v29.16b}, v2.16b
3600
eor v0.16b, v0.16b, v2.16b
3601
3602
mov w7,v0.s[0]
3603
eor w6,w7,w7,ror #32-2
3604
eor w6,w6,w7,ror #32-10
3605
eor w6,w6,w7,ror #32-18
3606
eor w6,w6,w7,ror #32-24
3607
eor w12,w12,w6
3608
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3609
eor w6,w14,w15
3610
eor w9,w12,w8
3611
eor w6,w6,w9
3612
mov v3.s[0],w6
3613
// optimize sbox using AESE instruction
3614
tbl v0.16b, {v3.16b}, v26.16b
3615
ushr v2.16b, v0.16b, 4
3616
and v0.16b, v0.16b, v31.16b
3617
tbl v0.16b, {v28.16b}, v0.16b
3618
tbl v2.16b, {v27.16b}, v2.16b
3619
eor v0.16b, v0.16b, v2.16b
3620
eor v1.16b, v1.16b, v1.16b
3621
aese v0.16b,v1.16b
3622
ushr v2.16b, v0.16b, 4
3623
and v0.16b, v0.16b, v31.16b
3624
tbl v0.16b, {v30.16b}, v0.16b
3625
tbl v2.16b, {v29.16b}, v2.16b
3626
eor v0.16b, v0.16b, v2.16b
3627
3628
mov w7,v0.s[0]
3629
eor w6,w7,w7,ror #32-2
3630
eor w6,w6,w7,ror #32-10
3631
eor w6,w6,w7,ror #32-18
3632
eor w6,w6,w7,ror #32-24
3633
ldp w7,w8,[x10],8
3634
eor w13,w13,w6
3635
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3636
eor w6,w12,w13
3637
eor w9,w7,w15
3638
eor w6,w6,w9
3639
mov v3.s[0],w6
3640
// optimize sbox using AESE instruction
3641
tbl v0.16b, {v3.16b}, v26.16b
3642
ushr v2.16b, v0.16b, 4
3643
and v0.16b, v0.16b, v31.16b
3644
tbl v0.16b, {v28.16b}, v0.16b
3645
tbl v2.16b, {v27.16b}, v2.16b
3646
eor v0.16b, v0.16b, v2.16b
3647
eor v1.16b, v1.16b, v1.16b
3648
aese v0.16b,v1.16b
3649
ushr v2.16b, v0.16b, 4
3650
and v0.16b, v0.16b, v31.16b
3651
tbl v0.16b, {v30.16b}, v0.16b
3652
tbl v2.16b, {v29.16b}, v2.16b
3653
eor v0.16b, v0.16b, v2.16b
3654
3655
mov w7,v0.s[0]
3656
eor w6,w7,w7,ror #32-2
3657
eor w6,w6,w7,ror #32-10
3658
eor w6,w6,w7,ror #32-18
3659
eor w6,w6,w7,ror #32-24
3660
eor w14,w14,w6
3661
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3662
eor w6,w12,w13
3663
eor w9,w14,w8
3664
eor w6,w6,w9
3665
mov v3.s[0],w6
3666
// optimize sbox using AESE instruction
3667
tbl v0.16b, {v3.16b}, v26.16b
3668
ushr v2.16b, v0.16b, 4
3669
and v0.16b, v0.16b, v31.16b
3670
tbl v0.16b, {v28.16b}, v0.16b
3671
tbl v2.16b, {v27.16b}, v2.16b
3672
eor v0.16b, v0.16b, v2.16b
3673
eor v1.16b, v1.16b, v1.16b
3674
aese v0.16b,v1.16b
3675
ushr v2.16b, v0.16b, 4
3676
and v0.16b, v0.16b, v31.16b
3677
tbl v0.16b, {v30.16b}, v0.16b
3678
tbl v2.16b, {v29.16b}, v2.16b
3679
eor v0.16b, v0.16b, v2.16b
3680
3681
mov w7,v0.s[0]
3682
eor w6,w7,w7,ror #32-2
3683
eor w6,w6,w7,ror #32-10
3684
eor w6,w6,w7,ror #32-18
3685
eor w6,w6,w7,ror #32-24
3686
eor w15,w15,w6
3687
subs w11,w11,#1
3688
b.ne 10b
3689
mov v16.s[0],w15
3690
mov v16.s[1],w14
3691
mov v16.s[2],w13
3692
mov v16.s[3],w12
3693
#ifndef __AARCH64EB__
3694
rev32 v16.16b,v16.16b
3695
#endif
3696
mov x3,x26
3697
and x29,x2,#0x0F
3698
// convert length into blocks
3699
lsr x2,x2,4
3700
cmp x2,#1
3701
b.lt .return
3702
3703
cmp x29,0
3704
// If the encryption/decryption Length is N times of 16,
3705
// the all blocks are encrypted/decrypted in .xts_encrypt_blocks
3706
b.eq .xts_encrypt_blocks
3707
3708
// If the encryption/decryption length is not N times of 16,
3709
// the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak
3710
// the other blocks are encrypted/decrypted in .xts_encrypt_blocks
3711
subs x2,x2,#1
3712
b.eq .only_2blks_tweak
3713
.xts_encrypt_blocks:
3714
#ifdef __AARCH64EB__
3715
rev32 v16.16b,v16.16b
3716
#endif
3717
mov x12,v16.d[0]
3718
mov x13,v16.d[1]
3719
mov w7,0x87
3720
extr x9,x13,x13,#32
3721
extr x15,x13,x12,#63
3722
and w8,w7,w9,asr#31
3723
eor x14,x8,x12,lsl#1
3724
mov w7,0x87
3725
extr x9,x15,x15,#32
3726
extr x17,x15,x14,#63
3727
and w8,w7,w9,asr#31
3728
eor x16,x8,x14,lsl#1
3729
mov w7,0x87
3730
extr x9,x17,x17,#32
3731
extr x19,x17,x16,#63
3732
and w8,w7,w9,asr#31
3733
eor x18,x8,x16,lsl#1
3734
mov w7,0x87
3735
extr x9,x19,x19,#32
3736
extr x21,x19,x18,#63
3737
and w8,w7,w9,asr#31
3738
eor x20,x8,x18,lsl#1
3739
mov w7,0x87
3740
extr x9,x21,x21,#32
3741
extr x23,x21,x20,#63
3742
and w8,w7,w9,asr#31
3743
eor x22,x8,x20,lsl#1
3744
mov w7,0x87
3745
extr x9,x23,x23,#32
3746
extr x25,x23,x22,#63
3747
and w8,w7,w9,asr#31
3748
eor x24,x8,x22,lsl#1
3749
mov w7,0x87
3750
extr x9,x25,x25,#32
3751
extr x27,x25,x24,#63
3752
and w8,w7,w9,asr#31
3753
eor x26,x8,x24,lsl#1
3754
.Lxts_8_blocks_process:
3755
cmp x2,#8
3756
mov v16.d[0],x12
3757
mov v16.d[1],x13
3758
#ifdef __AARCH64EB__
3759
rev32 v16.16b,v16.16b
3760
#endif
3761
mov w7,0x87
3762
extr x9,x27,x27,#32
3763
extr x13,x27,x26,#63
3764
and w8,w7,w9,asr#31
3765
eor x12,x8,x26,lsl#1
3766
mov v17.d[0],x14
3767
mov v17.d[1],x15
3768
#ifdef __AARCH64EB__
3769
rev32 v17.16b,v17.16b
3770
#endif
3771
mov w7,0x87
3772
extr x9,x13,x13,#32
3773
extr x15,x13,x12,#63
3774
and w8,w7,w9,asr#31
3775
eor x14,x8,x12,lsl#1
3776
mov v18.d[0],x16
3777
mov v18.d[1],x17
3778
#ifdef __AARCH64EB__
3779
rev32 v18.16b,v18.16b
3780
#endif
3781
mov w7,0x87
3782
extr x9,x15,x15,#32
3783
extr x17,x15,x14,#63
3784
and w8,w7,w9,asr#31
3785
eor x16,x8,x14,lsl#1
3786
mov v19.d[0],x18
3787
mov v19.d[1],x19
3788
#ifdef __AARCH64EB__
3789
rev32 v19.16b,v19.16b
3790
#endif
3791
mov w7,0x87
3792
extr x9,x17,x17,#32
3793
extr x19,x17,x16,#63
3794
and w8,w7,w9,asr#31
3795
eor x18,x8,x16,lsl#1
3796
mov v20.d[0],x20
3797
mov v20.d[1],x21
3798
#ifdef __AARCH64EB__
3799
rev32 v20.16b,v20.16b
3800
#endif
3801
mov w7,0x87
3802
extr x9,x19,x19,#32
3803
extr x21,x19,x18,#63
3804
and w8,w7,w9,asr#31
3805
eor x20,x8,x18,lsl#1
3806
mov v21.d[0],x22
3807
mov v21.d[1],x23
3808
#ifdef __AARCH64EB__
3809
rev32 v21.16b,v21.16b
3810
#endif
3811
mov w7,0x87
3812
extr x9,x21,x21,#32
3813
extr x23,x21,x20,#63
3814
and w8,w7,w9,asr#31
3815
eor x22,x8,x20,lsl#1
3816
mov v22.d[0],x24
3817
mov v22.d[1],x25
3818
#ifdef __AARCH64EB__
3819
rev32 v22.16b,v22.16b
3820
#endif
3821
mov w7,0x87
3822
extr x9,x23,x23,#32
3823
extr x25,x23,x22,#63
3824
and w8,w7,w9,asr#31
3825
eor x24,x8,x22,lsl#1
3826
mov v23.d[0],x26
3827
mov v23.d[1],x27
3828
#ifdef __AARCH64EB__
3829
rev32 v23.16b,v23.16b
3830
#endif
3831
mov w7,0x87
3832
extr x9,x25,x25,#32
3833
extr x27,x25,x24,#63
3834
and w8,w7,w9,asr#31
3835
eor x26,x8,x24,lsl#1
3836
b.lt .Lxts_4_blocks_process
3837
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3838
eor v4.16b, v4.16b, v16.16b
3839
eor v5.16b, v5.16b, v17.16b
3840
eor v6.16b, v6.16b, v18.16b
3841
eor v7.16b, v7.16b, v19.16b
3842
ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
3843
eor v8.16b, v8.16b, v20.16b
3844
eor v9.16b, v9.16b, v21.16b
3845
eor v10.16b, v10.16b, v22.16b
3846
eor v11.16b, v11.16b, v23.16b
3847
#ifndef __AARCH64EB__
3848
rev32 v4.16b,v4.16b
3849
#endif
3850
#ifndef __AARCH64EB__
3851
rev32 v5.16b,v5.16b
3852
#endif
3853
#ifndef __AARCH64EB__
3854
rev32 v6.16b,v6.16b
3855
#endif
3856
#ifndef __AARCH64EB__
3857
rev32 v7.16b,v7.16b
3858
#endif
3859
#ifndef __AARCH64EB__
3860
rev32 v8.16b,v8.16b
3861
#endif
3862
#ifndef __AARCH64EB__
3863
rev32 v9.16b,v9.16b
3864
#endif
3865
#ifndef __AARCH64EB__
3866
rev32 v10.16b,v10.16b
3867
#endif
3868
#ifndef __AARCH64EB__
3869
rev32 v11.16b,v11.16b
3870
#endif
3871
zip1 v0.4s,v4.4s,v5.4s
3872
zip2 v1.4s,v4.4s,v5.4s
3873
zip1 v2.4s,v6.4s,v7.4s
3874
zip2 v3.4s,v6.4s,v7.4s
3875
zip1 v4.2d,v0.2d,v2.2d
3876
zip2 v5.2d,v0.2d,v2.2d
3877
zip1 v6.2d,v1.2d,v3.2d
3878
zip2 v7.2d,v1.2d,v3.2d
3879
zip1 v0.4s,v8.4s,v9.4s
3880
zip2 v1.4s,v8.4s,v9.4s
3881
zip1 v2.4s,v10.4s,v11.4s
3882
zip2 v3.4s,v10.4s,v11.4s
3883
zip1 v8.2d,v0.2d,v2.2d
3884
zip2 v9.2d,v0.2d,v2.2d
3885
zip1 v10.2d,v1.2d,v3.2d
3886
zip2 v11.2d,v1.2d,v3.2d
3887
bl _vpsm4_ex_enc_8blks
3888
zip1 v8.4s,v0.4s,v1.4s
3889
zip2 v9.4s,v0.4s,v1.4s
3890
zip1 v10.4s,v2.4s,v3.4s
3891
zip2 v11.4s,v2.4s,v3.4s
3892
zip1 v0.2d,v8.2d,v10.2d
3893
zip2 v1.2d,v8.2d,v10.2d
3894
zip1 v2.2d,v9.2d,v11.2d
3895
zip2 v3.2d,v9.2d,v11.2d
3896
zip1 v8.4s,v4.4s,v5.4s
3897
zip2 v9.4s,v4.4s,v5.4s
3898
zip1 v10.4s,v6.4s,v7.4s
3899
zip2 v11.4s,v6.4s,v7.4s
3900
zip1 v4.2d,v8.2d,v10.2d
3901
zip2 v5.2d,v8.2d,v10.2d
3902
zip1 v6.2d,v9.2d,v11.2d
3903
zip2 v7.2d,v9.2d,v11.2d
3904
eor v0.16b, v0.16b, v16.16b
3905
eor v1.16b, v1.16b, v17.16b
3906
eor v2.16b, v2.16b, v18.16b
3907
eor v3.16b, v3.16b, v19.16b
3908
eor v4.16b, v4.16b, v20.16b
3909
eor v5.16b, v5.16b, v21.16b
3910
eor v6.16b, v6.16b, v22.16b
3911
eor v7.16b, v7.16b, v23.16b
3912
3913
// save the last tweak
3914
mov v25.16b,v23.16b
3915
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
3916
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
3917
subs x2,x2,#8
3918
b.gt .Lxts_8_blocks_process
3919
b 100f
3920
.Lxts_4_blocks_process:
3921
cmp x2,#4
3922
b.lt 1f
3923
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3924
eor v4.16b, v4.16b, v16.16b
3925
eor v5.16b, v5.16b, v17.16b
3926
eor v6.16b, v6.16b, v18.16b
3927
eor v7.16b, v7.16b, v19.16b
3928
#ifndef __AARCH64EB__
3929
rev32 v4.16b,v4.16b
3930
#endif
3931
#ifndef __AARCH64EB__
3932
rev32 v5.16b,v5.16b
3933
#endif
3934
#ifndef __AARCH64EB__
3935
rev32 v6.16b,v6.16b
3936
#endif
3937
#ifndef __AARCH64EB__
3938
rev32 v7.16b,v7.16b
3939
#endif
3940
zip1 v0.4s,v4.4s,v5.4s
3941
zip2 v1.4s,v4.4s,v5.4s
3942
zip1 v2.4s,v6.4s,v7.4s
3943
zip2 v3.4s,v6.4s,v7.4s
3944
zip1 v4.2d,v0.2d,v2.2d
3945
zip2 v5.2d,v0.2d,v2.2d
3946
zip1 v6.2d,v1.2d,v3.2d
3947
zip2 v7.2d,v1.2d,v3.2d
3948
bl _vpsm4_ex_enc_4blks
3949
zip1 v4.4s,v0.4s,v1.4s
3950
zip2 v5.4s,v0.4s,v1.4s
3951
zip1 v6.4s,v2.4s,v3.4s
3952
zip2 v7.4s,v2.4s,v3.4s
3953
zip1 v0.2d,v4.2d,v6.2d
3954
zip2 v1.2d,v4.2d,v6.2d
3955
zip1 v2.2d,v5.2d,v7.2d
3956
zip2 v3.2d,v5.2d,v7.2d
3957
eor v0.16b, v0.16b, v16.16b
3958
eor v1.16b, v1.16b, v17.16b
3959
eor v2.16b, v2.16b, v18.16b
3960
eor v3.16b, v3.16b, v19.16b
3961
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
3962
sub x2,x2,#4
3963
mov v16.16b,v20.16b
3964
mov v17.16b,v21.16b
3965
mov v18.16b,v22.16b
3966
// save the last tweak
3967
mov v25.16b,v19.16b
3968
1:
3969
// process last block
3970
cmp x2,#1
3971
b.lt 100f
3972
b.gt 1f
3973
ld1 {v4.4s},[x0],#16
3974
eor v4.16b, v4.16b, v16.16b
3975
#ifndef __AARCH64EB__
3976
rev32 v4.16b,v4.16b
3977
#endif
3978
mov x10,x3
3979
mov w11,#8
3980
mov w12,v4.s[0]
3981
mov w13,v4.s[1]
3982
mov w14,v4.s[2]
3983
mov w15,v4.s[3]
3984
10:
3985
ldp w7,w8,[x10],8
3986
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3987
eor w6,w14,w15
3988
eor w9,w7,w13
3989
eor w6,w6,w9
3990
mov v3.s[0],w6
3991
// optimize sbox using AESE instruction
3992
tbl v0.16b, {v3.16b}, v26.16b
3993
ushr v2.16b, v0.16b, 4
3994
and v0.16b, v0.16b, v31.16b
3995
tbl v0.16b, {v28.16b}, v0.16b
3996
tbl v2.16b, {v27.16b}, v2.16b
3997
eor v0.16b, v0.16b, v2.16b
3998
eor v1.16b, v1.16b, v1.16b
3999
aese v0.16b,v1.16b
4000
ushr v2.16b, v0.16b, 4
4001
and v0.16b, v0.16b, v31.16b
4002
tbl v0.16b, {v30.16b}, v0.16b
4003
tbl v2.16b, {v29.16b}, v2.16b
4004
eor v0.16b, v0.16b, v2.16b
4005
4006
mov w7,v0.s[0]
4007
eor w6,w7,w7,ror #32-2
4008
eor w6,w6,w7,ror #32-10
4009
eor w6,w6,w7,ror #32-18
4010
eor w6,w6,w7,ror #32-24
4011
eor w12,w12,w6
4012
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4013
eor w6,w14,w15
4014
eor w9,w12,w8
4015
eor w6,w6,w9
4016
mov v3.s[0],w6
4017
// optimize sbox using AESE instruction
4018
tbl v0.16b, {v3.16b}, v26.16b
4019
ushr v2.16b, v0.16b, 4
4020
and v0.16b, v0.16b, v31.16b
4021
tbl v0.16b, {v28.16b}, v0.16b
4022
tbl v2.16b, {v27.16b}, v2.16b
4023
eor v0.16b, v0.16b, v2.16b
4024
eor v1.16b, v1.16b, v1.16b
4025
aese v0.16b,v1.16b
4026
ushr v2.16b, v0.16b, 4
4027
and v0.16b, v0.16b, v31.16b
4028
tbl v0.16b, {v30.16b}, v0.16b
4029
tbl v2.16b, {v29.16b}, v2.16b
4030
eor v0.16b, v0.16b, v2.16b
4031
4032
mov w7,v0.s[0]
4033
eor w6,w7,w7,ror #32-2
4034
eor w6,w6,w7,ror #32-10
4035
eor w6,w6,w7,ror #32-18
4036
eor w6,w6,w7,ror #32-24
4037
ldp w7,w8,[x10],8
4038
eor w13,w13,w6
4039
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4040
eor w6,w12,w13
4041
eor w9,w7,w15
4042
eor w6,w6,w9
4043
mov v3.s[0],w6
4044
// optimize sbox using AESE instruction
4045
tbl v0.16b, {v3.16b}, v26.16b
4046
ushr v2.16b, v0.16b, 4
4047
and v0.16b, v0.16b, v31.16b
4048
tbl v0.16b, {v28.16b}, v0.16b
4049
tbl v2.16b, {v27.16b}, v2.16b
4050
eor v0.16b, v0.16b, v2.16b
4051
eor v1.16b, v1.16b, v1.16b
4052
aese v0.16b,v1.16b
4053
ushr v2.16b, v0.16b, 4
4054
and v0.16b, v0.16b, v31.16b
4055
tbl v0.16b, {v30.16b}, v0.16b
4056
tbl v2.16b, {v29.16b}, v2.16b
4057
eor v0.16b, v0.16b, v2.16b
4058
4059
mov w7,v0.s[0]
4060
eor w6,w7,w7,ror #32-2
4061
eor w6,w6,w7,ror #32-10
4062
eor w6,w6,w7,ror #32-18
4063
eor w6,w6,w7,ror #32-24
4064
eor w14,w14,w6
4065
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4066
eor w6,w12,w13
4067
eor w9,w14,w8
4068
eor w6,w6,w9
4069
mov v3.s[0],w6
4070
// optimize sbox using AESE instruction
4071
tbl v0.16b, {v3.16b}, v26.16b
4072
ushr v2.16b, v0.16b, 4
4073
and v0.16b, v0.16b, v31.16b
4074
tbl v0.16b, {v28.16b}, v0.16b
4075
tbl v2.16b, {v27.16b}, v2.16b
4076
eor v0.16b, v0.16b, v2.16b
4077
eor v1.16b, v1.16b, v1.16b
4078
aese v0.16b,v1.16b
4079
ushr v2.16b, v0.16b, 4
4080
and v0.16b, v0.16b, v31.16b
4081
tbl v0.16b, {v30.16b}, v0.16b
4082
tbl v2.16b, {v29.16b}, v2.16b
4083
eor v0.16b, v0.16b, v2.16b
4084
4085
mov w7,v0.s[0]
4086
eor w6,w7,w7,ror #32-2
4087
eor w6,w6,w7,ror #32-10
4088
eor w6,w6,w7,ror #32-18
4089
eor w6,w6,w7,ror #32-24
4090
eor w15,w15,w6
4091
subs w11,w11,#1
4092
b.ne 10b
4093
mov v4.s[0],w15
4094
mov v4.s[1],w14
4095
mov v4.s[2],w13
4096
mov v4.s[3],w12
4097
#ifndef __AARCH64EB__
4098
rev32 v4.16b,v4.16b
4099
#endif
4100
eor v4.16b, v4.16b, v16.16b
4101
st1 {v4.4s},[x1],#16
4102
// save the last tweak
4103
mov v25.16b,v16.16b
4104
b 100f
4105
1: // process last 2 blocks
4106
cmp x2,#2
4107
b.gt 1f
4108
ld1 {v4.4s,v5.4s},[x0],#32
4109
eor v4.16b, v4.16b, v16.16b
4110
eor v5.16b, v5.16b, v17.16b
4111
#ifndef __AARCH64EB__
4112
rev32 v4.16b,v4.16b
4113
#endif
4114
#ifndef __AARCH64EB__
4115
rev32 v5.16b,v5.16b
4116
#endif
4117
zip1 v0.4s,v4.4s,v5.4s
4118
zip2 v1.4s,v4.4s,v5.4s
4119
zip1 v2.4s,v6.4s,v7.4s
4120
zip2 v3.4s,v6.4s,v7.4s
4121
zip1 v4.2d,v0.2d,v2.2d
4122
zip2 v5.2d,v0.2d,v2.2d
4123
zip1 v6.2d,v1.2d,v3.2d
4124
zip2 v7.2d,v1.2d,v3.2d
4125
bl _vpsm4_ex_enc_4blks
4126
zip1 v4.4s,v0.4s,v1.4s
4127
zip2 v5.4s,v0.4s,v1.4s
4128
zip1 v6.4s,v2.4s,v3.4s
4129
zip2 v7.4s,v2.4s,v3.4s
4130
zip1 v0.2d,v4.2d,v6.2d
4131
zip2 v1.2d,v4.2d,v6.2d
4132
zip1 v2.2d,v5.2d,v7.2d
4133
zip2 v3.2d,v5.2d,v7.2d
4134
eor v0.16b, v0.16b, v16.16b
4135
eor v1.16b, v1.16b, v17.16b
4136
st1 {v0.4s,v1.4s},[x1],#32
4137
// save the last tweak
4138
mov v25.16b,v17.16b
4139
b 100f
4140
1: // process last 3 blocks
4141
ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
4142
eor v4.16b, v4.16b, v16.16b
4143
eor v5.16b, v5.16b, v17.16b
4144
eor v6.16b, v6.16b, v18.16b
4145
#ifndef __AARCH64EB__
4146
rev32 v4.16b,v4.16b
4147
#endif
4148
#ifndef __AARCH64EB__
4149
rev32 v5.16b,v5.16b
4150
#endif
4151
#ifndef __AARCH64EB__
4152
rev32 v6.16b,v6.16b
4153
#endif
4154
zip1 v0.4s,v4.4s,v5.4s
4155
zip2 v1.4s,v4.4s,v5.4s
4156
zip1 v2.4s,v6.4s,v7.4s
4157
zip2 v3.4s,v6.4s,v7.4s
4158
zip1 v4.2d,v0.2d,v2.2d
4159
zip2 v5.2d,v0.2d,v2.2d
4160
zip1 v6.2d,v1.2d,v3.2d
4161
zip2 v7.2d,v1.2d,v3.2d
4162
bl _vpsm4_ex_enc_4blks
4163
zip1 v4.4s,v0.4s,v1.4s
4164
zip2 v5.4s,v0.4s,v1.4s
4165
zip1 v6.4s,v2.4s,v3.4s
4166
zip2 v7.4s,v2.4s,v3.4s
4167
zip1 v0.2d,v4.2d,v6.2d
4168
zip2 v1.2d,v4.2d,v6.2d
4169
zip1 v2.2d,v5.2d,v7.2d
4170
zip2 v3.2d,v5.2d,v7.2d
4171
eor v0.16b, v0.16b, v16.16b
4172
eor v1.16b, v1.16b, v17.16b
4173
eor v2.16b, v2.16b, v18.16b
4174
st1 {v0.4s,v1.4s,v2.4s},[x1],#48
4175
// save the last tweak
4176
mov v25.16b,v18.16b
4177
100:
4178
cmp x29,0
4179
b.eq .return
4180
4181
// This branch calculates the last two tweaks,
4182
// while the encryption/decryption length is larger than 32
4183
.last_2blks_tweak:
4184
#ifdef __AARCH64EB__
4185
rev32 v25.16b,v25.16b
4186
#endif
4187
mov v2.16b,v25.16b
4188
adrp x9, .Lxts_magic
4189
ldr q0, [x9, #:lo12:.Lxts_magic]
4190
shl v17.16b, v2.16b, #1
4191
ext v1.16b, v2.16b, v2.16b,#15
4192
ushr v1.16b, v1.16b, #7
4193
mul v1.16b, v1.16b, v0.16b
4194
eor v17.16b, v17.16b, v1.16b
4195
mov v2.16b,v17.16b
4196
adrp x9, .Lxts_magic
4197
ldr q0, [x9, #:lo12:.Lxts_magic]
4198
shl v18.16b, v2.16b, #1
4199
ext v1.16b, v2.16b, v2.16b,#15
4200
ushr v1.16b, v1.16b, #7
4201
mul v1.16b, v1.16b, v0.16b
4202
eor v18.16b, v18.16b, v1.16b
4203
b .check_dec
4204
4205
4206
// This branch calculates the last two tweaks,
4207
// while the encryption/decryption length is equal to 32, who only need two tweaks
4208
.only_2blks_tweak:
4209
mov v17.16b,v16.16b
4210
#ifdef __AARCH64EB__
4211
rev32 v17.16b,v17.16b
4212
#endif
4213
mov v2.16b,v17.16b
4214
adrp x9, .Lxts_magic
4215
ldr q0, [x9, #:lo12:.Lxts_magic]
4216
shl v18.16b, v2.16b, #1
4217
ext v1.16b, v2.16b, v2.16b,#15
4218
ushr v1.16b, v1.16b, #7
4219
mul v1.16b, v1.16b, v0.16b
4220
eor v18.16b, v18.16b, v1.16b
4221
b .check_dec
4222
4223
4224
// Determine whether encryption or decryption is required.
4225
// The last two tweaks need to be swapped for decryption.
4226
.check_dec:
4227
// encryption:1 decryption:0
4228
cmp w28,1
4229
b.eq .process_last_2blks
4230
mov v0.16B,v17.16b
4231
mov v17.16B,v18.16b
4232
mov v18.16B,v0.16b
4233
4234
.process_last_2blks:
4235
#ifdef __AARCH64EB__
4236
rev32 v17.16b,v17.16b
4237
#endif
4238
#ifdef __AARCH64EB__
4239
rev32 v18.16b,v18.16b
4240
#endif
4241
ld1 {v4.4s},[x0],#16
4242
eor v4.16b, v4.16b, v17.16b
4243
#ifndef __AARCH64EB__
4244
rev32 v4.16b,v4.16b
4245
#endif
4246
mov x10,x3
4247
mov w11,#8
4248
mov w12,v4.s[0]
4249
mov w13,v4.s[1]
4250
mov w14,v4.s[2]
4251
mov w15,v4.s[3]
4252
10:
4253
ldp w7,w8,[x10],8
4254
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4255
eor w6,w14,w15
4256
eor w9,w7,w13
4257
eor w6,w6,w9
4258
mov v3.s[0],w6
4259
// optimize sbox using AESE instruction
4260
tbl v0.16b, {v3.16b}, v26.16b
4261
ushr v2.16b, v0.16b, 4
4262
and v0.16b, v0.16b, v31.16b
4263
tbl v0.16b, {v28.16b}, v0.16b
4264
tbl v2.16b, {v27.16b}, v2.16b
4265
eor v0.16b, v0.16b, v2.16b
4266
eor v1.16b, v1.16b, v1.16b
4267
aese v0.16b,v1.16b
4268
ushr v2.16b, v0.16b, 4
4269
and v0.16b, v0.16b, v31.16b
4270
tbl v0.16b, {v30.16b}, v0.16b
4271
tbl v2.16b, {v29.16b}, v2.16b
4272
eor v0.16b, v0.16b, v2.16b
4273
4274
mov w7,v0.s[0]
4275
eor w6,w7,w7,ror #32-2
4276
eor w6,w6,w7,ror #32-10
4277
eor w6,w6,w7,ror #32-18
4278
eor w6,w6,w7,ror #32-24
4279
eor w12,w12,w6
4280
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4281
eor w6,w14,w15
4282
eor w9,w12,w8
4283
eor w6,w6,w9
4284
mov v3.s[0],w6
4285
// optimize sbox using AESE instruction
4286
tbl v0.16b, {v3.16b}, v26.16b
4287
ushr v2.16b, v0.16b, 4
4288
and v0.16b, v0.16b, v31.16b
4289
tbl v0.16b, {v28.16b}, v0.16b
4290
tbl v2.16b, {v27.16b}, v2.16b
4291
eor v0.16b, v0.16b, v2.16b
4292
eor v1.16b, v1.16b, v1.16b
4293
aese v0.16b,v1.16b
4294
ushr v2.16b, v0.16b, 4
4295
and v0.16b, v0.16b, v31.16b
4296
tbl v0.16b, {v30.16b}, v0.16b
4297
tbl v2.16b, {v29.16b}, v2.16b
4298
eor v0.16b, v0.16b, v2.16b
4299
4300
mov w7,v0.s[0]
4301
eor w6,w7,w7,ror #32-2
4302
eor w6,w6,w7,ror #32-10
4303
eor w6,w6,w7,ror #32-18
4304
eor w6,w6,w7,ror #32-24
4305
ldp w7,w8,[x10],8
4306
eor w13,w13,w6
4307
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4308
eor w6,w12,w13
4309
eor w9,w7,w15
4310
eor w6,w6,w9
4311
mov v3.s[0],w6
4312
// optimize sbox using AESE instruction
4313
tbl v0.16b, {v3.16b}, v26.16b
4314
ushr v2.16b, v0.16b, 4
4315
and v0.16b, v0.16b, v31.16b
4316
tbl v0.16b, {v28.16b}, v0.16b
4317
tbl v2.16b, {v27.16b}, v2.16b
4318
eor v0.16b, v0.16b, v2.16b
4319
eor v1.16b, v1.16b, v1.16b
4320
aese v0.16b,v1.16b
4321
ushr v2.16b, v0.16b, 4
4322
and v0.16b, v0.16b, v31.16b
4323
tbl v0.16b, {v30.16b}, v0.16b
4324
tbl v2.16b, {v29.16b}, v2.16b
4325
eor v0.16b, v0.16b, v2.16b
4326
4327
mov w7,v0.s[0]
4328
eor w6,w7,w7,ror #32-2
4329
eor w6,w6,w7,ror #32-10
4330
eor w6,w6,w7,ror #32-18
4331
eor w6,w6,w7,ror #32-24
4332
eor w14,w14,w6
4333
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4334
eor w6,w12,w13
4335
eor w9,w14,w8
4336
eor w6,w6,w9
4337
mov v3.s[0],w6
4338
// optimize sbox using AESE instruction
4339
tbl v0.16b, {v3.16b}, v26.16b
4340
ushr v2.16b, v0.16b, 4
4341
and v0.16b, v0.16b, v31.16b
4342
tbl v0.16b, {v28.16b}, v0.16b
4343
tbl v2.16b, {v27.16b}, v2.16b
4344
eor v0.16b, v0.16b, v2.16b
4345
eor v1.16b, v1.16b, v1.16b
4346
aese v0.16b,v1.16b
4347
ushr v2.16b, v0.16b, 4
4348
and v0.16b, v0.16b, v31.16b
4349
tbl v0.16b, {v30.16b}, v0.16b
4350
tbl v2.16b, {v29.16b}, v2.16b
4351
eor v0.16b, v0.16b, v2.16b
4352
4353
mov w7,v0.s[0]
4354
eor w6,w7,w7,ror #32-2
4355
eor w6,w6,w7,ror #32-10
4356
eor w6,w6,w7,ror #32-18
4357
eor w6,w6,w7,ror #32-24
4358
eor w15,w15,w6
4359
subs w11,w11,#1
4360
b.ne 10b
4361
mov v4.s[0],w15
4362
mov v4.s[1],w14
4363
mov v4.s[2],w13
4364
mov v4.s[3],w12
4365
#ifndef __AARCH64EB__
4366
rev32 v4.16b,v4.16b
4367
#endif
4368
eor v4.16b, v4.16b, v17.16b
4369
st1 {v4.4s},[x1],#16
4370
4371
sub x26,x1,16
4372
.loop:
4373
subs x29,x29,1
4374
ldrb w7,[x26,x29]
4375
ldrb w8,[x0,x29]
4376
strb w8,[x26,x29]
4377
strb w7,[x1,x29]
4378
b.gt .loop
4379
ld1 {v4.4s}, [x26]
4380
eor v4.16b, v4.16b, v18.16b
4381
#ifndef __AARCH64EB__
4382
rev32 v4.16b,v4.16b
4383
#endif
4384
mov x10,x3
4385
mov w11,#8
4386
mov w12,v4.s[0]
4387
mov w13,v4.s[1]
4388
mov w14,v4.s[2]
4389
mov w15,v4.s[3]
4390
10:
4391
ldp w7,w8,[x10],8
4392
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4393
eor w6,w14,w15
4394
eor w9,w7,w13
4395
eor w6,w6,w9
4396
mov v3.s[0],w6
4397
// optimize sbox using AESE instruction
4398
tbl v0.16b, {v3.16b}, v26.16b
4399
ushr v2.16b, v0.16b, 4
4400
and v0.16b, v0.16b, v31.16b
4401
tbl v0.16b, {v28.16b}, v0.16b
4402
tbl v2.16b, {v27.16b}, v2.16b
4403
eor v0.16b, v0.16b, v2.16b
4404
eor v1.16b, v1.16b, v1.16b
4405
aese v0.16b,v1.16b
4406
ushr v2.16b, v0.16b, 4
4407
and v0.16b, v0.16b, v31.16b
4408
tbl v0.16b, {v30.16b}, v0.16b
4409
tbl v2.16b, {v29.16b}, v2.16b
4410
eor v0.16b, v0.16b, v2.16b
4411
4412
mov w7,v0.s[0]
4413
eor w6,w7,w7,ror #32-2
4414
eor w6,w6,w7,ror #32-10
4415
eor w6,w6,w7,ror #32-18
4416
eor w6,w6,w7,ror #32-24
4417
eor w12,w12,w6
4418
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4419
eor w6,w14,w15
4420
eor w9,w12,w8
4421
eor w6,w6,w9
4422
mov v3.s[0],w6
4423
// optimize sbox using AESE instruction
4424
tbl v0.16b, {v3.16b}, v26.16b
4425
ushr v2.16b, v0.16b, 4
4426
and v0.16b, v0.16b, v31.16b
4427
tbl v0.16b, {v28.16b}, v0.16b
4428
tbl v2.16b, {v27.16b}, v2.16b
4429
eor v0.16b, v0.16b, v2.16b
4430
eor v1.16b, v1.16b, v1.16b
4431
aese v0.16b,v1.16b
4432
ushr v2.16b, v0.16b, 4
4433
and v0.16b, v0.16b, v31.16b
4434
tbl v0.16b, {v30.16b}, v0.16b
4435
tbl v2.16b, {v29.16b}, v2.16b
4436
eor v0.16b, v0.16b, v2.16b
4437
4438
mov w7,v0.s[0]
4439
eor w6,w7,w7,ror #32-2
4440
eor w6,w6,w7,ror #32-10
4441
eor w6,w6,w7,ror #32-18
4442
eor w6,w6,w7,ror #32-24
4443
ldp w7,w8,[x10],8
4444
eor w13,w13,w6
4445
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4446
eor w6,w12,w13
4447
eor w9,w7,w15
4448
eor w6,w6,w9
4449
mov v3.s[0],w6
4450
// optimize sbox using AESE instruction
4451
tbl v0.16b, {v3.16b}, v26.16b
4452
ushr v2.16b, v0.16b, 4
4453
and v0.16b, v0.16b, v31.16b
4454
tbl v0.16b, {v28.16b}, v0.16b
4455
tbl v2.16b, {v27.16b}, v2.16b
4456
eor v0.16b, v0.16b, v2.16b
4457
eor v1.16b, v1.16b, v1.16b
4458
aese v0.16b,v1.16b
4459
ushr v2.16b, v0.16b, 4
4460
and v0.16b, v0.16b, v31.16b
4461
tbl v0.16b, {v30.16b}, v0.16b
4462
tbl v2.16b, {v29.16b}, v2.16b
4463
eor v0.16b, v0.16b, v2.16b
4464
4465
mov w7,v0.s[0]
4466
eor w6,w7,w7,ror #32-2
4467
eor w6,w6,w7,ror #32-10
4468
eor w6,w6,w7,ror #32-18
4469
eor w6,w6,w7,ror #32-24
4470
eor w14,w14,w6
4471
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4472
eor w6,w12,w13
4473
eor w9,w14,w8
4474
eor w6,w6,w9
4475
mov v3.s[0],w6
4476
// optimize sbox using AESE instruction
4477
tbl v0.16b, {v3.16b}, v26.16b
4478
ushr v2.16b, v0.16b, 4
4479
and v0.16b, v0.16b, v31.16b
4480
tbl v0.16b, {v28.16b}, v0.16b
4481
tbl v2.16b, {v27.16b}, v2.16b
4482
eor v0.16b, v0.16b, v2.16b
4483
eor v1.16b, v1.16b, v1.16b
4484
aese v0.16b,v1.16b
4485
ushr v2.16b, v0.16b, 4
4486
and v0.16b, v0.16b, v31.16b
4487
tbl v0.16b, {v30.16b}, v0.16b
4488
tbl v2.16b, {v29.16b}, v2.16b
4489
eor v0.16b, v0.16b, v2.16b
4490
4491
mov w7,v0.s[0]
4492
eor w6,w7,w7,ror #32-2
4493
eor w6,w6,w7,ror #32-10
4494
eor w6,w6,w7,ror #32-18
4495
eor w6,w6,w7,ror #32-24
4496
eor w15,w15,w6
4497
subs w11,w11,#1
4498
b.ne 10b
4499
mov v4.s[0],w15
4500
mov v4.s[1],w14
4501
mov v4.s[2],w13
4502
mov v4.s[3],w12
4503
#ifndef __AARCH64EB__
4504
rev32 v4.16b,v4.16b
4505
#endif
4506
eor v4.16b, v4.16b, v18.16b
4507
st1 {v4.4s}, [x26]
4508
.return:
4509
ldp d14, d15, [sp], #0x10
4510
ldp d12, d13, [sp], #0x10
4511
ldp d10, d11, [sp], #0x10
4512
ldp d8, d9, [sp], #0x10
4513
ldp x29, x30, [sp], #0x10
4514
ldp x27, x28, [sp], #0x10
4515
ldp x25, x26, [sp], #0x10
4516
ldp x23, x24, [sp], #0x10
4517
ldp x21, x22, [sp], #0x10
4518
ldp x19, x20, [sp], #0x10
4519
ldp x17, x18, [sp], #0x10
4520
ldp x15, x16, [sp], #0x10
4521
AARCH64_VALIDATE_LINK_REGISTER
4522
ret
4523
.size vpsm4_ex_xts_encrypt,.-vpsm4_ex_xts_encrypt
4524
4525