Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S
108611 views
1
/* Do not modify. This file is auto-generated from vpsm4_ex-armv8.pl. */
2
// Copyright 2022-2026 The OpenSSL Project Authors. All Rights Reserved.
3
//
4
// Licensed under the Apache License 2.0 (the "License"). You may not use
5
// this file except in compliance with the License. You can obtain a copy
6
// in the file LICENSE in the source distribution or at
7
// https://www.openssl.org/source/license.html
8
9
//
10
// This module implements SM4 with ASIMD and AESE on AARCH64
11
//
12
// Dec 2022
13
//
14
15
// $output is the last argument if it looks like a file (it has an extension)
16
// $flavour is the first argument if it doesn't look like a file
17
#include "arm_arch.h"
18
.arch armv8-a+crypto
19
.text
20
21
.type _vpsm4_ex_consts,%object
22
.align 7
23
_vpsm4_ex_consts:
24
.Lck:
25
.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
26
.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
27
.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
28
.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
29
.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
30
.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
31
.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
32
.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
33
.Lfk:
34
.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
35
.Lshuffles:
36
.quad 0x0B0A090807060504,0x030201000F0E0D0C
37
.Lxts_magic:
38
#ifndef __AARCH64EB__
39
.quad 0x0101010101010187,0x0101010101010101
40
#else
41
.quad 0x0101010101010101,0x0101010101010187
42
#endif
43
.Lsbox_magic:
44
#ifndef __AARCH64EB__
45
.quad 0x0b0e0104070a0d00,0x0306090c0f020508
46
.quad 0x62185a2042387a00,0x22581a6002783a40
47
.quad 0x15df62a89e54e923,0xc10bb67c4a803df7
48
.quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
49
.quad 0x6404462679195b3b,0xe383c1a1fe9edcbc
50
#else
51
.quad 0x0306090c0f020508,0x0b0e0104070a0d00
52
.quad 0x22581a6002783a40,0x62185a2042387a00
53
.quad 0xc10bb67c4a803df7,0x15df62a89e54e923
54
.quad 0x1407c6d56c7fbead,0xb9aa6b78c1d21300
55
.quad 0xe383c1a1fe9edcbc,0x6404462679195b3b
56
#endif
57
.quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
58
59
.size _vpsm4_ex_consts,.-_vpsm4_ex_consts
60
.type _vpsm4_ex_set_key,%function
61
.align 4
62
_vpsm4_ex_set_key:
63
AARCH64_VALID_CALL_TARGET
64
ld1 {v5.4s},[x0]
65
adrp x9, .Lsbox_magic
66
ldr q26, [x9, #:lo12:.Lsbox_magic]
67
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
68
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
69
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
70
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
71
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
72
#ifndef __AARCH64EB__
73
rev32 v5.16b,v5.16b
74
#endif
75
adrp x5,.Lshuffles
76
add x5,x5,#:lo12:.Lshuffles
77
ld1 {v7.2d},[x5]
78
adrp x5,.Lfk
79
add x5,x5,#:lo12:.Lfk
80
ld1 {v6.2d},[x5]
81
eor v5.16b,v5.16b,v6.16b
82
mov x6,#32
83
adrp x5,.Lck
84
add x5,x5,#:lo12:.Lck
85
movi v0.16b,#64
86
cbnz w2,1f
87
add x1,x1,124
88
1:
89
mov w7,v5.s[1]
90
ldr w8,[x5],#4
91
eor w8,w8,w7
92
mov w7,v5.s[2]
93
eor w8,w8,w7
94
mov w7,v5.s[3]
95
eor w8,w8,w7
96
// optimize sbox using AESE instruction
97
mov v4.s[0],w8
98
tbl v0.16b, {v4.16b}, v26.16b
99
ushr v2.16b, v0.16b, 4
100
and v0.16b, v0.16b, v31.16b
101
tbl v0.16b, {v28.16b}, v0.16b
102
tbl v2.16b, {v27.16b}, v2.16b
103
eor v0.16b, v0.16b, v2.16b
104
eor v1.16b, v1.16b, v1.16b
105
aese v0.16b,v1.16b
106
ushr v2.16b, v0.16b, 4
107
and v0.16b, v0.16b, v31.16b
108
tbl v0.16b, {v30.16b}, v0.16b
109
tbl v2.16b, {v29.16b}, v2.16b
110
eor v0.16b, v0.16b, v2.16b
111
mov w7,v0.s[0]
112
eor w8,w7,w7,ror #19
113
eor w8,w8,w7,ror #9
114
mov w7,v5.s[0]
115
eor w8,w8,w7
116
mov v5.s[0],w8
117
cbz w2,2f
118
str w8,[x1],#4
119
b 3f
120
2:
121
str w8,[x1],#-4
122
3:
123
tbl v5.16b,{v5.16b},v7.16b
124
subs x6,x6,#1
125
b.ne 1b
126
ret
127
.size _vpsm4_ex_set_key,.-_vpsm4_ex_set_key
128
.type _vpsm4_ex_enc_4blks,%function
129
.align 4
130
_vpsm4_ex_enc_4blks:
131
AARCH64_VALID_CALL_TARGET
132
mov x10,x3
133
mov w11,#8
134
10:
135
ldp w7,w8,[x10],8
136
dup v12.4s,w7
137
dup v13.4s,w8
138
139
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
140
eor v14.16b,v6.16b,v7.16b
141
eor v12.16b,v5.16b,v12.16b
142
eor v12.16b,v14.16b,v12.16b
143
// optimize sbox using AESE instruction
144
tbl v0.16b, {v12.16b}, v26.16b
145
ushr v24.16b, v0.16b, 4
146
and v0.16b, v0.16b, v31.16b
147
tbl v0.16b, {v28.16b}, v0.16b
148
tbl v24.16b, {v27.16b}, v24.16b
149
eor v0.16b, v0.16b, v24.16b
150
eor v1.16b, v1.16b, v1.16b
151
aese v0.16b,v1.16b
152
ushr v24.16b, v0.16b, 4
153
and v0.16b, v0.16b, v31.16b
154
tbl v0.16b, {v30.16b}, v0.16b
155
tbl v24.16b, {v29.16b}, v24.16b
156
eor v0.16b, v0.16b, v24.16b
157
mov v12.16b,v0.16b
158
159
// linear transformation
160
ushr v0.4s,v12.4s,32-2
161
ushr v1.4s,v12.4s,32-10
162
ushr v2.4s,v12.4s,32-18
163
ushr v3.4s,v12.4s,32-24
164
sli v0.4s,v12.4s,2
165
sli v1.4s,v12.4s,10
166
sli v2.4s,v12.4s,18
167
sli v3.4s,v12.4s,24
168
eor v24.16b,v0.16b,v12.16b
169
eor v24.16b,v24.16b,v1.16b
170
eor v12.16b,v2.16b,v3.16b
171
eor v12.16b,v12.16b,v24.16b
172
eor v4.16b,v4.16b,v12.16b
173
174
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
175
eor v14.16b,v14.16b,v4.16b
176
eor v13.16b,v14.16b,v13.16b
177
// optimize sbox using AESE instruction
178
tbl v0.16b, {v13.16b}, v26.16b
179
ushr v24.16b, v0.16b, 4
180
and v0.16b, v0.16b, v31.16b
181
tbl v0.16b, {v28.16b}, v0.16b
182
tbl v24.16b, {v27.16b}, v24.16b
183
eor v0.16b, v0.16b, v24.16b
184
eor v1.16b, v1.16b, v1.16b
185
aese v0.16b,v1.16b
186
ushr v24.16b, v0.16b, 4
187
and v0.16b, v0.16b, v31.16b
188
tbl v0.16b, {v30.16b}, v0.16b
189
tbl v24.16b, {v29.16b}, v24.16b
190
eor v0.16b, v0.16b, v24.16b
191
mov v13.16b,v0.16b
192
193
// linear transformation
194
ushr v0.4s,v13.4s,32-2
195
ushr v1.4s,v13.4s,32-10
196
ushr v2.4s,v13.4s,32-18
197
ushr v3.4s,v13.4s,32-24
198
sli v0.4s,v13.4s,2
199
sli v1.4s,v13.4s,10
200
sli v2.4s,v13.4s,18
201
sli v3.4s,v13.4s,24
202
eor v24.16b,v0.16b,v13.16b
203
eor v24.16b,v24.16b,v1.16b
204
eor v13.16b,v2.16b,v3.16b
205
eor v13.16b,v13.16b,v24.16b
206
ldp w7,w8,[x10],8
207
eor v5.16b,v5.16b,v13.16b
208
209
dup v12.4s,w7
210
dup v13.4s,w8
211
212
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
213
eor v14.16b,v4.16b,v5.16b
214
eor v12.16b,v7.16b,v12.16b
215
eor v12.16b,v14.16b,v12.16b
216
// optimize sbox using AESE instruction
217
tbl v0.16b, {v12.16b}, v26.16b
218
ushr v24.16b, v0.16b, 4
219
and v0.16b, v0.16b, v31.16b
220
tbl v0.16b, {v28.16b}, v0.16b
221
tbl v24.16b, {v27.16b}, v24.16b
222
eor v0.16b, v0.16b, v24.16b
223
eor v1.16b, v1.16b, v1.16b
224
aese v0.16b,v1.16b
225
ushr v24.16b, v0.16b, 4
226
and v0.16b, v0.16b, v31.16b
227
tbl v0.16b, {v30.16b}, v0.16b
228
tbl v24.16b, {v29.16b}, v24.16b
229
eor v0.16b, v0.16b, v24.16b
230
mov v12.16b,v0.16b
231
232
// linear transformation
233
ushr v0.4s,v12.4s,32-2
234
ushr v1.4s,v12.4s,32-10
235
ushr v2.4s,v12.4s,32-18
236
ushr v3.4s,v12.4s,32-24
237
sli v0.4s,v12.4s,2
238
sli v1.4s,v12.4s,10
239
sli v2.4s,v12.4s,18
240
sli v3.4s,v12.4s,24
241
eor v24.16b,v0.16b,v12.16b
242
eor v24.16b,v24.16b,v1.16b
243
eor v12.16b,v2.16b,v3.16b
244
eor v12.16b,v12.16b,v24.16b
245
eor v6.16b,v6.16b,v12.16b
246
247
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
248
eor v14.16b,v14.16b,v6.16b
249
eor v13.16b,v14.16b,v13.16b
250
// optimize sbox using AESE instruction
251
tbl v0.16b, {v13.16b}, v26.16b
252
ushr v24.16b, v0.16b, 4
253
and v0.16b, v0.16b, v31.16b
254
tbl v0.16b, {v28.16b}, v0.16b
255
tbl v24.16b, {v27.16b}, v24.16b
256
eor v0.16b, v0.16b, v24.16b
257
eor v1.16b, v1.16b, v1.16b
258
aese v0.16b,v1.16b
259
ushr v24.16b, v0.16b, 4
260
and v0.16b, v0.16b, v31.16b
261
tbl v0.16b, {v30.16b}, v0.16b
262
tbl v24.16b, {v29.16b}, v24.16b
263
eor v0.16b, v0.16b, v24.16b
264
mov v13.16b,v0.16b
265
266
// linear transformation
267
ushr v0.4s,v13.4s,32-2
268
ushr v1.4s,v13.4s,32-10
269
ushr v2.4s,v13.4s,32-18
270
ushr v3.4s,v13.4s,32-24
271
sli v0.4s,v13.4s,2
272
sli v1.4s,v13.4s,10
273
sli v2.4s,v13.4s,18
274
sli v3.4s,v13.4s,24
275
eor v24.16b,v0.16b,v13.16b
276
eor v24.16b,v24.16b,v1.16b
277
eor v13.16b,v2.16b,v3.16b
278
eor v13.16b,v13.16b,v24.16b
279
eor v7.16b,v7.16b,v13.16b
280
subs w11,w11,#1
281
b.ne 10b
282
#ifndef __AARCH64EB__
283
rev32 v3.16b,v4.16b
284
#else
285
mov v3.16b,v4.16b
286
#endif
287
#ifndef __AARCH64EB__
288
rev32 v2.16b,v5.16b
289
#else
290
mov v2.16b,v5.16b
291
#endif
292
#ifndef __AARCH64EB__
293
rev32 v1.16b,v6.16b
294
#else
295
mov v1.16b,v6.16b
296
#endif
297
#ifndef __AARCH64EB__
298
rev32 v0.16b,v7.16b
299
#else
300
mov v0.16b,v7.16b
301
#endif
302
ret
303
.size _vpsm4_ex_enc_4blks,.-_vpsm4_ex_enc_4blks
304
.type _vpsm4_ex_enc_8blks,%function
305
.align 4
306
_vpsm4_ex_enc_8blks:
307
AARCH64_VALID_CALL_TARGET
308
mov x10,x3
309
mov w11,#8
310
10:
311
ldp w7,w8,[x10],8
312
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
313
dup v12.4s,w7
314
eor v14.16b,v6.16b,v7.16b
315
eor v15.16b,v10.16b,v11.16b
316
eor v0.16b,v5.16b,v12.16b
317
eor v1.16b,v9.16b,v12.16b
318
eor v12.16b,v14.16b,v0.16b
319
eor v13.16b,v15.16b,v1.16b
320
// optimize sbox using AESE instruction
321
tbl v0.16b, {v12.16b}, v26.16b
322
tbl v1.16b, {v13.16b}, v26.16b
323
ushr v24.16b, v0.16b, 4
324
and v0.16b, v0.16b, v31.16b
325
tbl v0.16b, {v28.16b}, v0.16b
326
tbl v24.16b, {v27.16b}, v24.16b
327
eor v0.16b, v0.16b, v24.16b
328
ushr v24.16b, v1.16b, 4
329
and v1.16b, v1.16b, v31.16b
330
tbl v1.16b, {v28.16b}, v1.16b
331
tbl v24.16b, {v27.16b}, v24.16b
332
eor v1.16b, v1.16b, v24.16b
333
eor v25.16b, v25.16b, v25.16b
334
aese v0.16b,v25.16b
335
aese v1.16b,v25.16b
336
ushr v24.16b, v0.16b, 4
337
and v0.16b, v0.16b, v31.16b
338
tbl v0.16b, {v30.16b}, v0.16b
339
tbl v24.16b, {v29.16b}, v24.16b
340
eor v0.16b, v0.16b, v24.16b
341
ushr v24.16b, v1.16b, 4
342
and v1.16b, v1.16b, v31.16b
343
tbl v1.16b, {v30.16b}, v1.16b
344
tbl v24.16b, {v29.16b}, v24.16b
345
eor v1.16b, v1.16b, v24.16b
346
mov v12.16b,v0.16b
347
mov v13.16b,v1.16b
348
349
// linear transformation
350
ushr v0.4s,v12.4s,32-2
351
ushr v25.4s,v13.4s,32-2
352
ushr v1.4s,v12.4s,32-10
353
ushr v2.4s,v12.4s,32-18
354
ushr v3.4s,v12.4s,32-24
355
sli v0.4s,v12.4s,2
356
sli v25.4s,v13.4s,2
357
sli v1.4s,v12.4s,10
358
sli v2.4s,v12.4s,18
359
sli v3.4s,v12.4s,24
360
eor v24.16b,v0.16b,v12.16b
361
eor v24.16b,v24.16b,v1.16b
362
eor v12.16b,v2.16b,v3.16b
363
eor v12.16b,v12.16b,v24.16b
364
ushr v1.4s,v13.4s,32-10
365
ushr v2.4s,v13.4s,32-18
366
ushr v3.4s,v13.4s,32-24
367
sli v1.4s,v13.4s,10
368
sli v2.4s,v13.4s,18
369
sli v3.4s,v13.4s,24
370
eor v24.16b,v25.16b,v13.16b
371
eor v24.16b,v24.16b,v1.16b
372
eor v13.16b,v2.16b,v3.16b
373
eor v13.16b,v13.16b,v24.16b
374
eor v4.16b,v4.16b,v12.16b
375
eor v8.16b,v8.16b,v13.16b
376
377
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
378
dup v13.4s,w8
379
eor v14.16b,v14.16b,v4.16b
380
eor v15.16b,v15.16b,v8.16b
381
eor v12.16b,v14.16b,v13.16b
382
eor v13.16b,v15.16b,v13.16b
383
// optimize sbox using AESE instruction
384
tbl v0.16b, {v12.16b}, v26.16b
385
tbl v1.16b, {v13.16b}, v26.16b
386
ushr v24.16b, v0.16b, 4
387
and v0.16b, v0.16b, v31.16b
388
tbl v0.16b, {v28.16b}, v0.16b
389
tbl v24.16b, {v27.16b}, v24.16b
390
eor v0.16b, v0.16b, v24.16b
391
ushr v24.16b, v1.16b, 4
392
and v1.16b, v1.16b, v31.16b
393
tbl v1.16b, {v28.16b}, v1.16b
394
tbl v24.16b, {v27.16b}, v24.16b
395
eor v1.16b, v1.16b, v24.16b
396
eor v25.16b, v25.16b, v25.16b
397
aese v0.16b,v25.16b
398
aese v1.16b,v25.16b
399
ushr v24.16b, v0.16b, 4
400
and v0.16b, v0.16b, v31.16b
401
tbl v0.16b, {v30.16b}, v0.16b
402
tbl v24.16b, {v29.16b}, v24.16b
403
eor v0.16b, v0.16b, v24.16b
404
ushr v24.16b, v1.16b, 4
405
and v1.16b, v1.16b, v31.16b
406
tbl v1.16b, {v30.16b}, v1.16b
407
tbl v24.16b, {v29.16b}, v24.16b
408
eor v1.16b, v1.16b, v24.16b
409
mov v12.16b,v0.16b
410
mov v13.16b,v1.16b
411
412
// linear transformation
413
ushr v0.4s,v12.4s,32-2
414
ushr v25.4s,v13.4s,32-2
415
ushr v1.4s,v12.4s,32-10
416
ushr v2.4s,v12.4s,32-18
417
ushr v3.4s,v12.4s,32-24
418
sli v0.4s,v12.4s,2
419
sli v25.4s,v13.4s,2
420
sli v1.4s,v12.4s,10
421
sli v2.4s,v12.4s,18
422
sli v3.4s,v12.4s,24
423
eor v24.16b,v0.16b,v12.16b
424
eor v24.16b,v24.16b,v1.16b
425
eor v12.16b,v2.16b,v3.16b
426
eor v12.16b,v12.16b,v24.16b
427
ushr v1.4s,v13.4s,32-10
428
ushr v2.4s,v13.4s,32-18
429
ushr v3.4s,v13.4s,32-24
430
sli v1.4s,v13.4s,10
431
sli v2.4s,v13.4s,18
432
sli v3.4s,v13.4s,24
433
eor v24.16b,v25.16b,v13.16b
434
eor v24.16b,v24.16b,v1.16b
435
eor v13.16b,v2.16b,v3.16b
436
eor v13.16b,v13.16b,v24.16b
437
ldp w7,w8,[x10],8
438
eor v5.16b,v5.16b,v12.16b
439
eor v9.16b,v9.16b,v13.16b
440
441
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
442
dup v12.4s,w7
443
eor v14.16b,v4.16b,v5.16b
444
eor v15.16b,v8.16b,v9.16b
445
eor v0.16b,v7.16b,v12.16b
446
eor v1.16b,v11.16b,v12.16b
447
eor v12.16b,v14.16b,v0.16b
448
eor v13.16b,v15.16b,v1.16b
449
// optimize sbox using AESE instruction
450
tbl v0.16b, {v12.16b}, v26.16b
451
tbl v1.16b, {v13.16b}, v26.16b
452
ushr v24.16b, v0.16b, 4
453
and v0.16b, v0.16b, v31.16b
454
tbl v0.16b, {v28.16b}, v0.16b
455
tbl v24.16b, {v27.16b}, v24.16b
456
eor v0.16b, v0.16b, v24.16b
457
ushr v24.16b, v1.16b, 4
458
and v1.16b, v1.16b, v31.16b
459
tbl v1.16b, {v28.16b}, v1.16b
460
tbl v24.16b, {v27.16b}, v24.16b
461
eor v1.16b, v1.16b, v24.16b
462
eor v25.16b, v25.16b, v25.16b
463
aese v0.16b,v25.16b
464
aese v1.16b,v25.16b
465
ushr v24.16b, v0.16b, 4
466
and v0.16b, v0.16b, v31.16b
467
tbl v0.16b, {v30.16b}, v0.16b
468
tbl v24.16b, {v29.16b}, v24.16b
469
eor v0.16b, v0.16b, v24.16b
470
ushr v24.16b, v1.16b, 4
471
and v1.16b, v1.16b, v31.16b
472
tbl v1.16b, {v30.16b}, v1.16b
473
tbl v24.16b, {v29.16b}, v24.16b
474
eor v1.16b, v1.16b, v24.16b
475
mov v12.16b,v0.16b
476
mov v13.16b,v1.16b
477
478
// linear transformation
479
ushr v0.4s,v12.4s,32-2
480
ushr v25.4s,v13.4s,32-2
481
ushr v1.4s,v12.4s,32-10
482
ushr v2.4s,v12.4s,32-18
483
ushr v3.4s,v12.4s,32-24
484
sli v0.4s,v12.4s,2
485
sli v25.4s,v13.4s,2
486
sli v1.4s,v12.4s,10
487
sli v2.4s,v12.4s,18
488
sli v3.4s,v12.4s,24
489
eor v24.16b,v0.16b,v12.16b
490
eor v24.16b,v24.16b,v1.16b
491
eor v12.16b,v2.16b,v3.16b
492
eor v12.16b,v12.16b,v24.16b
493
ushr v1.4s,v13.4s,32-10
494
ushr v2.4s,v13.4s,32-18
495
ushr v3.4s,v13.4s,32-24
496
sli v1.4s,v13.4s,10
497
sli v2.4s,v13.4s,18
498
sli v3.4s,v13.4s,24
499
eor v24.16b,v25.16b,v13.16b
500
eor v24.16b,v24.16b,v1.16b
501
eor v13.16b,v2.16b,v3.16b
502
eor v13.16b,v13.16b,v24.16b
503
eor v6.16b,v6.16b,v12.16b
504
eor v10.16b,v10.16b,v13.16b
505
506
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
507
dup v13.4s,w8
508
eor v14.16b,v14.16b,v6.16b
509
eor v15.16b,v15.16b,v10.16b
510
eor v12.16b,v14.16b,v13.16b
511
eor v13.16b,v15.16b,v13.16b
512
// optimize sbox using AESE instruction
513
tbl v0.16b, {v12.16b}, v26.16b
514
tbl v1.16b, {v13.16b}, v26.16b
515
ushr v24.16b, v0.16b, 4
516
and v0.16b, v0.16b, v31.16b
517
tbl v0.16b, {v28.16b}, v0.16b
518
tbl v24.16b, {v27.16b}, v24.16b
519
eor v0.16b, v0.16b, v24.16b
520
ushr v24.16b, v1.16b, 4
521
and v1.16b, v1.16b, v31.16b
522
tbl v1.16b, {v28.16b}, v1.16b
523
tbl v24.16b, {v27.16b}, v24.16b
524
eor v1.16b, v1.16b, v24.16b
525
eor v25.16b, v25.16b, v25.16b
526
aese v0.16b,v25.16b
527
aese v1.16b,v25.16b
528
ushr v24.16b, v0.16b, 4
529
and v0.16b, v0.16b, v31.16b
530
tbl v0.16b, {v30.16b}, v0.16b
531
tbl v24.16b, {v29.16b}, v24.16b
532
eor v0.16b, v0.16b, v24.16b
533
ushr v24.16b, v1.16b, 4
534
and v1.16b, v1.16b, v31.16b
535
tbl v1.16b, {v30.16b}, v1.16b
536
tbl v24.16b, {v29.16b}, v24.16b
537
eor v1.16b, v1.16b, v24.16b
538
mov v12.16b,v0.16b
539
mov v13.16b,v1.16b
540
541
// linear transformation
542
ushr v0.4s,v12.4s,32-2
543
ushr v25.4s,v13.4s,32-2
544
ushr v1.4s,v12.4s,32-10
545
ushr v2.4s,v12.4s,32-18
546
ushr v3.4s,v12.4s,32-24
547
sli v0.4s,v12.4s,2
548
sli v25.4s,v13.4s,2
549
sli v1.4s,v12.4s,10
550
sli v2.4s,v12.4s,18
551
sli v3.4s,v12.4s,24
552
eor v24.16b,v0.16b,v12.16b
553
eor v24.16b,v24.16b,v1.16b
554
eor v12.16b,v2.16b,v3.16b
555
eor v12.16b,v12.16b,v24.16b
556
ushr v1.4s,v13.4s,32-10
557
ushr v2.4s,v13.4s,32-18
558
ushr v3.4s,v13.4s,32-24
559
sli v1.4s,v13.4s,10
560
sli v2.4s,v13.4s,18
561
sli v3.4s,v13.4s,24
562
eor v24.16b,v25.16b,v13.16b
563
eor v24.16b,v24.16b,v1.16b
564
eor v13.16b,v2.16b,v3.16b
565
eor v13.16b,v13.16b,v24.16b
566
eor v7.16b,v7.16b,v12.16b
567
eor v11.16b,v11.16b,v13.16b
568
subs w11,w11,#1
569
b.ne 10b
570
#ifndef __AARCH64EB__
571
rev32 v3.16b,v4.16b
572
#else
573
mov v3.16b,v4.16b
574
#endif
575
#ifndef __AARCH64EB__
576
rev32 v2.16b,v5.16b
577
#else
578
mov v2.16b,v5.16b
579
#endif
580
#ifndef __AARCH64EB__
581
rev32 v1.16b,v6.16b
582
#else
583
mov v1.16b,v6.16b
584
#endif
585
#ifndef __AARCH64EB__
586
rev32 v0.16b,v7.16b
587
#else
588
mov v0.16b,v7.16b
589
#endif
590
#ifndef __AARCH64EB__
591
rev32 v7.16b,v8.16b
592
#else
593
mov v7.16b,v8.16b
594
#endif
595
#ifndef __AARCH64EB__
596
rev32 v6.16b,v9.16b
597
#else
598
mov v6.16b,v9.16b
599
#endif
600
#ifndef __AARCH64EB__
601
rev32 v5.16b,v10.16b
602
#else
603
mov v5.16b,v10.16b
604
#endif
605
#ifndef __AARCH64EB__
606
rev32 v4.16b,v11.16b
607
#else
608
mov v4.16b,v11.16b
609
#endif
610
ret
611
.size _vpsm4_ex_enc_8blks,.-_vpsm4_ex_enc_8blks
612
.globl vpsm4_ex_set_encrypt_key
613
.type vpsm4_ex_set_encrypt_key,%function
614
.align 5
615
vpsm4_ex_set_encrypt_key:
616
AARCH64_SIGN_LINK_REGISTER
617
stp x29,x30,[sp,#-16]!
618
mov w2,1
619
bl _vpsm4_ex_set_key
620
ldp x29,x30,[sp],#16
621
AARCH64_VALIDATE_LINK_REGISTER
622
ret
623
.size vpsm4_ex_set_encrypt_key,.-vpsm4_ex_set_encrypt_key
624
.globl vpsm4_ex_set_decrypt_key
625
.type vpsm4_ex_set_decrypt_key,%function
626
.align 5
627
vpsm4_ex_set_decrypt_key:
628
AARCH64_SIGN_LINK_REGISTER
629
stp x29,x30,[sp,#-16]!
630
mov w2,0
631
bl _vpsm4_ex_set_key
632
ldp x29,x30,[sp],#16
633
AARCH64_VALIDATE_LINK_REGISTER
634
ret
635
.size vpsm4_ex_set_decrypt_key,.-vpsm4_ex_set_decrypt_key
636
.globl vpsm4_ex_encrypt
637
.type vpsm4_ex_encrypt,%function
638
.align 5
639
vpsm4_ex_encrypt:
640
AARCH64_VALID_CALL_TARGET
641
ld1 {v4.4s},[x0]
642
adrp x9, .Lsbox_magic
643
ldr q26, [x9, #:lo12:.Lsbox_magic]
644
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
645
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
646
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
647
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
648
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
649
#ifndef __AARCH64EB__
650
rev32 v4.16b,v4.16b
651
#endif
652
mov x3,x2
653
mov x10,x3
654
mov w11,#8
655
mov w12,v4.s[0]
656
mov w13,v4.s[1]
657
mov w14,v4.s[2]
658
mov w15,v4.s[3]
659
10:
660
ldp w7,w8,[x10],8
661
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
662
eor w6,w14,w15
663
eor w9,w7,w13
664
eor w6,w6,w9
665
mov v3.s[0],w6
666
// optimize sbox using AESE instruction
667
tbl v0.16b, {v3.16b}, v26.16b
668
ushr v2.16b, v0.16b, 4
669
and v0.16b, v0.16b, v31.16b
670
tbl v0.16b, {v28.16b}, v0.16b
671
tbl v2.16b, {v27.16b}, v2.16b
672
eor v0.16b, v0.16b, v2.16b
673
eor v1.16b, v1.16b, v1.16b
674
aese v0.16b,v1.16b
675
ushr v2.16b, v0.16b, 4
676
and v0.16b, v0.16b, v31.16b
677
tbl v0.16b, {v30.16b}, v0.16b
678
tbl v2.16b, {v29.16b}, v2.16b
679
eor v0.16b, v0.16b, v2.16b
680
681
mov w7,v0.s[0]
682
eor w6,w7,w7,ror #32-2
683
eor w6,w6,w7,ror #32-10
684
eor w6,w6,w7,ror #32-18
685
eor w6,w6,w7,ror #32-24
686
eor w12,w12,w6
687
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
688
eor w6,w14,w15
689
eor w9,w12,w8
690
eor w6,w6,w9
691
mov v3.s[0],w6
692
// optimize sbox using AESE instruction
693
tbl v0.16b, {v3.16b}, v26.16b
694
ushr v2.16b, v0.16b, 4
695
and v0.16b, v0.16b, v31.16b
696
tbl v0.16b, {v28.16b}, v0.16b
697
tbl v2.16b, {v27.16b}, v2.16b
698
eor v0.16b, v0.16b, v2.16b
699
eor v1.16b, v1.16b, v1.16b
700
aese v0.16b,v1.16b
701
ushr v2.16b, v0.16b, 4
702
and v0.16b, v0.16b, v31.16b
703
tbl v0.16b, {v30.16b}, v0.16b
704
tbl v2.16b, {v29.16b}, v2.16b
705
eor v0.16b, v0.16b, v2.16b
706
707
mov w7,v0.s[0]
708
eor w6,w7,w7,ror #32-2
709
eor w6,w6,w7,ror #32-10
710
eor w6,w6,w7,ror #32-18
711
eor w6,w6,w7,ror #32-24
712
ldp w7,w8,[x10],8
713
eor w13,w13,w6
714
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
715
eor w6,w12,w13
716
eor w9,w7,w15
717
eor w6,w6,w9
718
mov v3.s[0],w6
719
// optimize sbox using AESE instruction
720
tbl v0.16b, {v3.16b}, v26.16b
721
ushr v2.16b, v0.16b, 4
722
and v0.16b, v0.16b, v31.16b
723
tbl v0.16b, {v28.16b}, v0.16b
724
tbl v2.16b, {v27.16b}, v2.16b
725
eor v0.16b, v0.16b, v2.16b
726
eor v1.16b, v1.16b, v1.16b
727
aese v0.16b,v1.16b
728
ushr v2.16b, v0.16b, 4
729
and v0.16b, v0.16b, v31.16b
730
tbl v0.16b, {v30.16b}, v0.16b
731
tbl v2.16b, {v29.16b}, v2.16b
732
eor v0.16b, v0.16b, v2.16b
733
734
mov w7,v0.s[0]
735
eor w6,w7,w7,ror #32-2
736
eor w6,w6,w7,ror #32-10
737
eor w6,w6,w7,ror #32-18
738
eor w6,w6,w7,ror #32-24
739
eor w14,w14,w6
740
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
741
eor w6,w12,w13
742
eor w9,w14,w8
743
eor w6,w6,w9
744
mov v3.s[0],w6
745
// optimize sbox using AESE instruction
746
tbl v0.16b, {v3.16b}, v26.16b
747
ushr v2.16b, v0.16b, 4
748
and v0.16b, v0.16b, v31.16b
749
tbl v0.16b, {v28.16b}, v0.16b
750
tbl v2.16b, {v27.16b}, v2.16b
751
eor v0.16b, v0.16b, v2.16b
752
eor v1.16b, v1.16b, v1.16b
753
aese v0.16b,v1.16b
754
ushr v2.16b, v0.16b, 4
755
and v0.16b, v0.16b, v31.16b
756
tbl v0.16b, {v30.16b}, v0.16b
757
tbl v2.16b, {v29.16b}, v2.16b
758
eor v0.16b, v0.16b, v2.16b
759
760
mov w7,v0.s[0]
761
eor w6,w7,w7,ror #32-2
762
eor w6,w6,w7,ror #32-10
763
eor w6,w6,w7,ror #32-18
764
eor w6,w6,w7,ror #32-24
765
eor w15,w15,w6
766
subs w11,w11,#1
767
b.ne 10b
768
mov v4.s[0],w15
769
mov v4.s[1],w14
770
mov v4.s[2],w13
771
mov v4.s[3],w12
772
#ifndef __AARCH64EB__
773
rev32 v4.16b,v4.16b
774
#endif
775
st1 {v4.4s},[x1]
776
ret
777
.size vpsm4_ex_encrypt,.-vpsm4_ex_encrypt
778
.globl vpsm4_ex_decrypt
779
.type vpsm4_ex_decrypt,%function
780
.align 5
781
vpsm4_ex_decrypt:
782
AARCH64_VALID_CALL_TARGET
783
ld1 {v4.4s},[x0]
784
adrp x9, .Lsbox_magic
785
ldr q26, [x9, #:lo12:.Lsbox_magic]
786
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
787
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
788
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
789
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
790
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
791
#ifndef __AARCH64EB__
792
rev32 v4.16b,v4.16b
793
#endif
794
mov x3,x2
795
mov x10,x3
796
mov w11,#8
797
mov w12,v4.s[0]
798
mov w13,v4.s[1]
799
mov w14,v4.s[2]
800
mov w15,v4.s[3]
801
10:
802
ldp w7,w8,[x10],8
803
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
804
eor w6,w14,w15
805
eor w9,w7,w13
806
eor w6,w6,w9
807
mov v3.s[0],w6
808
// optimize sbox using AESE instruction
809
tbl v0.16b, {v3.16b}, v26.16b
810
ushr v2.16b, v0.16b, 4
811
and v0.16b, v0.16b, v31.16b
812
tbl v0.16b, {v28.16b}, v0.16b
813
tbl v2.16b, {v27.16b}, v2.16b
814
eor v0.16b, v0.16b, v2.16b
815
eor v1.16b, v1.16b, v1.16b
816
aese v0.16b,v1.16b
817
ushr v2.16b, v0.16b, 4
818
and v0.16b, v0.16b, v31.16b
819
tbl v0.16b, {v30.16b}, v0.16b
820
tbl v2.16b, {v29.16b}, v2.16b
821
eor v0.16b, v0.16b, v2.16b
822
823
mov w7,v0.s[0]
824
eor w6,w7,w7,ror #32-2
825
eor w6,w6,w7,ror #32-10
826
eor w6,w6,w7,ror #32-18
827
eor w6,w6,w7,ror #32-24
828
eor w12,w12,w6
829
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
830
eor w6,w14,w15
831
eor w9,w12,w8
832
eor w6,w6,w9
833
mov v3.s[0],w6
834
// optimize sbox using AESE instruction
835
tbl v0.16b, {v3.16b}, v26.16b
836
ushr v2.16b, v0.16b, 4
837
and v0.16b, v0.16b, v31.16b
838
tbl v0.16b, {v28.16b}, v0.16b
839
tbl v2.16b, {v27.16b}, v2.16b
840
eor v0.16b, v0.16b, v2.16b
841
eor v1.16b, v1.16b, v1.16b
842
aese v0.16b,v1.16b
843
ushr v2.16b, v0.16b, 4
844
and v0.16b, v0.16b, v31.16b
845
tbl v0.16b, {v30.16b}, v0.16b
846
tbl v2.16b, {v29.16b}, v2.16b
847
eor v0.16b, v0.16b, v2.16b
848
849
mov w7,v0.s[0]
850
eor w6,w7,w7,ror #32-2
851
eor w6,w6,w7,ror #32-10
852
eor w6,w6,w7,ror #32-18
853
eor w6,w6,w7,ror #32-24
854
ldp w7,w8,[x10],8
855
eor w13,w13,w6
856
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
857
eor w6,w12,w13
858
eor w9,w7,w15
859
eor w6,w6,w9
860
mov v3.s[0],w6
861
// optimize sbox using AESE instruction
862
tbl v0.16b, {v3.16b}, v26.16b
863
ushr v2.16b, v0.16b, 4
864
and v0.16b, v0.16b, v31.16b
865
tbl v0.16b, {v28.16b}, v0.16b
866
tbl v2.16b, {v27.16b}, v2.16b
867
eor v0.16b, v0.16b, v2.16b
868
eor v1.16b, v1.16b, v1.16b
869
aese v0.16b,v1.16b
870
ushr v2.16b, v0.16b, 4
871
and v0.16b, v0.16b, v31.16b
872
tbl v0.16b, {v30.16b}, v0.16b
873
tbl v2.16b, {v29.16b}, v2.16b
874
eor v0.16b, v0.16b, v2.16b
875
876
mov w7,v0.s[0]
877
eor w6,w7,w7,ror #32-2
878
eor w6,w6,w7,ror #32-10
879
eor w6,w6,w7,ror #32-18
880
eor w6,w6,w7,ror #32-24
881
eor w14,w14,w6
882
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
883
eor w6,w12,w13
884
eor w9,w14,w8
885
eor w6,w6,w9
886
mov v3.s[0],w6
887
// optimize sbox using AESE instruction
888
tbl v0.16b, {v3.16b}, v26.16b
889
ushr v2.16b, v0.16b, 4
890
and v0.16b, v0.16b, v31.16b
891
tbl v0.16b, {v28.16b}, v0.16b
892
tbl v2.16b, {v27.16b}, v2.16b
893
eor v0.16b, v0.16b, v2.16b
894
eor v1.16b, v1.16b, v1.16b
895
aese v0.16b,v1.16b
896
ushr v2.16b, v0.16b, 4
897
and v0.16b, v0.16b, v31.16b
898
tbl v0.16b, {v30.16b}, v0.16b
899
tbl v2.16b, {v29.16b}, v2.16b
900
eor v0.16b, v0.16b, v2.16b
901
902
mov w7,v0.s[0]
903
eor w6,w7,w7,ror #32-2
904
eor w6,w6,w7,ror #32-10
905
eor w6,w6,w7,ror #32-18
906
eor w6,w6,w7,ror #32-24
907
eor w15,w15,w6
908
subs w11,w11,#1
909
b.ne 10b
910
mov v4.s[0],w15
911
mov v4.s[1],w14
912
mov v4.s[2],w13
913
mov v4.s[3],w12
914
#ifndef __AARCH64EB__
915
rev32 v4.16b,v4.16b
916
#endif
917
st1 {v4.4s},[x1]
918
ret
919
.size vpsm4_ex_decrypt,.-vpsm4_ex_decrypt
920
.globl vpsm4_ex_ecb_encrypt
921
.type vpsm4_ex_ecb_encrypt,%function
922
.align 5
923
vpsm4_ex_ecb_encrypt:
924
AARCH64_SIGN_LINK_REGISTER
925
// convert length into blocks
926
lsr x2,x2,4
927
stp d8,d9,[sp,#-80]!
928
stp d10,d11,[sp,#16]
929
stp d12,d13,[sp,#32]
930
stp d14,d15,[sp,#48]
931
stp x29,x30,[sp,#64]
932
adrp x9, .Lsbox_magic
933
ldr q26, [x9, #:lo12:.Lsbox_magic]
934
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
935
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
936
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
937
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
938
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
939
.Lecb_8_blocks_process:
940
cmp w2,#8
941
b.lt .Lecb_4_blocks_process
942
ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
943
ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
944
#ifndef __AARCH64EB__
945
rev32 v4.16b,v4.16b
946
#endif
947
#ifndef __AARCH64EB__
948
rev32 v5.16b,v5.16b
949
#endif
950
#ifndef __AARCH64EB__
951
rev32 v6.16b,v6.16b
952
#endif
953
#ifndef __AARCH64EB__
954
rev32 v7.16b,v7.16b
955
#endif
956
#ifndef __AARCH64EB__
957
rev32 v8.16b,v8.16b
958
#endif
959
#ifndef __AARCH64EB__
960
rev32 v9.16b,v9.16b
961
#endif
962
#ifndef __AARCH64EB__
963
rev32 v10.16b,v10.16b
964
#endif
965
#ifndef __AARCH64EB__
966
rev32 v11.16b,v11.16b
967
#endif
968
bl _vpsm4_ex_enc_8blks
969
st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
970
st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
971
subs w2,w2,#8
972
b.gt .Lecb_8_blocks_process
973
b 100f
974
.Lecb_4_blocks_process:
975
cmp w2,#4
976
b.lt 1f
977
ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
978
#ifndef __AARCH64EB__
979
rev32 v4.16b,v4.16b
980
#endif
981
#ifndef __AARCH64EB__
982
rev32 v5.16b,v5.16b
983
#endif
984
#ifndef __AARCH64EB__
985
rev32 v6.16b,v6.16b
986
#endif
987
#ifndef __AARCH64EB__
988
rev32 v7.16b,v7.16b
989
#endif
990
bl _vpsm4_ex_enc_4blks
991
st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
992
sub w2,w2,#4
993
1:
994
// process last block
995
cmp w2,#1
996
b.lt 100f
997
b.gt 1f
998
ld1 {v4.4s},[x0]
999
#ifndef __AARCH64EB__
1000
rev32 v4.16b,v4.16b
1001
#endif
1002
mov x10,x3
1003
mov w11,#8
1004
mov w12,v4.s[0]
1005
mov w13,v4.s[1]
1006
mov w14,v4.s[2]
1007
mov w15,v4.s[3]
1008
10:
1009
ldp w7,w8,[x10],8
1010
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1011
eor w6,w14,w15
1012
eor w9,w7,w13
1013
eor w6,w6,w9
1014
mov v3.s[0],w6
1015
// optimize sbox using AESE instruction
1016
tbl v0.16b, {v3.16b}, v26.16b
1017
ushr v2.16b, v0.16b, 4
1018
and v0.16b, v0.16b, v31.16b
1019
tbl v0.16b, {v28.16b}, v0.16b
1020
tbl v2.16b, {v27.16b}, v2.16b
1021
eor v0.16b, v0.16b, v2.16b
1022
eor v1.16b, v1.16b, v1.16b
1023
aese v0.16b,v1.16b
1024
ushr v2.16b, v0.16b, 4
1025
and v0.16b, v0.16b, v31.16b
1026
tbl v0.16b, {v30.16b}, v0.16b
1027
tbl v2.16b, {v29.16b}, v2.16b
1028
eor v0.16b, v0.16b, v2.16b
1029
1030
mov w7,v0.s[0]
1031
eor w6,w7,w7,ror #32-2
1032
eor w6,w6,w7,ror #32-10
1033
eor w6,w6,w7,ror #32-18
1034
eor w6,w6,w7,ror #32-24
1035
eor w12,w12,w6
1036
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1037
eor w6,w14,w15
1038
eor w9,w12,w8
1039
eor w6,w6,w9
1040
mov v3.s[0],w6
1041
// optimize sbox using AESE instruction
1042
tbl v0.16b, {v3.16b}, v26.16b
1043
ushr v2.16b, v0.16b, 4
1044
and v0.16b, v0.16b, v31.16b
1045
tbl v0.16b, {v28.16b}, v0.16b
1046
tbl v2.16b, {v27.16b}, v2.16b
1047
eor v0.16b, v0.16b, v2.16b
1048
eor v1.16b, v1.16b, v1.16b
1049
aese v0.16b,v1.16b
1050
ushr v2.16b, v0.16b, 4
1051
and v0.16b, v0.16b, v31.16b
1052
tbl v0.16b, {v30.16b}, v0.16b
1053
tbl v2.16b, {v29.16b}, v2.16b
1054
eor v0.16b, v0.16b, v2.16b
1055
1056
mov w7,v0.s[0]
1057
eor w6,w7,w7,ror #32-2
1058
eor w6,w6,w7,ror #32-10
1059
eor w6,w6,w7,ror #32-18
1060
eor w6,w6,w7,ror #32-24
1061
ldp w7,w8,[x10],8
1062
eor w13,w13,w6
1063
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1064
eor w6,w12,w13
1065
eor w9,w7,w15
1066
eor w6,w6,w9
1067
mov v3.s[0],w6
1068
// optimize sbox using AESE instruction
1069
tbl v0.16b, {v3.16b}, v26.16b
1070
ushr v2.16b, v0.16b, 4
1071
and v0.16b, v0.16b, v31.16b
1072
tbl v0.16b, {v28.16b}, v0.16b
1073
tbl v2.16b, {v27.16b}, v2.16b
1074
eor v0.16b, v0.16b, v2.16b
1075
eor v1.16b, v1.16b, v1.16b
1076
aese v0.16b,v1.16b
1077
ushr v2.16b, v0.16b, 4
1078
and v0.16b, v0.16b, v31.16b
1079
tbl v0.16b, {v30.16b}, v0.16b
1080
tbl v2.16b, {v29.16b}, v2.16b
1081
eor v0.16b, v0.16b, v2.16b
1082
1083
mov w7,v0.s[0]
1084
eor w6,w7,w7,ror #32-2
1085
eor w6,w6,w7,ror #32-10
1086
eor w6,w6,w7,ror #32-18
1087
eor w6,w6,w7,ror #32-24
1088
eor w14,w14,w6
1089
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1090
eor w6,w12,w13
1091
eor w9,w14,w8
1092
eor w6,w6,w9
1093
mov v3.s[0],w6
1094
// optimize sbox using AESE instruction
1095
tbl v0.16b, {v3.16b}, v26.16b
1096
ushr v2.16b, v0.16b, 4
1097
and v0.16b, v0.16b, v31.16b
1098
tbl v0.16b, {v28.16b}, v0.16b
1099
tbl v2.16b, {v27.16b}, v2.16b
1100
eor v0.16b, v0.16b, v2.16b
1101
eor v1.16b, v1.16b, v1.16b
1102
aese v0.16b,v1.16b
1103
ushr v2.16b, v0.16b, 4
1104
and v0.16b, v0.16b, v31.16b
1105
tbl v0.16b, {v30.16b}, v0.16b
1106
tbl v2.16b, {v29.16b}, v2.16b
1107
eor v0.16b, v0.16b, v2.16b
1108
1109
mov w7,v0.s[0]
1110
eor w6,w7,w7,ror #32-2
1111
eor w6,w6,w7,ror #32-10
1112
eor w6,w6,w7,ror #32-18
1113
eor w6,w6,w7,ror #32-24
1114
eor w15,w15,w6
1115
subs w11,w11,#1
1116
b.ne 10b
1117
mov v4.s[0],w15
1118
mov v4.s[1],w14
1119
mov v4.s[2],w13
1120
mov v4.s[3],w12
1121
#ifndef __AARCH64EB__
1122
rev32 v4.16b,v4.16b
1123
#endif
1124
st1 {v4.4s},[x1]
1125
b 100f
1126
1: // process last 2 blocks
1127
ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16
1128
ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16
1129
cmp w2,#2
1130
b.gt 1f
1131
#ifndef __AARCH64EB__
1132
rev32 v4.16b,v4.16b
1133
#endif
1134
#ifndef __AARCH64EB__
1135
rev32 v5.16b,v5.16b
1136
#endif
1137
#ifndef __AARCH64EB__
1138
rev32 v6.16b,v6.16b
1139
#endif
1140
#ifndef __AARCH64EB__
1141
rev32 v7.16b,v7.16b
1142
#endif
1143
bl _vpsm4_ex_enc_4blks
1144
st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
1145
st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1]
1146
b 100f
1147
1: // process last 3 blocks
1148
ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16
1149
#ifndef __AARCH64EB__
1150
rev32 v4.16b,v4.16b
1151
#endif
1152
#ifndef __AARCH64EB__
1153
rev32 v5.16b,v5.16b
1154
#endif
1155
#ifndef __AARCH64EB__
1156
rev32 v6.16b,v6.16b
1157
#endif
1158
#ifndef __AARCH64EB__
1159
rev32 v7.16b,v7.16b
1160
#endif
1161
bl _vpsm4_ex_enc_4blks
1162
st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
1163
st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
1164
st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1]
1165
100:
1166
ldp d10,d11,[sp,#16]
1167
ldp d12,d13,[sp,#32]
1168
ldp d14,d15,[sp,#48]
1169
ldp x29,x30,[sp,#64]
1170
ldp d8,d9,[sp],#80
1171
AARCH64_VALIDATE_LINK_REGISTER
1172
ret
1173
.size vpsm4_ex_ecb_encrypt,.-vpsm4_ex_ecb_encrypt
1174
.globl vpsm4_ex_cbc_encrypt
1175
.type vpsm4_ex_cbc_encrypt,%function
1176
.align 5
1177
vpsm4_ex_cbc_encrypt:
1178
AARCH64_VALID_CALL_TARGET
1179
lsr x2,x2,4
1180
adrp x9, .Lsbox_magic
1181
ldr q26, [x9, #:lo12:.Lsbox_magic]
1182
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
1183
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
1184
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
1185
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
1186
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
1187
cbz w5,.Ldec
1188
ld1 {v3.4s},[x4]
1189
.Lcbc_4_blocks_enc:
1190
cmp w2,#4
1191
b.lt 1f
1192
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1193
eor v4.16b,v4.16b,v3.16b
1194
#ifndef __AARCH64EB__
1195
rev32 v5.16b,v5.16b
1196
#endif
1197
#ifndef __AARCH64EB__
1198
rev32 v4.16b,v4.16b
1199
#endif
1200
#ifndef __AARCH64EB__
1201
rev32 v6.16b,v6.16b
1202
#endif
1203
#ifndef __AARCH64EB__
1204
rev32 v7.16b,v7.16b
1205
#endif
1206
mov x10,x3
1207
mov w11,#8
1208
mov w12,v4.s[0]
1209
mov w13,v4.s[1]
1210
mov w14,v4.s[2]
1211
mov w15,v4.s[3]
1212
10:
1213
ldp w7,w8,[x10],8
1214
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1215
eor w6,w14,w15
1216
eor w9,w7,w13
1217
eor w6,w6,w9
1218
mov v3.s[0],w6
1219
// optimize sbox using AESE instruction
1220
tbl v0.16b, {v3.16b}, v26.16b
1221
ushr v2.16b, v0.16b, 4
1222
and v0.16b, v0.16b, v31.16b
1223
tbl v0.16b, {v28.16b}, v0.16b
1224
tbl v2.16b, {v27.16b}, v2.16b
1225
eor v0.16b, v0.16b, v2.16b
1226
eor v1.16b, v1.16b, v1.16b
1227
aese v0.16b,v1.16b
1228
ushr v2.16b, v0.16b, 4
1229
and v0.16b, v0.16b, v31.16b
1230
tbl v0.16b, {v30.16b}, v0.16b
1231
tbl v2.16b, {v29.16b}, v2.16b
1232
eor v0.16b, v0.16b, v2.16b
1233
1234
mov w7,v0.s[0]
1235
eor w6,w7,w7,ror #32-2
1236
eor w6,w6,w7,ror #32-10
1237
eor w6,w6,w7,ror #32-18
1238
eor w6,w6,w7,ror #32-24
1239
eor w12,w12,w6
1240
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1241
eor w6,w14,w15
1242
eor w9,w12,w8
1243
eor w6,w6,w9
1244
mov v3.s[0],w6
1245
// optimize sbox using AESE instruction
1246
tbl v0.16b, {v3.16b}, v26.16b
1247
ushr v2.16b, v0.16b, 4
1248
and v0.16b, v0.16b, v31.16b
1249
tbl v0.16b, {v28.16b}, v0.16b
1250
tbl v2.16b, {v27.16b}, v2.16b
1251
eor v0.16b, v0.16b, v2.16b
1252
eor v1.16b, v1.16b, v1.16b
1253
aese v0.16b,v1.16b
1254
ushr v2.16b, v0.16b, 4
1255
and v0.16b, v0.16b, v31.16b
1256
tbl v0.16b, {v30.16b}, v0.16b
1257
tbl v2.16b, {v29.16b}, v2.16b
1258
eor v0.16b, v0.16b, v2.16b
1259
1260
mov w7,v0.s[0]
1261
eor w6,w7,w7,ror #32-2
1262
eor w6,w6,w7,ror #32-10
1263
eor w6,w6,w7,ror #32-18
1264
eor w6,w6,w7,ror #32-24
1265
ldp w7,w8,[x10],8
1266
eor w13,w13,w6
1267
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1268
eor w6,w12,w13
1269
eor w9,w7,w15
1270
eor w6,w6,w9
1271
mov v3.s[0],w6
1272
// optimize sbox using AESE instruction
1273
tbl v0.16b, {v3.16b}, v26.16b
1274
ushr v2.16b, v0.16b, 4
1275
and v0.16b, v0.16b, v31.16b
1276
tbl v0.16b, {v28.16b}, v0.16b
1277
tbl v2.16b, {v27.16b}, v2.16b
1278
eor v0.16b, v0.16b, v2.16b
1279
eor v1.16b, v1.16b, v1.16b
1280
aese v0.16b,v1.16b
1281
ushr v2.16b, v0.16b, 4
1282
and v0.16b, v0.16b, v31.16b
1283
tbl v0.16b, {v30.16b}, v0.16b
1284
tbl v2.16b, {v29.16b}, v2.16b
1285
eor v0.16b, v0.16b, v2.16b
1286
1287
mov w7,v0.s[0]
1288
eor w6,w7,w7,ror #32-2
1289
eor w6,w6,w7,ror #32-10
1290
eor w6,w6,w7,ror #32-18
1291
eor w6,w6,w7,ror #32-24
1292
eor w14,w14,w6
1293
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1294
eor w6,w12,w13
1295
eor w9,w14,w8
1296
eor w6,w6,w9
1297
mov v3.s[0],w6
1298
// optimize sbox using AESE instruction
1299
tbl v0.16b, {v3.16b}, v26.16b
1300
ushr v2.16b, v0.16b, 4
1301
and v0.16b, v0.16b, v31.16b
1302
tbl v0.16b, {v28.16b}, v0.16b
1303
tbl v2.16b, {v27.16b}, v2.16b
1304
eor v0.16b, v0.16b, v2.16b
1305
eor v1.16b, v1.16b, v1.16b
1306
aese v0.16b,v1.16b
1307
ushr v2.16b, v0.16b, 4
1308
and v0.16b, v0.16b, v31.16b
1309
tbl v0.16b, {v30.16b}, v0.16b
1310
tbl v2.16b, {v29.16b}, v2.16b
1311
eor v0.16b, v0.16b, v2.16b
1312
1313
mov w7,v0.s[0]
1314
eor w6,w7,w7,ror #32-2
1315
eor w6,w6,w7,ror #32-10
1316
eor w6,w6,w7,ror #32-18
1317
eor w6,w6,w7,ror #32-24
1318
eor w15,w15,w6
1319
subs w11,w11,#1
1320
b.ne 10b
1321
mov v4.s[0],w15
1322
mov v4.s[1],w14
1323
mov v4.s[2],w13
1324
mov v4.s[3],w12
1325
eor v5.16b,v5.16b,v4.16b
1326
mov x10,x3
1327
mov w11,#8
1328
mov w12,v5.s[0]
1329
mov w13,v5.s[1]
1330
mov w14,v5.s[2]
1331
mov w15,v5.s[3]
1332
10:
1333
ldp w7,w8,[x10],8
1334
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1335
eor w6,w14,w15
1336
eor w9,w7,w13
1337
eor w6,w6,w9
1338
mov v3.s[0],w6
1339
// optimize sbox using AESE instruction
1340
tbl v0.16b, {v3.16b}, v26.16b
1341
ushr v2.16b, v0.16b, 4
1342
and v0.16b, v0.16b, v31.16b
1343
tbl v0.16b, {v28.16b}, v0.16b
1344
tbl v2.16b, {v27.16b}, v2.16b
1345
eor v0.16b, v0.16b, v2.16b
1346
eor v1.16b, v1.16b, v1.16b
1347
aese v0.16b,v1.16b
1348
ushr v2.16b, v0.16b, 4
1349
and v0.16b, v0.16b, v31.16b
1350
tbl v0.16b, {v30.16b}, v0.16b
1351
tbl v2.16b, {v29.16b}, v2.16b
1352
eor v0.16b, v0.16b, v2.16b
1353
1354
mov w7,v0.s[0]
1355
eor w6,w7,w7,ror #32-2
1356
eor w6,w6,w7,ror #32-10
1357
eor w6,w6,w7,ror #32-18
1358
eor w6,w6,w7,ror #32-24
1359
eor w12,w12,w6
1360
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1361
eor w6,w14,w15
1362
eor w9,w12,w8
1363
eor w6,w6,w9
1364
mov v3.s[0],w6
1365
// optimize sbox using AESE instruction
1366
tbl v0.16b, {v3.16b}, v26.16b
1367
ushr v2.16b, v0.16b, 4
1368
and v0.16b, v0.16b, v31.16b
1369
tbl v0.16b, {v28.16b}, v0.16b
1370
tbl v2.16b, {v27.16b}, v2.16b
1371
eor v0.16b, v0.16b, v2.16b
1372
eor v1.16b, v1.16b, v1.16b
1373
aese v0.16b,v1.16b
1374
ushr v2.16b, v0.16b, 4
1375
and v0.16b, v0.16b, v31.16b
1376
tbl v0.16b, {v30.16b}, v0.16b
1377
tbl v2.16b, {v29.16b}, v2.16b
1378
eor v0.16b, v0.16b, v2.16b
1379
1380
mov w7,v0.s[0]
1381
eor w6,w7,w7,ror #32-2
1382
eor w6,w6,w7,ror #32-10
1383
eor w6,w6,w7,ror #32-18
1384
eor w6,w6,w7,ror #32-24
1385
ldp w7,w8,[x10],8
1386
eor w13,w13,w6
1387
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1388
eor w6,w12,w13
1389
eor w9,w7,w15
1390
eor w6,w6,w9
1391
mov v3.s[0],w6
1392
// optimize sbox using AESE instruction
1393
tbl v0.16b, {v3.16b}, v26.16b
1394
ushr v2.16b, v0.16b, 4
1395
and v0.16b, v0.16b, v31.16b
1396
tbl v0.16b, {v28.16b}, v0.16b
1397
tbl v2.16b, {v27.16b}, v2.16b
1398
eor v0.16b, v0.16b, v2.16b
1399
eor v1.16b, v1.16b, v1.16b
1400
aese v0.16b,v1.16b
1401
ushr v2.16b, v0.16b, 4
1402
and v0.16b, v0.16b, v31.16b
1403
tbl v0.16b, {v30.16b}, v0.16b
1404
tbl v2.16b, {v29.16b}, v2.16b
1405
eor v0.16b, v0.16b, v2.16b
1406
1407
mov w7,v0.s[0]
1408
eor w6,w7,w7,ror #32-2
1409
eor w6,w6,w7,ror #32-10
1410
eor w6,w6,w7,ror #32-18
1411
eor w6,w6,w7,ror #32-24
1412
eor w14,w14,w6
1413
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1414
eor w6,w12,w13
1415
eor w9,w14,w8
1416
eor w6,w6,w9
1417
mov v3.s[0],w6
1418
// optimize sbox using AESE instruction
1419
tbl v0.16b, {v3.16b}, v26.16b
1420
ushr v2.16b, v0.16b, 4
1421
and v0.16b, v0.16b, v31.16b
1422
tbl v0.16b, {v28.16b}, v0.16b
1423
tbl v2.16b, {v27.16b}, v2.16b
1424
eor v0.16b, v0.16b, v2.16b
1425
eor v1.16b, v1.16b, v1.16b
1426
aese v0.16b,v1.16b
1427
ushr v2.16b, v0.16b, 4
1428
and v0.16b, v0.16b, v31.16b
1429
tbl v0.16b, {v30.16b}, v0.16b
1430
tbl v2.16b, {v29.16b}, v2.16b
1431
eor v0.16b, v0.16b, v2.16b
1432
1433
mov w7,v0.s[0]
1434
eor w6,w7,w7,ror #32-2
1435
eor w6,w6,w7,ror #32-10
1436
eor w6,w6,w7,ror #32-18
1437
eor w6,w6,w7,ror #32-24
1438
eor w15,w15,w6
1439
subs w11,w11,#1
1440
b.ne 10b
1441
mov v5.s[0],w15
1442
mov v5.s[1],w14
1443
mov v5.s[2],w13
1444
mov v5.s[3],w12
1445
#ifndef __AARCH64EB__
1446
rev32 v4.16b,v4.16b
1447
#endif
1448
eor v6.16b,v6.16b,v5.16b
1449
mov x10,x3
1450
mov w11,#8
1451
mov w12,v6.s[0]
1452
mov w13,v6.s[1]
1453
mov w14,v6.s[2]
1454
mov w15,v6.s[3]
1455
10:
1456
ldp w7,w8,[x10],8
1457
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1458
eor w6,w14,w15
1459
eor w9,w7,w13
1460
eor w6,w6,w9
1461
mov v3.s[0],w6
1462
// optimize sbox using AESE instruction
1463
tbl v0.16b, {v3.16b}, v26.16b
1464
ushr v2.16b, v0.16b, 4
1465
and v0.16b, v0.16b, v31.16b
1466
tbl v0.16b, {v28.16b}, v0.16b
1467
tbl v2.16b, {v27.16b}, v2.16b
1468
eor v0.16b, v0.16b, v2.16b
1469
eor v1.16b, v1.16b, v1.16b
1470
aese v0.16b,v1.16b
1471
ushr v2.16b, v0.16b, 4
1472
and v0.16b, v0.16b, v31.16b
1473
tbl v0.16b, {v30.16b}, v0.16b
1474
tbl v2.16b, {v29.16b}, v2.16b
1475
eor v0.16b, v0.16b, v2.16b
1476
1477
mov w7,v0.s[0]
1478
eor w6,w7,w7,ror #32-2
1479
eor w6,w6,w7,ror #32-10
1480
eor w6,w6,w7,ror #32-18
1481
eor w6,w6,w7,ror #32-24
1482
eor w12,w12,w6
1483
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1484
eor w6,w14,w15
1485
eor w9,w12,w8
1486
eor w6,w6,w9
1487
mov v3.s[0],w6
1488
// optimize sbox using AESE instruction
1489
tbl v0.16b, {v3.16b}, v26.16b
1490
ushr v2.16b, v0.16b, 4
1491
and v0.16b, v0.16b, v31.16b
1492
tbl v0.16b, {v28.16b}, v0.16b
1493
tbl v2.16b, {v27.16b}, v2.16b
1494
eor v0.16b, v0.16b, v2.16b
1495
eor v1.16b, v1.16b, v1.16b
1496
aese v0.16b,v1.16b
1497
ushr v2.16b, v0.16b, 4
1498
and v0.16b, v0.16b, v31.16b
1499
tbl v0.16b, {v30.16b}, v0.16b
1500
tbl v2.16b, {v29.16b}, v2.16b
1501
eor v0.16b, v0.16b, v2.16b
1502
1503
mov w7,v0.s[0]
1504
eor w6,w7,w7,ror #32-2
1505
eor w6,w6,w7,ror #32-10
1506
eor w6,w6,w7,ror #32-18
1507
eor w6,w6,w7,ror #32-24
1508
ldp w7,w8,[x10],8
1509
eor w13,w13,w6
1510
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1511
eor w6,w12,w13
1512
eor w9,w7,w15
1513
eor w6,w6,w9
1514
mov v3.s[0],w6
1515
// optimize sbox using AESE instruction
1516
tbl v0.16b, {v3.16b}, v26.16b
1517
ushr v2.16b, v0.16b, 4
1518
and v0.16b, v0.16b, v31.16b
1519
tbl v0.16b, {v28.16b}, v0.16b
1520
tbl v2.16b, {v27.16b}, v2.16b
1521
eor v0.16b, v0.16b, v2.16b
1522
eor v1.16b, v1.16b, v1.16b
1523
aese v0.16b,v1.16b
1524
ushr v2.16b, v0.16b, 4
1525
and v0.16b, v0.16b, v31.16b
1526
tbl v0.16b, {v30.16b}, v0.16b
1527
tbl v2.16b, {v29.16b}, v2.16b
1528
eor v0.16b, v0.16b, v2.16b
1529
1530
mov w7,v0.s[0]
1531
eor w6,w7,w7,ror #32-2
1532
eor w6,w6,w7,ror #32-10
1533
eor w6,w6,w7,ror #32-18
1534
eor w6,w6,w7,ror #32-24
1535
eor w14,w14,w6
1536
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1537
eor w6,w12,w13
1538
eor w9,w14,w8
1539
eor w6,w6,w9
1540
mov v3.s[0],w6
1541
// optimize sbox using AESE instruction
1542
tbl v0.16b, {v3.16b}, v26.16b
1543
ushr v2.16b, v0.16b, 4
1544
and v0.16b, v0.16b, v31.16b
1545
tbl v0.16b, {v28.16b}, v0.16b
1546
tbl v2.16b, {v27.16b}, v2.16b
1547
eor v0.16b, v0.16b, v2.16b
1548
eor v1.16b, v1.16b, v1.16b
1549
aese v0.16b,v1.16b
1550
ushr v2.16b, v0.16b, 4
1551
and v0.16b, v0.16b, v31.16b
1552
tbl v0.16b, {v30.16b}, v0.16b
1553
tbl v2.16b, {v29.16b}, v2.16b
1554
eor v0.16b, v0.16b, v2.16b
1555
1556
mov w7,v0.s[0]
1557
eor w6,w7,w7,ror #32-2
1558
eor w6,w6,w7,ror #32-10
1559
eor w6,w6,w7,ror #32-18
1560
eor w6,w6,w7,ror #32-24
1561
eor w15,w15,w6
1562
subs w11,w11,#1
1563
b.ne 10b
1564
mov v6.s[0],w15
1565
mov v6.s[1],w14
1566
mov v6.s[2],w13
1567
mov v6.s[3],w12
1568
#ifndef __AARCH64EB__
1569
rev32 v5.16b,v5.16b
1570
#endif
1571
eor v7.16b,v7.16b,v6.16b
1572
mov x10,x3
1573
mov w11,#8
1574
mov w12,v7.s[0]
1575
mov w13,v7.s[1]
1576
mov w14,v7.s[2]
1577
mov w15,v7.s[3]
1578
10:
1579
ldp w7,w8,[x10],8
1580
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1581
eor w6,w14,w15
1582
eor w9,w7,w13
1583
eor w6,w6,w9
1584
mov v3.s[0],w6
1585
// optimize sbox using AESE instruction
1586
tbl v0.16b, {v3.16b}, v26.16b
1587
ushr v2.16b, v0.16b, 4
1588
and v0.16b, v0.16b, v31.16b
1589
tbl v0.16b, {v28.16b}, v0.16b
1590
tbl v2.16b, {v27.16b}, v2.16b
1591
eor v0.16b, v0.16b, v2.16b
1592
eor v1.16b, v1.16b, v1.16b
1593
aese v0.16b,v1.16b
1594
ushr v2.16b, v0.16b, 4
1595
and v0.16b, v0.16b, v31.16b
1596
tbl v0.16b, {v30.16b}, v0.16b
1597
tbl v2.16b, {v29.16b}, v2.16b
1598
eor v0.16b, v0.16b, v2.16b
1599
1600
mov w7,v0.s[0]
1601
eor w6,w7,w7,ror #32-2
1602
eor w6,w6,w7,ror #32-10
1603
eor w6,w6,w7,ror #32-18
1604
eor w6,w6,w7,ror #32-24
1605
eor w12,w12,w6
1606
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1607
eor w6,w14,w15
1608
eor w9,w12,w8
1609
eor w6,w6,w9
1610
mov v3.s[0],w6
1611
// optimize sbox using AESE instruction
1612
tbl v0.16b, {v3.16b}, v26.16b
1613
ushr v2.16b, v0.16b, 4
1614
and v0.16b, v0.16b, v31.16b
1615
tbl v0.16b, {v28.16b}, v0.16b
1616
tbl v2.16b, {v27.16b}, v2.16b
1617
eor v0.16b, v0.16b, v2.16b
1618
eor v1.16b, v1.16b, v1.16b
1619
aese v0.16b,v1.16b
1620
ushr v2.16b, v0.16b, 4
1621
and v0.16b, v0.16b, v31.16b
1622
tbl v0.16b, {v30.16b}, v0.16b
1623
tbl v2.16b, {v29.16b}, v2.16b
1624
eor v0.16b, v0.16b, v2.16b
1625
1626
mov w7,v0.s[0]
1627
eor w6,w7,w7,ror #32-2
1628
eor w6,w6,w7,ror #32-10
1629
eor w6,w6,w7,ror #32-18
1630
eor w6,w6,w7,ror #32-24
1631
ldp w7,w8,[x10],8
1632
eor w13,w13,w6
1633
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1634
eor w6,w12,w13
1635
eor w9,w7,w15
1636
eor w6,w6,w9
1637
mov v3.s[0],w6
1638
// optimize sbox using AESE instruction
1639
tbl v0.16b, {v3.16b}, v26.16b
1640
ushr v2.16b, v0.16b, 4
1641
and v0.16b, v0.16b, v31.16b
1642
tbl v0.16b, {v28.16b}, v0.16b
1643
tbl v2.16b, {v27.16b}, v2.16b
1644
eor v0.16b, v0.16b, v2.16b
1645
eor v1.16b, v1.16b, v1.16b
1646
aese v0.16b,v1.16b
1647
ushr v2.16b, v0.16b, 4
1648
and v0.16b, v0.16b, v31.16b
1649
tbl v0.16b, {v30.16b}, v0.16b
1650
tbl v2.16b, {v29.16b}, v2.16b
1651
eor v0.16b, v0.16b, v2.16b
1652
1653
mov w7,v0.s[0]
1654
eor w6,w7,w7,ror #32-2
1655
eor w6,w6,w7,ror #32-10
1656
eor w6,w6,w7,ror #32-18
1657
eor w6,w6,w7,ror #32-24
1658
eor w14,w14,w6
1659
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1660
eor w6,w12,w13
1661
eor w9,w14,w8
1662
eor w6,w6,w9
1663
mov v3.s[0],w6
1664
// optimize sbox using AESE instruction
1665
tbl v0.16b, {v3.16b}, v26.16b
1666
ushr v2.16b, v0.16b, 4
1667
and v0.16b, v0.16b, v31.16b
1668
tbl v0.16b, {v28.16b}, v0.16b
1669
tbl v2.16b, {v27.16b}, v2.16b
1670
eor v0.16b, v0.16b, v2.16b
1671
eor v1.16b, v1.16b, v1.16b
1672
aese v0.16b,v1.16b
1673
ushr v2.16b, v0.16b, 4
1674
and v0.16b, v0.16b, v31.16b
1675
tbl v0.16b, {v30.16b}, v0.16b
1676
tbl v2.16b, {v29.16b}, v2.16b
1677
eor v0.16b, v0.16b, v2.16b
1678
1679
mov w7,v0.s[0]
1680
eor w6,w7,w7,ror #32-2
1681
eor w6,w6,w7,ror #32-10
1682
eor w6,w6,w7,ror #32-18
1683
eor w6,w6,w7,ror #32-24
1684
eor w15,w15,w6
1685
subs w11,w11,#1
1686
b.ne 10b
1687
mov v7.s[0],w15
1688
mov v7.s[1],w14
1689
mov v7.s[2],w13
1690
mov v7.s[3],w12
1691
#ifndef __AARCH64EB__
1692
rev32 v6.16b,v6.16b
1693
#endif
1694
#ifndef __AARCH64EB__
1695
rev32 v7.16b,v7.16b
1696
#endif
1697
orr v3.16b,v7.16b,v7.16b
1698
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
1699
subs w2,w2,#4
1700
b.ne .Lcbc_4_blocks_enc
1701
b 2f
1702
1:
1703
subs w2,w2,#1
1704
b.lt 2f
1705
ld1 {v4.4s},[x0],#16
1706
eor v3.16b,v3.16b,v4.16b
1707
#ifndef __AARCH64EB__
1708
rev32 v3.16b,v3.16b
1709
#endif
1710
mov x10,x3
1711
mov w11,#8
1712
mov w12,v3.s[0]
1713
mov w13,v3.s[1]
1714
mov w14,v3.s[2]
1715
mov w15,v3.s[3]
1716
10:
1717
ldp w7,w8,[x10],8
1718
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1719
eor w6,w14,w15
1720
eor w9,w7,w13
1721
eor w6,w6,w9
1722
mov v3.s[0],w6
1723
// optimize sbox using AESE instruction
1724
tbl v0.16b, {v3.16b}, v26.16b
1725
ushr v2.16b, v0.16b, 4
1726
and v0.16b, v0.16b, v31.16b
1727
tbl v0.16b, {v28.16b}, v0.16b
1728
tbl v2.16b, {v27.16b}, v2.16b
1729
eor v0.16b, v0.16b, v2.16b
1730
eor v1.16b, v1.16b, v1.16b
1731
aese v0.16b,v1.16b
1732
ushr v2.16b, v0.16b, 4
1733
and v0.16b, v0.16b, v31.16b
1734
tbl v0.16b, {v30.16b}, v0.16b
1735
tbl v2.16b, {v29.16b}, v2.16b
1736
eor v0.16b, v0.16b, v2.16b
1737
1738
mov w7,v0.s[0]
1739
eor w6,w7,w7,ror #32-2
1740
eor w6,w6,w7,ror #32-10
1741
eor w6,w6,w7,ror #32-18
1742
eor w6,w6,w7,ror #32-24
1743
eor w12,w12,w6
1744
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1745
eor w6,w14,w15
1746
eor w9,w12,w8
1747
eor w6,w6,w9
1748
mov v3.s[0],w6
1749
// optimize sbox using AESE instruction
1750
tbl v0.16b, {v3.16b}, v26.16b
1751
ushr v2.16b, v0.16b, 4
1752
and v0.16b, v0.16b, v31.16b
1753
tbl v0.16b, {v28.16b}, v0.16b
1754
tbl v2.16b, {v27.16b}, v2.16b
1755
eor v0.16b, v0.16b, v2.16b
1756
eor v1.16b, v1.16b, v1.16b
1757
aese v0.16b,v1.16b
1758
ushr v2.16b, v0.16b, 4
1759
and v0.16b, v0.16b, v31.16b
1760
tbl v0.16b, {v30.16b}, v0.16b
1761
tbl v2.16b, {v29.16b}, v2.16b
1762
eor v0.16b, v0.16b, v2.16b
1763
1764
mov w7,v0.s[0]
1765
eor w6,w7,w7,ror #32-2
1766
eor w6,w6,w7,ror #32-10
1767
eor w6,w6,w7,ror #32-18
1768
eor w6,w6,w7,ror #32-24
1769
ldp w7,w8,[x10],8
1770
eor w13,w13,w6
1771
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1772
eor w6,w12,w13
1773
eor w9,w7,w15
1774
eor w6,w6,w9
1775
mov v3.s[0],w6
1776
// optimize sbox using AESE instruction
1777
tbl v0.16b, {v3.16b}, v26.16b
1778
ushr v2.16b, v0.16b, 4
1779
and v0.16b, v0.16b, v31.16b
1780
tbl v0.16b, {v28.16b}, v0.16b
1781
tbl v2.16b, {v27.16b}, v2.16b
1782
eor v0.16b, v0.16b, v2.16b
1783
eor v1.16b, v1.16b, v1.16b
1784
aese v0.16b,v1.16b
1785
ushr v2.16b, v0.16b, 4
1786
and v0.16b, v0.16b, v31.16b
1787
tbl v0.16b, {v30.16b}, v0.16b
1788
tbl v2.16b, {v29.16b}, v2.16b
1789
eor v0.16b, v0.16b, v2.16b
1790
1791
mov w7,v0.s[0]
1792
eor w6,w7,w7,ror #32-2
1793
eor w6,w6,w7,ror #32-10
1794
eor w6,w6,w7,ror #32-18
1795
eor w6,w6,w7,ror #32-24
1796
eor w14,w14,w6
1797
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1798
eor w6,w12,w13
1799
eor w9,w14,w8
1800
eor w6,w6,w9
1801
mov v3.s[0],w6
1802
// optimize sbox using AESE instruction
1803
tbl v0.16b, {v3.16b}, v26.16b
1804
ushr v2.16b, v0.16b, 4
1805
and v0.16b, v0.16b, v31.16b
1806
tbl v0.16b, {v28.16b}, v0.16b
1807
tbl v2.16b, {v27.16b}, v2.16b
1808
eor v0.16b, v0.16b, v2.16b
1809
eor v1.16b, v1.16b, v1.16b
1810
aese v0.16b,v1.16b
1811
ushr v2.16b, v0.16b, 4
1812
and v0.16b, v0.16b, v31.16b
1813
tbl v0.16b, {v30.16b}, v0.16b
1814
tbl v2.16b, {v29.16b}, v2.16b
1815
eor v0.16b, v0.16b, v2.16b
1816
1817
mov w7,v0.s[0]
1818
eor w6,w7,w7,ror #32-2
1819
eor w6,w6,w7,ror #32-10
1820
eor w6,w6,w7,ror #32-18
1821
eor w6,w6,w7,ror #32-24
1822
eor w15,w15,w6
1823
subs w11,w11,#1
1824
b.ne 10b
1825
mov v3.s[0],w15
1826
mov v3.s[1],w14
1827
mov v3.s[2],w13
1828
mov v3.s[3],w12
1829
#ifndef __AARCH64EB__
1830
rev32 v3.16b,v3.16b
1831
#endif
1832
st1 {v3.4s},[x1],#16
1833
b 1b
1834
2:
1835
// save back IV
1836
st1 {v3.4s},[x4]
1837
ret
1838
1839
.Ldec:
1840
// decryption mode starts
1841
AARCH64_SIGN_LINK_REGISTER
1842
stp d8,d9,[sp,#-80]!
1843
stp d10,d11,[sp,#16]
1844
stp d12,d13,[sp,#32]
1845
stp d14,d15,[sp,#48]
1846
stp x29,x30,[sp,#64]
1847
.Lcbc_8_blocks_dec:
1848
cmp w2,#8
1849
b.lt 1f
1850
ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
1851
add x10,x0,#64
1852
ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10]
1853
#ifndef __AARCH64EB__
1854
rev32 v4.16b,v4.16b
1855
#endif
1856
#ifndef __AARCH64EB__
1857
rev32 v5.16b,v5.16b
1858
#endif
1859
#ifndef __AARCH64EB__
1860
rev32 v6.16b,v6.16b
1861
#endif
1862
#ifndef __AARCH64EB__
1863
rev32 v7.16b,v7.16b
1864
#endif
1865
#ifndef __AARCH64EB__
1866
rev32 v8.16b,v8.16b
1867
#endif
1868
#ifndef __AARCH64EB__
1869
rev32 v9.16b,v9.16b
1870
#endif
1871
#ifndef __AARCH64EB__
1872
rev32 v10.16b,v10.16b
1873
#endif
1874
#ifndef __AARCH64EB__
1875
rev32 v11.16b,v11.16b
1876
#endif
1877
bl _vpsm4_ex_enc_8blks
1878
zip1 v8.4s,v0.4s,v1.4s
1879
zip2 v9.4s,v0.4s,v1.4s
1880
zip1 v10.4s,v2.4s,v3.4s
1881
zip2 v11.4s,v2.4s,v3.4s
1882
zip1 v0.2d,v8.2d,v10.2d
1883
zip2 v1.2d,v8.2d,v10.2d
1884
zip1 v2.2d,v9.2d,v11.2d
1885
zip2 v3.2d,v9.2d,v11.2d
1886
zip1 v8.4s,v4.4s,v5.4s
1887
zip2 v9.4s,v4.4s,v5.4s
1888
zip1 v10.4s,v6.4s,v7.4s
1889
zip2 v11.4s,v6.4s,v7.4s
1890
zip1 v4.2d,v8.2d,v10.2d
1891
zip2 v5.2d,v8.2d,v10.2d
1892
zip1 v6.2d,v9.2d,v11.2d
1893
zip2 v7.2d,v9.2d,v11.2d
1894
ld1 {v15.4s},[x4]
1895
ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
1896
// note ivec1 and vtmpx[3] are reusing the same register
1897
// care needs to be taken to avoid conflict
1898
eor v0.16b,v0.16b,v15.16b
1899
ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
1900
eor v1.16b,v1.16b,v8.16b
1901
eor v2.16b,v2.16b,v9.16b
1902
eor v3.16b,v3.16b,v10.16b
1903
// save back IV
1904
st1 {v15.4s}, [x4]
1905
eor v4.16b,v4.16b,v11.16b
1906
eor v5.16b,v5.16b,v12.16b
1907
eor v6.16b,v6.16b,v13.16b
1908
eor v7.16b,v7.16b,v14.16b
1909
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
1910
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
1911
subs w2,w2,#8
1912
b.gt .Lcbc_8_blocks_dec
1913
b.eq 100f
1914
1:
1915
ld1 {v15.4s},[x4]
1916
.Lcbc_4_blocks_dec:
1917
cmp w2,#4
1918
b.lt 1f
1919
ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
1920
#ifndef __AARCH64EB__
1921
rev32 v4.16b,v4.16b
1922
#endif
1923
#ifndef __AARCH64EB__
1924
rev32 v5.16b,v5.16b
1925
#endif
1926
#ifndef __AARCH64EB__
1927
rev32 v6.16b,v6.16b
1928
#endif
1929
#ifndef __AARCH64EB__
1930
rev32 v7.16b,v7.16b
1931
#endif
1932
bl _vpsm4_ex_enc_4blks
1933
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1934
zip1 v8.4s,v0.4s,v1.4s
1935
zip2 v9.4s,v0.4s,v1.4s
1936
zip1 v10.4s,v2.4s,v3.4s
1937
zip2 v11.4s,v2.4s,v3.4s
1938
zip1 v0.2d,v8.2d,v10.2d
1939
zip2 v1.2d,v8.2d,v10.2d
1940
zip1 v2.2d,v9.2d,v11.2d
1941
zip2 v3.2d,v9.2d,v11.2d
1942
eor v0.16b,v0.16b,v15.16b
1943
eor v1.16b,v1.16b,v4.16b
1944
orr v15.16b,v7.16b,v7.16b
1945
eor v2.16b,v2.16b,v5.16b
1946
eor v3.16b,v3.16b,v6.16b
1947
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
1948
subs w2,w2,#4
1949
b.gt .Lcbc_4_blocks_dec
1950
// save back IV
1951
st1 {v7.4s}, [x4]
1952
b 100f
1953
1: // last block
1954
subs w2,w2,#1
1955
b.lt 100f
1956
b.gt 1f
1957
ld1 {v4.4s},[x0],#16
1958
// save back IV
1959
st1 {v4.4s}, [x4]
1960
#ifndef __AARCH64EB__
1961
rev32 v8.16b,v4.16b
1962
#else
1963
mov v8.16b,v4.16b
1964
#endif
1965
mov x10,x3
1966
mov w11,#8
1967
mov w12,v8.s[0]
1968
mov w13,v8.s[1]
1969
mov w14,v8.s[2]
1970
mov w15,v8.s[3]
1971
10:
1972
ldp w7,w8,[x10],8
1973
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1974
eor w6,w14,w15
1975
eor w9,w7,w13
1976
eor w6,w6,w9
1977
mov v3.s[0],w6
1978
// optimize sbox using AESE instruction
1979
tbl v0.16b, {v3.16b}, v26.16b
1980
ushr v2.16b, v0.16b, 4
1981
and v0.16b, v0.16b, v31.16b
1982
tbl v0.16b, {v28.16b}, v0.16b
1983
tbl v2.16b, {v27.16b}, v2.16b
1984
eor v0.16b, v0.16b, v2.16b
1985
eor v1.16b, v1.16b, v1.16b
1986
aese v0.16b,v1.16b
1987
ushr v2.16b, v0.16b, 4
1988
and v0.16b, v0.16b, v31.16b
1989
tbl v0.16b, {v30.16b}, v0.16b
1990
tbl v2.16b, {v29.16b}, v2.16b
1991
eor v0.16b, v0.16b, v2.16b
1992
1993
mov w7,v0.s[0]
1994
eor w6,w7,w7,ror #32-2
1995
eor w6,w6,w7,ror #32-10
1996
eor w6,w6,w7,ror #32-18
1997
eor w6,w6,w7,ror #32-24
1998
eor w12,w12,w6
1999
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2000
eor w6,w14,w15
2001
eor w9,w12,w8
2002
eor w6,w6,w9
2003
mov v3.s[0],w6
2004
// optimize sbox using AESE instruction
2005
tbl v0.16b, {v3.16b}, v26.16b
2006
ushr v2.16b, v0.16b, 4
2007
and v0.16b, v0.16b, v31.16b
2008
tbl v0.16b, {v28.16b}, v0.16b
2009
tbl v2.16b, {v27.16b}, v2.16b
2010
eor v0.16b, v0.16b, v2.16b
2011
eor v1.16b, v1.16b, v1.16b
2012
aese v0.16b,v1.16b
2013
ushr v2.16b, v0.16b, 4
2014
and v0.16b, v0.16b, v31.16b
2015
tbl v0.16b, {v30.16b}, v0.16b
2016
tbl v2.16b, {v29.16b}, v2.16b
2017
eor v0.16b, v0.16b, v2.16b
2018
2019
mov w7,v0.s[0]
2020
eor w6,w7,w7,ror #32-2
2021
eor w6,w6,w7,ror #32-10
2022
eor w6,w6,w7,ror #32-18
2023
eor w6,w6,w7,ror #32-24
2024
ldp w7,w8,[x10],8
2025
eor w13,w13,w6
2026
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2027
eor w6,w12,w13
2028
eor w9,w7,w15
2029
eor w6,w6,w9
2030
mov v3.s[0],w6
2031
// optimize sbox using AESE instruction
2032
tbl v0.16b, {v3.16b}, v26.16b
2033
ushr v2.16b, v0.16b, 4
2034
and v0.16b, v0.16b, v31.16b
2035
tbl v0.16b, {v28.16b}, v0.16b
2036
tbl v2.16b, {v27.16b}, v2.16b
2037
eor v0.16b, v0.16b, v2.16b
2038
eor v1.16b, v1.16b, v1.16b
2039
aese v0.16b,v1.16b
2040
ushr v2.16b, v0.16b, 4
2041
and v0.16b, v0.16b, v31.16b
2042
tbl v0.16b, {v30.16b}, v0.16b
2043
tbl v2.16b, {v29.16b}, v2.16b
2044
eor v0.16b, v0.16b, v2.16b
2045
2046
mov w7,v0.s[0]
2047
eor w6,w7,w7,ror #32-2
2048
eor w6,w6,w7,ror #32-10
2049
eor w6,w6,w7,ror #32-18
2050
eor w6,w6,w7,ror #32-24
2051
eor w14,w14,w6
2052
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2053
eor w6,w12,w13
2054
eor w9,w14,w8
2055
eor w6,w6,w9
2056
mov v3.s[0],w6
2057
// optimize sbox using AESE instruction
2058
tbl v0.16b, {v3.16b}, v26.16b
2059
ushr v2.16b, v0.16b, 4
2060
and v0.16b, v0.16b, v31.16b
2061
tbl v0.16b, {v28.16b}, v0.16b
2062
tbl v2.16b, {v27.16b}, v2.16b
2063
eor v0.16b, v0.16b, v2.16b
2064
eor v1.16b, v1.16b, v1.16b
2065
aese v0.16b,v1.16b
2066
ushr v2.16b, v0.16b, 4
2067
and v0.16b, v0.16b, v31.16b
2068
tbl v0.16b, {v30.16b}, v0.16b
2069
tbl v2.16b, {v29.16b}, v2.16b
2070
eor v0.16b, v0.16b, v2.16b
2071
2072
mov w7,v0.s[0]
2073
eor w6,w7,w7,ror #32-2
2074
eor w6,w6,w7,ror #32-10
2075
eor w6,w6,w7,ror #32-18
2076
eor w6,w6,w7,ror #32-24
2077
eor w15,w15,w6
2078
subs w11,w11,#1
2079
b.ne 10b
2080
mov v8.s[0],w15
2081
mov v8.s[1],w14
2082
mov v8.s[2],w13
2083
mov v8.s[3],w12
2084
#ifndef __AARCH64EB__
2085
rev32 v8.16b,v8.16b
2086
#endif
2087
eor v8.16b,v8.16b,v15.16b
2088
st1 {v8.4s},[x1],#16
2089
b 100f
2090
1: // last two blocks
2091
ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0]
2092
add x10,x0,#16
2093
ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16
2094
subs w2,w2,1
2095
b.gt 1f
2096
#ifndef __AARCH64EB__
2097
rev32 v4.16b,v4.16b
2098
#endif
2099
#ifndef __AARCH64EB__
2100
rev32 v5.16b,v5.16b
2101
#endif
2102
#ifndef __AARCH64EB__
2103
rev32 v6.16b,v6.16b
2104
#endif
2105
#ifndef __AARCH64EB__
2106
rev32 v7.16b,v7.16b
2107
#endif
2108
bl _vpsm4_ex_enc_4blks
2109
ld1 {v4.4s,v5.4s},[x0],#32
2110
zip1 v8.4s,v0.4s,v1.4s
2111
zip2 v9.4s,v0.4s,v1.4s
2112
zip1 v10.4s,v2.4s,v3.4s
2113
zip2 v11.4s,v2.4s,v3.4s
2114
zip1 v0.2d,v8.2d,v10.2d
2115
zip2 v1.2d,v8.2d,v10.2d
2116
zip1 v2.2d,v9.2d,v11.2d
2117
zip2 v3.2d,v9.2d,v11.2d
2118
eor v0.16b,v0.16b,v15.16b
2119
eor v1.16b,v1.16b,v4.16b
2120
st1 {v0.4s,v1.4s},[x1],#32
2121
// save back IV
2122
st1 {v5.4s}, [x4]
2123
b 100f
2124
1: // last 3 blocks
2125
ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10]
2126
#ifndef __AARCH64EB__
2127
rev32 v4.16b,v4.16b
2128
#endif
2129
#ifndef __AARCH64EB__
2130
rev32 v5.16b,v5.16b
2131
#endif
2132
#ifndef __AARCH64EB__
2133
rev32 v6.16b,v6.16b
2134
#endif
2135
#ifndef __AARCH64EB__
2136
rev32 v7.16b,v7.16b
2137
#endif
2138
bl _vpsm4_ex_enc_4blks
2139
ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
2140
zip1 v8.4s,v0.4s,v1.4s
2141
zip2 v9.4s,v0.4s,v1.4s
2142
zip1 v10.4s,v2.4s,v3.4s
2143
zip2 v11.4s,v2.4s,v3.4s
2144
zip1 v0.2d,v8.2d,v10.2d
2145
zip2 v1.2d,v8.2d,v10.2d
2146
zip1 v2.2d,v9.2d,v11.2d
2147
zip2 v3.2d,v9.2d,v11.2d
2148
eor v0.16b,v0.16b,v15.16b
2149
eor v1.16b,v1.16b,v4.16b
2150
eor v2.16b,v2.16b,v5.16b
2151
st1 {v0.4s,v1.4s,v2.4s},[x1],#48
2152
// save back IV
2153
st1 {v6.4s}, [x4]
2154
100:
2155
ldp d10,d11,[sp,#16]
2156
ldp d12,d13,[sp,#32]
2157
ldp d14,d15,[sp,#48]
2158
ldp x29,x30,[sp,#64]
2159
ldp d8,d9,[sp],#80
2160
AARCH64_VALIDATE_LINK_REGISTER
2161
ret
2162
.size vpsm4_ex_cbc_encrypt,.-vpsm4_ex_cbc_encrypt
2163
.globl vpsm4_ex_ctr32_encrypt_blocks
2164
.type vpsm4_ex_ctr32_encrypt_blocks,%function
2165
.align 5
2166
vpsm4_ex_ctr32_encrypt_blocks:
2167
AARCH64_VALID_CALL_TARGET
2168
ld1 {v3.4s},[x4]
2169
#ifndef __AARCH64EB__
2170
rev32 v3.16b,v3.16b
2171
#endif
2172
adrp x9, .Lsbox_magic
2173
ldr q26, [x9, #:lo12:.Lsbox_magic]
2174
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
2175
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
2176
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
2177
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
2178
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
2179
cmp w2,#1
2180
b.ne 1f
2181
// fast processing for one single block without
2182
// context saving overhead
2183
mov x10,x3
2184
mov w11,#8
2185
mov w12,v3.s[0]
2186
mov w13,v3.s[1]
2187
mov w14,v3.s[2]
2188
mov w15,v3.s[3]
2189
10:
2190
ldp w7,w8,[x10],8
2191
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2192
eor w6,w14,w15
2193
eor w9,w7,w13
2194
eor w6,w6,w9
2195
mov v3.s[0],w6
2196
// optimize sbox using AESE instruction
2197
tbl v0.16b, {v3.16b}, v26.16b
2198
ushr v2.16b, v0.16b, 4
2199
and v0.16b, v0.16b, v31.16b
2200
tbl v0.16b, {v28.16b}, v0.16b
2201
tbl v2.16b, {v27.16b}, v2.16b
2202
eor v0.16b, v0.16b, v2.16b
2203
eor v1.16b, v1.16b, v1.16b
2204
aese v0.16b,v1.16b
2205
ushr v2.16b, v0.16b, 4
2206
and v0.16b, v0.16b, v31.16b
2207
tbl v0.16b, {v30.16b}, v0.16b
2208
tbl v2.16b, {v29.16b}, v2.16b
2209
eor v0.16b, v0.16b, v2.16b
2210
2211
mov w7,v0.s[0]
2212
eor w6,w7,w7,ror #32-2
2213
eor w6,w6,w7,ror #32-10
2214
eor w6,w6,w7,ror #32-18
2215
eor w6,w6,w7,ror #32-24
2216
eor w12,w12,w6
2217
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2218
eor w6,w14,w15
2219
eor w9,w12,w8
2220
eor w6,w6,w9
2221
mov v3.s[0],w6
2222
// optimize sbox using AESE instruction
2223
tbl v0.16b, {v3.16b}, v26.16b
2224
ushr v2.16b, v0.16b, 4
2225
and v0.16b, v0.16b, v31.16b
2226
tbl v0.16b, {v28.16b}, v0.16b
2227
tbl v2.16b, {v27.16b}, v2.16b
2228
eor v0.16b, v0.16b, v2.16b
2229
eor v1.16b, v1.16b, v1.16b
2230
aese v0.16b,v1.16b
2231
ushr v2.16b, v0.16b, 4
2232
and v0.16b, v0.16b, v31.16b
2233
tbl v0.16b, {v30.16b}, v0.16b
2234
tbl v2.16b, {v29.16b}, v2.16b
2235
eor v0.16b, v0.16b, v2.16b
2236
2237
mov w7,v0.s[0]
2238
eor w6,w7,w7,ror #32-2
2239
eor w6,w6,w7,ror #32-10
2240
eor w6,w6,w7,ror #32-18
2241
eor w6,w6,w7,ror #32-24
2242
ldp w7,w8,[x10],8
2243
eor w13,w13,w6
2244
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2245
eor w6,w12,w13
2246
eor w9,w7,w15
2247
eor w6,w6,w9
2248
mov v3.s[0],w6
2249
// optimize sbox using AESE instruction
2250
tbl v0.16b, {v3.16b}, v26.16b
2251
ushr v2.16b, v0.16b, 4
2252
and v0.16b, v0.16b, v31.16b
2253
tbl v0.16b, {v28.16b}, v0.16b
2254
tbl v2.16b, {v27.16b}, v2.16b
2255
eor v0.16b, v0.16b, v2.16b
2256
eor v1.16b, v1.16b, v1.16b
2257
aese v0.16b,v1.16b
2258
ushr v2.16b, v0.16b, 4
2259
and v0.16b, v0.16b, v31.16b
2260
tbl v0.16b, {v30.16b}, v0.16b
2261
tbl v2.16b, {v29.16b}, v2.16b
2262
eor v0.16b, v0.16b, v2.16b
2263
2264
mov w7,v0.s[0]
2265
eor w6,w7,w7,ror #32-2
2266
eor w6,w6,w7,ror #32-10
2267
eor w6,w6,w7,ror #32-18
2268
eor w6,w6,w7,ror #32-24
2269
eor w14,w14,w6
2270
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2271
eor w6,w12,w13
2272
eor w9,w14,w8
2273
eor w6,w6,w9
2274
mov v3.s[0],w6
2275
// optimize sbox using AESE instruction
2276
tbl v0.16b, {v3.16b}, v26.16b
2277
ushr v2.16b, v0.16b, 4
2278
and v0.16b, v0.16b, v31.16b
2279
tbl v0.16b, {v28.16b}, v0.16b
2280
tbl v2.16b, {v27.16b}, v2.16b
2281
eor v0.16b, v0.16b, v2.16b
2282
eor v1.16b, v1.16b, v1.16b
2283
aese v0.16b,v1.16b
2284
ushr v2.16b, v0.16b, 4
2285
and v0.16b, v0.16b, v31.16b
2286
tbl v0.16b, {v30.16b}, v0.16b
2287
tbl v2.16b, {v29.16b}, v2.16b
2288
eor v0.16b, v0.16b, v2.16b
2289
2290
mov w7,v0.s[0]
2291
eor w6,w7,w7,ror #32-2
2292
eor w6,w6,w7,ror #32-10
2293
eor w6,w6,w7,ror #32-18
2294
eor w6,w6,w7,ror #32-24
2295
eor w15,w15,w6
2296
subs w11,w11,#1
2297
b.ne 10b
2298
mov v3.s[0],w15
2299
mov v3.s[1],w14
2300
mov v3.s[2],w13
2301
mov v3.s[3],w12
2302
#ifndef __AARCH64EB__
2303
rev32 v3.16b,v3.16b
2304
#endif
2305
ld1 {v4.4s},[x0]
2306
eor v4.16b,v4.16b,v3.16b
2307
st1 {v4.4s},[x1]
2308
ret
2309
1:
2310
AARCH64_SIGN_LINK_REGISTER
2311
stp d8,d9,[sp,#-80]!
2312
stp d10,d11,[sp,#16]
2313
stp d12,d13,[sp,#32]
2314
stp d14,d15,[sp,#48]
2315
stp x29,x30,[sp,#64]
2316
mov w12,v3.s[0]
2317
mov w13,v3.s[1]
2318
mov w14,v3.s[2]
2319
mov w5,v3.s[3]
2320
.Lctr32_4_blocks_process:
2321
cmp w2,#4
2322
b.lt 1f
2323
dup v4.4s,w12
2324
dup v5.4s,w13
2325
dup v6.4s,w14
2326
mov v7.s[0],w5
2327
add w5,w5,#1
2328
mov v7.s[1],w5
2329
add w5,w5,#1
2330
mov v7.s[2],w5
2331
add w5,w5,#1
2332
mov v7.s[3],w5
2333
add w5,w5,#1
2334
cmp w2,#8
2335
b.ge .Lctr32_8_blocks_process
2336
bl _vpsm4_ex_enc_4blks
2337
ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2338
eor v0.16b,v0.16b,v12.16b
2339
eor v1.16b,v1.16b,v13.16b
2340
eor v2.16b,v2.16b,v14.16b
2341
eor v3.16b,v3.16b,v15.16b
2342
st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2343
subs w2,w2,#4
2344
b.ne .Lctr32_4_blocks_process
2345
b 100f
2346
.Lctr32_8_blocks_process:
2347
dup v8.4s,w12
2348
dup v9.4s,w13
2349
dup v10.4s,w14
2350
mov v11.s[0],w5
2351
add w5,w5,#1
2352
mov v11.s[1],w5
2353
add w5,w5,#1
2354
mov v11.s[2],w5
2355
add w5,w5,#1
2356
mov v11.s[3],w5
2357
add w5,w5,#1
2358
bl _vpsm4_ex_enc_8blks
2359
ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2360
ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2361
eor v0.16b,v0.16b,v12.16b
2362
eor v1.16b,v1.16b,v13.16b
2363
eor v2.16b,v2.16b,v14.16b
2364
eor v3.16b,v3.16b,v15.16b
2365
eor v4.16b,v4.16b,v8.16b
2366
eor v5.16b,v5.16b,v9.16b
2367
eor v6.16b,v6.16b,v10.16b
2368
eor v7.16b,v7.16b,v11.16b
2369
st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2370
st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
2371
subs w2,w2,#8
2372
b.ne .Lctr32_4_blocks_process
2373
b 100f
2374
1: // last block processing
2375
subs w2,w2,#1
2376
b.lt 100f
2377
b.gt 1f
2378
mov v3.s[0],w12
2379
mov v3.s[1],w13
2380
mov v3.s[2],w14
2381
mov v3.s[3],w5
2382
mov x10,x3
2383
mov w11,#8
2384
mov w12,v3.s[0]
2385
mov w13,v3.s[1]
2386
mov w14,v3.s[2]
2387
mov w15,v3.s[3]
2388
10:
2389
ldp w7,w8,[x10],8
2390
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2391
eor w6,w14,w15
2392
eor w9,w7,w13
2393
eor w6,w6,w9
2394
mov v3.s[0],w6
2395
// optimize sbox using AESE instruction
2396
tbl v0.16b, {v3.16b}, v26.16b
2397
ushr v2.16b, v0.16b, 4
2398
and v0.16b, v0.16b, v31.16b
2399
tbl v0.16b, {v28.16b}, v0.16b
2400
tbl v2.16b, {v27.16b}, v2.16b
2401
eor v0.16b, v0.16b, v2.16b
2402
eor v1.16b, v1.16b, v1.16b
2403
aese v0.16b,v1.16b
2404
ushr v2.16b, v0.16b, 4
2405
and v0.16b, v0.16b, v31.16b
2406
tbl v0.16b, {v30.16b}, v0.16b
2407
tbl v2.16b, {v29.16b}, v2.16b
2408
eor v0.16b, v0.16b, v2.16b
2409
2410
mov w7,v0.s[0]
2411
eor w6,w7,w7,ror #32-2
2412
eor w6,w6,w7,ror #32-10
2413
eor w6,w6,w7,ror #32-18
2414
eor w6,w6,w7,ror #32-24
2415
eor w12,w12,w6
2416
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2417
eor w6,w14,w15
2418
eor w9,w12,w8
2419
eor w6,w6,w9
2420
mov v3.s[0],w6
2421
// optimize sbox using AESE instruction
2422
tbl v0.16b, {v3.16b}, v26.16b
2423
ushr v2.16b, v0.16b, 4
2424
and v0.16b, v0.16b, v31.16b
2425
tbl v0.16b, {v28.16b}, v0.16b
2426
tbl v2.16b, {v27.16b}, v2.16b
2427
eor v0.16b, v0.16b, v2.16b
2428
eor v1.16b, v1.16b, v1.16b
2429
aese v0.16b,v1.16b
2430
ushr v2.16b, v0.16b, 4
2431
and v0.16b, v0.16b, v31.16b
2432
tbl v0.16b, {v30.16b}, v0.16b
2433
tbl v2.16b, {v29.16b}, v2.16b
2434
eor v0.16b, v0.16b, v2.16b
2435
2436
mov w7,v0.s[0]
2437
eor w6,w7,w7,ror #32-2
2438
eor w6,w6,w7,ror #32-10
2439
eor w6,w6,w7,ror #32-18
2440
eor w6,w6,w7,ror #32-24
2441
ldp w7,w8,[x10],8
2442
eor w13,w13,w6
2443
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2444
eor w6,w12,w13
2445
eor w9,w7,w15
2446
eor w6,w6,w9
2447
mov v3.s[0],w6
2448
// optimize sbox using AESE instruction
2449
tbl v0.16b, {v3.16b}, v26.16b
2450
ushr v2.16b, v0.16b, 4
2451
and v0.16b, v0.16b, v31.16b
2452
tbl v0.16b, {v28.16b}, v0.16b
2453
tbl v2.16b, {v27.16b}, v2.16b
2454
eor v0.16b, v0.16b, v2.16b
2455
eor v1.16b, v1.16b, v1.16b
2456
aese v0.16b,v1.16b
2457
ushr v2.16b, v0.16b, 4
2458
and v0.16b, v0.16b, v31.16b
2459
tbl v0.16b, {v30.16b}, v0.16b
2460
tbl v2.16b, {v29.16b}, v2.16b
2461
eor v0.16b, v0.16b, v2.16b
2462
2463
mov w7,v0.s[0]
2464
eor w6,w7,w7,ror #32-2
2465
eor w6,w6,w7,ror #32-10
2466
eor w6,w6,w7,ror #32-18
2467
eor w6,w6,w7,ror #32-24
2468
eor w14,w14,w6
2469
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2470
eor w6,w12,w13
2471
eor w9,w14,w8
2472
eor w6,w6,w9
2473
mov v3.s[0],w6
2474
// optimize sbox using AESE instruction
2475
tbl v0.16b, {v3.16b}, v26.16b
2476
ushr v2.16b, v0.16b, 4
2477
and v0.16b, v0.16b, v31.16b
2478
tbl v0.16b, {v28.16b}, v0.16b
2479
tbl v2.16b, {v27.16b}, v2.16b
2480
eor v0.16b, v0.16b, v2.16b
2481
eor v1.16b, v1.16b, v1.16b
2482
aese v0.16b,v1.16b
2483
ushr v2.16b, v0.16b, 4
2484
and v0.16b, v0.16b, v31.16b
2485
tbl v0.16b, {v30.16b}, v0.16b
2486
tbl v2.16b, {v29.16b}, v2.16b
2487
eor v0.16b, v0.16b, v2.16b
2488
2489
mov w7,v0.s[0]
2490
eor w6,w7,w7,ror #32-2
2491
eor w6,w6,w7,ror #32-10
2492
eor w6,w6,w7,ror #32-18
2493
eor w6,w6,w7,ror #32-24
2494
eor w15,w15,w6
2495
subs w11,w11,#1
2496
b.ne 10b
2497
mov v3.s[0],w15
2498
mov v3.s[1],w14
2499
mov v3.s[2],w13
2500
mov v3.s[3],w12
2501
#ifndef __AARCH64EB__
2502
rev32 v3.16b,v3.16b
2503
#endif
2504
ld1 {v4.4s},[x0]
2505
eor v4.16b,v4.16b,v3.16b
2506
st1 {v4.4s},[x1]
2507
b 100f
2508
1: // last 2 blocks processing
2509
dup v4.4s,w12
2510
dup v5.4s,w13
2511
dup v6.4s,w14
2512
mov v7.s[0],w5
2513
add w5,w5,#1
2514
mov v7.s[1],w5
2515
subs w2,w2,#1
2516
b.ne 1f
2517
bl _vpsm4_ex_enc_4blks
2518
ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
2519
ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
2520
eor v0.16b,v0.16b,v12.16b
2521
eor v1.16b,v1.16b,v13.16b
2522
eor v2.16b,v2.16b,v14.16b
2523
eor v3.16b,v3.16b,v15.16b
2524
st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
2525
st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
2526
b 100f
2527
1: // last 3 blocks processing
2528
add w5,w5,#1
2529
mov v7.s[2],w5
2530
bl _vpsm4_ex_enc_4blks
2531
ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
2532
ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
2533
ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16
2534
eor v0.16b,v0.16b,v12.16b
2535
eor v1.16b,v1.16b,v13.16b
2536
eor v2.16b,v2.16b,v14.16b
2537
eor v3.16b,v3.16b,v15.16b
2538
st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
2539
st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
2540
st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16
2541
100:
2542
ldp d10,d11,[sp,#16]
2543
ldp d12,d13,[sp,#32]
2544
ldp d14,d15,[sp,#48]
2545
ldp x29,x30,[sp,#64]
2546
ldp d8,d9,[sp],#80
2547
AARCH64_VALIDATE_LINK_REGISTER
2548
ret
2549
.size vpsm4_ex_ctr32_encrypt_blocks,.-vpsm4_ex_ctr32_encrypt_blocks
2550
.globl vpsm4_ex_xts_encrypt_gb
2551
.type vpsm4_ex_xts_encrypt_gb,%function
2552
.align 5
2553
vpsm4_ex_xts_encrypt_gb:
2554
AARCH64_SIGN_LINK_REGISTER
2555
stp x15, x16, [sp, #-0x10]!
2556
stp x17, x18, [sp, #-0x10]!
2557
stp x19, x20, [sp, #-0x10]!
2558
stp x21, x22, [sp, #-0x10]!
2559
stp x23, x24, [sp, #-0x10]!
2560
stp x25, x26, [sp, #-0x10]!
2561
stp x27, x28, [sp, #-0x10]!
2562
stp x29, x30, [sp, #-0x10]!
2563
stp d8, d9, [sp, #-0x10]!
2564
stp d10, d11, [sp, #-0x10]!
2565
stp d12, d13, [sp, #-0x10]!
2566
stp d14, d15, [sp, #-0x10]!
2567
mov x26,x3
2568
mov x27,x4
2569
mov w28,w6
2570
ld1 {v16.4s}, [x5]
2571
mov x3,x27
2572
adrp x9, .Lsbox_magic
2573
ldr q26, [x9, #:lo12:.Lsbox_magic]
2574
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
2575
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
2576
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
2577
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
2578
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
2579
#ifndef __AARCH64EB__
2580
rev32 v16.16b,v16.16b
2581
#endif
2582
mov x10,x3
2583
mov w11,#8
2584
mov w12,v16.s[0]
2585
mov w13,v16.s[1]
2586
mov w14,v16.s[2]
2587
mov w15,v16.s[3]
2588
10:
2589
ldp w7,w8,[x10],8
2590
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2591
eor w6,w14,w15
2592
eor w9,w7,w13
2593
eor w6,w6,w9
2594
mov v3.s[0],w6
2595
// optimize sbox using AESE instruction
2596
tbl v0.16b, {v3.16b}, v26.16b
2597
ushr v2.16b, v0.16b, 4
2598
and v0.16b, v0.16b, v31.16b
2599
tbl v0.16b, {v28.16b}, v0.16b
2600
tbl v2.16b, {v27.16b}, v2.16b
2601
eor v0.16b, v0.16b, v2.16b
2602
eor v1.16b, v1.16b, v1.16b
2603
aese v0.16b,v1.16b
2604
ushr v2.16b, v0.16b, 4
2605
and v0.16b, v0.16b, v31.16b
2606
tbl v0.16b, {v30.16b}, v0.16b
2607
tbl v2.16b, {v29.16b}, v2.16b
2608
eor v0.16b, v0.16b, v2.16b
2609
2610
mov w7,v0.s[0]
2611
eor w6,w7,w7,ror #32-2
2612
eor w6,w6,w7,ror #32-10
2613
eor w6,w6,w7,ror #32-18
2614
eor w6,w6,w7,ror #32-24
2615
eor w12,w12,w6
2616
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2617
eor w6,w14,w15
2618
eor w9,w12,w8
2619
eor w6,w6,w9
2620
mov v3.s[0],w6
2621
// optimize sbox using AESE instruction
2622
tbl v0.16b, {v3.16b}, v26.16b
2623
ushr v2.16b, v0.16b, 4
2624
and v0.16b, v0.16b, v31.16b
2625
tbl v0.16b, {v28.16b}, v0.16b
2626
tbl v2.16b, {v27.16b}, v2.16b
2627
eor v0.16b, v0.16b, v2.16b
2628
eor v1.16b, v1.16b, v1.16b
2629
aese v0.16b,v1.16b
2630
ushr v2.16b, v0.16b, 4
2631
and v0.16b, v0.16b, v31.16b
2632
tbl v0.16b, {v30.16b}, v0.16b
2633
tbl v2.16b, {v29.16b}, v2.16b
2634
eor v0.16b, v0.16b, v2.16b
2635
2636
mov w7,v0.s[0]
2637
eor w6,w7,w7,ror #32-2
2638
eor w6,w6,w7,ror #32-10
2639
eor w6,w6,w7,ror #32-18
2640
eor w6,w6,w7,ror #32-24
2641
ldp w7,w8,[x10],8
2642
eor w13,w13,w6
2643
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2644
eor w6,w12,w13
2645
eor w9,w7,w15
2646
eor w6,w6,w9
2647
mov v3.s[0],w6
2648
// optimize sbox using AESE instruction
2649
tbl v0.16b, {v3.16b}, v26.16b
2650
ushr v2.16b, v0.16b, 4
2651
and v0.16b, v0.16b, v31.16b
2652
tbl v0.16b, {v28.16b}, v0.16b
2653
tbl v2.16b, {v27.16b}, v2.16b
2654
eor v0.16b, v0.16b, v2.16b
2655
eor v1.16b, v1.16b, v1.16b
2656
aese v0.16b,v1.16b
2657
ushr v2.16b, v0.16b, 4
2658
and v0.16b, v0.16b, v31.16b
2659
tbl v0.16b, {v30.16b}, v0.16b
2660
tbl v2.16b, {v29.16b}, v2.16b
2661
eor v0.16b, v0.16b, v2.16b
2662
2663
mov w7,v0.s[0]
2664
eor w6,w7,w7,ror #32-2
2665
eor w6,w6,w7,ror #32-10
2666
eor w6,w6,w7,ror #32-18
2667
eor w6,w6,w7,ror #32-24
2668
eor w14,w14,w6
2669
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2670
eor w6,w12,w13
2671
eor w9,w14,w8
2672
eor w6,w6,w9
2673
mov v3.s[0],w6
2674
// optimize sbox using AESE instruction
2675
tbl v0.16b, {v3.16b}, v26.16b
2676
ushr v2.16b, v0.16b, 4
2677
and v0.16b, v0.16b, v31.16b
2678
tbl v0.16b, {v28.16b}, v0.16b
2679
tbl v2.16b, {v27.16b}, v2.16b
2680
eor v0.16b, v0.16b, v2.16b
2681
eor v1.16b, v1.16b, v1.16b
2682
aese v0.16b,v1.16b
2683
ushr v2.16b, v0.16b, 4
2684
and v0.16b, v0.16b, v31.16b
2685
tbl v0.16b, {v30.16b}, v0.16b
2686
tbl v2.16b, {v29.16b}, v2.16b
2687
eor v0.16b, v0.16b, v2.16b
2688
2689
mov w7,v0.s[0]
2690
eor w6,w7,w7,ror #32-2
2691
eor w6,w6,w7,ror #32-10
2692
eor w6,w6,w7,ror #32-18
2693
eor w6,w6,w7,ror #32-24
2694
eor w15,w15,w6
2695
subs w11,w11,#1
2696
b.ne 10b
2697
mov v16.s[0],w15
2698
mov v16.s[1],w14
2699
mov v16.s[2],w13
2700
mov v16.s[3],w12
2701
#ifndef __AARCH64EB__
2702
rev32 v16.16b,v16.16b
2703
#endif
2704
mov x3,x26
2705
and x29,x2,#0x0F
2706
// convert length into blocks
2707
lsr x2,x2,4
2708
cmp x2,#1
2709
b.lt .return_gb
2710
2711
cmp x29,0
2712
// If the encryption/decryption Length is N times of 16,
2713
// the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
2714
b.eq .xts_encrypt_blocks_gb
2715
2716
// If the encryption/decryption length is not N times of 16,
2717
// the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb
2718
// the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
2719
subs x2,x2,#1
2720
b.eq .only_2blks_tweak_gb
2721
.xts_encrypt_blocks_gb:
2722
rbit v16.16b,v16.16b
2723
#ifdef __AARCH64EB__
2724
rev32 v16.16b,v16.16b
2725
#endif
2726
mov x12,v16.d[0]
2727
mov x13,v16.d[1]
2728
mov w7,0x87
2729
extr x9,x13,x13,#32
2730
extr x15,x13,x12,#63
2731
and w8,w7,w9,asr#31
2732
eor x14,x8,x12,lsl#1
2733
mov w7,0x87
2734
extr x9,x15,x15,#32
2735
extr x17,x15,x14,#63
2736
and w8,w7,w9,asr#31
2737
eor x16,x8,x14,lsl#1
2738
mov w7,0x87
2739
extr x9,x17,x17,#32
2740
extr x19,x17,x16,#63
2741
and w8,w7,w9,asr#31
2742
eor x18,x8,x16,lsl#1
2743
mov w7,0x87
2744
extr x9,x19,x19,#32
2745
extr x21,x19,x18,#63
2746
and w8,w7,w9,asr#31
2747
eor x20,x8,x18,lsl#1
2748
mov w7,0x87
2749
extr x9,x21,x21,#32
2750
extr x23,x21,x20,#63
2751
and w8,w7,w9,asr#31
2752
eor x22,x8,x20,lsl#1
2753
mov w7,0x87
2754
extr x9,x23,x23,#32
2755
extr x25,x23,x22,#63
2756
and w8,w7,w9,asr#31
2757
eor x24,x8,x22,lsl#1
2758
mov w7,0x87
2759
extr x9,x25,x25,#32
2760
extr x27,x25,x24,#63
2761
and w8,w7,w9,asr#31
2762
eor x26,x8,x24,lsl#1
2763
.Lxts_8_blocks_process_gb:
2764
cmp x2,#8
2765
mov v16.d[0],x12
2766
mov v16.d[1],x13
2767
#ifdef __AARCH64EB__
2768
rev32 v16.16b,v16.16b
2769
#endif
2770
mov w7,0x87
2771
extr x9,x27,x27,#32
2772
extr x13,x27,x26,#63
2773
and w8,w7,w9,asr#31
2774
eor x12,x8,x26,lsl#1
2775
mov v17.d[0],x14
2776
mov v17.d[1],x15
2777
#ifdef __AARCH64EB__
2778
rev32 v17.16b,v17.16b
2779
#endif
2780
mov w7,0x87
2781
extr x9,x13,x13,#32
2782
extr x15,x13,x12,#63
2783
and w8,w7,w9,asr#31
2784
eor x14,x8,x12,lsl#1
2785
mov v18.d[0],x16
2786
mov v18.d[1],x17
2787
#ifdef __AARCH64EB__
2788
rev32 v18.16b,v18.16b
2789
#endif
2790
mov w7,0x87
2791
extr x9,x15,x15,#32
2792
extr x17,x15,x14,#63
2793
and w8,w7,w9,asr#31
2794
eor x16,x8,x14,lsl#1
2795
mov v19.d[0],x18
2796
mov v19.d[1],x19
2797
#ifdef __AARCH64EB__
2798
rev32 v19.16b,v19.16b
2799
#endif
2800
mov w7,0x87
2801
extr x9,x17,x17,#32
2802
extr x19,x17,x16,#63
2803
and w8,w7,w9,asr#31
2804
eor x18,x8,x16,lsl#1
2805
mov v20.d[0],x20
2806
mov v20.d[1],x21
2807
#ifdef __AARCH64EB__
2808
rev32 v20.16b,v20.16b
2809
#endif
2810
mov w7,0x87
2811
extr x9,x19,x19,#32
2812
extr x21,x19,x18,#63
2813
and w8,w7,w9,asr#31
2814
eor x20,x8,x18,lsl#1
2815
mov v21.d[0],x22
2816
mov v21.d[1],x23
2817
#ifdef __AARCH64EB__
2818
rev32 v21.16b,v21.16b
2819
#endif
2820
mov w7,0x87
2821
extr x9,x21,x21,#32
2822
extr x23,x21,x20,#63
2823
and w8,w7,w9,asr#31
2824
eor x22,x8,x20,lsl#1
2825
mov v22.d[0],x24
2826
mov v22.d[1],x25
2827
#ifdef __AARCH64EB__
2828
rev32 v22.16b,v22.16b
2829
#endif
2830
mov w7,0x87
2831
extr x9,x23,x23,#32
2832
extr x25,x23,x22,#63
2833
and w8,w7,w9,asr#31
2834
eor x24,x8,x22,lsl#1
2835
mov v23.d[0],x26
2836
mov v23.d[1],x27
2837
#ifdef __AARCH64EB__
2838
rev32 v23.16b,v23.16b
2839
#endif
2840
mov w7,0x87
2841
extr x9,x25,x25,#32
2842
extr x27,x25,x24,#63
2843
and w8,w7,w9,asr#31
2844
eor x26,x8,x24,lsl#1
2845
b.lt .Lxts_4_blocks_process_gb
2846
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
2847
rbit v16.16b,v16.16b
2848
rbit v17.16b,v17.16b
2849
rbit v18.16b,v18.16b
2850
rbit v19.16b,v19.16b
2851
eor v4.16b, v4.16b, v16.16b
2852
eor v5.16b, v5.16b, v17.16b
2853
eor v6.16b, v6.16b, v18.16b
2854
eor v7.16b, v7.16b, v19.16b
2855
ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2856
rbit v20.16b,v20.16b
2857
rbit v21.16b,v21.16b
2858
rbit v22.16b,v22.16b
2859
rbit v23.16b,v23.16b
2860
eor v8.16b, v8.16b, v20.16b
2861
eor v9.16b, v9.16b, v21.16b
2862
eor v10.16b, v10.16b, v22.16b
2863
eor v11.16b, v11.16b, v23.16b
2864
#ifndef __AARCH64EB__
2865
rev32 v4.16b,v4.16b
2866
#endif
2867
#ifndef __AARCH64EB__
2868
rev32 v5.16b,v5.16b
2869
#endif
2870
#ifndef __AARCH64EB__
2871
rev32 v6.16b,v6.16b
2872
#endif
2873
#ifndef __AARCH64EB__
2874
rev32 v7.16b,v7.16b
2875
#endif
2876
#ifndef __AARCH64EB__
2877
rev32 v8.16b,v8.16b
2878
#endif
2879
#ifndef __AARCH64EB__
2880
rev32 v9.16b,v9.16b
2881
#endif
2882
#ifndef __AARCH64EB__
2883
rev32 v10.16b,v10.16b
2884
#endif
2885
#ifndef __AARCH64EB__
2886
rev32 v11.16b,v11.16b
2887
#endif
2888
zip1 v0.4s,v4.4s,v5.4s
2889
zip2 v1.4s,v4.4s,v5.4s
2890
zip1 v2.4s,v6.4s,v7.4s
2891
zip2 v3.4s,v6.4s,v7.4s
2892
zip1 v4.2d,v0.2d,v2.2d
2893
zip2 v5.2d,v0.2d,v2.2d
2894
zip1 v6.2d,v1.2d,v3.2d
2895
zip2 v7.2d,v1.2d,v3.2d
2896
zip1 v0.4s,v8.4s,v9.4s
2897
zip2 v1.4s,v8.4s,v9.4s
2898
zip1 v2.4s,v10.4s,v11.4s
2899
zip2 v3.4s,v10.4s,v11.4s
2900
zip1 v8.2d,v0.2d,v2.2d
2901
zip2 v9.2d,v0.2d,v2.2d
2902
zip1 v10.2d,v1.2d,v3.2d
2903
zip2 v11.2d,v1.2d,v3.2d
2904
bl _vpsm4_ex_enc_8blks
2905
zip1 v8.4s,v0.4s,v1.4s
2906
zip2 v9.4s,v0.4s,v1.4s
2907
zip1 v10.4s,v2.4s,v3.4s
2908
zip2 v11.4s,v2.4s,v3.4s
2909
zip1 v0.2d,v8.2d,v10.2d
2910
zip2 v1.2d,v8.2d,v10.2d
2911
zip1 v2.2d,v9.2d,v11.2d
2912
zip2 v3.2d,v9.2d,v11.2d
2913
zip1 v8.4s,v4.4s,v5.4s
2914
zip2 v9.4s,v4.4s,v5.4s
2915
zip1 v10.4s,v6.4s,v7.4s
2916
zip2 v11.4s,v6.4s,v7.4s
2917
zip1 v4.2d,v8.2d,v10.2d
2918
zip2 v5.2d,v8.2d,v10.2d
2919
zip1 v6.2d,v9.2d,v11.2d
2920
zip2 v7.2d,v9.2d,v11.2d
2921
eor v0.16b, v0.16b, v16.16b
2922
eor v1.16b, v1.16b, v17.16b
2923
eor v2.16b, v2.16b, v18.16b
2924
eor v3.16b, v3.16b, v19.16b
2925
eor v4.16b, v4.16b, v20.16b
2926
eor v5.16b, v5.16b, v21.16b
2927
eor v6.16b, v6.16b, v22.16b
2928
eor v7.16b, v7.16b, v23.16b
2929
2930
// save the last tweak
2931
mov v25.16b,v23.16b
2932
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2933
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
2934
subs x2,x2,#8
2935
b.gt .Lxts_8_blocks_process_gb
2936
b 100f
2937
.Lxts_4_blocks_process_gb:
2938
cmp x2,#4
2939
b.lt 1f
2940
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
2941
rbit v16.16b,v16.16b
2942
rbit v17.16b,v17.16b
2943
rbit v18.16b,v18.16b
2944
rbit v19.16b,v19.16b
2945
eor v4.16b, v4.16b, v16.16b
2946
eor v5.16b, v5.16b, v17.16b
2947
eor v6.16b, v6.16b, v18.16b
2948
eor v7.16b, v7.16b, v19.16b
2949
#ifndef __AARCH64EB__
2950
rev32 v4.16b,v4.16b
2951
#endif
2952
#ifndef __AARCH64EB__
2953
rev32 v5.16b,v5.16b
2954
#endif
2955
#ifndef __AARCH64EB__
2956
rev32 v6.16b,v6.16b
2957
#endif
2958
#ifndef __AARCH64EB__
2959
rev32 v7.16b,v7.16b
2960
#endif
2961
zip1 v0.4s,v4.4s,v5.4s
2962
zip2 v1.4s,v4.4s,v5.4s
2963
zip1 v2.4s,v6.4s,v7.4s
2964
zip2 v3.4s,v6.4s,v7.4s
2965
zip1 v4.2d,v0.2d,v2.2d
2966
zip2 v5.2d,v0.2d,v2.2d
2967
zip1 v6.2d,v1.2d,v3.2d
2968
zip2 v7.2d,v1.2d,v3.2d
2969
bl _vpsm4_ex_enc_4blks
2970
zip1 v4.4s,v0.4s,v1.4s
2971
zip2 v5.4s,v0.4s,v1.4s
2972
zip1 v6.4s,v2.4s,v3.4s
2973
zip2 v7.4s,v2.4s,v3.4s
2974
zip1 v0.2d,v4.2d,v6.2d
2975
zip2 v1.2d,v4.2d,v6.2d
2976
zip1 v2.2d,v5.2d,v7.2d
2977
zip2 v3.2d,v5.2d,v7.2d
2978
eor v0.16b, v0.16b, v16.16b
2979
eor v1.16b, v1.16b, v17.16b
2980
eor v2.16b, v2.16b, v18.16b
2981
eor v3.16b, v3.16b, v19.16b
2982
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2983
sub x2,x2,#4
2984
mov v16.16b,v20.16b
2985
mov v17.16b,v21.16b
2986
mov v18.16b,v22.16b
2987
// save the last tweak
2988
mov v25.16b,v19.16b
2989
1:
2990
// process last block
2991
cmp x2,#1
2992
b.lt 100f
2993
b.gt 1f
2994
ld1 {v4.4s},[x0],#16
2995
rbit v16.16b,v16.16b
2996
eor v4.16b, v4.16b, v16.16b
2997
#ifndef __AARCH64EB__
2998
rev32 v4.16b,v4.16b
2999
#endif
3000
mov x10,x3
3001
mov w11,#8
3002
mov w12,v4.s[0]
3003
mov w13,v4.s[1]
3004
mov w14,v4.s[2]
3005
mov w15,v4.s[3]
3006
10:
3007
ldp w7,w8,[x10],8
3008
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3009
eor w6,w14,w15
3010
eor w9,w7,w13
3011
eor w6,w6,w9
3012
mov v3.s[0],w6
3013
// optimize sbox using AESE instruction
3014
tbl v0.16b, {v3.16b}, v26.16b
3015
ushr v2.16b, v0.16b, 4
3016
and v0.16b, v0.16b, v31.16b
3017
tbl v0.16b, {v28.16b}, v0.16b
3018
tbl v2.16b, {v27.16b}, v2.16b
3019
eor v0.16b, v0.16b, v2.16b
3020
eor v1.16b, v1.16b, v1.16b
3021
aese v0.16b,v1.16b
3022
ushr v2.16b, v0.16b, 4
3023
and v0.16b, v0.16b, v31.16b
3024
tbl v0.16b, {v30.16b}, v0.16b
3025
tbl v2.16b, {v29.16b}, v2.16b
3026
eor v0.16b, v0.16b, v2.16b
3027
3028
mov w7,v0.s[0]
3029
eor w6,w7,w7,ror #32-2
3030
eor w6,w6,w7,ror #32-10
3031
eor w6,w6,w7,ror #32-18
3032
eor w6,w6,w7,ror #32-24
3033
eor w12,w12,w6
3034
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3035
eor w6,w14,w15
3036
eor w9,w12,w8
3037
eor w6,w6,w9
3038
mov v3.s[0],w6
3039
// optimize sbox using AESE instruction
3040
tbl v0.16b, {v3.16b}, v26.16b
3041
ushr v2.16b, v0.16b, 4
3042
and v0.16b, v0.16b, v31.16b
3043
tbl v0.16b, {v28.16b}, v0.16b
3044
tbl v2.16b, {v27.16b}, v2.16b
3045
eor v0.16b, v0.16b, v2.16b
3046
eor v1.16b, v1.16b, v1.16b
3047
aese v0.16b,v1.16b
3048
ushr v2.16b, v0.16b, 4
3049
and v0.16b, v0.16b, v31.16b
3050
tbl v0.16b, {v30.16b}, v0.16b
3051
tbl v2.16b, {v29.16b}, v2.16b
3052
eor v0.16b, v0.16b, v2.16b
3053
3054
mov w7,v0.s[0]
3055
eor w6,w7,w7,ror #32-2
3056
eor w6,w6,w7,ror #32-10
3057
eor w6,w6,w7,ror #32-18
3058
eor w6,w6,w7,ror #32-24
3059
ldp w7,w8,[x10],8
3060
eor w13,w13,w6
3061
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3062
eor w6,w12,w13
3063
eor w9,w7,w15
3064
eor w6,w6,w9
3065
mov v3.s[0],w6
3066
// optimize sbox using AESE instruction
3067
tbl v0.16b, {v3.16b}, v26.16b
3068
ushr v2.16b, v0.16b, 4
3069
and v0.16b, v0.16b, v31.16b
3070
tbl v0.16b, {v28.16b}, v0.16b
3071
tbl v2.16b, {v27.16b}, v2.16b
3072
eor v0.16b, v0.16b, v2.16b
3073
eor v1.16b, v1.16b, v1.16b
3074
aese v0.16b,v1.16b
3075
ushr v2.16b, v0.16b, 4
3076
and v0.16b, v0.16b, v31.16b
3077
tbl v0.16b, {v30.16b}, v0.16b
3078
tbl v2.16b, {v29.16b}, v2.16b
3079
eor v0.16b, v0.16b, v2.16b
3080
3081
mov w7,v0.s[0]
3082
eor w6,w7,w7,ror #32-2
3083
eor w6,w6,w7,ror #32-10
3084
eor w6,w6,w7,ror #32-18
3085
eor w6,w6,w7,ror #32-24
3086
eor w14,w14,w6
3087
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3088
eor w6,w12,w13
3089
eor w9,w14,w8
3090
eor w6,w6,w9
3091
mov v3.s[0],w6
3092
// optimize sbox using AESE instruction
3093
tbl v0.16b, {v3.16b}, v26.16b
3094
ushr v2.16b, v0.16b, 4
3095
and v0.16b, v0.16b, v31.16b
3096
tbl v0.16b, {v28.16b}, v0.16b
3097
tbl v2.16b, {v27.16b}, v2.16b
3098
eor v0.16b, v0.16b, v2.16b
3099
eor v1.16b, v1.16b, v1.16b
3100
aese v0.16b,v1.16b
3101
ushr v2.16b, v0.16b, 4
3102
and v0.16b, v0.16b, v31.16b
3103
tbl v0.16b, {v30.16b}, v0.16b
3104
tbl v2.16b, {v29.16b}, v2.16b
3105
eor v0.16b, v0.16b, v2.16b
3106
3107
mov w7,v0.s[0]
3108
eor w6,w7,w7,ror #32-2
3109
eor w6,w6,w7,ror #32-10
3110
eor w6,w6,w7,ror #32-18
3111
eor w6,w6,w7,ror #32-24
3112
eor w15,w15,w6
3113
subs w11,w11,#1
3114
b.ne 10b
3115
mov v4.s[0],w15
3116
mov v4.s[1],w14
3117
mov v4.s[2],w13
3118
mov v4.s[3],w12
3119
#ifndef __AARCH64EB__
3120
rev32 v4.16b,v4.16b
3121
#endif
3122
eor v4.16b, v4.16b, v16.16b
3123
st1 {v4.4s},[x1],#16
3124
// save the last tweak
3125
mov v25.16b,v16.16b
3126
b 100f
3127
1: // process last 2 blocks
3128
cmp x2,#2
3129
b.gt 1f
3130
ld1 {v4.4s,v5.4s},[x0],#32
3131
rbit v16.16b,v16.16b
3132
rbit v17.16b,v17.16b
3133
eor v4.16b, v4.16b, v16.16b
3134
eor v5.16b, v5.16b, v17.16b
3135
#ifndef __AARCH64EB__
3136
rev32 v4.16b,v4.16b
3137
#endif
3138
#ifndef __AARCH64EB__
3139
rev32 v5.16b,v5.16b
3140
#endif
3141
zip1 v0.4s,v4.4s,v5.4s
3142
zip2 v1.4s,v4.4s,v5.4s
3143
zip1 v2.4s,v6.4s,v7.4s
3144
zip2 v3.4s,v6.4s,v7.4s
3145
zip1 v4.2d,v0.2d,v2.2d
3146
zip2 v5.2d,v0.2d,v2.2d
3147
zip1 v6.2d,v1.2d,v3.2d
3148
zip2 v7.2d,v1.2d,v3.2d
3149
bl _vpsm4_ex_enc_4blks
3150
zip1 v4.4s,v0.4s,v1.4s
3151
zip2 v5.4s,v0.4s,v1.4s
3152
zip1 v6.4s,v2.4s,v3.4s
3153
zip2 v7.4s,v2.4s,v3.4s
3154
zip1 v0.2d,v4.2d,v6.2d
3155
zip2 v1.2d,v4.2d,v6.2d
3156
zip1 v2.2d,v5.2d,v7.2d
3157
zip2 v3.2d,v5.2d,v7.2d
3158
eor v0.16b, v0.16b, v16.16b
3159
eor v1.16b, v1.16b, v17.16b
3160
st1 {v0.4s,v1.4s},[x1],#32
3161
// save the last tweak
3162
mov v25.16b,v17.16b
3163
b 100f
3164
1: // process last 3 blocks
3165
ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
3166
rbit v16.16b,v16.16b
3167
rbit v17.16b,v17.16b
3168
rbit v18.16b,v18.16b
3169
eor v4.16b, v4.16b, v16.16b
3170
eor v5.16b, v5.16b, v17.16b
3171
eor v6.16b, v6.16b, v18.16b
3172
#ifndef __AARCH64EB__
3173
rev32 v4.16b,v4.16b
3174
#endif
3175
#ifndef __AARCH64EB__
3176
rev32 v5.16b,v5.16b
3177
#endif
3178
#ifndef __AARCH64EB__
3179
rev32 v6.16b,v6.16b
3180
#endif
3181
zip1 v0.4s,v4.4s,v5.4s
3182
zip2 v1.4s,v4.4s,v5.4s
3183
zip1 v2.4s,v6.4s,v7.4s
3184
zip2 v3.4s,v6.4s,v7.4s
3185
zip1 v4.2d,v0.2d,v2.2d
3186
zip2 v5.2d,v0.2d,v2.2d
3187
zip1 v6.2d,v1.2d,v3.2d
3188
zip2 v7.2d,v1.2d,v3.2d
3189
bl _vpsm4_ex_enc_4blks
3190
zip1 v4.4s,v0.4s,v1.4s
3191
zip2 v5.4s,v0.4s,v1.4s
3192
zip1 v6.4s,v2.4s,v3.4s
3193
zip2 v7.4s,v2.4s,v3.4s
3194
zip1 v0.2d,v4.2d,v6.2d
3195
zip2 v1.2d,v4.2d,v6.2d
3196
zip1 v2.2d,v5.2d,v7.2d
3197
zip2 v3.2d,v5.2d,v7.2d
3198
eor v0.16b, v0.16b, v16.16b
3199
eor v1.16b, v1.16b, v17.16b
3200
eor v2.16b, v2.16b, v18.16b
3201
st1 {v0.4s,v1.4s,v2.4s},[x1],#48
3202
// save the last tweak
3203
mov v25.16b,v18.16b
3204
100:
3205
cmp x29,0
3206
b.eq .return_gb
3207
3208
// This branch calculates the last two tweaks,
3209
// while the encryption/decryption length is larger than 32
3210
.last_2blks_tweak_gb:
3211
#ifdef __AARCH64EB__
3212
rev32 v25.16b,v25.16b
3213
#endif
3214
rbit v2.16b,v25.16b
3215
adrp x9, .Lxts_magic
3216
ldr q0, [x9, #:lo12:.Lxts_magic]
3217
shl v17.16b, v2.16b, #1
3218
ext v1.16b, v2.16b, v2.16b,#15
3219
ushr v1.16b, v1.16b, #7
3220
mul v1.16b, v1.16b, v0.16b
3221
eor v17.16b, v17.16b, v1.16b
3222
rbit v17.16b,v17.16b
3223
rbit v2.16b,v17.16b
3224
adrp x9, .Lxts_magic
3225
ldr q0, [x9, #:lo12:.Lxts_magic]
3226
shl v18.16b, v2.16b, #1
3227
ext v1.16b, v2.16b, v2.16b,#15
3228
ushr v1.16b, v1.16b, #7
3229
mul v1.16b, v1.16b, v0.16b
3230
eor v18.16b, v18.16b, v1.16b
3231
rbit v18.16b,v18.16b
3232
b .check_dec_gb
3233
3234
3235
// This branch calculates the last two tweaks,
3236
// while the encryption/decryption length is equal to 32, who only need two tweaks
3237
.only_2blks_tweak_gb:
3238
mov v17.16b,v16.16b
3239
#ifdef __AARCH64EB__
3240
rev32 v17.16b,v17.16b
3241
#endif
3242
rbit v2.16b,v17.16b
3243
adrp x9, .Lxts_magic
3244
ldr q0, [x9, #:lo12:.Lxts_magic]
3245
shl v18.16b, v2.16b, #1
3246
ext v1.16b, v2.16b, v2.16b,#15
3247
ushr v1.16b, v1.16b, #7
3248
mul v1.16b, v1.16b, v0.16b
3249
eor v18.16b, v18.16b, v1.16b
3250
rbit v18.16b,v18.16b
3251
b .check_dec_gb
3252
3253
3254
// Determine whether encryption or decryption is required.
3255
// The last two tweaks need to be swapped for decryption.
3256
.check_dec_gb:
3257
// encryption:1 decryption:0
3258
cmp w28,1
3259
b.eq .process_last_2blks_gb
3260
mov v0.16B,v17.16b
3261
mov v17.16B,v18.16b
3262
mov v18.16B,v0.16b
3263
3264
.process_last_2blks_gb:
3265
#ifdef __AARCH64EB__
3266
rev32 v17.16b,v17.16b
3267
#endif
3268
#ifdef __AARCH64EB__
3269
rev32 v18.16b,v18.16b
3270
#endif
3271
ld1 {v4.4s},[x0],#16
3272
eor v4.16b, v4.16b, v17.16b
3273
#ifndef __AARCH64EB__
3274
rev32 v4.16b,v4.16b
3275
#endif
3276
mov x10,x3
3277
mov w11,#8
3278
mov w12,v4.s[0]
3279
mov w13,v4.s[1]
3280
mov w14,v4.s[2]
3281
mov w15,v4.s[3]
3282
10:
3283
ldp w7,w8,[x10],8
3284
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3285
eor w6,w14,w15
3286
eor w9,w7,w13
3287
eor w6,w6,w9
3288
mov v3.s[0],w6
3289
// optimize sbox using AESE instruction
3290
tbl v0.16b, {v3.16b}, v26.16b
3291
ushr v2.16b, v0.16b, 4
3292
and v0.16b, v0.16b, v31.16b
3293
tbl v0.16b, {v28.16b}, v0.16b
3294
tbl v2.16b, {v27.16b}, v2.16b
3295
eor v0.16b, v0.16b, v2.16b
3296
eor v1.16b, v1.16b, v1.16b
3297
aese v0.16b,v1.16b
3298
ushr v2.16b, v0.16b, 4
3299
and v0.16b, v0.16b, v31.16b
3300
tbl v0.16b, {v30.16b}, v0.16b
3301
tbl v2.16b, {v29.16b}, v2.16b
3302
eor v0.16b, v0.16b, v2.16b
3303
3304
mov w7,v0.s[0]
3305
eor w6,w7,w7,ror #32-2
3306
eor w6,w6,w7,ror #32-10
3307
eor w6,w6,w7,ror #32-18
3308
eor w6,w6,w7,ror #32-24
3309
eor w12,w12,w6
3310
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3311
eor w6,w14,w15
3312
eor w9,w12,w8
3313
eor w6,w6,w9
3314
mov v3.s[0],w6
3315
// optimize sbox using AESE instruction
3316
tbl v0.16b, {v3.16b}, v26.16b
3317
ushr v2.16b, v0.16b, 4
3318
and v0.16b, v0.16b, v31.16b
3319
tbl v0.16b, {v28.16b}, v0.16b
3320
tbl v2.16b, {v27.16b}, v2.16b
3321
eor v0.16b, v0.16b, v2.16b
3322
eor v1.16b, v1.16b, v1.16b
3323
aese v0.16b,v1.16b
3324
ushr v2.16b, v0.16b, 4
3325
and v0.16b, v0.16b, v31.16b
3326
tbl v0.16b, {v30.16b}, v0.16b
3327
tbl v2.16b, {v29.16b}, v2.16b
3328
eor v0.16b, v0.16b, v2.16b
3329
3330
mov w7,v0.s[0]
3331
eor w6,w7,w7,ror #32-2
3332
eor w6,w6,w7,ror #32-10
3333
eor w6,w6,w7,ror #32-18
3334
eor w6,w6,w7,ror #32-24
3335
ldp w7,w8,[x10],8
3336
eor w13,w13,w6
3337
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3338
eor w6,w12,w13
3339
eor w9,w7,w15
3340
eor w6,w6,w9
3341
mov v3.s[0],w6
3342
// optimize sbox using AESE instruction
3343
tbl v0.16b, {v3.16b}, v26.16b
3344
ushr v2.16b, v0.16b, 4
3345
and v0.16b, v0.16b, v31.16b
3346
tbl v0.16b, {v28.16b}, v0.16b
3347
tbl v2.16b, {v27.16b}, v2.16b
3348
eor v0.16b, v0.16b, v2.16b
3349
eor v1.16b, v1.16b, v1.16b
3350
aese v0.16b,v1.16b
3351
ushr v2.16b, v0.16b, 4
3352
and v0.16b, v0.16b, v31.16b
3353
tbl v0.16b, {v30.16b}, v0.16b
3354
tbl v2.16b, {v29.16b}, v2.16b
3355
eor v0.16b, v0.16b, v2.16b
3356
3357
mov w7,v0.s[0]
3358
eor w6,w7,w7,ror #32-2
3359
eor w6,w6,w7,ror #32-10
3360
eor w6,w6,w7,ror #32-18
3361
eor w6,w6,w7,ror #32-24
3362
eor w14,w14,w6
3363
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3364
eor w6,w12,w13
3365
eor w9,w14,w8
3366
eor w6,w6,w9
3367
mov v3.s[0],w6
3368
// optimize sbox using AESE instruction
3369
tbl v0.16b, {v3.16b}, v26.16b
3370
ushr v2.16b, v0.16b, 4
3371
and v0.16b, v0.16b, v31.16b
3372
tbl v0.16b, {v28.16b}, v0.16b
3373
tbl v2.16b, {v27.16b}, v2.16b
3374
eor v0.16b, v0.16b, v2.16b
3375
eor v1.16b, v1.16b, v1.16b
3376
aese v0.16b,v1.16b
3377
ushr v2.16b, v0.16b, 4
3378
and v0.16b, v0.16b, v31.16b
3379
tbl v0.16b, {v30.16b}, v0.16b
3380
tbl v2.16b, {v29.16b}, v2.16b
3381
eor v0.16b, v0.16b, v2.16b
3382
3383
mov w7,v0.s[0]
3384
eor w6,w7,w7,ror #32-2
3385
eor w6,w6,w7,ror #32-10
3386
eor w6,w6,w7,ror #32-18
3387
eor w6,w6,w7,ror #32-24
3388
eor w15,w15,w6
3389
subs w11,w11,#1
3390
b.ne 10b
3391
mov v4.s[0],w15
3392
mov v4.s[1],w14
3393
mov v4.s[2],w13
3394
mov v4.s[3],w12
3395
#ifndef __AARCH64EB__
3396
rev32 v4.16b,v4.16b
3397
#endif
3398
eor v4.16b, v4.16b, v17.16b
3399
st1 {v4.4s},[x1],#16
3400
3401
sub x26,x1,16
3402
.loop_gb:
3403
subs x29,x29,1
3404
ldrb w7,[x26,x29]
3405
ldrb w8,[x0,x29]
3406
strb w8,[x26,x29]
3407
strb w7,[x1,x29]
3408
b.gt .loop_gb
3409
ld1 {v4.4s}, [x26]
3410
eor v4.16b, v4.16b, v18.16b
3411
#ifndef __AARCH64EB__
3412
rev32 v4.16b,v4.16b
3413
#endif
3414
mov x10,x3
3415
mov w11,#8
3416
mov w12,v4.s[0]
3417
mov w13,v4.s[1]
3418
mov w14,v4.s[2]
3419
mov w15,v4.s[3]
3420
10:
3421
ldp w7,w8,[x10],8
3422
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3423
eor w6,w14,w15
3424
eor w9,w7,w13
3425
eor w6,w6,w9
3426
mov v3.s[0],w6
3427
// optimize sbox using AESE instruction
3428
tbl v0.16b, {v3.16b}, v26.16b
3429
ushr v2.16b, v0.16b, 4
3430
and v0.16b, v0.16b, v31.16b
3431
tbl v0.16b, {v28.16b}, v0.16b
3432
tbl v2.16b, {v27.16b}, v2.16b
3433
eor v0.16b, v0.16b, v2.16b
3434
eor v1.16b, v1.16b, v1.16b
3435
aese v0.16b,v1.16b
3436
ushr v2.16b, v0.16b, 4
3437
and v0.16b, v0.16b, v31.16b
3438
tbl v0.16b, {v30.16b}, v0.16b
3439
tbl v2.16b, {v29.16b}, v2.16b
3440
eor v0.16b, v0.16b, v2.16b
3441
3442
mov w7,v0.s[0]
3443
eor w6,w7,w7,ror #32-2
3444
eor w6,w6,w7,ror #32-10
3445
eor w6,w6,w7,ror #32-18
3446
eor w6,w6,w7,ror #32-24
3447
eor w12,w12,w6
3448
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3449
eor w6,w14,w15
3450
eor w9,w12,w8
3451
eor w6,w6,w9
3452
mov v3.s[0],w6
3453
// optimize sbox using AESE instruction
3454
tbl v0.16b, {v3.16b}, v26.16b
3455
ushr v2.16b, v0.16b, 4
3456
and v0.16b, v0.16b, v31.16b
3457
tbl v0.16b, {v28.16b}, v0.16b
3458
tbl v2.16b, {v27.16b}, v2.16b
3459
eor v0.16b, v0.16b, v2.16b
3460
eor v1.16b, v1.16b, v1.16b
3461
aese v0.16b,v1.16b
3462
ushr v2.16b, v0.16b, 4
3463
and v0.16b, v0.16b, v31.16b
3464
tbl v0.16b, {v30.16b}, v0.16b
3465
tbl v2.16b, {v29.16b}, v2.16b
3466
eor v0.16b, v0.16b, v2.16b
3467
3468
mov w7,v0.s[0]
3469
eor w6,w7,w7,ror #32-2
3470
eor w6,w6,w7,ror #32-10
3471
eor w6,w6,w7,ror #32-18
3472
eor w6,w6,w7,ror #32-24
3473
ldp w7,w8,[x10],8
3474
eor w13,w13,w6
3475
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3476
eor w6,w12,w13
3477
eor w9,w7,w15
3478
eor w6,w6,w9
3479
mov v3.s[0],w6
3480
// optimize sbox using AESE instruction
3481
tbl v0.16b, {v3.16b}, v26.16b
3482
ushr v2.16b, v0.16b, 4
3483
and v0.16b, v0.16b, v31.16b
3484
tbl v0.16b, {v28.16b}, v0.16b
3485
tbl v2.16b, {v27.16b}, v2.16b
3486
eor v0.16b, v0.16b, v2.16b
3487
eor v1.16b, v1.16b, v1.16b
3488
aese v0.16b,v1.16b
3489
ushr v2.16b, v0.16b, 4
3490
and v0.16b, v0.16b, v31.16b
3491
tbl v0.16b, {v30.16b}, v0.16b
3492
tbl v2.16b, {v29.16b}, v2.16b
3493
eor v0.16b, v0.16b, v2.16b
3494
3495
mov w7,v0.s[0]
3496
eor w6,w7,w7,ror #32-2
3497
eor w6,w6,w7,ror #32-10
3498
eor w6,w6,w7,ror #32-18
3499
eor w6,w6,w7,ror #32-24
3500
eor w14,w14,w6
3501
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3502
eor w6,w12,w13
3503
eor w9,w14,w8
3504
eor w6,w6,w9
3505
mov v3.s[0],w6
3506
// optimize sbox using AESE instruction
3507
tbl v0.16b, {v3.16b}, v26.16b
3508
ushr v2.16b, v0.16b, 4
3509
and v0.16b, v0.16b, v31.16b
3510
tbl v0.16b, {v28.16b}, v0.16b
3511
tbl v2.16b, {v27.16b}, v2.16b
3512
eor v0.16b, v0.16b, v2.16b
3513
eor v1.16b, v1.16b, v1.16b
3514
aese v0.16b,v1.16b
3515
ushr v2.16b, v0.16b, 4
3516
and v0.16b, v0.16b, v31.16b
3517
tbl v0.16b, {v30.16b}, v0.16b
3518
tbl v2.16b, {v29.16b}, v2.16b
3519
eor v0.16b, v0.16b, v2.16b
3520
3521
mov w7,v0.s[0]
3522
eor w6,w7,w7,ror #32-2
3523
eor w6,w6,w7,ror #32-10
3524
eor w6,w6,w7,ror #32-18
3525
eor w6,w6,w7,ror #32-24
3526
eor w15,w15,w6
3527
subs w11,w11,#1
3528
b.ne 10b
3529
mov v4.s[0],w15
3530
mov v4.s[1],w14
3531
mov v4.s[2],w13
3532
mov v4.s[3],w12
3533
#ifndef __AARCH64EB__
3534
rev32 v4.16b,v4.16b
3535
#endif
3536
eor v4.16b, v4.16b, v18.16b
3537
st1 {v4.4s}, [x26]
3538
.return_gb:
3539
ldp d14, d15, [sp], #0x10
3540
ldp d12, d13, [sp], #0x10
3541
ldp d10, d11, [sp], #0x10
3542
ldp d8, d9, [sp], #0x10
3543
ldp x29, x30, [sp], #0x10
3544
ldp x27, x28, [sp], #0x10
3545
ldp x25, x26, [sp], #0x10
3546
ldp x23, x24, [sp], #0x10
3547
ldp x21, x22, [sp], #0x10
3548
ldp x19, x20, [sp], #0x10
3549
ldp x17, x18, [sp], #0x10
3550
ldp x15, x16, [sp], #0x10
3551
AARCH64_VALIDATE_LINK_REGISTER
3552
ret
3553
.size vpsm4_ex_xts_encrypt_gb,.-vpsm4_ex_xts_encrypt_gb
3554
.globl vpsm4_ex_xts_encrypt
3555
.type vpsm4_ex_xts_encrypt,%function
3556
.align 5
3557
vpsm4_ex_xts_encrypt:
3558
AARCH64_SIGN_LINK_REGISTER
3559
stp x15, x16, [sp, #-0x10]!
3560
stp x17, x18, [sp, #-0x10]!
3561
stp x19, x20, [sp, #-0x10]!
3562
stp x21, x22, [sp, #-0x10]!
3563
stp x23, x24, [sp, #-0x10]!
3564
stp x25, x26, [sp, #-0x10]!
3565
stp x27, x28, [sp, #-0x10]!
3566
stp x29, x30, [sp, #-0x10]!
3567
stp d8, d9, [sp, #-0x10]!
3568
stp d10, d11, [sp, #-0x10]!
3569
stp d12, d13, [sp, #-0x10]!
3570
stp d14, d15, [sp, #-0x10]!
3571
mov x26,x3
3572
mov x27,x4
3573
mov w28,w6
3574
ld1 {v16.4s}, [x5]
3575
mov x3,x27
3576
adrp x9, .Lsbox_magic
3577
ldr q26, [x9, #:lo12:.Lsbox_magic]
3578
ldr q27, [x9, #:lo12:.Lsbox_magic+16]
3579
ldr q28, [x9, #:lo12:.Lsbox_magic+32]
3580
ldr q29, [x9, #:lo12:.Lsbox_magic+48]
3581
ldr q30, [x9, #:lo12:.Lsbox_magic+64]
3582
ldr q31, [x9, #:lo12:.Lsbox_magic+80]
3583
#ifndef __AARCH64EB__
3584
rev32 v16.16b,v16.16b
3585
#endif
3586
mov x10,x3
3587
mov w11,#8
3588
mov w12,v16.s[0]
3589
mov w13,v16.s[1]
3590
mov w14,v16.s[2]
3591
mov w15,v16.s[3]
3592
10:
3593
ldp w7,w8,[x10],8
3594
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3595
eor w6,w14,w15
3596
eor w9,w7,w13
3597
eor w6,w6,w9
3598
mov v3.s[0],w6
3599
// optimize sbox using AESE instruction
3600
tbl v0.16b, {v3.16b}, v26.16b
3601
ushr v2.16b, v0.16b, 4
3602
and v0.16b, v0.16b, v31.16b
3603
tbl v0.16b, {v28.16b}, v0.16b
3604
tbl v2.16b, {v27.16b}, v2.16b
3605
eor v0.16b, v0.16b, v2.16b
3606
eor v1.16b, v1.16b, v1.16b
3607
aese v0.16b,v1.16b
3608
ushr v2.16b, v0.16b, 4
3609
and v0.16b, v0.16b, v31.16b
3610
tbl v0.16b, {v30.16b}, v0.16b
3611
tbl v2.16b, {v29.16b}, v2.16b
3612
eor v0.16b, v0.16b, v2.16b
3613
3614
mov w7,v0.s[0]
3615
eor w6,w7,w7,ror #32-2
3616
eor w6,w6,w7,ror #32-10
3617
eor w6,w6,w7,ror #32-18
3618
eor w6,w6,w7,ror #32-24
3619
eor w12,w12,w6
3620
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3621
eor w6,w14,w15
3622
eor w9,w12,w8
3623
eor w6,w6,w9
3624
mov v3.s[0],w6
3625
// optimize sbox using AESE instruction
3626
tbl v0.16b, {v3.16b}, v26.16b
3627
ushr v2.16b, v0.16b, 4
3628
and v0.16b, v0.16b, v31.16b
3629
tbl v0.16b, {v28.16b}, v0.16b
3630
tbl v2.16b, {v27.16b}, v2.16b
3631
eor v0.16b, v0.16b, v2.16b
3632
eor v1.16b, v1.16b, v1.16b
3633
aese v0.16b,v1.16b
3634
ushr v2.16b, v0.16b, 4
3635
and v0.16b, v0.16b, v31.16b
3636
tbl v0.16b, {v30.16b}, v0.16b
3637
tbl v2.16b, {v29.16b}, v2.16b
3638
eor v0.16b, v0.16b, v2.16b
3639
3640
mov w7,v0.s[0]
3641
eor w6,w7,w7,ror #32-2
3642
eor w6,w6,w7,ror #32-10
3643
eor w6,w6,w7,ror #32-18
3644
eor w6,w6,w7,ror #32-24
3645
ldp w7,w8,[x10],8
3646
eor w13,w13,w6
3647
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3648
eor w6,w12,w13
3649
eor w9,w7,w15
3650
eor w6,w6,w9
3651
mov v3.s[0],w6
3652
// optimize sbox using AESE instruction
3653
tbl v0.16b, {v3.16b}, v26.16b
3654
ushr v2.16b, v0.16b, 4
3655
and v0.16b, v0.16b, v31.16b
3656
tbl v0.16b, {v28.16b}, v0.16b
3657
tbl v2.16b, {v27.16b}, v2.16b
3658
eor v0.16b, v0.16b, v2.16b
3659
eor v1.16b, v1.16b, v1.16b
3660
aese v0.16b,v1.16b
3661
ushr v2.16b, v0.16b, 4
3662
and v0.16b, v0.16b, v31.16b
3663
tbl v0.16b, {v30.16b}, v0.16b
3664
tbl v2.16b, {v29.16b}, v2.16b
3665
eor v0.16b, v0.16b, v2.16b
3666
3667
mov w7,v0.s[0]
3668
eor w6,w7,w7,ror #32-2
3669
eor w6,w6,w7,ror #32-10
3670
eor w6,w6,w7,ror #32-18
3671
eor w6,w6,w7,ror #32-24
3672
eor w14,w14,w6
3673
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3674
eor w6,w12,w13
3675
eor w9,w14,w8
3676
eor w6,w6,w9
3677
mov v3.s[0],w6
3678
// optimize sbox using AESE instruction
3679
tbl v0.16b, {v3.16b}, v26.16b
3680
ushr v2.16b, v0.16b, 4
3681
and v0.16b, v0.16b, v31.16b
3682
tbl v0.16b, {v28.16b}, v0.16b
3683
tbl v2.16b, {v27.16b}, v2.16b
3684
eor v0.16b, v0.16b, v2.16b
3685
eor v1.16b, v1.16b, v1.16b
3686
aese v0.16b,v1.16b
3687
ushr v2.16b, v0.16b, 4
3688
and v0.16b, v0.16b, v31.16b
3689
tbl v0.16b, {v30.16b}, v0.16b
3690
tbl v2.16b, {v29.16b}, v2.16b
3691
eor v0.16b, v0.16b, v2.16b
3692
3693
mov w7,v0.s[0]
3694
eor w6,w7,w7,ror #32-2
3695
eor w6,w6,w7,ror #32-10
3696
eor w6,w6,w7,ror #32-18
3697
eor w6,w6,w7,ror #32-24
3698
eor w15,w15,w6
3699
subs w11,w11,#1
3700
b.ne 10b
3701
mov v16.s[0],w15
3702
mov v16.s[1],w14
3703
mov v16.s[2],w13
3704
mov v16.s[3],w12
3705
#ifndef __AARCH64EB__
3706
rev32 v16.16b,v16.16b
3707
#endif
3708
mov x3,x26
3709
and x29,x2,#0x0F
3710
// convert length into blocks
3711
lsr x2,x2,4
3712
cmp x2,#1
3713
b.lt .return
3714
3715
cmp x29,0
3716
// If the encryption/decryption Length is N times of 16,
3717
// the all blocks are encrypted/decrypted in .xts_encrypt_blocks
3718
b.eq .xts_encrypt_blocks
3719
3720
// If the encryption/decryption length is not N times of 16,
3721
// the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak
3722
// the other blocks are encrypted/decrypted in .xts_encrypt_blocks
3723
subs x2,x2,#1
3724
b.eq .only_2blks_tweak
3725
.xts_encrypt_blocks:
3726
#ifdef __AARCH64EB__
3727
rev32 v16.16b,v16.16b
3728
#endif
3729
mov x12,v16.d[0]
3730
mov x13,v16.d[1]
3731
mov w7,0x87
3732
extr x9,x13,x13,#32
3733
extr x15,x13,x12,#63
3734
and w8,w7,w9,asr#31
3735
eor x14,x8,x12,lsl#1
3736
mov w7,0x87
3737
extr x9,x15,x15,#32
3738
extr x17,x15,x14,#63
3739
and w8,w7,w9,asr#31
3740
eor x16,x8,x14,lsl#1
3741
mov w7,0x87
3742
extr x9,x17,x17,#32
3743
extr x19,x17,x16,#63
3744
and w8,w7,w9,asr#31
3745
eor x18,x8,x16,lsl#1
3746
mov w7,0x87
3747
extr x9,x19,x19,#32
3748
extr x21,x19,x18,#63
3749
and w8,w7,w9,asr#31
3750
eor x20,x8,x18,lsl#1
3751
mov w7,0x87
3752
extr x9,x21,x21,#32
3753
extr x23,x21,x20,#63
3754
and w8,w7,w9,asr#31
3755
eor x22,x8,x20,lsl#1
3756
mov w7,0x87
3757
extr x9,x23,x23,#32
3758
extr x25,x23,x22,#63
3759
and w8,w7,w9,asr#31
3760
eor x24,x8,x22,lsl#1
3761
mov w7,0x87
3762
extr x9,x25,x25,#32
3763
extr x27,x25,x24,#63
3764
and w8,w7,w9,asr#31
3765
eor x26,x8,x24,lsl#1
3766
.Lxts_8_blocks_process:
3767
cmp x2,#8
3768
mov v16.d[0],x12
3769
mov v16.d[1],x13
3770
#ifdef __AARCH64EB__
3771
rev32 v16.16b,v16.16b
3772
#endif
3773
mov w7,0x87
3774
extr x9,x27,x27,#32
3775
extr x13,x27,x26,#63
3776
and w8,w7,w9,asr#31
3777
eor x12,x8,x26,lsl#1
3778
mov v17.d[0],x14
3779
mov v17.d[1],x15
3780
#ifdef __AARCH64EB__
3781
rev32 v17.16b,v17.16b
3782
#endif
3783
mov w7,0x87
3784
extr x9,x13,x13,#32
3785
extr x15,x13,x12,#63
3786
and w8,w7,w9,asr#31
3787
eor x14,x8,x12,lsl#1
3788
mov v18.d[0],x16
3789
mov v18.d[1],x17
3790
#ifdef __AARCH64EB__
3791
rev32 v18.16b,v18.16b
3792
#endif
3793
mov w7,0x87
3794
extr x9,x15,x15,#32
3795
extr x17,x15,x14,#63
3796
and w8,w7,w9,asr#31
3797
eor x16,x8,x14,lsl#1
3798
mov v19.d[0],x18
3799
mov v19.d[1],x19
3800
#ifdef __AARCH64EB__
3801
rev32 v19.16b,v19.16b
3802
#endif
3803
mov w7,0x87
3804
extr x9,x17,x17,#32
3805
extr x19,x17,x16,#63
3806
and w8,w7,w9,asr#31
3807
eor x18,x8,x16,lsl#1
3808
mov v20.d[0],x20
3809
mov v20.d[1],x21
3810
#ifdef __AARCH64EB__
3811
rev32 v20.16b,v20.16b
3812
#endif
3813
mov w7,0x87
3814
extr x9,x19,x19,#32
3815
extr x21,x19,x18,#63
3816
and w8,w7,w9,asr#31
3817
eor x20,x8,x18,lsl#1
3818
mov v21.d[0],x22
3819
mov v21.d[1],x23
3820
#ifdef __AARCH64EB__
3821
rev32 v21.16b,v21.16b
3822
#endif
3823
mov w7,0x87
3824
extr x9,x21,x21,#32
3825
extr x23,x21,x20,#63
3826
and w8,w7,w9,asr#31
3827
eor x22,x8,x20,lsl#1
3828
mov v22.d[0],x24
3829
mov v22.d[1],x25
3830
#ifdef __AARCH64EB__
3831
rev32 v22.16b,v22.16b
3832
#endif
3833
mov w7,0x87
3834
extr x9,x23,x23,#32
3835
extr x25,x23,x22,#63
3836
and w8,w7,w9,asr#31
3837
eor x24,x8,x22,lsl#1
3838
mov v23.d[0],x26
3839
mov v23.d[1],x27
3840
#ifdef __AARCH64EB__
3841
rev32 v23.16b,v23.16b
3842
#endif
3843
mov w7,0x87
3844
extr x9,x25,x25,#32
3845
extr x27,x25,x24,#63
3846
and w8,w7,w9,asr#31
3847
eor x26,x8,x24,lsl#1
3848
b.lt .Lxts_4_blocks_process
3849
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3850
eor v4.16b, v4.16b, v16.16b
3851
eor v5.16b, v5.16b, v17.16b
3852
eor v6.16b, v6.16b, v18.16b
3853
eor v7.16b, v7.16b, v19.16b
3854
ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
3855
eor v8.16b, v8.16b, v20.16b
3856
eor v9.16b, v9.16b, v21.16b
3857
eor v10.16b, v10.16b, v22.16b
3858
eor v11.16b, v11.16b, v23.16b
3859
#ifndef __AARCH64EB__
3860
rev32 v4.16b,v4.16b
3861
#endif
3862
#ifndef __AARCH64EB__
3863
rev32 v5.16b,v5.16b
3864
#endif
3865
#ifndef __AARCH64EB__
3866
rev32 v6.16b,v6.16b
3867
#endif
3868
#ifndef __AARCH64EB__
3869
rev32 v7.16b,v7.16b
3870
#endif
3871
#ifndef __AARCH64EB__
3872
rev32 v8.16b,v8.16b
3873
#endif
3874
#ifndef __AARCH64EB__
3875
rev32 v9.16b,v9.16b
3876
#endif
3877
#ifndef __AARCH64EB__
3878
rev32 v10.16b,v10.16b
3879
#endif
3880
#ifndef __AARCH64EB__
3881
rev32 v11.16b,v11.16b
3882
#endif
3883
zip1 v0.4s,v4.4s,v5.4s
3884
zip2 v1.4s,v4.4s,v5.4s
3885
zip1 v2.4s,v6.4s,v7.4s
3886
zip2 v3.4s,v6.4s,v7.4s
3887
zip1 v4.2d,v0.2d,v2.2d
3888
zip2 v5.2d,v0.2d,v2.2d
3889
zip1 v6.2d,v1.2d,v3.2d
3890
zip2 v7.2d,v1.2d,v3.2d
3891
zip1 v0.4s,v8.4s,v9.4s
3892
zip2 v1.4s,v8.4s,v9.4s
3893
zip1 v2.4s,v10.4s,v11.4s
3894
zip2 v3.4s,v10.4s,v11.4s
3895
zip1 v8.2d,v0.2d,v2.2d
3896
zip2 v9.2d,v0.2d,v2.2d
3897
zip1 v10.2d,v1.2d,v3.2d
3898
zip2 v11.2d,v1.2d,v3.2d
3899
bl _vpsm4_ex_enc_8blks
3900
zip1 v8.4s,v0.4s,v1.4s
3901
zip2 v9.4s,v0.4s,v1.4s
3902
zip1 v10.4s,v2.4s,v3.4s
3903
zip2 v11.4s,v2.4s,v3.4s
3904
zip1 v0.2d,v8.2d,v10.2d
3905
zip2 v1.2d,v8.2d,v10.2d
3906
zip1 v2.2d,v9.2d,v11.2d
3907
zip2 v3.2d,v9.2d,v11.2d
3908
zip1 v8.4s,v4.4s,v5.4s
3909
zip2 v9.4s,v4.4s,v5.4s
3910
zip1 v10.4s,v6.4s,v7.4s
3911
zip2 v11.4s,v6.4s,v7.4s
3912
zip1 v4.2d,v8.2d,v10.2d
3913
zip2 v5.2d,v8.2d,v10.2d
3914
zip1 v6.2d,v9.2d,v11.2d
3915
zip2 v7.2d,v9.2d,v11.2d
3916
eor v0.16b, v0.16b, v16.16b
3917
eor v1.16b, v1.16b, v17.16b
3918
eor v2.16b, v2.16b, v18.16b
3919
eor v3.16b, v3.16b, v19.16b
3920
eor v4.16b, v4.16b, v20.16b
3921
eor v5.16b, v5.16b, v21.16b
3922
eor v6.16b, v6.16b, v22.16b
3923
eor v7.16b, v7.16b, v23.16b
3924
3925
// save the last tweak
3926
mov v25.16b,v23.16b
3927
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
3928
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
3929
subs x2,x2,#8
3930
b.gt .Lxts_8_blocks_process
3931
b 100f
3932
.Lxts_4_blocks_process:
3933
cmp x2,#4
3934
b.lt 1f
3935
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3936
eor v4.16b, v4.16b, v16.16b
3937
eor v5.16b, v5.16b, v17.16b
3938
eor v6.16b, v6.16b, v18.16b
3939
eor v7.16b, v7.16b, v19.16b
3940
#ifndef __AARCH64EB__
3941
rev32 v4.16b,v4.16b
3942
#endif
3943
#ifndef __AARCH64EB__
3944
rev32 v5.16b,v5.16b
3945
#endif
3946
#ifndef __AARCH64EB__
3947
rev32 v6.16b,v6.16b
3948
#endif
3949
#ifndef __AARCH64EB__
3950
rev32 v7.16b,v7.16b
3951
#endif
3952
zip1 v0.4s,v4.4s,v5.4s
3953
zip2 v1.4s,v4.4s,v5.4s
3954
zip1 v2.4s,v6.4s,v7.4s
3955
zip2 v3.4s,v6.4s,v7.4s
3956
zip1 v4.2d,v0.2d,v2.2d
3957
zip2 v5.2d,v0.2d,v2.2d
3958
zip1 v6.2d,v1.2d,v3.2d
3959
zip2 v7.2d,v1.2d,v3.2d
3960
bl _vpsm4_ex_enc_4blks
3961
zip1 v4.4s,v0.4s,v1.4s
3962
zip2 v5.4s,v0.4s,v1.4s
3963
zip1 v6.4s,v2.4s,v3.4s
3964
zip2 v7.4s,v2.4s,v3.4s
3965
zip1 v0.2d,v4.2d,v6.2d
3966
zip2 v1.2d,v4.2d,v6.2d
3967
zip1 v2.2d,v5.2d,v7.2d
3968
zip2 v3.2d,v5.2d,v7.2d
3969
eor v0.16b, v0.16b, v16.16b
3970
eor v1.16b, v1.16b, v17.16b
3971
eor v2.16b, v2.16b, v18.16b
3972
eor v3.16b, v3.16b, v19.16b
3973
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
3974
sub x2,x2,#4
3975
mov v16.16b,v20.16b
3976
mov v17.16b,v21.16b
3977
mov v18.16b,v22.16b
3978
// save the last tweak
3979
mov v25.16b,v19.16b
3980
1:
3981
// process last block
3982
cmp x2,#1
3983
b.lt 100f
3984
b.gt 1f
3985
ld1 {v4.4s},[x0],#16
3986
eor v4.16b, v4.16b, v16.16b
3987
#ifndef __AARCH64EB__
3988
rev32 v4.16b,v4.16b
3989
#endif
3990
mov x10,x3
3991
mov w11,#8
3992
mov w12,v4.s[0]
3993
mov w13,v4.s[1]
3994
mov w14,v4.s[2]
3995
mov w15,v4.s[3]
3996
10:
3997
ldp w7,w8,[x10],8
3998
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3999
eor w6,w14,w15
4000
eor w9,w7,w13
4001
eor w6,w6,w9
4002
mov v3.s[0],w6
4003
// optimize sbox using AESE instruction
4004
tbl v0.16b, {v3.16b}, v26.16b
4005
ushr v2.16b, v0.16b, 4
4006
and v0.16b, v0.16b, v31.16b
4007
tbl v0.16b, {v28.16b}, v0.16b
4008
tbl v2.16b, {v27.16b}, v2.16b
4009
eor v0.16b, v0.16b, v2.16b
4010
eor v1.16b, v1.16b, v1.16b
4011
aese v0.16b,v1.16b
4012
ushr v2.16b, v0.16b, 4
4013
and v0.16b, v0.16b, v31.16b
4014
tbl v0.16b, {v30.16b}, v0.16b
4015
tbl v2.16b, {v29.16b}, v2.16b
4016
eor v0.16b, v0.16b, v2.16b
4017
4018
mov w7,v0.s[0]
4019
eor w6,w7,w7,ror #32-2
4020
eor w6,w6,w7,ror #32-10
4021
eor w6,w6,w7,ror #32-18
4022
eor w6,w6,w7,ror #32-24
4023
eor w12,w12,w6
4024
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4025
eor w6,w14,w15
4026
eor w9,w12,w8
4027
eor w6,w6,w9
4028
mov v3.s[0],w6
4029
// optimize sbox using AESE instruction
4030
tbl v0.16b, {v3.16b}, v26.16b
4031
ushr v2.16b, v0.16b, 4
4032
and v0.16b, v0.16b, v31.16b
4033
tbl v0.16b, {v28.16b}, v0.16b
4034
tbl v2.16b, {v27.16b}, v2.16b
4035
eor v0.16b, v0.16b, v2.16b
4036
eor v1.16b, v1.16b, v1.16b
4037
aese v0.16b,v1.16b
4038
ushr v2.16b, v0.16b, 4
4039
and v0.16b, v0.16b, v31.16b
4040
tbl v0.16b, {v30.16b}, v0.16b
4041
tbl v2.16b, {v29.16b}, v2.16b
4042
eor v0.16b, v0.16b, v2.16b
4043
4044
mov w7,v0.s[0]
4045
eor w6,w7,w7,ror #32-2
4046
eor w6,w6,w7,ror #32-10
4047
eor w6,w6,w7,ror #32-18
4048
eor w6,w6,w7,ror #32-24
4049
ldp w7,w8,[x10],8
4050
eor w13,w13,w6
4051
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4052
eor w6,w12,w13
4053
eor w9,w7,w15
4054
eor w6,w6,w9
4055
mov v3.s[0],w6
4056
// optimize sbox using AESE instruction
4057
tbl v0.16b, {v3.16b}, v26.16b
4058
ushr v2.16b, v0.16b, 4
4059
and v0.16b, v0.16b, v31.16b
4060
tbl v0.16b, {v28.16b}, v0.16b
4061
tbl v2.16b, {v27.16b}, v2.16b
4062
eor v0.16b, v0.16b, v2.16b
4063
eor v1.16b, v1.16b, v1.16b
4064
aese v0.16b,v1.16b
4065
ushr v2.16b, v0.16b, 4
4066
and v0.16b, v0.16b, v31.16b
4067
tbl v0.16b, {v30.16b}, v0.16b
4068
tbl v2.16b, {v29.16b}, v2.16b
4069
eor v0.16b, v0.16b, v2.16b
4070
4071
mov w7,v0.s[0]
4072
eor w6,w7,w7,ror #32-2
4073
eor w6,w6,w7,ror #32-10
4074
eor w6,w6,w7,ror #32-18
4075
eor w6,w6,w7,ror #32-24
4076
eor w14,w14,w6
4077
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4078
eor w6,w12,w13
4079
eor w9,w14,w8
4080
eor w6,w6,w9
4081
mov v3.s[0],w6
4082
// optimize sbox using AESE instruction
4083
tbl v0.16b, {v3.16b}, v26.16b
4084
ushr v2.16b, v0.16b, 4
4085
and v0.16b, v0.16b, v31.16b
4086
tbl v0.16b, {v28.16b}, v0.16b
4087
tbl v2.16b, {v27.16b}, v2.16b
4088
eor v0.16b, v0.16b, v2.16b
4089
eor v1.16b, v1.16b, v1.16b
4090
aese v0.16b,v1.16b
4091
ushr v2.16b, v0.16b, 4
4092
and v0.16b, v0.16b, v31.16b
4093
tbl v0.16b, {v30.16b}, v0.16b
4094
tbl v2.16b, {v29.16b}, v2.16b
4095
eor v0.16b, v0.16b, v2.16b
4096
4097
mov w7,v0.s[0]
4098
eor w6,w7,w7,ror #32-2
4099
eor w6,w6,w7,ror #32-10
4100
eor w6,w6,w7,ror #32-18
4101
eor w6,w6,w7,ror #32-24
4102
eor w15,w15,w6
4103
subs w11,w11,#1
4104
b.ne 10b
4105
mov v4.s[0],w15
4106
mov v4.s[1],w14
4107
mov v4.s[2],w13
4108
mov v4.s[3],w12
4109
#ifndef __AARCH64EB__
4110
rev32 v4.16b,v4.16b
4111
#endif
4112
eor v4.16b, v4.16b, v16.16b
4113
st1 {v4.4s},[x1],#16
4114
// save the last tweak
4115
mov v25.16b,v16.16b
4116
b 100f
4117
1: // process last 2 blocks
4118
cmp x2,#2
4119
b.gt 1f
4120
ld1 {v4.4s,v5.4s},[x0],#32
4121
eor v4.16b, v4.16b, v16.16b
4122
eor v5.16b, v5.16b, v17.16b
4123
#ifndef __AARCH64EB__
4124
rev32 v4.16b,v4.16b
4125
#endif
4126
#ifndef __AARCH64EB__
4127
rev32 v5.16b,v5.16b
4128
#endif
4129
zip1 v0.4s,v4.4s,v5.4s
4130
zip2 v1.4s,v4.4s,v5.4s
4131
zip1 v2.4s,v6.4s,v7.4s
4132
zip2 v3.4s,v6.4s,v7.4s
4133
zip1 v4.2d,v0.2d,v2.2d
4134
zip2 v5.2d,v0.2d,v2.2d
4135
zip1 v6.2d,v1.2d,v3.2d
4136
zip2 v7.2d,v1.2d,v3.2d
4137
bl _vpsm4_ex_enc_4blks
4138
zip1 v4.4s,v0.4s,v1.4s
4139
zip2 v5.4s,v0.4s,v1.4s
4140
zip1 v6.4s,v2.4s,v3.4s
4141
zip2 v7.4s,v2.4s,v3.4s
4142
zip1 v0.2d,v4.2d,v6.2d
4143
zip2 v1.2d,v4.2d,v6.2d
4144
zip1 v2.2d,v5.2d,v7.2d
4145
zip2 v3.2d,v5.2d,v7.2d
4146
eor v0.16b, v0.16b, v16.16b
4147
eor v1.16b, v1.16b, v17.16b
4148
st1 {v0.4s,v1.4s},[x1],#32
4149
// save the last tweak
4150
mov v25.16b,v17.16b
4151
b 100f
4152
1: // process last 3 blocks
4153
ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
4154
eor v4.16b, v4.16b, v16.16b
4155
eor v5.16b, v5.16b, v17.16b
4156
eor v6.16b, v6.16b, v18.16b
4157
#ifndef __AARCH64EB__
4158
rev32 v4.16b,v4.16b
4159
#endif
4160
#ifndef __AARCH64EB__
4161
rev32 v5.16b,v5.16b
4162
#endif
4163
#ifndef __AARCH64EB__
4164
rev32 v6.16b,v6.16b
4165
#endif
4166
zip1 v0.4s,v4.4s,v5.4s
4167
zip2 v1.4s,v4.4s,v5.4s
4168
zip1 v2.4s,v6.4s,v7.4s
4169
zip2 v3.4s,v6.4s,v7.4s
4170
zip1 v4.2d,v0.2d,v2.2d
4171
zip2 v5.2d,v0.2d,v2.2d
4172
zip1 v6.2d,v1.2d,v3.2d
4173
zip2 v7.2d,v1.2d,v3.2d
4174
bl _vpsm4_ex_enc_4blks
4175
zip1 v4.4s,v0.4s,v1.4s
4176
zip2 v5.4s,v0.4s,v1.4s
4177
zip1 v6.4s,v2.4s,v3.4s
4178
zip2 v7.4s,v2.4s,v3.4s
4179
zip1 v0.2d,v4.2d,v6.2d
4180
zip2 v1.2d,v4.2d,v6.2d
4181
zip1 v2.2d,v5.2d,v7.2d
4182
zip2 v3.2d,v5.2d,v7.2d
4183
eor v0.16b, v0.16b, v16.16b
4184
eor v1.16b, v1.16b, v17.16b
4185
eor v2.16b, v2.16b, v18.16b
4186
st1 {v0.4s,v1.4s,v2.4s},[x1],#48
4187
// save the last tweak
4188
mov v25.16b,v18.16b
4189
100:
4190
cmp x29,0
4191
b.eq .return
4192
4193
// This branch calculates the last two tweaks,
4194
// while the encryption/decryption length is larger than 32
4195
.last_2blks_tweak:
4196
#ifdef __AARCH64EB__
4197
rev32 v25.16b,v25.16b
4198
#endif
4199
mov v2.16b,v25.16b
4200
adrp x9, .Lxts_magic
4201
ldr q0, [x9, #:lo12:.Lxts_magic]
4202
shl v17.16b, v2.16b, #1
4203
ext v1.16b, v2.16b, v2.16b,#15
4204
ushr v1.16b, v1.16b, #7
4205
mul v1.16b, v1.16b, v0.16b
4206
eor v17.16b, v17.16b, v1.16b
4207
mov v2.16b,v17.16b
4208
adrp x9, .Lxts_magic
4209
ldr q0, [x9, #:lo12:.Lxts_magic]
4210
shl v18.16b, v2.16b, #1
4211
ext v1.16b, v2.16b, v2.16b,#15
4212
ushr v1.16b, v1.16b, #7
4213
mul v1.16b, v1.16b, v0.16b
4214
eor v18.16b, v18.16b, v1.16b
4215
b .check_dec
4216
4217
4218
// This branch calculates the last two tweaks,
4219
// while the encryption/decryption length is equal to 32, who only need two tweaks
4220
.only_2blks_tweak:
4221
mov v17.16b,v16.16b
4222
#ifdef __AARCH64EB__
4223
rev32 v17.16b,v17.16b
4224
#endif
4225
mov v2.16b,v17.16b
4226
adrp x9, .Lxts_magic
4227
ldr q0, [x9, #:lo12:.Lxts_magic]
4228
shl v18.16b, v2.16b, #1
4229
ext v1.16b, v2.16b, v2.16b,#15
4230
ushr v1.16b, v1.16b, #7
4231
mul v1.16b, v1.16b, v0.16b
4232
eor v18.16b, v18.16b, v1.16b
4233
b .check_dec
4234
4235
4236
// Determine whether encryption or decryption is required.
4237
// The last two tweaks need to be swapped for decryption.
4238
.check_dec:
4239
// encryption:1 decryption:0
4240
cmp w28,1
4241
b.eq .process_last_2blks
4242
mov v0.16B,v17.16b
4243
mov v17.16B,v18.16b
4244
mov v18.16B,v0.16b
4245
4246
.process_last_2blks:
4247
#ifdef __AARCH64EB__
4248
rev32 v17.16b,v17.16b
4249
#endif
4250
#ifdef __AARCH64EB__
4251
rev32 v18.16b,v18.16b
4252
#endif
4253
ld1 {v4.4s},[x0],#16
4254
eor v4.16b, v4.16b, v17.16b
4255
#ifndef __AARCH64EB__
4256
rev32 v4.16b,v4.16b
4257
#endif
4258
mov x10,x3
4259
mov w11,#8
4260
mov w12,v4.s[0]
4261
mov w13,v4.s[1]
4262
mov w14,v4.s[2]
4263
mov w15,v4.s[3]
4264
10:
4265
ldp w7,w8,[x10],8
4266
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4267
eor w6,w14,w15
4268
eor w9,w7,w13
4269
eor w6,w6,w9
4270
mov v3.s[0],w6
4271
// optimize sbox using AESE instruction
4272
tbl v0.16b, {v3.16b}, v26.16b
4273
ushr v2.16b, v0.16b, 4
4274
and v0.16b, v0.16b, v31.16b
4275
tbl v0.16b, {v28.16b}, v0.16b
4276
tbl v2.16b, {v27.16b}, v2.16b
4277
eor v0.16b, v0.16b, v2.16b
4278
eor v1.16b, v1.16b, v1.16b
4279
aese v0.16b,v1.16b
4280
ushr v2.16b, v0.16b, 4
4281
and v0.16b, v0.16b, v31.16b
4282
tbl v0.16b, {v30.16b}, v0.16b
4283
tbl v2.16b, {v29.16b}, v2.16b
4284
eor v0.16b, v0.16b, v2.16b
4285
4286
mov w7,v0.s[0]
4287
eor w6,w7,w7,ror #32-2
4288
eor w6,w6,w7,ror #32-10
4289
eor w6,w6,w7,ror #32-18
4290
eor w6,w6,w7,ror #32-24
4291
eor w12,w12,w6
4292
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4293
eor w6,w14,w15
4294
eor w9,w12,w8
4295
eor w6,w6,w9
4296
mov v3.s[0],w6
4297
// optimize sbox using AESE instruction
4298
tbl v0.16b, {v3.16b}, v26.16b
4299
ushr v2.16b, v0.16b, 4
4300
and v0.16b, v0.16b, v31.16b
4301
tbl v0.16b, {v28.16b}, v0.16b
4302
tbl v2.16b, {v27.16b}, v2.16b
4303
eor v0.16b, v0.16b, v2.16b
4304
eor v1.16b, v1.16b, v1.16b
4305
aese v0.16b,v1.16b
4306
ushr v2.16b, v0.16b, 4
4307
and v0.16b, v0.16b, v31.16b
4308
tbl v0.16b, {v30.16b}, v0.16b
4309
tbl v2.16b, {v29.16b}, v2.16b
4310
eor v0.16b, v0.16b, v2.16b
4311
4312
mov w7,v0.s[0]
4313
eor w6,w7,w7,ror #32-2
4314
eor w6,w6,w7,ror #32-10
4315
eor w6,w6,w7,ror #32-18
4316
eor w6,w6,w7,ror #32-24
4317
ldp w7,w8,[x10],8
4318
eor w13,w13,w6
4319
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4320
eor w6,w12,w13
4321
eor w9,w7,w15
4322
eor w6,w6,w9
4323
mov v3.s[0],w6
4324
// optimize sbox using AESE instruction
4325
tbl v0.16b, {v3.16b}, v26.16b
4326
ushr v2.16b, v0.16b, 4
4327
and v0.16b, v0.16b, v31.16b
4328
tbl v0.16b, {v28.16b}, v0.16b
4329
tbl v2.16b, {v27.16b}, v2.16b
4330
eor v0.16b, v0.16b, v2.16b
4331
eor v1.16b, v1.16b, v1.16b
4332
aese v0.16b,v1.16b
4333
ushr v2.16b, v0.16b, 4
4334
and v0.16b, v0.16b, v31.16b
4335
tbl v0.16b, {v30.16b}, v0.16b
4336
tbl v2.16b, {v29.16b}, v2.16b
4337
eor v0.16b, v0.16b, v2.16b
4338
4339
mov w7,v0.s[0]
4340
eor w6,w7,w7,ror #32-2
4341
eor w6,w6,w7,ror #32-10
4342
eor w6,w6,w7,ror #32-18
4343
eor w6,w6,w7,ror #32-24
4344
eor w14,w14,w6
4345
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4346
eor w6,w12,w13
4347
eor w9,w14,w8
4348
eor w6,w6,w9
4349
mov v3.s[0],w6
4350
// optimize sbox using AESE instruction
4351
tbl v0.16b, {v3.16b}, v26.16b
4352
ushr v2.16b, v0.16b, 4
4353
and v0.16b, v0.16b, v31.16b
4354
tbl v0.16b, {v28.16b}, v0.16b
4355
tbl v2.16b, {v27.16b}, v2.16b
4356
eor v0.16b, v0.16b, v2.16b
4357
eor v1.16b, v1.16b, v1.16b
4358
aese v0.16b,v1.16b
4359
ushr v2.16b, v0.16b, 4
4360
and v0.16b, v0.16b, v31.16b
4361
tbl v0.16b, {v30.16b}, v0.16b
4362
tbl v2.16b, {v29.16b}, v2.16b
4363
eor v0.16b, v0.16b, v2.16b
4364
4365
mov w7,v0.s[0]
4366
eor w6,w7,w7,ror #32-2
4367
eor w6,w6,w7,ror #32-10
4368
eor w6,w6,w7,ror #32-18
4369
eor w6,w6,w7,ror #32-24
4370
eor w15,w15,w6
4371
subs w11,w11,#1
4372
b.ne 10b
4373
mov v4.s[0],w15
4374
mov v4.s[1],w14
4375
mov v4.s[2],w13
4376
mov v4.s[3],w12
4377
#ifndef __AARCH64EB__
4378
rev32 v4.16b,v4.16b
4379
#endif
4380
eor v4.16b, v4.16b, v17.16b
4381
st1 {v4.4s},[x1],#16
4382
4383
sub x26,x1,16
4384
.loop:
4385
subs x29,x29,1
4386
ldrb w7,[x26,x29]
4387
ldrb w8,[x0,x29]
4388
strb w8,[x26,x29]
4389
strb w7,[x1,x29]
4390
b.gt .loop
4391
ld1 {v4.4s}, [x26]
4392
eor v4.16b, v4.16b, v18.16b
4393
#ifndef __AARCH64EB__
4394
rev32 v4.16b,v4.16b
4395
#endif
4396
mov x10,x3
4397
mov w11,#8
4398
mov w12,v4.s[0]
4399
mov w13,v4.s[1]
4400
mov w14,v4.s[2]
4401
mov w15,v4.s[3]
4402
10:
4403
ldp w7,w8,[x10],8
4404
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4405
eor w6,w14,w15
4406
eor w9,w7,w13
4407
eor w6,w6,w9
4408
mov v3.s[0],w6
4409
// optimize sbox using AESE instruction
4410
tbl v0.16b, {v3.16b}, v26.16b
4411
ushr v2.16b, v0.16b, 4
4412
and v0.16b, v0.16b, v31.16b
4413
tbl v0.16b, {v28.16b}, v0.16b
4414
tbl v2.16b, {v27.16b}, v2.16b
4415
eor v0.16b, v0.16b, v2.16b
4416
eor v1.16b, v1.16b, v1.16b
4417
aese v0.16b,v1.16b
4418
ushr v2.16b, v0.16b, 4
4419
and v0.16b, v0.16b, v31.16b
4420
tbl v0.16b, {v30.16b}, v0.16b
4421
tbl v2.16b, {v29.16b}, v2.16b
4422
eor v0.16b, v0.16b, v2.16b
4423
4424
mov w7,v0.s[0]
4425
eor w6,w7,w7,ror #32-2
4426
eor w6,w6,w7,ror #32-10
4427
eor w6,w6,w7,ror #32-18
4428
eor w6,w6,w7,ror #32-24
4429
eor w12,w12,w6
4430
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4431
eor w6,w14,w15
4432
eor w9,w12,w8
4433
eor w6,w6,w9
4434
mov v3.s[0],w6
4435
// optimize sbox using AESE instruction
4436
tbl v0.16b, {v3.16b}, v26.16b
4437
ushr v2.16b, v0.16b, 4
4438
and v0.16b, v0.16b, v31.16b
4439
tbl v0.16b, {v28.16b}, v0.16b
4440
tbl v2.16b, {v27.16b}, v2.16b
4441
eor v0.16b, v0.16b, v2.16b
4442
eor v1.16b, v1.16b, v1.16b
4443
aese v0.16b,v1.16b
4444
ushr v2.16b, v0.16b, 4
4445
and v0.16b, v0.16b, v31.16b
4446
tbl v0.16b, {v30.16b}, v0.16b
4447
tbl v2.16b, {v29.16b}, v2.16b
4448
eor v0.16b, v0.16b, v2.16b
4449
4450
mov w7,v0.s[0]
4451
eor w6,w7,w7,ror #32-2
4452
eor w6,w6,w7,ror #32-10
4453
eor w6,w6,w7,ror #32-18
4454
eor w6,w6,w7,ror #32-24
4455
ldp w7,w8,[x10],8
4456
eor w13,w13,w6
4457
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4458
eor w6,w12,w13
4459
eor w9,w7,w15
4460
eor w6,w6,w9
4461
mov v3.s[0],w6
4462
// optimize sbox using AESE instruction
4463
tbl v0.16b, {v3.16b}, v26.16b
4464
ushr v2.16b, v0.16b, 4
4465
and v0.16b, v0.16b, v31.16b
4466
tbl v0.16b, {v28.16b}, v0.16b
4467
tbl v2.16b, {v27.16b}, v2.16b
4468
eor v0.16b, v0.16b, v2.16b
4469
eor v1.16b, v1.16b, v1.16b
4470
aese v0.16b,v1.16b
4471
ushr v2.16b, v0.16b, 4
4472
and v0.16b, v0.16b, v31.16b
4473
tbl v0.16b, {v30.16b}, v0.16b
4474
tbl v2.16b, {v29.16b}, v2.16b
4475
eor v0.16b, v0.16b, v2.16b
4476
4477
mov w7,v0.s[0]
4478
eor w6,w7,w7,ror #32-2
4479
eor w6,w6,w7,ror #32-10
4480
eor w6,w6,w7,ror #32-18
4481
eor w6,w6,w7,ror #32-24
4482
eor w14,w14,w6
4483
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4484
eor w6,w12,w13
4485
eor w9,w14,w8
4486
eor w6,w6,w9
4487
mov v3.s[0],w6
4488
// optimize sbox using AESE instruction
4489
tbl v0.16b, {v3.16b}, v26.16b
4490
ushr v2.16b, v0.16b, 4
4491
and v0.16b, v0.16b, v31.16b
4492
tbl v0.16b, {v28.16b}, v0.16b
4493
tbl v2.16b, {v27.16b}, v2.16b
4494
eor v0.16b, v0.16b, v2.16b
4495
eor v1.16b, v1.16b, v1.16b
4496
aese v0.16b,v1.16b
4497
ushr v2.16b, v0.16b, 4
4498
and v0.16b, v0.16b, v31.16b
4499
tbl v0.16b, {v30.16b}, v0.16b
4500
tbl v2.16b, {v29.16b}, v2.16b
4501
eor v0.16b, v0.16b, v2.16b
4502
4503
mov w7,v0.s[0]
4504
eor w6,w7,w7,ror #32-2
4505
eor w6,w6,w7,ror #32-10
4506
eor w6,w6,w7,ror #32-18
4507
eor w6,w6,w7,ror #32-24
4508
eor w15,w15,w6
4509
subs w11,w11,#1
4510
b.ne 10b
4511
mov v4.s[0],w15
4512
mov v4.s[1],w14
4513
mov v4.s[2],w13
4514
mov v4.s[3],w12
4515
#ifndef __AARCH64EB__
4516
rev32 v4.16b,v4.16b
4517
#endif
4518
eor v4.16b, v4.16b, v18.16b
4519
st1 {v4.4s}, [x26]
4520
.return:
4521
ldp d14, d15, [sp], #0x10
4522
ldp d12, d13, [sp], #0x10
4523
ldp d10, d11, [sp], #0x10
4524
ldp d8, d9, [sp], #0x10
4525
ldp x29, x30, [sp], #0x10
4526
ldp x27, x28, [sp], #0x10
4527
ldp x25, x26, [sp], #0x10
4528
ldp x23, x24, [sp], #0x10
4529
ldp x21, x22, [sp], #0x10
4530
ldp x19, x20, [sp], #0x10
4531
ldp x17, x18, [sp], #0x10
4532
ldp x15, x16, [sp], #0x10
4533
AARCH64_VALIDATE_LINK_REGISTER
4534
ret
4535
.size vpsm4_ex_xts_encrypt,.-vpsm4_ex_xts_encrypt
4536
4537