Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/vpsm4-armv8.S
39536 views
1
/* Do not modify. This file is auto-generated from vpsm4-armv8.pl. */
2
// Copyright 2020-2025 The OpenSSL Project Authors. All Rights Reserved.
3
//
4
// Licensed under the Apache License 2.0 (the "License"). You may not use
5
// this file except in compliance with the License. You can obtain a copy
6
// in the file LICENSE in the source distribution or at
7
// https://www.openssl.org/source/license.html
8
9
//
10
// This module implements SM4 with ASIMD on aarch64
11
//
12
// Feb 2022
13
//
14
15
// $output is the last argument if it looks like a file (it has an extension)
16
// $flavour is the first argument if it doesn't look like a file
17
#include "arm_arch.h"
18
.arch armv8-a
19
.text
20
21
.section .rodata
22
.type _vpsm4_consts,%object
23
.align 7
24
_vpsm4_consts:
25
.Lsbox:
26
.byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
27
.byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
28
.byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
29
.byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
30
.byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
31
.byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
32
.byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
33
.byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
34
.byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
35
.byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
36
.byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
37
.byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
38
.byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
39
.byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
40
.byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
41
.byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
42
.Lck:
43
.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
44
.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
45
.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
46
.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
47
.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
48
.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
49
.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
50
.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
51
.Lfk:
52
.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
53
.Lshuffles:
54
.quad 0x0B0A090807060504,0x030201000F0E0D0C
55
.Lxts_magic:
56
.quad 0x0101010101010187,0x0101010101010101
57
58
.size _vpsm4_consts,.-_vpsm4_consts
59
60
.previous
61
62
.type _vpsm4_set_key,%function
63
.align 4
64
_vpsm4_set_key:
65
AARCH64_VALID_CALL_TARGET
66
ld1 {v5.4s},[x0]
67
adrp x10,.Lsbox
68
add x10,x10,#:lo12:.Lsbox
69
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
70
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
71
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
72
ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
73
#ifndef __AARCH64EB__
74
rev32 v5.16b,v5.16b
75
#endif
76
adrp x5,.Lshuffles
77
add x5,x5,#:lo12:.Lshuffles
78
ld1 {v7.2d},[x5]
79
adrp x5,.Lfk
80
add x5,x5,#:lo12:.Lfk
81
ld1 {v6.2d},[x5]
82
eor v5.16b,v5.16b,v6.16b
83
mov x6,#32
84
adrp x5,.Lck
85
add x5,x5,#:lo12:.Lck
86
movi v0.16b,#64
87
cbnz w2,1f
88
add x1,x1,124
89
1:
90
mov w7,v5.s[1]
91
ldr w8,[x5],#4
92
eor w8,w8,w7
93
mov w7,v5.s[2]
94
eor w8,w8,w7
95
mov w7,v5.s[3]
96
eor w8,w8,w7
97
// sbox lookup
98
mov v4.s[0],w8
99
tbl v1.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v4.16b
100
sub v4.16b,v4.16b,v0.16b
101
tbx v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v4.16b
102
sub v4.16b,v4.16b,v0.16b
103
tbx v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v4.16b
104
sub v4.16b,v4.16b,v0.16b
105
tbx v1.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v4.16b
106
mov w7,v1.s[0]
107
eor w8,w7,w7,ror #19
108
eor w8,w8,w7,ror #9
109
mov w7,v5.s[0]
110
eor w8,w8,w7
111
mov v5.s[0],w8
112
cbz w2,2f
113
str w8,[x1],#4
114
b 3f
115
2:
116
str w8,[x1],#-4
117
3:
118
tbl v5.16b,{v5.16b},v7.16b
119
subs x6,x6,#1
120
b.ne 1b
121
ret
122
.size _vpsm4_set_key,.-_vpsm4_set_key
123
.type _vpsm4_enc_4blks,%function
124
.align 4
125
_vpsm4_enc_4blks:
126
AARCH64_VALID_CALL_TARGET
127
mov x10,x3
128
mov w11,#8
129
10:
130
ldp w7,w8,[x10],8
131
dup v12.4s,w7
132
dup v13.4s,w8
133
134
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
135
eor v14.16b,v6.16b,v7.16b
136
eor v12.16b,v5.16b,v12.16b
137
eor v12.16b,v14.16b,v12.16b
138
movi v0.16b,#64
139
movi v1.16b,#128
140
movi v2.16b,#192
141
sub v0.16b,v12.16b,v0.16b
142
sub v1.16b,v12.16b,v1.16b
143
sub v2.16b,v12.16b,v2.16b
144
tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
145
tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
146
tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
147
tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
148
add v0.2d,v0.2d,v1.2d
149
add v2.2d,v2.2d,v12.2d
150
add v12.2d,v0.2d,v2.2d
151
152
ushr v0.4s,v12.4s,32-2
153
sli v0.4s,v12.4s,2
154
ushr v2.4s,v12.4s,32-10
155
eor v1.16b,v0.16b,v12.16b
156
sli v2.4s,v12.4s,10
157
eor v1.16b,v2.16b,v1.16b
158
ushr v0.4s,v12.4s,32-18
159
sli v0.4s,v12.4s,18
160
ushr v2.4s,v12.4s,32-24
161
eor v1.16b,v0.16b,v1.16b
162
sli v2.4s,v12.4s,24
163
eor v12.16b,v2.16b,v1.16b
164
eor v4.16b,v4.16b,v12.16b
165
166
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
167
eor v14.16b,v14.16b,v4.16b
168
eor v13.16b,v14.16b,v13.16b
169
movi v0.16b,#64
170
movi v1.16b,#128
171
movi v2.16b,#192
172
sub v0.16b,v13.16b,v0.16b
173
sub v1.16b,v13.16b,v1.16b
174
sub v2.16b,v13.16b,v2.16b
175
tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
176
tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
177
tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
178
tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
179
add v0.2d,v0.2d,v1.2d
180
add v2.2d,v2.2d,v13.2d
181
add v13.2d,v0.2d,v2.2d
182
183
ushr v0.4s,v13.4s,32-2
184
sli v0.4s,v13.4s,2
185
ushr v2.4s,v13.4s,32-10
186
eor v1.16b,v0.16b,v13.16b
187
sli v2.4s,v13.4s,10
188
eor v1.16b,v2.16b,v1.16b
189
ushr v0.4s,v13.4s,32-18
190
sli v0.4s,v13.4s,18
191
ushr v2.4s,v13.4s,32-24
192
eor v1.16b,v0.16b,v1.16b
193
sli v2.4s,v13.4s,24
194
eor v13.16b,v2.16b,v1.16b
195
ldp w7,w8,[x10],8
196
eor v5.16b,v5.16b,v13.16b
197
198
dup v12.4s,w7
199
dup v13.4s,w8
200
201
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
202
eor v14.16b,v4.16b,v5.16b
203
eor v12.16b,v7.16b,v12.16b
204
eor v12.16b,v14.16b,v12.16b
205
movi v0.16b,#64
206
movi v1.16b,#128
207
movi v2.16b,#192
208
sub v0.16b,v12.16b,v0.16b
209
sub v1.16b,v12.16b,v1.16b
210
sub v2.16b,v12.16b,v2.16b
211
tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
212
tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
213
tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
214
tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
215
add v0.2d,v0.2d,v1.2d
216
add v2.2d,v2.2d,v12.2d
217
add v12.2d,v0.2d,v2.2d
218
219
ushr v0.4s,v12.4s,32-2
220
sli v0.4s,v12.4s,2
221
ushr v2.4s,v12.4s,32-10
222
eor v1.16b,v0.16b,v12.16b
223
sli v2.4s,v12.4s,10
224
eor v1.16b,v2.16b,v1.16b
225
ushr v0.4s,v12.4s,32-18
226
sli v0.4s,v12.4s,18
227
ushr v2.4s,v12.4s,32-24
228
eor v1.16b,v0.16b,v1.16b
229
sli v2.4s,v12.4s,24
230
eor v12.16b,v2.16b,v1.16b
231
eor v6.16b,v6.16b,v12.16b
232
233
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
234
eor v14.16b,v14.16b,v6.16b
235
eor v13.16b,v14.16b,v13.16b
236
movi v0.16b,#64
237
movi v1.16b,#128
238
movi v2.16b,#192
239
sub v0.16b,v13.16b,v0.16b
240
sub v1.16b,v13.16b,v1.16b
241
sub v2.16b,v13.16b,v2.16b
242
tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
243
tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
244
tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
245
tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
246
add v0.2d,v0.2d,v1.2d
247
add v2.2d,v2.2d,v13.2d
248
add v13.2d,v0.2d,v2.2d
249
250
ushr v0.4s,v13.4s,32-2
251
sli v0.4s,v13.4s,2
252
ushr v2.4s,v13.4s,32-10
253
eor v1.16b,v0.16b,v13.16b
254
sli v2.4s,v13.4s,10
255
eor v1.16b,v2.16b,v1.16b
256
ushr v0.4s,v13.4s,32-18
257
sli v0.4s,v13.4s,18
258
ushr v2.4s,v13.4s,32-24
259
eor v1.16b,v0.16b,v1.16b
260
sli v2.4s,v13.4s,24
261
eor v13.16b,v2.16b,v1.16b
262
eor v7.16b,v7.16b,v13.16b
263
subs w11,w11,#1
264
b.ne 10b
265
#ifndef __AARCH64EB__
266
rev32 v3.16b,v4.16b
267
#else
268
mov v3.16b,v4.16b
269
#endif
270
#ifndef __AARCH64EB__
271
rev32 v2.16b,v5.16b
272
#else
273
mov v2.16b,v5.16b
274
#endif
275
#ifndef __AARCH64EB__
276
rev32 v1.16b,v6.16b
277
#else
278
mov v1.16b,v6.16b
279
#endif
280
#ifndef __AARCH64EB__
281
rev32 v0.16b,v7.16b
282
#else
283
mov v0.16b,v7.16b
284
#endif
285
ret
286
.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
287
.type _vpsm4_enc_8blks,%function
288
.align 4
289
_vpsm4_enc_8blks:
290
AARCH64_VALID_CALL_TARGET
291
mov x10,x3
292
mov w11,#8
293
10:
294
ldp w7,w8,[x10],8
295
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
296
dup v12.4s,w7
297
eor v14.16b,v6.16b,v7.16b
298
eor v15.16b,v10.16b,v11.16b
299
eor v0.16b,v5.16b,v12.16b
300
eor v1.16b,v9.16b,v12.16b
301
eor v12.16b,v14.16b,v0.16b
302
eor v13.16b,v15.16b,v1.16b
303
movi v3.16b,#64
304
sub v0.16b,v12.16b,v3.16b
305
sub v1.16b,v0.16b,v3.16b
306
sub v2.16b,v1.16b,v3.16b
307
tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
308
tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
309
tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
310
tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
311
add v1.2d,v0.2d,v1.2d
312
add v12.2d,v2.2d,v12.2d
313
add v12.2d,v1.2d,v12.2d
314
315
sub v0.16b,v13.16b,v3.16b
316
sub v1.16b,v0.16b,v3.16b
317
sub v2.16b,v1.16b,v3.16b
318
tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
319
tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
320
tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
321
tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
322
add v1.2d,v0.2d,v1.2d
323
add v13.2d,v2.2d,v13.2d
324
add v13.2d,v1.2d,v13.2d
325
326
ushr v0.4s,v12.4s,32-2
327
sli v0.4s,v12.4s,2
328
ushr v2.4s,v13.4s,32-2
329
eor v1.16b,v0.16b,v12.16b
330
sli v2.4s,v13.4s,2
331
332
ushr v0.4s,v12.4s,32-10
333
eor v3.16b,v2.16b,v13.16b
334
sli v0.4s,v12.4s,10
335
ushr v2.4s,v13.4s,32-10
336
eor v1.16b,v0.16b,v1.16b
337
sli v2.4s,v13.4s,10
338
339
ushr v0.4s,v12.4s,32-18
340
eor v3.16b,v2.16b,v3.16b
341
sli v0.4s,v12.4s,18
342
ushr v2.4s,v13.4s,32-18
343
eor v1.16b,v0.16b,v1.16b
344
sli v2.4s,v13.4s,18
345
346
ushr v0.4s,v12.4s,32-24
347
eor v3.16b,v2.16b,v3.16b
348
sli v0.4s,v12.4s,24
349
ushr v2.4s,v13.4s,32-24
350
eor v12.16b,v0.16b,v1.16b
351
sli v2.4s,v13.4s,24
352
eor v13.16b,v2.16b,v3.16b
353
eor v4.16b,v4.16b,v12.16b
354
eor v8.16b,v8.16b,v13.16b
355
356
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
357
dup v13.4s,w8
358
eor v14.16b,v14.16b,v4.16b
359
eor v15.16b,v15.16b,v8.16b
360
eor v12.16b,v14.16b,v13.16b
361
eor v13.16b,v15.16b,v13.16b
362
movi v3.16b,#64
363
sub v0.16b,v12.16b,v3.16b
364
sub v1.16b,v0.16b,v3.16b
365
sub v2.16b,v1.16b,v3.16b
366
tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
367
tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
368
tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
369
tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
370
add v1.2d,v0.2d,v1.2d
371
add v12.2d,v2.2d,v12.2d
372
add v12.2d,v1.2d,v12.2d
373
374
sub v0.16b,v13.16b,v3.16b
375
sub v1.16b,v0.16b,v3.16b
376
sub v2.16b,v1.16b,v3.16b
377
tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
378
tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
379
tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
380
tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
381
add v1.2d,v0.2d,v1.2d
382
add v13.2d,v2.2d,v13.2d
383
add v13.2d,v1.2d,v13.2d
384
385
ushr v0.4s,v12.4s,32-2
386
sli v0.4s,v12.4s,2
387
ushr v2.4s,v13.4s,32-2
388
eor v1.16b,v0.16b,v12.16b
389
sli v2.4s,v13.4s,2
390
391
ushr v0.4s,v12.4s,32-10
392
eor v3.16b,v2.16b,v13.16b
393
sli v0.4s,v12.4s,10
394
ushr v2.4s,v13.4s,32-10
395
eor v1.16b,v0.16b,v1.16b
396
sli v2.4s,v13.4s,10
397
398
ushr v0.4s,v12.4s,32-18
399
eor v3.16b,v2.16b,v3.16b
400
sli v0.4s,v12.4s,18
401
ushr v2.4s,v13.4s,32-18
402
eor v1.16b,v0.16b,v1.16b
403
sli v2.4s,v13.4s,18
404
405
ushr v0.4s,v12.4s,32-24
406
eor v3.16b,v2.16b,v3.16b
407
sli v0.4s,v12.4s,24
408
ushr v2.4s,v13.4s,32-24
409
eor v12.16b,v0.16b,v1.16b
410
sli v2.4s,v13.4s,24
411
eor v13.16b,v2.16b,v3.16b
412
ldp w7,w8,[x10],8
413
eor v5.16b,v5.16b,v12.16b
414
eor v9.16b,v9.16b,v13.16b
415
416
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
417
dup v12.4s,w7
418
eor v14.16b,v4.16b,v5.16b
419
eor v15.16b,v8.16b,v9.16b
420
eor v0.16b,v7.16b,v12.16b
421
eor v1.16b,v11.16b,v12.16b
422
eor v12.16b,v14.16b,v0.16b
423
eor v13.16b,v15.16b,v1.16b
424
movi v3.16b,#64
425
sub v0.16b,v12.16b,v3.16b
426
sub v1.16b,v0.16b,v3.16b
427
sub v2.16b,v1.16b,v3.16b
428
tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
429
tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
430
tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
431
tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
432
add v1.2d,v0.2d,v1.2d
433
add v12.2d,v2.2d,v12.2d
434
add v12.2d,v1.2d,v12.2d
435
436
sub v0.16b,v13.16b,v3.16b
437
sub v1.16b,v0.16b,v3.16b
438
sub v2.16b,v1.16b,v3.16b
439
tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
440
tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
441
tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
442
tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
443
add v1.2d,v0.2d,v1.2d
444
add v13.2d,v2.2d,v13.2d
445
add v13.2d,v1.2d,v13.2d
446
447
ushr v0.4s,v12.4s,32-2
448
sli v0.4s,v12.4s,2
449
ushr v2.4s,v13.4s,32-2
450
eor v1.16b,v0.16b,v12.16b
451
sli v2.4s,v13.4s,2
452
453
ushr v0.4s,v12.4s,32-10
454
eor v3.16b,v2.16b,v13.16b
455
sli v0.4s,v12.4s,10
456
ushr v2.4s,v13.4s,32-10
457
eor v1.16b,v0.16b,v1.16b
458
sli v2.4s,v13.4s,10
459
460
ushr v0.4s,v12.4s,32-18
461
eor v3.16b,v2.16b,v3.16b
462
sli v0.4s,v12.4s,18
463
ushr v2.4s,v13.4s,32-18
464
eor v1.16b,v0.16b,v1.16b
465
sli v2.4s,v13.4s,18
466
467
ushr v0.4s,v12.4s,32-24
468
eor v3.16b,v2.16b,v3.16b
469
sli v0.4s,v12.4s,24
470
ushr v2.4s,v13.4s,32-24
471
eor v12.16b,v0.16b,v1.16b
472
sli v2.4s,v13.4s,24
473
eor v13.16b,v2.16b,v3.16b
474
eor v6.16b,v6.16b,v12.16b
475
eor v10.16b,v10.16b,v13.16b
476
477
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
478
dup v13.4s,w8
479
eor v14.16b,v14.16b,v6.16b
480
eor v15.16b,v15.16b,v10.16b
481
eor v12.16b,v14.16b,v13.16b
482
eor v13.16b,v15.16b,v13.16b
483
movi v3.16b,#64
484
sub v0.16b,v12.16b,v3.16b
485
sub v1.16b,v0.16b,v3.16b
486
sub v2.16b,v1.16b,v3.16b
487
tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
488
tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
489
tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
490
tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
491
add v1.2d,v0.2d,v1.2d
492
add v12.2d,v2.2d,v12.2d
493
add v12.2d,v1.2d,v12.2d
494
495
sub v0.16b,v13.16b,v3.16b
496
sub v1.16b,v0.16b,v3.16b
497
sub v2.16b,v1.16b,v3.16b
498
tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
499
tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
500
tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
501
tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
502
add v1.2d,v0.2d,v1.2d
503
add v13.2d,v2.2d,v13.2d
504
add v13.2d,v1.2d,v13.2d
505
506
ushr v0.4s,v12.4s,32-2
507
sli v0.4s,v12.4s,2
508
ushr v2.4s,v13.4s,32-2
509
eor v1.16b,v0.16b,v12.16b
510
sli v2.4s,v13.4s,2
511
512
ushr v0.4s,v12.4s,32-10
513
eor v3.16b,v2.16b,v13.16b
514
sli v0.4s,v12.4s,10
515
ushr v2.4s,v13.4s,32-10
516
eor v1.16b,v0.16b,v1.16b
517
sli v2.4s,v13.4s,10
518
519
ushr v0.4s,v12.4s,32-18
520
eor v3.16b,v2.16b,v3.16b
521
sli v0.4s,v12.4s,18
522
ushr v2.4s,v13.4s,32-18
523
eor v1.16b,v0.16b,v1.16b
524
sli v2.4s,v13.4s,18
525
526
ushr v0.4s,v12.4s,32-24
527
eor v3.16b,v2.16b,v3.16b
528
sli v0.4s,v12.4s,24
529
ushr v2.4s,v13.4s,32-24
530
eor v12.16b,v0.16b,v1.16b
531
sli v2.4s,v13.4s,24
532
eor v13.16b,v2.16b,v3.16b
533
eor v7.16b,v7.16b,v12.16b
534
eor v11.16b,v11.16b,v13.16b
535
subs w11,w11,#1
536
b.ne 10b
537
#ifndef __AARCH64EB__
538
rev32 v3.16b,v4.16b
539
#else
540
mov v3.16b,v4.16b
541
#endif
542
#ifndef __AARCH64EB__
543
rev32 v2.16b,v5.16b
544
#else
545
mov v2.16b,v5.16b
546
#endif
547
#ifndef __AARCH64EB__
548
rev32 v1.16b,v6.16b
549
#else
550
mov v1.16b,v6.16b
551
#endif
552
#ifndef __AARCH64EB__
553
rev32 v0.16b,v7.16b
554
#else
555
mov v0.16b,v7.16b
556
#endif
557
#ifndef __AARCH64EB__
558
rev32 v7.16b,v8.16b
559
#else
560
mov v7.16b,v8.16b
561
#endif
562
#ifndef __AARCH64EB__
563
rev32 v6.16b,v9.16b
564
#else
565
mov v6.16b,v9.16b
566
#endif
567
#ifndef __AARCH64EB__
568
rev32 v5.16b,v10.16b
569
#else
570
mov v5.16b,v10.16b
571
#endif
572
#ifndef __AARCH64EB__
573
rev32 v4.16b,v11.16b
574
#else
575
mov v4.16b,v11.16b
576
#endif
577
ret
578
.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
579
.globl vpsm4_set_encrypt_key
580
.type vpsm4_set_encrypt_key,%function
581
.align 5
582
vpsm4_set_encrypt_key:
583
AARCH64_SIGN_LINK_REGISTER
584
stp x29,x30,[sp,#-16]!
585
mov w2,1
586
bl _vpsm4_set_key
587
ldp x29,x30,[sp],#16
588
AARCH64_VALIDATE_LINK_REGISTER
589
ret
590
.size vpsm4_set_encrypt_key,.-vpsm4_set_encrypt_key
591
.globl vpsm4_set_decrypt_key
592
.type vpsm4_set_decrypt_key,%function
593
.align 5
594
vpsm4_set_decrypt_key:
595
AARCH64_SIGN_LINK_REGISTER
596
stp x29,x30,[sp,#-16]!
597
mov w2,0
598
bl _vpsm4_set_key
599
ldp x29,x30,[sp],#16
600
AARCH64_VALIDATE_LINK_REGISTER
601
ret
602
.size vpsm4_set_decrypt_key,.-vpsm4_set_decrypt_key
603
.globl vpsm4_encrypt
604
.type vpsm4_encrypt,%function
605
.align 5
606
vpsm4_encrypt:
607
AARCH64_VALID_CALL_TARGET
608
ld1 {v4.4s},[x0]
609
adrp x10,.Lsbox
610
add x10,x10,#:lo12:.Lsbox
611
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
612
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
613
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
614
ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
615
#ifndef __AARCH64EB__
616
rev32 v4.16b,v4.16b
617
#endif
618
mov x3,x2
619
mov x10,x3
620
mov w11,#8
621
mov w12,v4.s[0]
622
mov w13,v4.s[1]
623
mov w14,v4.s[2]
624
mov w15,v4.s[3]
625
10:
626
ldp w7,w8,[x10],8
627
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
628
eor w6,w14,w15
629
eor w9,w7,w13
630
eor w6,w6,w9
631
movi v1.16b,#64
632
movi v2.16b,#128
633
movi v3.16b,#192
634
mov v0.s[0],w6
635
636
sub v1.16b,v0.16b,v1.16b
637
sub v2.16b,v0.16b,v2.16b
638
sub v3.16b,v0.16b,v3.16b
639
640
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
641
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
642
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
643
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
644
645
mov w6,v0.s[0]
646
mov w7,v1.s[0]
647
mov w9,v2.s[0]
648
add w7,w6,w7
649
mov w6,v3.s[0]
650
add w7,w7,w9
651
add w7,w7,w6
652
653
eor w6,w7,w7,ror #32-2
654
eor w6,w6,w7,ror #32-10
655
eor w6,w6,w7,ror #32-18
656
eor w6,w6,w7,ror #32-24
657
eor w12,w12,w6
658
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
659
eor w6,w14,w15
660
eor w9,w12,w8
661
eor w6,w6,w9
662
movi v1.16b,#64
663
movi v2.16b,#128
664
movi v3.16b,#192
665
mov v0.s[0],w6
666
667
sub v1.16b,v0.16b,v1.16b
668
sub v2.16b,v0.16b,v2.16b
669
sub v3.16b,v0.16b,v3.16b
670
671
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
672
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
673
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
674
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
675
676
mov w6,v0.s[0]
677
mov w7,v1.s[0]
678
mov w9,v2.s[0]
679
add w7,w6,w7
680
mov w6,v3.s[0]
681
add w7,w7,w9
682
add w7,w7,w6
683
684
eor w6,w7,w7,ror #32-2
685
eor w6,w6,w7,ror #32-10
686
eor w6,w6,w7,ror #32-18
687
eor w6,w6,w7,ror #32-24
688
ldp w7,w8,[x10],8
689
eor w13,w13,w6
690
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
691
eor w6,w12,w13
692
eor w9,w7,w15
693
eor w6,w6,w9
694
movi v1.16b,#64
695
movi v2.16b,#128
696
movi v3.16b,#192
697
mov v0.s[0],w6
698
699
sub v1.16b,v0.16b,v1.16b
700
sub v2.16b,v0.16b,v2.16b
701
sub v3.16b,v0.16b,v3.16b
702
703
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
704
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
705
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
706
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
707
708
mov w6,v0.s[0]
709
mov w7,v1.s[0]
710
mov w9,v2.s[0]
711
add w7,w6,w7
712
mov w6,v3.s[0]
713
add w7,w7,w9
714
add w7,w7,w6
715
716
eor w6,w7,w7,ror #32-2
717
eor w6,w6,w7,ror #32-10
718
eor w6,w6,w7,ror #32-18
719
eor w6,w6,w7,ror #32-24
720
eor w14,w14,w6
721
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
722
eor w6,w12,w13
723
eor w9,w14,w8
724
eor w6,w6,w9
725
movi v1.16b,#64
726
movi v2.16b,#128
727
movi v3.16b,#192
728
mov v0.s[0],w6
729
730
sub v1.16b,v0.16b,v1.16b
731
sub v2.16b,v0.16b,v2.16b
732
sub v3.16b,v0.16b,v3.16b
733
734
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
735
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
736
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
737
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
738
739
mov w6,v0.s[0]
740
mov w7,v1.s[0]
741
mov w9,v2.s[0]
742
add w7,w6,w7
743
mov w6,v3.s[0]
744
add w7,w7,w9
745
add w7,w7,w6
746
747
eor w6,w7,w7,ror #32-2
748
eor w6,w6,w7,ror #32-10
749
eor w6,w6,w7,ror #32-18
750
eor w6,w6,w7,ror #32-24
751
eor w15,w15,w6
752
subs w11,w11,#1
753
b.ne 10b
754
mov v4.s[0],w15
755
mov v4.s[1],w14
756
mov v4.s[2],w13
757
mov v4.s[3],w12
758
#ifndef __AARCH64EB__
759
rev32 v4.16b,v4.16b
760
#endif
761
st1 {v4.4s},[x1]
762
ret
763
.size vpsm4_encrypt,.-vpsm4_encrypt
764
.globl vpsm4_decrypt
765
.type vpsm4_decrypt,%function
766
.align 5
767
vpsm4_decrypt:
768
AARCH64_VALID_CALL_TARGET
769
ld1 {v4.4s},[x0]
770
adrp x10,.Lsbox
771
add x10,x10,#:lo12:.Lsbox
772
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
773
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
774
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
775
ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
776
#ifndef __AARCH64EB__
777
rev32 v4.16b,v4.16b
778
#endif
779
mov x3,x2
780
mov x10,x3
781
mov w11,#8
782
mov w12,v4.s[0]
783
mov w13,v4.s[1]
784
mov w14,v4.s[2]
785
mov w15,v4.s[3]
786
10:
787
ldp w7,w8,[x10],8
788
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
789
eor w6,w14,w15
790
eor w9,w7,w13
791
eor w6,w6,w9
792
movi v1.16b,#64
793
movi v2.16b,#128
794
movi v3.16b,#192
795
mov v0.s[0],w6
796
797
sub v1.16b,v0.16b,v1.16b
798
sub v2.16b,v0.16b,v2.16b
799
sub v3.16b,v0.16b,v3.16b
800
801
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
802
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
803
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
804
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
805
806
mov w6,v0.s[0]
807
mov w7,v1.s[0]
808
mov w9,v2.s[0]
809
add w7,w6,w7
810
mov w6,v3.s[0]
811
add w7,w7,w9
812
add w7,w7,w6
813
814
eor w6,w7,w7,ror #32-2
815
eor w6,w6,w7,ror #32-10
816
eor w6,w6,w7,ror #32-18
817
eor w6,w6,w7,ror #32-24
818
eor w12,w12,w6
819
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
820
eor w6,w14,w15
821
eor w9,w12,w8
822
eor w6,w6,w9
823
movi v1.16b,#64
824
movi v2.16b,#128
825
movi v3.16b,#192
826
mov v0.s[0],w6
827
828
sub v1.16b,v0.16b,v1.16b
829
sub v2.16b,v0.16b,v2.16b
830
sub v3.16b,v0.16b,v3.16b
831
832
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
833
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
834
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
835
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
836
837
mov w6,v0.s[0]
838
mov w7,v1.s[0]
839
mov w9,v2.s[0]
840
add w7,w6,w7
841
mov w6,v3.s[0]
842
add w7,w7,w9
843
add w7,w7,w6
844
845
eor w6,w7,w7,ror #32-2
846
eor w6,w6,w7,ror #32-10
847
eor w6,w6,w7,ror #32-18
848
eor w6,w6,w7,ror #32-24
849
ldp w7,w8,[x10],8
850
eor w13,w13,w6
851
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
852
eor w6,w12,w13
853
eor w9,w7,w15
854
eor w6,w6,w9
855
movi v1.16b,#64
856
movi v2.16b,#128
857
movi v3.16b,#192
858
mov v0.s[0],w6
859
860
sub v1.16b,v0.16b,v1.16b
861
sub v2.16b,v0.16b,v2.16b
862
sub v3.16b,v0.16b,v3.16b
863
864
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
865
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
866
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
867
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
868
869
mov w6,v0.s[0]
870
mov w7,v1.s[0]
871
mov w9,v2.s[0]
872
add w7,w6,w7
873
mov w6,v3.s[0]
874
add w7,w7,w9
875
add w7,w7,w6
876
877
eor w6,w7,w7,ror #32-2
878
eor w6,w6,w7,ror #32-10
879
eor w6,w6,w7,ror #32-18
880
eor w6,w6,w7,ror #32-24
881
eor w14,w14,w6
882
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
883
eor w6,w12,w13
884
eor w9,w14,w8
885
eor w6,w6,w9
886
movi v1.16b,#64
887
movi v2.16b,#128
888
movi v3.16b,#192
889
mov v0.s[0],w6
890
891
sub v1.16b,v0.16b,v1.16b
892
sub v2.16b,v0.16b,v2.16b
893
sub v3.16b,v0.16b,v3.16b
894
895
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
896
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
897
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
898
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
899
900
mov w6,v0.s[0]
901
mov w7,v1.s[0]
902
mov w9,v2.s[0]
903
add w7,w6,w7
904
mov w6,v3.s[0]
905
add w7,w7,w9
906
add w7,w7,w6
907
908
eor w6,w7,w7,ror #32-2
909
eor w6,w6,w7,ror #32-10
910
eor w6,w6,w7,ror #32-18
911
eor w6,w6,w7,ror #32-24
912
eor w15,w15,w6
913
subs w11,w11,#1
914
b.ne 10b
915
mov v4.s[0],w15
916
mov v4.s[1],w14
917
mov v4.s[2],w13
918
mov v4.s[3],w12
919
#ifndef __AARCH64EB__
920
rev32 v4.16b,v4.16b
921
#endif
922
st1 {v4.4s},[x1]
923
ret
924
.size vpsm4_decrypt,.-vpsm4_decrypt
925
.globl vpsm4_ecb_encrypt
926
.type vpsm4_ecb_encrypt,%function
927
.align 5
928
vpsm4_ecb_encrypt:
929
AARCH64_SIGN_LINK_REGISTER
930
// convert length into blocks
931
lsr x2,x2,4
932
stp d8,d9,[sp,#-80]!
933
stp d10,d11,[sp,#16]
934
stp d12,d13,[sp,#32]
935
stp d14,d15,[sp,#48]
936
stp x29,x30,[sp,#64]
937
adrp x10,.Lsbox
938
add x10,x10,#:lo12:.Lsbox
939
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
940
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
941
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
942
ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
943
.Lecb_8_blocks_process:
944
cmp w2,#8
945
b.lt .Lecb_4_blocks_process
946
ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
947
ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
948
#ifndef __AARCH64EB__
949
rev32 v4.16b,v4.16b
950
#endif
951
#ifndef __AARCH64EB__
952
rev32 v5.16b,v5.16b
953
#endif
954
#ifndef __AARCH64EB__
955
rev32 v6.16b,v6.16b
956
#endif
957
#ifndef __AARCH64EB__
958
rev32 v7.16b,v7.16b
959
#endif
960
#ifndef __AARCH64EB__
961
rev32 v8.16b,v8.16b
962
#endif
963
#ifndef __AARCH64EB__
964
rev32 v9.16b,v9.16b
965
#endif
966
#ifndef __AARCH64EB__
967
rev32 v10.16b,v10.16b
968
#endif
969
#ifndef __AARCH64EB__
970
rev32 v11.16b,v11.16b
971
#endif
972
bl _vpsm4_enc_8blks
973
st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
974
st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
975
subs w2,w2,#8
976
b.gt .Lecb_8_blocks_process
977
b 100f
978
.Lecb_4_blocks_process:
979
cmp w2,#4
980
b.lt 1f
981
ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
982
#ifndef __AARCH64EB__
983
rev32 v4.16b,v4.16b
984
#endif
985
#ifndef __AARCH64EB__
986
rev32 v5.16b,v5.16b
987
#endif
988
#ifndef __AARCH64EB__
989
rev32 v6.16b,v6.16b
990
#endif
991
#ifndef __AARCH64EB__
992
rev32 v7.16b,v7.16b
993
#endif
994
bl _vpsm4_enc_4blks
995
st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
996
sub w2,w2,#4
997
1:
998
// process last block
999
cmp w2,#1
1000
b.lt 100f
1001
b.gt 1f
1002
ld1 {v4.4s},[x0]
1003
#ifndef __AARCH64EB__
1004
rev32 v4.16b,v4.16b
1005
#endif
1006
mov x10,x3
1007
mov w11,#8
1008
mov w12,v4.s[0]
1009
mov w13,v4.s[1]
1010
mov w14,v4.s[2]
1011
mov w15,v4.s[3]
1012
10:
1013
ldp w7,w8,[x10],8
1014
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1015
eor w6,w14,w15
1016
eor w9,w7,w13
1017
eor w6,w6,w9
1018
movi v1.16b,#64
1019
movi v2.16b,#128
1020
movi v3.16b,#192
1021
mov v0.s[0],w6
1022
1023
sub v1.16b,v0.16b,v1.16b
1024
sub v2.16b,v0.16b,v2.16b
1025
sub v3.16b,v0.16b,v3.16b
1026
1027
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1028
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1029
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1030
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1031
1032
mov w6,v0.s[0]
1033
mov w7,v1.s[0]
1034
mov w9,v2.s[0]
1035
add w7,w6,w7
1036
mov w6,v3.s[0]
1037
add w7,w7,w9
1038
add w7,w7,w6
1039
1040
eor w6,w7,w7,ror #32-2
1041
eor w6,w6,w7,ror #32-10
1042
eor w6,w6,w7,ror #32-18
1043
eor w6,w6,w7,ror #32-24
1044
eor w12,w12,w6
1045
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1046
eor w6,w14,w15
1047
eor w9,w12,w8
1048
eor w6,w6,w9
1049
movi v1.16b,#64
1050
movi v2.16b,#128
1051
movi v3.16b,#192
1052
mov v0.s[0],w6
1053
1054
sub v1.16b,v0.16b,v1.16b
1055
sub v2.16b,v0.16b,v2.16b
1056
sub v3.16b,v0.16b,v3.16b
1057
1058
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1059
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1060
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1061
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1062
1063
mov w6,v0.s[0]
1064
mov w7,v1.s[0]
1065
mov w9,v2.s[0]
1066
add w7,w6,w7
1067
mov w6,v3.s[0]
1068
add w7,w7,w9
1069
add w7,w7,w6
1070
1071
eor w6,w7,w7,ror #32-2
1072
eor w6,w6,w7,ror #32-10
1073
eor w6,w6,w7,ror #32-18
1074
eor w6,w6,w7,ror #32-24
1075
ldp w7,w8,[x10],8
1076
eor w13,w13,w6
1077
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1078
eor w6,w12,w13
1079
eor w9,w7,w15
1080
eor w6,w6,w9
1081
movi v1.16b,#64
1082
movi v2.16b,#128
1083
movi v3.16b,#192
1084
mov v0.s[0],w6
1085
1086
sub v1.16b,v0.16b,v1.16b
1087
sub v2.16b,v0.16b,v2.16b
1088
sub v3.16b,v0.16b,v3.16b
1089
1090
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1091
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1092
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1093
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1094
1095
mov w6,v0.s[0]
1096
mov w7,v1.s[0]
1097
mov w9,v2.s[0]
1098
add w7,w6,w7
1099
mov w6,v3.s[0]
1100
add w7,w7,w9
1101
add w7,w7,w6
1102
1103
eor w6,w7,w7,ror #32-2
1104
eor w6,w6,w7,ror #32-10
1105
eor w6,w6,w7,ror #32-18
1106
eor w6,w6,w7,ror #32-24
1107
eor w14,w14,w6
1108
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1109
eor w6,w12,w13
1110
eor w9,w14,w8
1111
eor w6,w6,w9
1112
movi v1.16b,#64
1113
movi v2.16b,#128
1114
movi v3.16b,#192
1115
mov v0.s[0],w6
1116
1117
sub v1.16b,v0.16b,v1.16b
1118
sub v2.16b,v0.16b,v2.16b
1119
sub v3.16b,v0.16b,v3.16b
1120
1121
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1122
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1123
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1124
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1125
1126
mov w6,v0.s[0]
1127
mov w7,v1.s[0]
1128
mov w9,v2.s[0]
1129
add w7,w6,w7
1130
mov w6,v3.s[0]
1131
add w7,w7,w9
1132
add w7,w7,w6
1133
1134
eor w6,w7,w7,ror #32-2
1135
eor w6,w6,w7,ror #32-10
1136
eor w6,w6,w7,ror #32-18
1137
eor w6,w6,w7,ror #32-24
1138
eor w15,w15,w6
1139
subs w11,w11,#1
1140
b.ne 10b
1141
mov v4.s[0],w15
1142
mov v4.s[1],w14
1143
mov v4.s[2],w13
1144
mov v4.s[3],w12
1145
#ifndef __AARCH64EB__
1146
rev32 v4.16b,v4.16b
1147
#endif
1148
st1 {v4.4s},[x1]
1149
b 100f
1150
1: // process last 2 blocks
1151
ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16
1152
ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16
1153
cmp w2,#2
1154
b.gt 1f
1155
#ifndef __AARCH64EB__
1156
rev32 v4.16b,v4.16b
1157
#endif
1158
#ifndef __AARCH64EB__
1159
rev32 v5.16b,v5.16b
1160
#endif
1161
#ifndef __AARCH64EB__
1162
rev32 v6.16b,v6.16b
1163
#endif
1164
#ifndef __AARCH64EB__
1165
rev32 v7.16b,v7.16b
1166
#endif
1167
bl _vpsm4_enc_4blks
1168
st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
1169
st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1]
1170
b 100f
1171
1: // process last 3 blocks
1172
ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16
1173
#ifndef __AARCH64EB__
1174
rev32 v4.16b,v4.16b
1175
#endif
1176
#ifndef __AARCH64EB__
1177
rev32 v5.16b,v5.16b
1178
#endif
1179
#ifndef __AARCH64EB__
1180
rev32 v6.16b,v6.16b
1181
#endif
1182
#ifndef __AARCH64EB__
1183
rev32 v7.16b,v7.16b
1184
#endif
1185
bl _vpsm4_enc_4blks
1186
st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
1187
st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
1188
st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1]
1189
100:
1190
ldp d10,d11,[sp,#16]
1191
ldp d12,d13,[sp,#32]
1192
ldp d14,d15,[sp,#48]
1193
ldp x29,x30,[sp,#64]
1194
ldp d8,d9,[sp],#80
1195
AARCH64_VALIDATE_LINK_REGISTER
1196
ret
1197
.size vpsm4_ecb_encrypt,.-vpsm4_ecb_encrypt
1198
.globl vpsm4_cbc_encrypt
1199
.type vpsm4_cbc_encrypt,%function
1200
.align 5
1201
vpsm4_cbc_encrypt:
1202
AARCH64_VALID_CALL_TARGET
1203
lsr x2,x2,4
1204
adrp x10,.Lsbox
1205
add x10,x10,#:lo12:.Lsbox
1206
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
1207
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
1208
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
1209
ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
1210
cbz w5,.Ldec
1211
ld1 {v3.4s},[x4]
1212
.Lcbc_4_blocks_enc:
1213
cmp w2,#4
1214
b.lt 1f
1215
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1216
eor v4.16b,v4.16b,v3.16b
1217
#ifndef __AARCH64EB__
1218
rev32 v5.16b,v5.16b
1219
#endif
1220
#ifndef __AARCH64EB__
1221
rev32 v4.16b,v4.16b
1222
#endif
1223
#ifndef __AARCH64EB__
1224
rev32 v6.16b,v6.16b
1225
#endif
1226
#ifndef __AARCH64EB__
1227
rev32 v7.16b,v7.16b
1228
#endif
1229
mov x10,x3
1230
mov w11,#8
1231
mov w12,v4.s[0]
1232
mov w13,v4.s[1]
1233
mov w14,v4.s[2]
1234
mov w15,v4.s[3]
1235
10:
1236
ldp w7,w8,[x10],8
1237
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1238
eor w6,w14,w15
1239
eor w9,w7,w13
1240
eor w6,w6,w9
1241
movi v1.16b,#64
1242
movi v2.16b,#128
1243
movi v3.16b,#192
1244
mov v0.s[0],w6
1245
1246
sub v1.16b,v0.16b,v1.16b
1247
sub v2.16b,v0.16b,v2.16b
1248
sub v3.16b,v0.16b,v3.16b
1249
1250
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1251
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1252
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1253
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1254
1255
mov w6,v0.s[0]
1256
mov w7,v1.s[0]
1257
mov w9,v2.s[0]
1258
add w7,w6,w7
1259
mov w6,v3.s[0]
1260
add w7,w7,w9
1261
add w7,w7,w6
1262
1263
eor w6,w7,w7,ror #32-2
1264
eor w6,w6,w7,ror #32-10
1265
eor w6,w6,w7,ror #32-18
1266
eor w6,w6,w7,ror #32-24
1267
eor w12,w12,w6
1268
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1269
eor w6,w14,w15
1270
eor w9,w12,w8
1271
eor w6,w6,w9
1272
movi v1.16b,#64
1273
movi v2.16b,#128
1274
movi v3.16b,#192
1275
mov v0.s[0],w6
1276
1277
sub v1.16b,v0.16b,v1.16b
1278
sub v2.16b,v0.16b,v2.16b
1279
sub v3.16b,v0.16b,v3.16b
1280
1281
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1282
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1283
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1284
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1285
1286
mov w6,v0.s[0]
1287
mov w7,v1.s[0]
1288
mov w9,v2.s[0]
1289
add w7,w6,w7
1290
mov w6,v3.s[0]
1291
add w7,w7,w9
1292
add w7,w7,w6
1293
1294
eor w6,w7,w7,ror #32-2
1295
eor w6,w6,w7,ror #32-10
1296
eor w6,w6,w7,ror #32-18
1297
eor w6,w6,w7,ror #32-24
1298
ldp w7,w8,[x10],8
1299
eor w13,w13,w6
1300
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1301
eor w6,w12,w13
1302
eor w9,w7,w15
1303
eor w6,w6,w9
1304
movi v1.16b,#64
1305
movi v2.16b,#128
1306
movi v3.16b,#192
1307
mov v0.s[0],w6
1308
1309
sub v1.16b,v0.16b,v1.16b
1310
sub v2.16b,v0.16b,v2.16b
1311
sub v3.16b,v0.16b,v3.16b
1312
1313
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1314
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1315
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1316
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1317
1318
mov w6,v0.s[0]
1319
mov w7,v1.s[0]
1320
mov w9,v2.s[0]
1321
add w7,w6,w7
1322
mov w6,v3.s[0]
1323
add w7,w7,w9
1324
add w7,w7,w6
1325
1326
eor w6,w7,w7,ror #32-2
1327
eor w6,w6,w7,ror #32-10
1328
eor w6,w6,w7,ror #32-18
1329
eor w6,w6,w7,ror #32-24
1330
eor w14,w14,w6
1331
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1332
eor w6,w12,w13
1333
eor w9,w14,w8
1334
eor w6,w6,w9
1335
movi v1.16b,#64
1336
movi v2.16b,#128
1337
movi v3.16b,#192
1338
mov v0.s[0],w6
1339
1340
sub v1.16b,v0.16b,v1.16b
1341
sub v2.16b,v0.16b,v2.16b
1342
sub v3.16b,v0.16b,v3.16b
1343
1344
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1345
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1346
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1347
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1348
1349
mov w6,v0.s[0]
1350
mov w7,v1.s[0]
1351
mov w9,v2.s[0]
1352
add w7,w6,w7
1353
mov w6,v3.s[0]
1354
add w7,w7,w9
1355
add w7,w7,w6
1356
1357
eor w6,w7,w7,ror #32-2
1358
eor w6,w6,w7,ror #32-10
1359
eor w6,w6,w7,ror #32-18
1360
eor w6,w6,w7,ror #32-24
1361
eor w15,w15,w6
1362
subs w11,w11,#1
1363
b.ne 10b
1364
mov v4.s[0],w15
1365
mov v4.s[1],w14
1366
mov v4.s[2],w13
1367
mov v4.s[3],w12
1368
eor v5.16b,v5.16b,v4.16b
1369
mov x10,x3
1370
mov w11,#8
1371
mov w12,v5.s[0]
1372
mov w13,v5.s[1]
1373
mov w14,v5.s[2]
1374
mov w15,v5.s[3]
1375
10:
1376
ldp w7,w8,[x10],8
1377
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1378
eor w6,w14,w15
1379
eor w9,w7,w13
1380
eor w6,w6,w9
1381
movi v1.16b,#64
1382
movi v2.16b,#128
1383
movi v3.16b,#192
1384
mov v0.s[0],w6
1385
1386
sub v1.16b,v0.16b,v1.16b
1387
sub v2.16b,v0.16b,v2.16b
1388
sub v3.16b,v0.16b,v3.16b
1389
1390
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1391
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1392
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1393
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1394
1395
mov w6,v0.s[0]
1396
mov w7,v1.s[0]
1397
mov w9,v2.s[0]
1398
add w7,w6,w7
1399
mov w6,v3.s[0]
1400
add w7,w7,w9
1401
add w7,w7,w6
1402
1403
eor w6,w7,w7,ror #32-2
1404
eor w6,w6,w7,ror #32-10
1405
eor w6,w6,w7,ror #32-18
1406
eor w6,w6,w7,ror #32-24
1407
eor w12,w12,w6
1408
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1409
eor w6,w14,w15
1410
eor w9,w12,w8
1411
eor w6,w6,w9
1412
movi v1.16b,#64
1413
movi v2.16b,#128
1414
movi v3.16b,#192
1415
mov v0.s[0],w6
1416
1417
sub v1.16b,v0.16b,v1.16b
1418
sub v2.16b,v0.16b,v2.16b
1419
sub v3.16b,v0.16b,v3.16b
1420
1421
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1422
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1423
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1424
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1425
1426
mov w6,v0.s[0]
1427
mov w7,v1.s[0]
1428
mov w9,v2.s[0]
1429
add w7,w6,w7
1430
mov w6,v3.s[0]
1431
add w7,w7,w9
1432
add w7,w7,w6
1433
1434
eor w6,w7,w7,ror #32-2
1435
eor w6,w6,w7,ror #32-10
1436
eor w6,w6,w7,ror #32-18
1437
eor w6,w6,w7,ror #32-24
1438
ldp w7,w8,[x10],8
1439
eor w13,w13,w6
1440
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1441
eor w6,w12,w13
1442
eor w9,w7,w15
1443
eor w6,w6,w9
1444
movi v1.16b,#64
1445
movi v2.16b,#128
1446
movi v3.16b,#192
1447
mov v0.s[0],w6
1448
1449
sub v1.16b,v0.16b,v1.16b
1450
sub v2.16b,v0.16b,v2.16b
1451
sub v3.16b,v0.16b,v3.16b
1452
1453
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1454
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1455
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1456
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1457
1458
mov w6,v0.s[0]
1459
mov w7,v1.s[0]
1460
mov w9,v2.s[0]
1461
add w7,w6,w7
1462
mov w6,v3.s[0]
1463
add w7,w7,w9
1464
add w7,w7,w6
1465
1466
eor w6,w7,w7,ror #32-2
1467
eor w6,w6,w7,ror #32-10
1468
eor w6,w6,w7,ror #32-18
1469
eor w6,w6,w7,ror #32-24
1470
eor w14,w14,w6
1471
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1472
eor w6,w12,w13
1473
eor w9,w14,w8
1474
eor w6,w6,w9
1475
movi v1.16b,#64
1476
movi v2.16b,#128
1477
movi v3.16b,#192
1478
mov v0.s[0],w6
1479
1480
sub v1.16b,v0.16b,v1.16b
1481
sub v2.16b,v0.16b,v2.16b
1482
sub v3.16b,v0.16b,v3.16b
1483
1484
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1485
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1486
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1487
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1488
1489
mov w6,v0.s[0]
1490
mov w7,v1.s[0]
1491
mov w9,v2.s[0]
1492
add w7,w6,w7
1493
mov w6,v3.s[0]
1494
add w7,w7,w9
1495
add w7,w7,w6
1496
1497
eor w6,w7,w7,ror #32-2
1498
eor w6,w6,w7,ror #32-10
1499
eor w6,w6,w7,ror #32-18
1500
eor w6,w6,w7,ror #32-24
1501
eor w15,w15,w6
1502
subs w11,w11,#1
1503
b.ne 10b
1504
mov v5.s[0],w15
1505
mov v5.s[1],w14
1506
mov v5.s[2],w13
1507
mov v5.s[3],w12
1508
#ifndef __AARCH64EB__
1509
rev32 v4.16b,v4.16b
1510
#endif
1511
eor v6.16b,v6.16b,v5.16b
1512
mov x10,x3
1513
mov w11,#8
1514
mov w12,v6.s[0]
1515
mov w13,v6.s[1]
1516
mov w14,v6.s[2]
1517
mov w15,v6.s[3]
1518
10:
1519
ldp w7,w8,[x10],8
1520
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1521
eor w6,w14,w15
1522
eor w9,w7,w13
1523
eor w6,w6,w9
1524
movi v1.16b,#64
1525
movi v2.16b,#128
1526
movi v3.16b,#192
1527
mov v0.s[0],w6
1528
1529
sub v1.16b,v0.16b,v1.16b
1530
sub v2.16b,v0.16b,v2.16b
1531
sub v3.16b,v0.16b,v3.16b
1532
1533
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1534
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1535
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1536
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1537
1538
mov w6,v0.s[0]
1539
mov w7,v1.s[0]
1540
mov w9,v2.s[0]
1541
add w7,w6,w7
1542
mov w6,v3.s[0]
1543
add w7,w7,w9
1544
add w7,w7,w6
1545
1546
eor w6,w7,w7,ror #32-2
1547
eor w6,w6,w7,ror #32-10
1548
eor w6,w6,w7,ror #32-18
1549
eor w6,w6,w7,ror #32-24
1550
eor w12,w12,w6
1551
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1552
eor w6,w14,w15
1553
eor w9,w12,w8
1554
eor w6,w6,w9
1555
movi v1.16b,#64
1556
movi v2.16b,#128
1557
movi v3.16b,#192
1558
mov v0.s[0],w6
1559
1560
sub v1.16b,v0.16b,v1.16b
1561
sub v2.16b,v0.16b,v2.16b
1562
sub v3.16b,v0.16b,v3.16b
1563
1564
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1565
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1566
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1567
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1568
1569
mov w6,v0.s[0]
1570
mov w7,v1.s[0]
1571
mov w9,v2.s[0]
1572
add w7,w6,w7
1573
mov w6,v3.s[0]
1574
add w7,w7,w9
1575
add w7,w7,w6
1576
1577
eor w6,w7,w7,ror #32-2
1578
eor w6,w6,w7,ror #32-10
1579
eor w6,w6,w7,ror #32-18
1580
eor w6,w6,w7,ror #32-24
1581
ldp w7,w8,[x10],8
1582
eor w13,w13,w6
1583
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1584
eor w6,w12,w13
1585
eor w9,w7,w15
1586
eor w6,w6,w9
1587
movi v1.16b,#64
1588
movi v2.16b,#128
1589
movi v3.16b,#192
1590
mov v0.s[0],w6
1591
1592
sub v1.16b,v0.16b,v1.16b
1593
sub v2.16b,v0.16b,v2.16b
1594
sub v3.16b,v0.16b,v3.16b
1595
1596
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1597
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1598
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1599
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1600
1601
mov w6,v0.s[0]
1602
mov w7,v1.s[0]
1603
mov w9,v2.s[0]
1604
add w7,w6,w7
1605
mov w6,v3.s[0]
1606
add w7,w7,w9
1607
add w7,w7,w6
1608
1609
eor w6,w7,w7,ror #32-2
1610
eor w6,w6,w7,ror #32-10
1611
eor w6,w6,w7,ror #32-18
1612
eor w6,w6,w7,ror #32-24
1613
eor w14,w14,w6
1614
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1615
eor w6,w12,w13
1616
eor w9,w14,w8
1617
eor w6,w6,w9
1618
movi v1.16b,#64
1619
movi v2.16b,#128
1620
movi v3.16b,#192
1621
mov v0.s[0],w6
1622
1623
sub v1.16b,v0.16b,v1.16b
1624
sub v2.16b,v0.16b,v2.16b
1625
sub v3.16b,v0.16b,v3.16b
1626
1627
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1628
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1629
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1630
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1631
1632
mov w6,v0.s[0]
1633
mov w7,v1.s[0]
1634
mov w9,v2.s[0]
1635
add w7,w6,w7
1636
mov w6,v3.s[0]
1637
add w7,w7,w9
1638
add w7,w7,w6
1639
1640
eor w6,w7,w7,ror #32-2
1641
eor w6,w6,w7,ror #32-10
1642
eor w6,w6,w7,ror #32-18
1643
eor w6,w6,w7,ror #32-24
1644
eor w15,w15,w6
1645
subs w11,w11,#1
1646
b.ne 10b
1647
mov v6.s[0],w15
1648
mov v6.s[1],w14
1649
mov v6.s[2],w13
1650
mov v6.s[3],w12
1651
#ifndef __AARCH64EB__
1652
rev32 v5.16b,v5.16b
1653
#endif
1654
eor v7.16b,v7.16b,v6.16b
1655
mov x10,x3
1656
mov w11,#8
1657
mov w12,v7.s[0]
1658
mov w13,v7.s[1]
1659
mov w14,v7.s[2]
1660
mov w15,v7.s[3]
1661
10:
1662
ldp w7,w8,[x10],8
1663
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1664
eor w6,w14,w15
1665
eor w9,w7,w13
1666
eor w6,w6,w9
1667
movi v1.16b,#64
1668
movi v2.16b,#128
1669
movi v3.16b,#192
1670
mov v0.s[0],w6
1671
1672
sub v1.16b,v0.16b,v1.16b
1673
sub v2.16b,v0.16b,v2.16b
1674
sub v3.16b,v0.16b,v3.16b
1675
1676
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1677
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1678
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1679
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1680
1681
mov w6,v0.s[0]
1682
mov w7,v1.s[0]
1683
mov w9,v2.s[0]
1684
add w7,w6,w7
1685
mov w6,v3.s[0]
1686
add w7,w7,w9
1687
add w7,w7,w6
1688
1689
eor w6,w7,w7,ror #32-2
1690
eor w6,w6,w7,ror #32-10
1691
eor w6,w6,w7,ror #32-18
1692
eor w6,w6,w7,ror #32-24
1693
eor w12,w12,w6
1694
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1695
eor w6,w14,w15
1696
eor w9,w12,w8
1697
eor w6,w6,w9
1698
movi v1.16b,#64
1699
movi v2.16b,#128
1700
movi v3.16b,#192
1701
mov v0.s[0],w6
1702
1703
sub v1.16b,v0.16b,v1.16b
1704
sub v2.16b,v0.16b,v2.16b
1705
sub v3.16b,v0.16b,v3.16b
1706
1707
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1708
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1709
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1710
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1711
1712
mov w6,v0.s[0]
1713
mov w7,v1.s[0]
1714
mov w9,v2.s[0]
1715
add w7,w6,w7
1716
mov w6,v3.s[0]
1717
add w7,w7,w9
1718
add w7,w7,w6
1719
1720
eor w6,w7,w7,ror #32-2
1721
eor w6,w6,w7,ror #32-10
1722
eor w6,w6,w7,ror #32-18
1723
eor w6,w6,w7,ror #32-24
1724
ldp w7,w8,[x10],8
1725
eor w13,w13,w6
1726
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1727
eor w6,w12,w13
1728
eor w9,w7,w15
1729
eor w6,w6,w9
1730
movi v1.16b,#64
1731
movi v2.16b,#128
1732
movi v3.16b,#192
1733
mov v0.s[0],w6
1734
1735
sub v1.16b,v0.16b,v1.16b
1736
sub v2.16b,v0.16b,v2.16b
1737
sub v3.16b,v0.16b,v3.16b
1738
1739
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1740
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1741
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1742
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1743
1744
mov w6,v0.s[0]
1745
mov w7,v1.s[0]
1746
mov w9,v2.s[0]
1747
add w7,w6,w7
1748
mov w6,v3.s[0]
1749
add w7,w7,w9
1750
add w7,w7,w6
1751
1752
eor w6,w7,w7,ror #32-2
1753
eor w6,w6,w7,ror #32-10
1754
eor w6,w6,w7,ror #32-18
1755
eor w6,w6,w7,ror #32-24
1756
eor w14,w14,w6
1757
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1758
eor w6,w12,w13
1759
eor w9,w14,w8
1760
eor w6,w6,w9
1761
movi v1.16b,#64
1762
movi v2.16b,#128
1763
movi v3.16b,#192
1764
mov v0.s[0],w6
1765
1766
sub v1.16b,v0.16b,v1.16b
1767
sub v2.16b,v0.16b,v2.16b
1768
sub v3.16b,v0.16b,v3.16b
1769
1770
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1771
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1772
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1773
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1774
1775
mov w6,v0.s[0]
1776
mov w7,v1.s[0]
1777
mov w9,v2.s[0]
1778
add w7,w6,w7
1779
mov w6,v3.s[0]
1780
add w7,w7,w9
1781
add w7,w7,w6
1782
1783
eor w6,w7,w7,ror #32-2
1784
eor w6,w6,w7,ror #32-10
1785
eor w6,w6,w7,ror #32-18
1786
eor w6,w6,w7,ror #32-24
1787
eor w15,w15,w6
1788
subs w11,w11,#1
1789
b.ne 10b
1790
mov v7.s[0],w15
1791
mov v7.s[1],w14
1792
mov v7.s[2],w13
1793
mov v7.s[3],w12
1794
#ifndef __AARCH64EB__
1795
rev32 v6.16b,v6.16b
1796
#endif
1797
#ifndef __AARCH64EB__
1798
rev32 v7.16b,v7.16b
1799
#endif
1800
orr v3.16b,v7.16b,v7.16b
1801
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
1802
subs w2,w2,#4
1803
b.ne .Lcbc_4_blocks_enc
1804
b 2f
1805
1:
1806
subs w2,w2,#1
1807
b.lt 2f
1808
ld1 {v4.4s},[x0],#16
1809
eor v3.16b,v3.16b,v4.16b
1810
#ifndef __AARCH64EB__
1811
rev32 v3.16b,v3.16b
1812
#endif
1813
mov x10,x3
1814
mov w11,#8
1815
mov w12,v3.s[0]
1816
mov w13,v3.s[1]
1817
mov w14,v3.s[2]
1818
mov w15,v3.s[3]
1819
10:
1820
ldp w7,w8,[x10],8
1821
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1822
eor w6,w14,w15
1823
eor w9,w7,w13
1824
eor w6,w6,w9
1825
movi v1.16b,#64
1826
movi v2.16b,#128
1827
movi v3.16b,#192
1828
mov v0.s[0],w6
1829
1830
sub v1.16b,v0.16b,v1.16b
1831
sub v2.16b,v0.16b,v2.16b
1832
sub v3.16b,v0.16b,v3.16b
1833
1834
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1835
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1836
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1837
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1838
1839
mov w6,v0.s[0]
1840
mov w7,v1.s[0]
1841
mov w9,v2.s[0]
1842
add w7,w6,w7
1843
mov w6,v3.s[0]
1844
add w7,w7,w9
1845
add w7,w7,w6
1846
1847
eor w6,w7,w7,ror #32-2
1848
eor w6,w6,w7,ror #32-10
1849
eor w6,w6,w7,ror #32-18
1850
eor w6,w6,w7,ror #32-24
1851
eor w12,w12,w6
1852
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1853
eor w6,w14,w15
1854
eor w9,w12,w8
1855
eor w6,w6,w9
1856
movi v1.16b,#64
1857
movi v2.16b,#128
1858
movi v3.16b,#192
1859
mov v0.s[0],w6
1860
1861
sub v1.16b,v0.16b,v1.16b
1862
sub v2.16b,v0.16b,v2.16b
1863
sub v3.16b,v0.16b,v3.16b
1864
1865
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1866
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1867
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1868
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1869
1870
mov w6,v0.s[0]
1871
mov w7,v1.s[0]
1872
mov w9,v2.s[0]
1873
add w7,w6,w7
1874
mov w6,v3.s[0]
1875
add w7,w7,w9
1876
add w7,w7,w6
1877
1878
eor w6,w7,w7,ror #32-2
1879
eor w6,w6,w7,ror #32-10
1880
eor w6,w6,w7,ror #32-18
1881
eor w6,w6,w7,ror #32-24
1882
ldp w7,w8,[x10],8
1883
eor w13,w13,w6
1884
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1885
eor w6,w12,w13
1886
eor w9,w7,w15
1887
eor w6,w6,w9
1888
movi v1.16b,#64
1889
movi v2.16b,#128
1890
movi v3.16b,#192
1891
mov v0.s[0],w6
1892
1893
sub v1.16b,v0.16b,v1.16b
1894
sub v2.16b,v0.16b,v2.16b
1895
sub v3.16b,v0.16b,v3.16b
1896
1897
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1898
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1899
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1900
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1901
1902
mov w6,v0.s[0]
1903
mov w7,v1.s[0]
1904
mov w9,v2.s[0]
1905
add w7,w6,w7
1906
mov w6,v3.s[0]
1907
add w7,w7,w9
1908
add w7,w7,w6
1909
1910
eor w6,w7,w7,ror #32-2
1911
eor w6,w6,w7,ror #32-10
1912
eor w6,w6,w7,ror #32-18
1913
eor w6,w6,w7,ror #32-24
1914
eor w14,w14,w6
1915
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1916
eor w6,w12,w13
1917
eor w9,w14,w8
1918
eor w6,w6,w9
1919
movi v1.16b,#64
1920
movi v2.16b,#128
1921
movi v3.16b,#192
1922
mov v0.s[0],w6
1923
1924
sub v1.16b,v0.16b,v1.16b
1925
sub v2.16b,v0.16b,v2.16b
1926
sub v3.16b,v0.16b,v3.16b
1927
1928
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1929
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1930
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1931
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1932
1933
mov w6,v0.s[0]
1934
mov w7,v1.s[0]
1935
mov w9,v2.s[0]
1936
add w7,w6,w7
1937
mov w6,v3.s[0]
1938
add w7,w7,w9
1939
add w7,w7,w6
1940
1941
eor w6,w7,w7,ror #32-2
1942
eor w6,w6,w7,ror #32-10
1943
eor w6,w6,w7,ror #32-18
1944
eor w6,w6,w7,ror #32-24
1945
eor w15,w15,w6
1946
subs w11,w11,#1
1947
b.ne 10b
1948
mov v3.s[0],w15
1949
mov v3.s[1],w14
1950
mov v3.s[2],w13
1951
mov v3.s[3],w12
1952
#ifndef __AARCH64EB__
1953
rev32 v3.16b,v3.16b
1954
#endif
1955
st1 {v3.4s},[x1],#16
1956
b 1b
1957
2:
1958
// save back IV
1959
st1 {v3.4s},[x4]
1960
ret
1961
1962
.Ldec:
1963
// decryption mode starts
1964
AARCH64_SIGN_LINK_REGISTER
1965
stp d8,d9,[sp,#-80]!
1966
stp d10,d11,[sp,#16]
1967
stp d12,d13,[sp,#32]
1968
stp d14,d15,[sp,#48]
1969
stp x29,x30,[sp,#64]
1970
.Lcbc_8_blocks_dec:
1971
cmp w2,#8
1972
b.lt 1f
1973
ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
1974
add x10,x0,#64
1975
ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10]
1976
#ifndef __AARCH64EB__
1977
rev32 v4.16b,v4.16b
1978
#endif
1979
#ifndef __AARCH64EB__
1980
rev32 v5.16b,v5.16b
1981
#endif
1982
#ifndef __AARCH64EB__
1983
rev32 v6.16b,v6.16b
1984
#endif
1985
#ifndef __AARCH64EB__
1986
rev32 v7.16b,v7.16b
1987
#endif
1988
#ifndef __AARCH64EB__
1989
rev32 v8.16b,v8.16b
1990
#endif
1991
#ifndef __AARCH64EB__
1992
rev32 v9.16b,v9.16b
1993
#endif
1994
#ifndef __AARCH64EB__
1995
rev32 v10.16b,v10.16b
1996
#endif
1997
#ifndef __AARCH64EB__
1998
rev32 v11.16b,v11.16b
1999
#endif
2000
bl _vpsm4_enc_8blks
2001
zip1 v8.4s,v0.4s,v1.4s
2002
zip2 v9.4s,v0.4s,v1.4s
2003
zip1 v10.4s,v2.4s,v3.4s
2004
zip2 v11.4s,v2.4s,v3.4s
2005
zip1 v0.2d,v8.2d,v10.2d
2006
zip2 v1.2d,v8.2d,v10.2d
2007
zip1 v2.2d,v9.2d,v11.2d
2008
zip2 v3.2d,v9.2d,v11.2d
2009
zip1 v8.4s,v4.4s,v5.4s
2010
zip2 v9.4s,v4.4s,v5.4s
2011
zip1 v10.4s,v6.4s,v7.4s
2012
zip2 v11.4s,v6.4s,v7.4s
2013
zip1 v4.2d,v8.2d,v10.2d
2014
zip2 v5.2d,v8.2d,v10.2d
2015
zip1 v6.2d,v9.2d,v11.2d
2016
zip2 v7.2d,v9.2d,v11.2d
2017
ld1 {v15.4s},[x4]
2018
ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2019
// note ivec1 and vtmpx[3] are reusing the same register
2020
// care needs to be taken to avoid conflict
2021
eor v0.16b,v0.16b,v15.16b
2022
ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2023
eor v1.16b,v1.16b,v8.16b
2024
eor v2.16b,v2.16b,v9.16b
2025
eor v3.16b,v3.16b,v10.16b
2026
// save back IV
2027
st1 {v15.4s}, [x4]
2028
eor v4.16b,v4.16b,v11.16b
2029
eor v5.16b,v5.16b,v12.16b
2030
eor v6.16b,v6.16b,v13.16b
2031
eor v7.16b,v7.16b,v14.16b
2032
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2033
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
2034
subs w2,w2,#8
2035
b.gt .Lcbc_8_blocks_dec
2036
b.eq 100f
2037
1:
2038
ld1 {v15.4s},[x4]
2039
.Lcbc_4_blocks_dec:
2040
cmp w2,#4
2041
b.lt 1f
2042
ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]
2043
#ifndef __AARCH64EB__
2044
rev32 v4.16b,v4.16b
2045
#endif
2046
#ifndef __AARCH64EB__
2047
rev32 v5.16b,v5.16b
2048
#endif
2049
#ifndef __AARCH64EB__
2050
rev32 v6.16b,v6.16b
2051
#endif
2052
#ifndef __AARCH64EB__
2053
rev32 v7.16b,v7.16b
2054
#endif
2055
bl _vpsm4_enc_4blks
2056
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
2057
zip1 v8.4s,v0.4s,v1.4s
2058
zip2 v9.4s,v0.4s,v1.4s
2059
zip1 v10.4s,v2.4s,v3.4s
2060
zip2 v11.4s,v2.4s,v3.4s
2061
zip1 v0.2d,v8.2d,v10.2d
2062
zip2 v1.2d,v8.2d,v10.2d
2063
zip1 v2.2d,v9.2d,v11.2d
2064
zip2 v3.2d,v9.2d,v11.2d
2065
eor v0.16b,v0.16b,v15.16b
2066
eor v1.16b,v1.16b,v4.16b
2067
orr v15.16b,v7.16b,v7.16b
2068
eor v2.16b,v2.16b,v5.16b
2069
eor v3.16b,v3.16b,v6.16b
2070
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2071
subs w2,w2,#4
2072
b.gt .Lcbc_4_blocks_dec
2073
// save back IV
2074
st1 {v7.4s}, [x4]
2075
b 100f
2076
1: // last block
2077
subs w2,w2,#1
2078
b.lt 100f
2079
b.gt 1f
2080
ld1 {v4.4s},[x0],#16
2081
// save back IV
2082
st1 {v4.4s}, [x4]
2083
#ifndef __AARCH64EB__
2084
rev32 v8.16b,v4.16b
2085
#else
2086
mov v8.16b,v4.16b
2087
#endif
2088
mov x10,x3
2089
mov w11,#8
2090
mov w12,v8.s[0]
2091
mov w13,v8.s[1]
2092
mov w14,v8.s[2]
2093
mov w15,v8.s[3]
2094
10:
2095
ldp w7,w8,[x10],8
2096
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2097
eor w6,w14,w15
2098
eor w9,w7,w13
2099
eor w6,w6,w9
2100
movi v1.16b,#64
2101
movi v2.16b,#128
2102
movi v3.16b,#192
2103
mov v0.s[0],w6
2104
2105
sub v1.16b,v0.16b,v1.16b
2106
sub v2.16b,v0.16b,v2.16b
2107
sub v3.16b,v0.16b,v3.16b
2108
2109
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2110
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2111
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2112
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2113
2114
mov w6,v0.s[0]
2115
mov w7,v1.s[0]
2116
mov w9,v2.s[0]
2117
add w7,w6,w7
2118
mov w6,v3.s[0]
2119
add w7,w7,w9
2120
add w7,w7,w6
2121
2122
eor w6,w7,w7,ror #32-2
2123
eor w6,w6,w7,ror #32-10
2124
eor w6,w6,w7,ror #32-18
2125
eor w6,w6,w7,ror #32-24
2126
eor w12,w12,w6
2127
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2128
eor w6,w14,w15
2129
eor w9,w12,w8
2130
eor w6,w6,w9
2131
movi v1.16b,#64
2132
movi v2.16b,#128
2133
movi v3.16b,#192
2134
mov v0.s[0],w6
2135
2136
sub v1.16b,v0.16b,v1.16b
2137
sub v2.16b,v0.16b,v2.16b
2138
sub v3.16b,v0.16b,v3.16b
2139
2140
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2141
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2142
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2143
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2144
2145
mov w6,v0.s[0]
2146
mov w7,v1.s[0]
2147
mov w9,v2.s[0]
2148
add w7,w6,w7
2149
mov w6,v3.s[0]
2150
add w7,w7,w9
2151
add w7,w7,w6
2152
2153
eor w6,w7,w7,ror #32-2
2154
eor w6,w6,w7,ror #32-10
2155
eor w6,w6,w7,ror #32-18
2156
eor w6,w6,w7,ror #32-24
2157
ldp w7,w8,[x10],8
2158
eor w13,w13,w6
2159
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2160
eor w6,w12,w13
2161
eor w9,w7,w15
2162
eor w6,w6,w9
2163
movi v1.16b,#64
2164
movi v2.16b,#128
2165
movi v3.16b,#192
2166
mov v0.s[0],w6
2167
2168
sub v1.16b,v0.16b,v1.16b
2169
sub v2.16b,v0.16b,v2.16b
2170
sub v3.16b,v0.16b,v3.16b
2171
2172
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2173
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2174
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2175
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2176
2177
mov w6,v0.s[0]
2178
mov w7,v1.s[0]
2179
mov w9,v2.s[0]
2180
add w7,w6,w7
2181
mov w6,v3.s[0]
2182
add w7,w7,w9
2183
add w7,w7,w6
2184
2185
eor w6,w7,w7,ror #32-2
2186
eor w6,w6,w7,ror #32-10
2187
eor w6,w6,w7,ror #32-18
2188
eor w6,w6,w7,ror #32-24
2189
eor w14,w14,w6
2190
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2191
eor w6,w12,w13
2192
eor w9,w14,w8
2193
eor w6,w6,w9
2194
movi v1.16b,#64
2195
movi v2.16b,#128
2196
movi v3.16b,#192
2197
mov v0.s[0],w6
2198
2199
sub v1.16b,v0.16b,v1.16b
2200
sub v2.16b,v0.16b,v2.16b
2201
sub v3.16b,v0.16b,v3.16b
2202
2203
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2204
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2205
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2206
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2207
2208
mov w6,v0.s[0]
2209
mov w7,v1.s[0]
2210
mov w9,v2.s[0]
2211
add w7,w6,w7
2212
mov w6,v3.s[0]
2213
add w7,w7,w9
2214
add w7,w7,w6
2215
2216
eor w6,w7,w7,ror #32-2
2217
eor w6,w6,w7,ror #32-10
2218
eor w6,w6,w7,ror #32-18
2219
eor w6,w6,w7,ror #32-24
2220
eor w15,w15,w6
2221
subs w11,w11,#1
2222
b.ne 10b
2223
mov v8.s[0],w15
2224
mov v8.s[1],w14
2225
mov v8.s[2],w13
2226
mov v8.s[3],w12
2227
#ifndef __AARCH64EB__
2228
rev32 v8.16b,v8.16b
2229
#endif
2230
eor v8.16b,v8.16b,v15.16b
2231
st1 {v8.4s},[x1],#16
2232
b 100f
2233
1: // last two blocks
2234
ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0]
2235
add x10,x0,#16
2236
ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16
2237
subs w2,w2,1
2238
b.gt 1f
2239
#ifndef __AARCH64EB__
2240
rev32 v4.16b,v4.16b
2241
#endif
2242
#ifndef __AARCH64EB__
2243
rev32 v5.16b,v5.16b
2244
#endif
2245
#ifndef __AARCH64EB__
2246
rev32 v6.16b,v6.16b
2247
#endif
2248
#ifndef __AARCH64EB__
2249
rev32 v7.16b,v7.16b
2250
#endif
2251
bl _vpsm4_enc_4blks
2252
ld1 {v4.4s,v5.4s},[x0],#32
2253
zip1 v8.4s,v0.4s,v1.4s
2254
zip2 v9.4s,v0.4s,v1.4s
2255
zip1 v10.4s,v2.4s,v3.4s
2256
zip2 v11.4s,v2.4s,v3.4s
2257
zip1 v0.2d,v8.2d,v10.2d
2258
zip2 v1.2d,v8.2d,v10.2d
2259
zip1 v2.2d,v9.2d,v11.2d
2260
zip2 v3.2d,v9.2d,v11.2d
2261
eor v0.16b,v0.16b,v15.16b
2262
eor v1.16b,v1.16b,v4.16b
2263
st1 {v0.4s,v1.4s},[x1],#32
2264
// save back IV
2265
st1 {v5.4s}, [x4]
2266
b 100f
2267
1: // last 3 blocks
2268
ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10]
2269
#ifndef __AARCH64EB__
2270
rev32 v4.16b,v4.16b
2271
#endif
2272
#ifndef __AARCH64EB__
2273
rev32 v5.16b,v5.16b
2274
#endif
2275
#ifndef __AARCH64EB__
2276
rev32 v6.16b,v6.16b
2277
#endif
2278
#ifndef __AARCH64EB__
2279
rev32 v7.16b,v7.16b
2280
#endif
2281
bl _vpsm4_enc_4blks
2282
ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
2283
zip1 v8.4s,v0.4s,v1.4s
2284
zip2 v9.4s,v0.4s,v1.4s
2285
zip1 v10.4s,v2.4s,v3.4s
2286
zip2 v11.4s,v2.4s,v3.4s
2287
zip1 v0.2d,v8.2d,v10.2d
2288
zip2 v1.2d,v8.2d,v10.2d
2289
zip1 v2.2d,v9.2d,v11.2d
2290
zip2 v3.2d,v9.2d,v11.2d
2291
eor v0.16b,v0.16b,v15.16b
2292
eor v1.16b,v1.16b,v4.16b
2293
eor v2.16b,v2.16b,v5.16b
2294
st1 {v0.4s,v1.4s,v2.4s},[x1],#48
2295
// save back IV
2296
st1 {v6.4s}, [x4]
2297
100:
2298
ldp d10,d11,[sp,#16]
2299
ldp d12,d13,[sp,#32]
2300
ldp d14,d15,[sp,#48]
2301
ldp x29,x30,[sp,#64]
2302
ldp d8,d9,[sp],#80
2303
AARCH64_VALIDATE_LINK_REGISTER
2304
ret
2305
.size vpsm4_cbc_encrypt,.-vpsm4_cbc_encrypt
2306
.globl vpsm4_ctr32_encrypt_blocks
2307
.type vpsm4_ctr32_encrypt_blocks,%function
2308
.align 5
2309
vpsm4_ctr32_encrypt_blocks:
2310
AARCH64_VALID_CALL_TARGET
2311
ld1 {v3.4s},[x4]
2312
#ifndef __AARCH64EB__
2313
rev32 v3.16b,v3.16b
2314
#endif
2315
adrp x10,.Lsbox
2316
add x10,x10,#:lo12:.Lsbox
2317
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
2318
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
2319
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
2320
ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
2321
cmp w2,#1
2322
b.ne 1f
2323
// fast processing for one single block without
2324
// context saving overhead
2325
mov x10,x3
2326
mov w11,#8
2327
mov w12,v3.s[0]
2328
mov w13,v3.s[1]
2329
mov w14,v3.s[2]
2330
mov w15,v3.s[3]
2331
10:
2332
ldp w7,w8,[x10],8
2333
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2334
eor w6,w14,w15
2335
eor w9,w7,w13
2336
eor w6,w6,w9
2337
movi v1.16b,#64
2338
movi v2.16b,#128
2339
movi v3.16b,#192
2340
mov v0.s[0],w6
2341
2342
sub v1.16b,v0.16b,v1.16b
2343
sub v2.16b,v0.16b,v2.16b
2344
sub v3.16b,v0.16b,v3.16b
2345
2346
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2347
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2348
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2349
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2350
2351
mov w6,v0.s[0]
2352
mov w7,v1.s[0]
2353
mov w9,v2.s[0]
2354
add w7,w6,w7
2355
mov w6,v3.s[0]
2356
add w7,w7,w9
2357
add w7,w7,w6
2358
2359
eor w6,w7,w7,ror #32-2
2360
eor w6,w6,w7,ror #32-10
2361
eor w6,w6,w7,ror #32-18
2362
eor w6,w6,w7,ror #32-24
2363
eor w12,w12,w6
2364
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2365
eor w6,w14,w15
2366
eor w9,w12,w8
2367
eor w6,w6,w9
2368
movi v1.16b,#64
2369
movi v2.16b,#128
2370
movi v3.16b,#192
2371
mov v0.s[0],w6
2372
2373
sub v1.16b,v0.16b,v1.16b
2374
sub v2.16b,v0.16b,v2.16b
2375
sub v3.16b,v0.16b,v3.16b
2376
2377
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2378
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2379
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2380
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2381
2382
mov w6,v0.s[0]
2383
mov w7,v1.s[0]
2384
mov w9,v2.s[0]
2385
add w7,w6,w7
2386
mov w6,v3.s[0]
2387
add w7,w7,w9
2388
add w7,w7,w6
2389
2390
eor w6,w7,w7,ror #32-2
2391
eor w6,w6,w7,ror #32-10
2392
eor w6,w6,w7,ror #32-18
2393
eor w6,w6,w7,ror #32-24
2394
ldp w7,w8,[x10],8
2395
eor w13,w13,w6
2396
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2397
eor w6,w12,w13
2398
eor w9,w7,w15
2399
eor w6,w6,w9
2400
movi v1.16b,#64
2401
movi v2.16b,#128
2402
movi v3.16b,#192
2403
mov v0.s[0],w6
2404
2405
sub v1.16b,v0.16b,v1.16b
2406
sub v2.16b,v0.16b,v2.16b
2407
sub v3.16b,v0.16b,v3.16b
2408
2409
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2410
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2411
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2412
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2413
2414
mov w6,v0.s[0]
2415
mov w7,v1.s[0]
2416
mov w9,v2.s[0]
2417
add w7,w6,w7
2418
mov w6,v3.s[0]
2419
add w7,w7,w9
2420
add w7,w7,w6
2421
2422
eor w6,w7,w7,ror #32-2
2423
eor w6,w6,w7,ror #32-10
2424
eor w6,w6,w7,ror #32-18
2425
eor w6,w6,w7,ror #32-24
2426
eor w14,w14,w6
2427
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2428
eor w6,w12,w13
2429
eor w9,w14,w8
2430
eor w6,w6,w9
2431
movi v1.16b,#64
2432
movi v2.16b,#128
2433
movi v3.16b,#192
2434
mov v0.s[0],w6
2435
2436
sub v1.16b,v0.16b,v1.16b
2437
sub v2.16b,v0.16b,v2.16b
2438
sub v3.16b,v0.16b,v3.16b
2439
2440
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2441
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2442
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2443
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2444
2445
mov w6,v0.s[0]
2446
mov w7,v1.s[0]
2447
mov w9,v2.s[0]
2448
add w7,w6,w7
2449
mov w6,v3.s[0]
2450
add w7,w7,w9
2451
add w7,w7,w6
2452
2453
eor w6,w7,w7,ror #32-2
2454
eor w6,w6,w7,ror #32-10
2455
eor w6,w6,w7,ror #32-18
2456
eor w6,w6,w7,ror #32-24
2457
eor w15,w15,w6
2458
subs w11,w11,#1
2459
b.ne 10b
2460
mov v3.s[0],w15
2461
mov v3.s[1],w14
2462
mov v3.s[2],w13
2463
mov v3.s[3],w12
2464
#ifndef __AARCH64EB__
2465
rev32 v3.16b,v3.16b
2466
#endif
2467
ld1 {v4.4s},[x0]
2468
eor v4.16b,v4.16b,v3.16b
2469
st1 {v4.4s},[x1]
2470
ret
2471
1:
2472
AARCH64_SIGN_LINK_REGISTER
2473
stp d8,d9,[sp,#-80]!
2474
stp d10,d11,[sp,#16]
2475
stp d12,d13,[sp,#32]
2476
stp d14,d15,[sp,#48]
2477
stp x29,x30,[sp,#64]
2478
mov w12,v3.s[0]
2479
mov w13,v3.s[1]
2480
mov w14,v3.s[2]
2481
mov w5,v3.s[3]
2482
.Lctr32_4_blocks_process:
2483
cmp w2,#4
2484
b.lt 1f
2485
dup v4.4s,w12
2486
dup v5.4s,w13
2487
dup v6.4s,w14
2488
mov v7.s[0],w5
2489
add w5,w5,#1
2490
mov v7.s[1],w5
2491
add w5,w5,#1
2492
mov v7.s[2],w5
2493
add w5,w5,#1
2494
mov v7.s[3],w5
2495
add w5,w5,#1
2496
cmp w2,#8
2497
b.ge .Lctr32_8_blocks_process
2498
bl _vpsm4_enc_4blks
2499
ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2500
eor v0.16b,v0.16b,v12.16b
2501
eor v1.16b,v1.16b,v13.16b
2502
eor v2.16b,v2.16b,v14.16b
2503
eor v3.16b,v3.16b,v15.16b
2504
st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2505
subs w2,w2,#4
2506
b.ne .Lctr32_4_blocks_process
2507
b 100f
2508
.Lctr32_8_blocks_process:
2509
dup v8.4s,w12
2510
dup v9.4s,w13
2511
dup v10.4s,w14
2512
mov v11.s[0],w5
2513
add w5,w5,#1
2514
mov v11.s[1],w5
2515
add w5,w5,#1
2516
mov v11.s[2],w5
2517
add w5,w5,#1
2518
mov v11.s[3],w5
2519
add w5,w5,#1
2520
bl _vpsm4_enc_8blks
2521
ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2522
ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2523
eor v0.16b,v0.16b,v12.16b
2524
eor v1.16b,v1.16b,v13.16b
2525
eor v2.16b,v2.16b,v14.16b
2526
eor v3.16b,v3.16b,v15.16b
2527
eor v4.16b,v4.16b,v8.16b
2528
eor v5.16b,v5.16b,v9.16b
2529
eor v6.16b,v6.16b,v10.16b
2530
eor v7.16b,v7.16b,v11.16b
2531
st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2532
st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
2533
subs w2,w2,#8
2534
b.ne .Lctr32_4_blocks_process
2535
b 100f
2536
1: // last block processing
2537
subs w2,w2,#1
2538
b.lt 100f
2539
b.gt 1f
2540
mov v3.s[0],w12
2541
mov v3.s[1],w13
2542
mov v3.s[2],w14
2543
mov v3.s[3],w5
2544
mov x10,x3
2545
mov w11,#8
2546
mov w12,v3.s[0]
2547
mov w13,v3.s[1]
2548
mov w14,v3.s[2]
2549
mov w15,v3.s[3]
2550
10:
2551
ldp w7,w8,[x10],8
2552
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2553
eor w6,w14,w15
2554
eor w9,w7,w13
2555
eor w6,w6,w9
2556
movi v1.16b,#64
2557
movi v2.16b,#128
2558
movi v3.16b,#192
2559
mov v0.s[0],w6
2560
2561
sub v1.16b,v0.16b,v1.16b
2562
sub v2.16b,v0.16b,v2.16b
2563
sub v3.16b,v0.16b,v3.16b
2564
2565
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2566
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2567
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2568
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2569
2570
mov w6,v0.s[0]
2571
mov w7,v1.s[0]
2572
mov w9,v2.s[0]
2573
add w7,w6,w7
2574
mov w6,v3.s[0]
2575
add w7,w7,w9
2576
add w7,w7,w6
2577
2578
eor w6,w7,w7,ror #32-2
2579
eor w6,w6,w7,ror #32-10
2580
eor w6,w6,w7,ror #32-18
2581
eor w6,w6,w7,ror #32-24
2582
eor w12,w12,w6
2583
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2584
eor w6,w14,w15
2585
eor w9,w12,w8
2586
eor w6,w6,w9
2587
movi v1.16b,#64
2588
movi v2.16b,#128
2589
movi v3.16b,#192
2590
mov v0.s[0],w6
2591
2592
sub v1.16b,v0.16b,v1.16b
2593
sub v2.16b,v0.16b,v2.16b
2594
sub v3.16b,v0.16b,v3.16b
2595
2596
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2597
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2598
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2599
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2600
2601
mov w6,v0.s[0]
2602
mov w7,v1.s[0]
2603
mov w9,v2.s[0]
2604
add w7,w6,w7
2605
mov w6,v3.s[0]
2606
add w7,w7,w9
2607
add w7,w7,w6
2608
2609
eor w6,w7,w7,ror #32-2
2610
eor w6,w6,w7,ror #32-10
2611
eor w6,w6,w7,ror #32-18
2612
eor w6,w6,w7,ror #32-24
2613
ldp w7,w8,[x10],8
2614
eor w13,w13,w6
2615
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2616
eor w6,w12,w13
2617
eor w9,w7,w15
2618
eor w6,w6,w9
2619
movi v1.16b,#64
2620
movi v2.16b,#128
2621
movi v3.16b,#192
2622
mov v0.s[0],w6
2623
2624
sub v1.16b,v0.16b,v1.16b
2625
sub v2.16b,v0.16b,v2.16b
2626
sub v3.16b,v0.16b,v3.16b
2627
2628
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2629
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2630
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2631
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2632
2633
mov w6,v0.s[0]
2634
mov w7,v1.s[0]
2635
mov w9,v2.s[0]
2636
add w7,w6,w7
2637
mov w6,v3.s[0]
2638
add w7,w7,w9
2639
add w7,w7,w6
2640
2641
eor w6,w7,w7,ror #32-2
2642
eor w6,w6,w7,ror #32-10
2643
eor w6,w6,w7,ror #32-18
2644
eor w6,w6,w7,ror #32-24
2645
eor w14,w14,w6
2646
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2647
eor w6,w12,w13
2648
eor w9,w14,w8
2649
eor w6,w6,w9
2650
movi v1.16b,#64
2651
movi v2.16b,#128
2652
movi v3.16b,#192
2653
mov v0.s[0],w6
2654
2655
sub v1.16b,v0.16b,v1.16b
2656
sub v2.16b,v0.16b,v2.16b
2657
sub v3.16b,v0.16b,v3.16b
2658
2659
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2660
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2661
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2662
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2663
2664
mov w6,v0.s[0]
2665
mov w7,v1.s[0]
2666
mov w9,v2.s[0]
2667
add w7,w6,w7
2668
mov w6,v3.s[0]
2669
add w7,w7,w9
2670
add w7,w7,w6
2671
2672
eor w6,w7,w7,ror #32-2
2673
eor w6,w6,w7,ror #32-10
2674
eor w6,w6,w7,ror #32-18
2675
eor w6,w6,w7,ror #32-24
2676
eor w15,w15,w6
2677
subs w11,w11,#1
2678
b.ne 10b
2679
mov v3.s[0],w15
2680
mov v3.s[1],w14
2681
mov v3.s[2],w13
2682
mov v3.s[3],w12
2683
#ifndef __AARCH64EB__
2684
rev32 v3.16b,v3.16b
2685
#endif
2686
ld1 {v4.4s},[x0]
2687
eor v4.16b,v4.16b,v3.16b
2688
st1 {v4.4s},[x1]
2689
b 100f
2690
1: // last 2 blocks processing
2691
dup v4.4s,w12
2692
dup v5.4s,w13
2693
dup v6.4s,w14
2694
mov v7.s[0],w5
2695
add w5,w5,#1
2696
mov v7.s[1],w5
2697
subs w2,w2,#1
2698
b.ne 1f
2699
bl _vpsm4_enc_4blks
2700
ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
2701
ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
2702
eor v0.16b,v0.16b,v12.16b
2703
eor v1.16b,v1.16b,v13.16b
2704
eor v2.16b,v2.16b,v14.16b
2705
eor v3.16b,v3.16b,v15.16b
2706
st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
2707
st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
2708
b 100f
2709
1: // last 3 blocks processing
2710
add w5,w5,#1
2711
mov v7.s[2],w5
2712
bl _vpsm4_enc_4blks
2713
ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
2714
ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
2715
ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16
2716
eor v0.16b,v0.16b,v12.16b
2717
eor v1.16b,v1.16b,v13.16b
2718
eor v2.16b,v2.16b,v14.16b
2719
eor v3.16b,v3.16b,v15.16b
2720
st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
2721
st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
2722
st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16
2723
100:
2724
ldp d10,d11,[sp,#16]
2725
ldp d12,d13,[sp,#32]
2726
ldp d14,d15,[sp,#48]
2727
ldp x29,x30,[sp,#64]
2728
ldp d8,d9,[sp],#80
2729
AARCH64_VALIDATE_LINK_REGISTER
2730
ret
2731
.size vpsm4_ctr32_encrypt_blocks,.-vpsm4_ctr32_encrypt_blocks
2732
.globl vpsm4_xts_encrypt_gb
2733
.type vpsm4_xts_encrypt_gb,%function
2734
.align 5
2735
vpsm4_xts_encrypt_gb:
2736
AARCH64_SIGN_LINK_REGISTER
2737
stp x15, x16, [sp, #-0x10]!
2738
stp x17, x18, [sp, #-0x10]!
2739
stp x19, x20, [sp, #-0x10]!
2740
stp x21, x22, [sp, #-0x10]!
2741
stp x23, x24, [sp, #-0x10]!
2742
stp x25, x26, [sp, #-0x10]!
2743
stp x27, x28, [sp, #-0x10]!
2744
stp x29, x30, [sp, #-0x10]!
2745
stp d8, d9, [sp, #-0x10]!
2746
stp d10, d11, [sp, #-0x10]!
2747
stp d12, d13, [sp, #-0x10]!
2748
stp d14, d15, [sp, #-0x10]!
2749
mov x26,x3
2750
mov x27,x4
2751
mov w28,w6
2752
ld1 {v8.4s}, [x5]
2753
mov x3,x27
2754
adrp x10,.Lsbox
2755
add x10,x10,#:lo12:.Lsbox
2756
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
2757
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
2758
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
2759
ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
2760
#ifndef __AARCH64EB__
2761
rev32 v8.16b,v8.16b
2762
#endif
2763
mov x10,x3
2764
mov w11,#8
2765
mov w12,v8.s[0]
2766
mov w13,v8.s[1]
2767
mov w14,v8.s[2]
2768
mov w15,v8.s[3]
2769
10:
2770
ldp w7,w8,[x10],8
2771
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2772
eor w6,w14,w15
2773
eor w9,w7,w13
2774
eor w6,w6,w9
2775
movi v1.16b,#64
2776
movi v2.16b,#128
2777
movi v3.16b,#192
2778
mov v0.s[0],w6
2779
2780
sub v1.16b,v0.16b,v1.16b
2781
sub v2.16b,v0.16b,v2.16b
2782
sub v3.16b,v0.16b,v3.16b
2783
2784
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2785
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2786
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2787
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2788
2789
mov w6,v0.s[0]
2790
mov w7,v1.s[0]
2791
mov w9,v2.s[0]
2792
add w7,w6,w7
2793
mov w6,v3.s[0]
2794
add w7,w7,w9
2795
add w7,w7,w6
2796
2797
eor w6,w7,w7,ror #32-2
2798
eor w6,w6,w7,ror #32-10
2799
eor w6,w6,w7,ror #32-18
2800
eor w6,w6,w7,ror #32-24
2801
eor w12,w12,w6
2802
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2803
eor w6,w14,w15
2804
eor w9,w12,w8
2805
eor w6,w6,w9
2806
movi v1.16b,#64
2807
movi v2.16b,#128
2808
movi v3.16b,#192
2809
mov v0.s[0],w6
2810
2811
sub v1.16b,v0.16b,v1.16b
2812
sub v2.16b,v0.16b,v2.16b
2813
sub v3.16b,v0.16b,v3.16b
2814
2815
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2816
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2817
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2818
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2819
2820
mov w6,v0.s[0]
2821
mov w7,v1.s[0]
2822
mov w9,v2.s[0]
2823
add w7,w6,w7
2824
mov w6,v3.s[0]
2825
add w7,w7,w9
2826
add w7,w7,w6
2827
2828
eor w6,w7,w7,ror #32-2
2829
eor w6,w6,w7,ror #32-10
2830
eor w6,w6,w7,ror #32-18
2831
eor w6,w6,w7,ror #32-24
2832
ldp w7,w8,[x10],8
2833
eor w13,w13,w6
2834
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2835
eor w6,w12,w13
2836
eor w9,w7,w15
2837
eor w6,w6,w9
2838
movi v1.16b,#64
2839
movi v2.16b,#128
2840
movi v3.16b,#192
2841
mov v0.s[0],w6
2842
2843
sub v1.16b,v0.16b,v1.16b
2844
sub v2.16b,v0.16b,v2.16b
2845
sub v3.16b,v0.16b,v3.16b
2846
2847
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2848
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2849
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2850
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2851
2852
mov w6,v0.s[0]
2853
mov w7,v1.s[0]
2854
mov w9,v2.s[0]
2855
add w7,w6,w7
2856
mov w6,v3.s[0]
2857
add w7,w7,w9
2858
add w7,w7,w6
2859
2860
eor w6,w7,w7,ror #32-2
2861
eor w6,w6,w7,ror #32-10
2862
eor w6,w6,w7,ror #32-18
2863
eor w6,w6,w7,ror #32-24
2864
eor w14,w14,w6
2865
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2866
eor w6,w12,w13
2867
eor w9,w14,w8
2868
eor w6,w6,w9
2869
movi v1.16b,#64
2870
movi v2.16b,#128
2871
movi v3.16b,#192
2872
mov v0.s[0],w6
2873
2874
sub v1.16b,v0.16b,v1.16b
2875
sub v2.16b,v0.16b,v2.16b
2876
sub v3.16b,v0.16b,v3.16b
2877
2878
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2879
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2880
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2881
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2882
2883
mov w6,v0.s[0]
2884
mov w7,v1.s[0]
2885
mov w9,v2.s[0]
2886
add w7,w6,w7
2887
mov w6,v3.s[0]
2888
add w7,w7,w9
2889
add w7,w7,w6
2890
2891
eor w6,w7,w7,ror #32-2
2892
eor w6,w6,w7,ror #32-10
2893
eor w6,w6,w7,ror #32-18
2894
eor w6,w6,w7,ror #32-24
2895
eor w15,w15,w6
2896
subs w11,w11,#1
2897
b.ne 10b
2898
mov v8.s[0],w15
2899
mov v8.s[1],w14
2900
mov v8.s[2],w13
2901
mov v8.s[3],w12
2902
#ifndef __AARCH64EB__
2903
rev32 v8.16b,v8.16b
2904
#endif
2905
mov x3,x26
2906
and x29,x2,#0x0F
2907
// convert length into blocks
2908
lsr x2,x2,4
2909
cmp x2,#1
2910
b.lt .return_gb
2911
2912
cmp x29,0
2913
// If the encryption/decryption Length is N times of 16,
2914
// the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
2915
b.eq .xts_encrypt_blocks_gb
2916
2917
// If the encryption/decryption length is not N times of 16,
2918
// the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb
2919
// the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
2920
subs x2,x2,#1
2921
b.eq .only_2blks_tweak_gb
2922
.xts_encrypt_blocks_gb:
2923
rbit v8.16b,v8.16b
2924
#ifdef __AARCH64EB__
2925
rev32 v8.16b,v8.16b
2926
#endif
2927
mov x12,v8.d[0]
2928
mov x13,v8.d[1]
2929
mov w7,0x87
2930
extr x9,x13,x13,#32
2931
extr x15,x13,x12,#63
2932
and w8,w7,w9,asr#31
2933
eor x14,x8,x12,lsl#1
2934
mov w7,0x87
2935
extr x9,x15,x15,#32
2936
extr x17,x15,x14,#63
2937
and w8,w7,w9,asr#31
2938
eor x16,x8,x14,lsl#1
2939
mov w7,0x87
2940
extr x9,x17,x17,#32
2941
extr x19,x17,x16,#63
2942
and w8,w7,w9,asr#31
2943
eor x18,x8,x16,lsl#1
2944
mov w7,0x87
2945
extr x9,x19,x19,#32
2946
extr x21,x19,x18,#63
2947
and w8,w7,w9,asr#31
2948
eor x20,x8,x18,lsl#1
2949
mov w7,0x87
2950
extr x9,x21,x21,#32
2951
extr x23,x21,x20,#63
2952
and w8,w7,w9,asr#31
2953
eor x22,x8,x20,lsl#1
2954
mov w7,0x87
2955
extr x9,x23,x23,#32
2956
extr x25,x23,x22,#63
2957
and w8,w7,w9,asr#31
2958
eor x24,x8,x22,lsl#1
2959
mov w7,0x87
2960
extr x9,x25,x25,#32
2961
extr x27,x25,x24,#63
2962
and w8,w7,w9,asr#31
2963
eor x26,x8,x24,lsl#1
2964
.Lxts_8_blocks_process_gb:
2965
cmp x2,#8
2966
b.lt .Lxts_4_blocks_process_gb
2967
mov v0.d[0],x12
2968
mov v0.d[1],x13
2969
#ifdef __AARCH64EB__
2970
rev32 v0.16b,v0.16b
2971
#endif
2972
mov v1.d[0],x14
2973
mov v1.d[1],x15
2974
#ifdef __AARCH64EB__
2975
rev32 v1.16b,v1.16b
2976
#endif
2977
mov v2.d[0],x16
2978
mov v2.d[1],x17
2979
#ifdef __AARCH64EB__
2980
rev32 v2.16b,v2.16b
2981
#endif
2982
mov v3.d[0],x18
2983
mov v3.d[1],x19
2984
#ifdef __AARCH64EB__
2985
rev32 v3.16b,v3.16b
2986
#endif
2987
mov v12.d[0],x20
2988
mov v12.d[1],x21
2989
#ifdef __AARCH64EB__
2990
rev32 v12.16b,v12.16b
2991
#endif
2992
mov v13.d[0],x22
2993
mov v13.d[1],x23
2994
#ifdef __AARCH64EB__
2995
rev32 v13.16b,v13.16b
2996
#endif
2997
mov v14.d[0],x24
2998
mov v14.d[1],x25
2999
#ifdef __AARCH64EB__
3000
rev32 v14.16b,v14.16b
3001
#endif
3002
mov v15.d[0],x26
3003
mov v15.d[1],x27
3004
#ifdef __AARCH64EB__
3005
rev32 v15.16b,v15.16b
3006
#endif
3007
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3008
rbit v0.16b,v0.16b
3009
rbit v1.16b,v1.16b
3010
rbit v2.16b,v2.16b
3011
rbit v3.16b,v3.16b
3012
eor v4.16b, v4.16b, v0.16b
3013
eor v5.16b, v5.16b, v1.16b
3014
eor v6.16b, v6.16b, v2.16b
3015
eor v7.16b, v7.16b, v3.16b
3016
ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
3017
rbit v12.16b,v12.16b
3018
rbit v13.16b,v13.16b
3019
rbit v14.16b,v14.16b
3020
rbit v15.16b,v15.16b
3021
eor v8.16b, v8.16b, v12.16b
3022
eor v9.16b, v9.16b, v13.16b
3023
eor v10.16b, v10.16b, v14.16b
3024
eor v11.16b, v11.16b, v15.16b
3025
#ifndef __AARCH64EB__
3026
rev32 v4.16b,v4.16b
3027
#endif
3028
#ifndef __AARCH64EB__
3029
rev32 v5.16b,v5.16b
3030
#endif
3031
#ifndef __AARCH64EB__
3032
rev32 v6.16b,v6.16b
3033
#endif
3034
#ifndef __AARCH64EB__
3035
rev32 v7.16b,v7.16b
3036
#endif
3037
#ifndef __AARCH64EB__
3038
rev32 v8.16b,v8.16b
3039
#endif
3040
#ifndef __AARCH64EB__
3041
rev32 v9.16b,v9.16b
3042
#endif
3043
#ifndef __AARCH64EB__
3044
rev32 v10.16b,v10.16b
3045
#endif
3046
#ifndef __AARCH64EB__
3047
rev32 v11.16b,v11.16b
3048
#endif
3049
zip1 v0.4s,v4.4s,v5.4s
3050
zip2 v1.4s,v4.4s,v5.4s
3051
zip1 v2.4s,v6.4s,v7.4s
3052
zip2 v3.4s,v6.4s,v7.4s
3053
zip1 v4.2d,v0.2d,v2.2d
3054
zip2 v5.2d,v0.2d,v2.2d
3055
zip1 v6.2d,v1.2d,v3.2d
3056
zip2 v7.2d,v1.2d,v3.2d
3057
zip1 v0.4s,v8.4s,v9.4s
3058
zip2 v1.4s,v8.4s,v9.4s
3059
zip1 v2.4s,v10.4s,v11.4s
3060
zip2 v3.4s,v10.4s,v11.4s
3061
zip1 v8.2d,v0.2d,v2.2d
3062
zip2 v9.2d,v0.2d,v2.2d
3063
zip1 v10.2d,v1.2d,v3.2d
3064
zip2 v11.2d,v1.2d,v3.2d
3065
bl _vpsm4_enc_8blks
3066
zip1 v8.4s,v0.4s,v1.4s
3067
zip2 v9.4s,v0.4s,v1.4s
3068
zip1 v10.4s,v2.4s,v3.4s
3069
zip2 v11.4s,v2.4s,v3.4s
3070
zip1 v0.2d,v8.2d,v10.2d
3071
zip2 v1.2d,v8.2d,v10.2d
3072
zip1 v2.2d,v9.2d,v11.2d
3073
zip2 v3.2d,v9.2d,v11.2d
3074
zip1 v8.4s,v4.4s,v5.4s
3075
zip2 v9.4s,v4.4s,v5.4s
3076
zip1 v10.4s,v6.4s,v7.4s
3077
zip2 v11.4s,v6.4s,v7.4s
3078
zip1 v4.2d,v8.2d,v10.2d
3079
zip2 v5.2d,v8.2d,v10.2d
3080
zip1 v6.2d,v9.2d,v11.2d
3081
zip2 v7.2d,v9.2d,v11.2d
3082
mov v12.d[0],x12
3083
mov v12.d[1],x13
3084
#ifdef __AARCH64EB__
3085
rev32 v12.16b,v12.16b
3086
#endif
3087
mov w7,0x87
3088
extr x9,x27,x27,#32
3089
extr x13,x27,x26,#63
3090
and w8,w7,w9,asr#31
3091
eor x12,x8,x26,lsl#1
3092
mov v13.d[0],x14
3093
mov v13.d[1],x15
3094
#ifdef __AARCH64EB__
3095
rev32 v13.16b,v13.16b
3096
#endif
3097
mov w7,0x87
3098
extr x9,x13,x13,#32
3099
extr x15,x13,x12,#63
3100
and w8,w7,w9,asr#31
3101
eor x14,x8,x12,lsl#1
3102
mov v14.d[0],x16
3103
mov v14.d[1],x17
3104
#ifdef __AARCH64EB__
3105
rev32 v14.16b,v14.16b
3106
#endif
3107
mov w7,0x87
3108
extr x9,x15,x15,#32
3109
extr x17,x15,x14,#63
3110
and w8,w7,w9,asr#31
3111
eor x16,x8,x14,lsl#1
3112
mov v15.d[0],x18
3113
mov v15.d[1],x19
3114
#ifdef __AARCH64EB__
3115
rev32 v15.16b,v15.16b
3116
#endif
3117
mov w7,0x87
3118
extr x9,x17,x17,#32
3119
extr x19,x17,x16,#63
3120
and w8,w7,w9,asr#31
3121
eor x18,x8,x16,lsl#1
3122
mov v8.d[0],x20
3123
mov v8.d[1],x21
3124
#ifdef __AARCH64EB__
3125
rev32 v8.16b,v8.16b
3126
#endif
3127
mov w7,0x87
3128
extr x9,x19,x19,#32
3129
extr x21,x19,x18,#63
3130
and w8,w7,w9,asr#31
3131
eor x20,x8,x18,lsl#1
3132
mov v9.d[0],x22
3133
mov v9.d[1],x23
3134
#ifdef __AARCH64EB__
3135
rev32 v9.16b,v9.16b
3136
#endif
3137
mov w7,0x87
3138
extr x9,x21,x21,#32
3139
extr x23,x21,x20,#63
3140
and w8,w7,w9,asr#31
3141
eor x22,x8,x20,lsl#1
3142
mov v10.d[0],x24
3143
mov v10.d[1],x25
3144
#ifdef __AARCH64EB__
3145
rev32 v10.16b,v10.16b
3146
#endif
3147
mov w7,0x87
3148
extr x9,x23,x23,#32
3149
extr x25,x23,x22,#63
3150
and w8,w7,w9,asr#31
3151
eor x24,x8,x22,lsl#1
3152
mov v11.d[0],x26
3153
mov v11.d[1],x27
3154
#ifdef __AARCH64EB__
3155
rev32 v11.16b,v11.16b
3156
#endif
3157
mov w7,0x87
3158
extr x9,x25,x25,#32
3159
extr x27,x25,x24,#63
3160
and w8,w7,w9,asr#31
3161
eor x26,x8,x24,lsl#1
3162
eor v0.16b, v0.16b, v12.16b
3163
eor v1.16b, v1.16b, v13.16b
3164
eor v2.16b, v2.16b, v14.16b
3165
eor v3.16b, v3.16b, v15.16b
3166
eor v4.16b, v4.16b, v8.16b
3167
eor v5.16b, v5.16b, v9.16b
3168
eor v6.16b, v6.16b, v10.16b
3169
eor v7.16b, v7.16b, v11.16b
3170
3171
// save the last tweak
3172
st1 {v11.4s},[x5]
3173
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
3174
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
3175
subs x2,x2,#8
3176
b.gt .Lxts_8_blocks_process_gb
3177
b 100f
3178
.Lxts_4_blocks_process_gb:
3179
mov v8.d[0],x12
3180
mov v8.d[1],x13
3181
#ifdef __AARCH64EB__
3182
rev32 v8.16b,v8.16b
3183
#endif
3184
mov v9.d[0],x14
3185
mov v9.d[1],x15
3186
#ifdef __AARCH64EB__
3187
rev32 v9.16b,v9.16b
3188
#endif
3189
mov v10.d[0],x16
3190
mov v10.d[1],x17
3191
#ifdef __AARCH64EB__
3192
rev32 v10.16b,v10.16b
3193
#endif
3194
mov v11.d[0],x18
3195
mov v11.d[1],x19
3196
#ifdef __AARCH64EB__
3197
rev32 v11.16b,v11.16b
3198
#endif
3199
cmp x2,#4
3200
b.lt 1f
3201
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3202
rbit v8.16b,v8.16b
3203
rbit v9.16b,v9.16b
3204
rbit v10.16b,v10.16b
3205
rbit v11.16b,v11.16b
3206
eor v4.16b, v4.16b, v8.16b
3207
eor v5.16b, v5.16b, v9.16b
3208
eor v6.16b, v6.16b, v10.16b
3209
eor v7.16b, v7.16b, v11.16b
3210
#ifndef __AARCH64EB__
3211
rev32 v4.16b,v4.16b
3212
#endif
3213
#ifndef __AARCH64EB__
3214
rev32 v5.16b,v5.16b
3215
#endif
3216
#ifndef __AARCH64EB__
3217
rev32 v6.16b,v6.16b
3218
#endif
3219
#ifndef __AARCH64EB__
3220
rev32 v7.16b,v7.16b
3221
#endif
3222
zip1 v0.4s,v4.4s,v5.4s
3223
zip2 v1.4s,v4.4s,v5.4s
3224
zip1 v2.4s,v6.4s,v7.4s
3225
zip2 v3.4s,v6.4s,v7.4s
3226
zip1 v4.2d,v0.2d,v2.2d
3227
zip2 v5.2d,v0.2d,v2.2d
3228
zip1 v6.2d,v1.2d,v3.2d
3229
zip2 v7.2d,v1.2d,v3.2d
3230
bl _vpsm4_enc_4blks
3231
zip1 v4.4s,v0.4s,v1.4s
3232
zip2 v5.4s,v0.4s,v1.4s
3233
zip1 v6.4s,v2.4s,v3.4s
3234
zip2 v7.4s,v2.4s,v3.4s
3235
zip1 v0.2d,v4.2d,v6.2d
3236
zip2 v1.2d,v4.2d,v6.2d
3237
zip1 v2.2d,v5.2d,v7.2d
3238
zip2 v3.2d,v5.2d,v7.2d
3239
eor v0.16b, v0.16b, v8.16b
3240
eor v1.16b, v1.16b, v9.16b
3241
eor v2.16b, v2.16b, v10.16b
3242
eor v3.16b, v3.16b, v11.16b
3243
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
3244
sub x2,x2,#4
3245
mov v8.d[0],x20
3246
mov v8.d[1],x21
3247
#ifdef __AARCH64EB__
3248
rev32 v8.16b,v8.16b
3249
#endif
3250
mov v9.d[0],x22
3251
mov v9.d[1],x23
3252
#ifdef __AARCH64EB__
3253
rev32 v9.16b,v9.16b
3254
#endif
3255
mov v10.d[0],x24
3256
mov v10.d[1],x25
3257
#ifdef __AARCH64EB__
3258
rev32 v10.16b,v10.16b
3259
#endif
3260
// save the last tweak
3261
st1 {v11.4s},[x5]
3262
1:
3263
// process last block
3264
cmp x2,#1
3265
b.lt 100f
3266
b.gt 1f
3267
ld1 {v4.4s},[x0],#16
3268
rbit v8.16b,v8.16b
3269
eor v4.16b, v4.16b, v8.16b
3270
#ifndef __AARCH64EB__
3271
rev32 v4.16b,v4.16b
3272
#endif
3273
mov x10,x3
3274
mov w11,#8
3275
mov w12,v4.s[0]
3276
mov w13,v4.s[1]
3277
mov w14,v4.s[2]
3278
mov w15,v4.s[3]
3279
10:
3280
ldp w7,w8,[x10],8
3281
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3282
eor w6,w14,w15
3283
eor w9,w7,w13
3284
eor w6,w6,w9
3285
movi v1.16b,#64
3286
movi v2.16b,#128
3287
movi v3.16b,#192
3288
mov v0.s[0],w6
3289
3290
sub v1.16b,v0.16b,v1.16b
3291
sub v2.16b,v0.16b,v2.16b
3292
sub v3.16b,v0.16b,v3.16b
3293
3294
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3295
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3296
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3297
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3298
3299
mov w6,v0.s[0]
3300
mov w7,v1.s[0]
3301
mov w9,v2.s[0]
3302
add w7,w6,w7
3303
mov w6,v3.s[0]
3304
add w7,w7,w9
3305
add w7,w7,w6
3306
3307
eor w6,w7,w7,ror #32-2
3308
eor w6,w6,w7,ror #32-10
3309
eor w6,w6,w7,ror #32-18
3310
eor w6,w6,w7,ror #32-24
3311
eor w12,w12,w6
3312
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3313
eor w6,w14,w15
3314
eor w9,w12,w8
3315
eor w6,w6,w9
3316
movi v1.16b,#64
3317
movi v2.16b,#128
3318
movi v3.16b,#192
3319
mov v0.s[0],w6
3320
3321
sub v1.16b,v0.16b,v1.16b
3322
sub v2.16b,v0.16b,v2.16b
3323
sub v3.16b,v0.16b,v3.16b
3324
3325
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3326
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3327
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3328
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3329
3330
mov w6,v0.s[0]
3331
mov w7,v1.s[0]
3332
mov w9,v2.s[0]
3333
add w7,w6,w7
3334
mov w6,v3.s[0]
3335
add w7,w7,w9
3336
add w7,w7,w6
3337
3338
eor w6,w7,w7,ror #32-2
3339
eor w6,w6,w7,ror #32-10
3340
eor w6,w6,w7,ror #32-18
3341
eor w6,w6,w7,ror #32-24
3342
ldp w7,w8,[x10],8
3343
eor w13,w13,w6
3344
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3345
eor w6,w12,w13
3346
eor w9,w7,w15
3347
eor w6,w6,w9
3348
movi v1.16b,#64
3349
movi v2.16b,#128
3350
movi v3.16b,#192
3351
mov v0.s[0],w6
3352
3353
sub v1.16b,v0.16b,v1.16b
3354
sub v2.16b,v0.16b,v2.16b
3355
sub v3.16b,v0.16b,v3.16b
3356
3357
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3358
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3359
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3360
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3361
3362
mov w6,v0.s[0]
3363
mov w7,v1.s[0]
3364
mov w9,v2.s[0]
3365
add w7,w6,w7
3366
mov w6,v3.s[0]
3367
add w7,w7,w9
3368
add w7,w7,w6
3369
3370
eor w6,w7,w7,ror #32-2
3371
eor w6,w6,w7,ror #32-10
3372
eor w6,w6,w7,ror #32-18
3373
eor w6,w6,w7,ror #32-24
3374
eor w14,w14,w6
3375
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3376
eor w6,w12,w13
3377
eor w9,w14,w8
3378
eor w6,w6,w9
3379
movi v1.16b,#64
3380
movi v2.16b,#128
3381
movi v3.16b,#192
3382
mov v0.s[0],w6
3383
3384
sub v1.16b,v0.16b,v1.16b
3385
sub v2.16b,v0.16b,v2.16b
3386
sub v3.16b,v0.16b,v3.16b
3387
3388
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3389
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3390
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3391
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3392
3393
mov w6,v0.s[0]
3394
mov w7,v1.s[0]
3395
mov w9,v2.s[0]
3396
add w7,w6,w7
3397
mov w6,v3.s[0]
3398
add w7,w7,w9
3399
add w7,w7,w6
3400
3401
eor w6,w7,w7,ror #32-2
3402
eor w6,w6,w7,ror #32-10
3403
eor w6,w6,w7,ror #32-18
3404
eor w6,w6,w7,ror #32-24
3405
eor w15,w15,w6
3406
subs w11,w11,#1
3407
b.ne 10b
3408
mov v4.s[0],w15
3409
mov v4.s[1],w14
3410
mov v4.s[2],w13
3411
mov v4.s[3],w12
3412
#ifndef __AARCH64EB__
3413
rev32 v4.16b,v4.16b
3414
#endif
3415
eor v4.16b, v4.16b, v8.16b
3416
st1 {v4.4s},[x1],#16
3417
// save the last tweak
3418
st1 {v8.4s},[x5]
3419
b 100f
3420
1: // process last 2 blocks
3421
cmp x2,#2
3422
b.gt 1f
3423
ld1 {v4.4s,v5.4s},[x0],#32
3424
rbit v8.16b,v8.16b
3425
rbit v9.16b,v9.16b
3426
eor v4.16b, v4.16b, v8.16b
3427
eor v5.16b, v5.16b, v9.16b
3428
#ifndef __AARCH64EB__
3429
rev32 v4.16b,v4.16b
3430
#endif
3431
#ifndef __AARCH64EB__
3432
rev32 v5.16b,v5.16b
3433
#endif
3434
zip1 v0.4s,v4.4s,v5.4s
3435
zip2 v1.4s,v4.4s,v5.4s
3436
zip1 v2.4s,v6.4s,v7.4s
3437
zip2 v3.4s,v6.4s,v7.4s
3438
zip1 v4.2d,v0.2d,v2.2d
3439
zip2 v5.2d,v0.2d,v2.2d
3440
zip1 v6.2d,v1.2d,v3.2d
3441
zip2 v7.2d,v1.2d,v3.2d
3442
bl _vpsm4_enc_4blks
3443
zip1 v4.4s,v0.4s,v1.4s
3444
zip2 v5.4s,v0.4s,v1.4s
3445
zip1 v6.4s,v2.4s,v3.4s
3446
zip2 v7.4s,v2.4s,v3.4s
3447
zip1 v0.2d,v4.2d,v6.2d
3448
zip2 v1.2d,v4.2d,v6.2d
3449
zip1 v2.2d,v5.2d,v7.2d
3450
zip2 v3.2d,v5.2d,v7.2d
3451
eor v0.16b, v0.16b, v8.16b
3452
eor v1.16b, v1.16b, v9.16b
3453
st1 {v0.4s,v1.4s},[x1],#32
3454
// save the last tweak
3455
st1 {v9.4s},[x5]
3456
b 100f
3457
1: // process last 3 blocks
3458
ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
3459
rbit v8.16b,v8.16b
3460
rbit v9.16b,v9.16b
3461
rbit v10.16b,v10.16b
3462
eor v4.16b, v4.16b, v8.16b
3463
eor v5.16b, v5.16b, v9.16b
3464
eor v6.16b, v6.16b, v10.16b
3465
#ifndef __AARCH64EB__
3466
rev32 v4.16b,v4.16b
3467
#endif
3468
#ifndef __AARCH64EB__
3469
rev32 v5.16b,v5.16b
3470
#endif
3471
#ifndef __AARCH64EB__
3472
rev32 v6.16b,v6.16b
3473
#endif
3474
zip1 v0.4s,v4.4s,v5.4s
3475
zip2 v1.4s,v4.4s,v5.4s
3476
zip1 v2.4s,v6.4s,v7.4s
3477
zip2 v3.4s,v6.4s,v7.4s
3478
zip1 v4.2d,v0.2d,v2.2d
3479
zip2 v5.2d,v0.2d,v2.2d
3480
zip1 v6.2d,v1.2d,v3.2d
3481
zip2 v7.2d,v1.2d,v3.2d
3482
bl _vpsm4_enc_4blks
3483
zip1 v4.4s,v0.4s,v1.4s
3484
zip2 v5.4s,v0.4s,v1.4s
3485
zip1 v6.4s,v2.4s,v3.4s
3486
zip2 v7.4s,v2.4s,v3.4s
3487
zip1 v0.2d,v4.2d,v6.2d
3488
zip2 v1.2d,v4.2d,v6.2d
3489
zip1 v2.2d,v5.2d,v7.2d
3490
zip2 v3.2d,v5.2d,v7.2d
3491
eor v0.16b, v0.16b, v8.16b
3492
eor v1.16b, v1.16b, v9.16b
3493
eor v2.16b, v2.16b, v10.16b
3494
st1 {v0.4s,v1.4s,v2.4s},[x1],#48
3495
// save the last tweak
3496
st1 {v10.4s},[x5]
3497
100:
3498
cmp x29,0
3499
b.eq .return_gb
3500
3501
// This branch calculates the last two tweaks,
3502
// while the encryption/decryption length is larger than 32
3503
.last_2blks_tweak_gb:
3504
ld1 {v8.4s},[x5]
3505
#ifdef __AARCH64EB__
3506
rev32 v8.16b,v8.16b
3507
#endif
3508
rbit v2.16b,v8.16b
3509
adrp x10,.Lxts_magic
3510
ldr q0, [x10, #:lo12:.Lxts_magic]
3511
shl v9.16b, v2.16b, #1
3512
ext v1.16b, v2.16b, v2.16b,#15
3513
ushr v1.16b, v1.16b, #7
3514
mul v1.16b, v1.16b, v0.16b
3515
eor v9.16b, v9.16b, v1.16b
3516
rbit v9.16b,v9.16b
3517
rbit v2.16b,v9.16b
3518
adrp x10,.Lxts_magic
3519
ldr q0, [x10, #:lo12:.Lxts_magic]
3520
shl v10.16b, v2.16b, #1
3521
ext v1.16b, v2.16b, v2.16b,#15
3522
ushr v1.16b, v1.16b, #7
3523
mul v1.16b, v1.16b, v0.16b
3524
eor v10.16b, v10.16b, v1.16b
3525
rbit v10.16b,v10.16b
3526
b .check_dec_gb
3527
3528
3529
// This branch calculates the last two tweaks,
3530
// while the encryption/decryption length is equal to 32, who only need two tweaks
3531
.only_2blks_tweak_gb:
3532
mov v9.16b,v8.16b
3533
#ifdef __AARCH64EB__
3534
rev32 v9.16b,v9.16b
3535
#endif
3536
rbit v2.16b,v9.16b
3537
adrp x10,.Lxts_magic
3538
ldr q0, [x10, #:lo12:.Lxts_magic]
3539
shl v10.16b, v2.16b, #1
3540
ext v1.16b, v2.16b, v2.16b,#15
3541
ushr v1.16b, v1.16b, #7
3542
mul v1.16b, v1.16b, v0.16b
3543
eor v10.16b, v10.16b, v1.16b
3544
rbit v10.16b,v10.16b
3545
b .check_dec_gb
3546
3547
3548
// Determine whether encryption or decryption is required.
3549
// The last two tweaks need to be swapped for decryption.
3550
.check_dec_gb:
3551
// encryption:1 decryption:0
3552
cmp w28,1
3553
b.eq .process_last_2blks_gb
3554
mov v0.16B,v9.16b
3555
mov v9.16B,v10.16b
3556
mov v10.16B,v0.16b
3557
3558
.process_last_2blks_gb:
3559
#ifdef __AARCH64EB__
3560
rev32 v9.16b,v9.16b
3561
#endif
3562
#ifdef __AARCH64EB__
3563
rev32 v10.16b,v10.16b
3564
#endif
3565
ld1 {v4.4s},[x0],#16
3566
eor v4.16b, v4.16b, v9.16b
3567
#ifndef __AARCH64EB__
3568
rev32 v4.16b,v4.16b
3569
#endif
3570
mov x10,x3
3571
mov w11,#8
3572
mov w12,v4.s[0]
3573
mov w13,v4.s[1]
3574
mov w14,v4.s[2]
3575
mov w15,v4.s[3]
3576
10:
3577
ldp w7,w8,[x10],8
3578
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3579
eor w6,w14,w15
3580
eor w9,w7,w13
3581
eor w6,w6,w9
3582
movi v1.16b,#64
3583
movi v2.16b,#128
3584
movi v3.16b,#192
3585
mov v0.s[0],w6
3586
3587
sub v1.16b,v0.16b,v1.16b
3588
sub v2.16b,v0.16b,v2.16b
3589
sub v3.16b,v0.16b,v3.16b
3590
3591
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3592
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3593
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3594
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3595
3596
mov w6,v0.s[0]
3597
mov w7,v1.s[0]
3598
mov w9,v2.s[0]
3599
add w7,w6,w7
3600
mov w6,v3.s[0]
3601
add w7,w7,w9
3602
add w7,w7,w6
3603
3604
eor w6,w7,w7,ror #32-2
3605
eor w6,w6,w7,ror #32-10
3606
eor w6,w6,w7,ror #32-18
3607
eor w6,w6,w7,ror #32-24
3608
eor w12,w12,w6
3609
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3610
eor w6,w14,w15
3611
eor w9,w12,w8
3612
eor w6,w6,w9
3613
movi v1.16b,#64
3614
movi v2.16b,#128
3615
movi v3.16b,#192
3616
mov v0.s[0],w6
3617
3618
sub v1.16b,v0.16b,v1.16b
3619
sub v2.16b,v0.16b,v2.16b
3620
sub v3.16b,v0.16b,v3.16b
3621
3622
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3623
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3624
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3625
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3626
3627
mov w6,v0.s[0]
3628
mov w7,v1.s[0]
3629
mov w9,v2.s[0]
3630
add w7,w6,w7
3631
mov w6,v3.s[0]
3632
add w7,w7,w9
3633
add w7,w7,w6
3634
3635
eor w6,w7,w7,ror #32-2
3636
eor w6,w6,w7,ror #32-10
3637
eor w6,w6,w7,ror #32-18
3638
eor w6,w6,w7,ror #32-24
3639
ldp w7,w8,[x10],8
3640
eor w13,w13,w6
3641
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3642
eor w6,w12,w13
3643
eor w9,w7,w15
3644
eor w6,w6,w9
3645
movi v1.16b,#64
3646
movi v2.16b,#128
3647
movi v3.16b,#192
3648
mov v0.s[0],w6
3649
3650
sub v1.16b,v0.16b,v1.16b
3651
sub v2.16b,v0.16b,v2.16b
3652
sub v3.16b,v0.16b,v3.16b
3653
3654
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3655
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3656
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3657
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3658
3659
mov w6,v0.s[0]
3660
mov w7,v1.s[0]
3661
mov w9,v2.s[0]
3662
add w7,w6,w7
3663
mov w6,v3.s[0]
3664
add w7,w7,w9
3665
add w7,w7,w6
3666
3667
eor w6,w7,w7,ror #32-2
3668
eor w6,w6,w7,ror #32-10
3669
eor w6,w6,w7,ror #32-18
3670
eor w6,w6,w7,ror #32-24
3671
eor w14,w14,w6
3672
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3673
eor w6,w12,w13
3674
eor w9,w14,w8
3675
eor w6,w6,w9
3676
movi v1.16b,#64
3677
movi v2.16b,#128
3678
movi v3.16b,#192
3679
mov v0.s[0],w6
3680
3681
sub v1.16b,v0.16b,v1.16b
3682
sub v2.16b,v0.16b,v2.16b
3683
sub v3.16b,v0.16b,v3.16b
3684
3685
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3686
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3687
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3688
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3689
3690
mov w6,v0.s[0]
3691
mov w7,v1.s[0]
3692
mov w9,v2.s[0]
3693
add w7,w6,w7
3694
mov w6,v3.s[0]
3695
add w7,w7,w9
3696
add w7,w7,w6
3697
3698
eor w6,w7,w7,ror #32-2
3699
eor w6,w6,w7,ror #32-10
3700
eor w6,w6,w7,ror #32-18
3701
eor w6,w6,w7,ror #32-24
3702
eor w15,w15,w6
3703
subs w11,w11,#1
3704
b.ne 10b
3705
mov v4.s[0],w15
3706
mov v4.s[1],w14
3707
mov v4.s[2],w13
3708
mov v4.s[3],w12
3709
#ifndef __AARCH64EB__
3710
rev32 v4.16b,v4.16b
3711
#endif
3712
eor v4.16b, v4.16b, v9.16b
3713
st1 {v4.4s},[x1],#16
3714
3715
sub x26,x1,16
3716
.loop_gb:
3717
subs x29,x29,1
3718
ldrb w7,[x26,x29]
3719
ldrb w8,[x0,x29]
3720
strb w8,[x26,x29]
3721
strb w7,[x1,x29]
3722
b.gt .loop_gb
3723
ld1 {v4.4s}, [x26]
3724
eor v4.16b, v4.16b, v10.16b
3725
#ifndef __AARCH64EB__
3726
rev32 v4.16b,v4.16b
3727
#endif
3728
mov x10,x3
3729
mov w11,#8
3730
mov w12,v4.s[0]
3731
mov w13,v4.s[1]
3732
mov w14,v4.s[2]
3733
mov w15,v4.s[3]
3734
10:
3735
ldp w7,w8,[x10],8
3736
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3737
eor w6,w14,w15
3738
eor w9,w7,w13
3739
eor w6,w6,w9
3740
movi v1.16b,#64
3741
movi v2.16b,#128
3742
movi v3.16b,#192
3743
mov v0.s[0],w6
3744
3745
sub v1.16b,v0.16b,v1.16b
3746
sub v2.16b,v0.16b,v2.16b
3747
sub v3.16b,v0.16b,v3.16b
3748
3749
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3750
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3751
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3752
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3753
3754
mov w6,v0.s[0]
3755
mov w7,v1.s[0]
3756
mov w9,v2.s[0]
3757
add w7,w6,w7
3758
mov w6,v3.s[0]
3759
add w7,w7,w9
3760
add w7,w7,w6
3761
3762
eor w6,w7,w7,ror #32-2
3763
eor w6,w6,w7,ror #32-10
3764
eor w6,w6,w7,ror #32-18
3765
eor w6,w6,w7,ror #32-24
3766
eor w12,w12,w6
3767
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3768
eor w6,w14,w15
3769
eor w9,w12,w8
3770
eor w6,w6,w9
3771
movi v1.16b,#64
3772
movi v2.16b,#128
3773
movi v3.16b,#192
3774
mov v0.s[0],w6
3775
3776
sub v1.16b,v0.16b,v1.16b
3777
sub v2.16b,v0.16b,v2.16b
3778
sub v3.16b,v0.16b,v3.16b
3779
3780
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3781
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3782
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3783
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3784
3785
mov w6,v0.s[0]
3786
mov w7,v1.s[0]
3787
mov w9,v2.s[0]
3788
add w7,w6,w7
3789
mov w6,v3.s[0]
3790
add w7,w7,w9
3791
add w7,w7,w6
3792
3793
eor w6,w7,w7,ror #32-2
3794
eor w6,w6,w7,ror #32-10
3795
eor w6,w6,w7,ror #32-18
3796
eor w6,w6,w7,ror #32-24
3797
ldp w7,w8,[x10],8
3798
eor w13,w13,w6
3799
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3800
eor w6,w12,w13
3801
eor w9,w7,w15
3802
eor w6,w6,w9
3803
movi v1.16b,#64
3804
movi v2.16b,#128
3805
movi v3.16b,#192
3806
mov v0.s[0],w6
3807
3808
sub v1.16b,v0.16b,v1.16b
3809
sub v2.16b,v0.16b,v2.16b
3810
sub v3.16b,v0.16b,v3.16b
3811
3812
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3813
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3814
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3815
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3816
3817
mov w6,v0.s[0]
3818
mov w7,v1.s[0]
3819
mov w9,v2.s[0]
3820
add w7,w6,w7
3821
mov w6,v3.s[0]
3822
add w7,w7,w9
3823
add w7,w7,w6
3824
3825
eor w6,w7,w7,ror #32-2
3826
eor w6,w6,w7,ror #32-10
3827
eor w6,w6,w7,ror #32-18
3828
eor w6,w6,w7,ror #32-24
3829
eor w14,w14,w6
3830
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3831
eor w6,w12,w13
3832
eor w9,w14,w8
3833
eor w6,w6,w9
3834
movi v1.16b,#64
3835
movi v2.16b,#128
3836
movi v3.16b,#192
3837
mov v0.s[0],w6
3838
3839
sub v1.16b,v0.16b,v1.16b
3840
sub v2.16b,v0.16b,v2.16b
3841
sub v3.16b,v0.16b,v3.16b
3842
3843
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3844
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3845
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3846
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3847
3848
mov w6,v0.s[0]
3849
mov w7,v1.s[0]
3850
mov w9,v2.s[0]
3851
add w7,w6,w7
3852
mov w6,v3.s[0]
3853
add w7,w7,w9
3854
add w7,w7,w6
3855
3856
eor w6,w7,w7,ror #32-2
3857
eor w6,w6,w7,ror #32-10
3858
eor w6,w6,w7,ror #32-18
3859
eor w6,w6,w7,ror #32-24
3860
eor w15,w15,w6
3861
subs w11,w11,#1
3862
b.ne 10b
3863
mov v4.s[0],w15
3864
mov v4.s[1],w14
3865
mov v4.s[2],w13
3866
mov v4.s[3],w12
3867
#ifndef __AARCH64EB__
3868
rev32 v4.16b,v4.16b
3869
#endif
3870
eor v4.16b, v4.16b, v10.16b
3871
st1 {v4.4s}, [x26]
3872
.return_gb:
3873
ldp d14, d15, [sp], #0x10
3874
ldp d12, d13, [sp], #0x10
3875
ldp d10, d11, [sp], #0x10
3876
ldp d8, d9, [sp], #0x10
3877
ldp x29, x30, [sp], #0x10
3878
ldp x27, x28, [sp], #0x10
3879
ldp x25, x26, [sp], #0x10
3880
ldp x23, x24, [sp], #0x10
3881
ldp x21, x22, [sp], #0x10
3882
ldp x19, x20, [sp], #0x10
3883
ldp x17, x18, [sp], #0x10
3884
ldp x15, x16, [sp], #0x10
3885
AARCH64_VALIDATE_LINK_REGISTER
3886
ret
3887
.size vpsm4_xts_encrypt_gb,.-vpsm4_xts_encrypt_gb
3888
.globl vpsm4_xts_encrypt
3889
.type vpsm4_xts_encrypt,%function
3890
.align 5
3891
vpsm4_xts_encrypt:
3892
AARCH64_SIGN_LINK_REGISTER
3893
stp x15, x16, [sp, #-0x10]!
3894
stp x17, x18, [sp, #-0x10]!
3895
stp x19, x20, [sp, #-0x10]!
3896
stp x21, x22, [sp, #-0x10]!
3897
stp x23, x24, [sp, #-0x10]!
3898
stp x25, x26, [sp, #-0x10]!
3899
stp x27, x28, [sp, #-0x10]!
3900
stp x29, x30, [sp, #-0x10]!
3901
stp d8, d9, [sp, #-0x10]!
3902
stp d10, d11, [sp, #-0x10]!
3903
stp d12, d13, [sp, #-0x10]!
3904
stp d14, d15, [sp, #-0x10]!
3905
mov x26,x3
3906
mov x27,x4
3907
mov w28,w6
3908
ld1 {v8.4s}, [x5]
3909
mov x3,x27
3910
adrp x10,.Lsbox
3911
add x10,x10,#:lo12:.Lsbox
3912
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
3913
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
3914
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
3915
ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10]
3916
#ifndef __AARCH64EB__
3917
rev32 v8.16b,v8.16b
3918
#endif
3919
mov x10,x3
3920
mov w11,#8
3921
mov w12,v8.s[0]
3922
mov w13,v8.s[1]
3923
mov w14,v8.s[2]
3924
mov w15,v8.s[3]
3925
10:
3926
ldp w7,w8,[x10],8
3927
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3928
eor w6,w14,w15
3929
eor w9,w7,w13
3930
eor w6,w6,w9
3931
movi v1.16b,#64
3932
movi v2.16b,#128
3933
movi v3.16b,#192
3934
mov v0.s[0],w6
3935
3936
sub v1.16b,v0.16b,v1.16b
3937
sub v2.16b,v0.16b,v2.16b
3938
sub v3.16b,v0.16b,v3.16b
3939
3940
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3941
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3942
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3943
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3944
3945
mov w6,v0.s[0]
3946
mov w7,v1.s[0]
3947
mov w9,v2.s[0]
3948
add w7,w6,w7
3949
mov w6,v3.s[0]
3950
add w7,w7,w9
3951
add w7,w7,w6
3952
3953
eor w6,w7,w7,ror #32-2
3954
eor w6,w6,w7,ror #32-10
3955
eor w6,w6,w7,ror #32-18
3956
eor w6,w6,w7,ror #32-24
3957
eor w12,w12,w6
3958
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3959
eor w6,w14,w15
3960
eor w9,w12,w8
3961
eor w6,w6,w9
3962
movi v1.16b,#64
3963
movi v2.16b,#128
3964
movi v3.16b,#192
3965
mov v0.s[0],w6
3966
3967
sub v1.16b,v0.16b,v1.16b
3968
sub v2.16b,v0.16b,v2.16b
3969
sub v3.16b,v0.16b,v3.16b
3970
3971
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3972
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3973
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3974
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3975
3976
mov w6,v0.s[0]
3977
mov w7,v1.s[0]
3978
mov w9,v2.s[0]
3979
add w7,w6,w7
3980
mov w6,v3.s[0]
3981
add w7,w7,w9
3982
add w7,w7,w6
3983
3984
eor w6,w7,w7,ror #32-2
3985
eor w6,w6,w7,ror #32-10
3986
eor w6,w6,w7,ror #32-18
3987
eor w6,w6,w7,ror #32-24
3988
ldp w7,w8,[x10],8
3989
eor w13,w13,w6
3990
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3991
eor w6,w12,w13
3992
eor w9,w7,w15
3993
eor w6,w6,w9
3994
movi v1.16b,#64
3995
movi v2.16b,#128
3996
movi v3.16b,#192
3997
mov v0.s[0],w6
3998
3999
sub v1.16b,v0.16b,v1.16b
4000
sub v2.16b,v0.16b,v2.16b
4001
sub v3.16b,v0.16b,v3.16b
4002
4003
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4004
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4005
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4006
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4007
4008
mov w6,v0.s[0]
4009
mov w7,v1.s[0]
4010
mov w9,v2.s[0]
4011
add w7,w6,w7
4012
mov w6,v3.s[0]
4013
add w7,w7,w9
4014
add w7,w7,w6
4015
4016
eor w6,w7,w7,ror #32-2
4017
eor w6,w6,w7,ror #32-10
4018
eor w6,w6,w7,ror #32-18
4019
eor w6,w6,w7,ror #32-24
4020
eor w14,w14,w6
4021
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4022
eor w6,w12,w13
4023
eor w9,w14,w8
4024
eor w6,w6,w9
4025
movi v1.16b,#64
4026
movi v2.16b,#128
4027
movi v3.16b,#192
4028
mov v0.s[0],w6
4029
4030
sub v1.16b,v0.16b,v1.16b
4031
sub v2.16b,v0.16b,v2.16b
4032
sub v3.16b,v0.16b,v3.16b
4033
4034
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4035
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4036
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4037
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4038
4039
mov w6,v0.s[0]
4040
mov w7,v1.s[0]
4041
mov w9,v2.s[0]
4042
add w7,w6,w7
4043
mov w6,v3.s[0]
4044
add w7,w7,w9
4045
add w7,w7,w6
4046
4047
eor w6,w7,w7,ror #32-2
4048
eor w6,w6,w7,ror #32-10
4049
eor w6,w6,w7,ror #32-18
4050
eor w6,w6,w7,ror #32-24
4051
eor w15,w15,w6
4052
subs w11,w11,#1
4053
b.ne 10b
4054
mov v8.s[0],w15
4055
mov v8.s[1],w14
4056
mov v8.s[2],w13
4057
mov v8.s[3],w12
4058
#ifndef __AARCH64EB__
4059
rev32 v8.16b,v8.16b
4060
#endif
4061
mov x3,x26
4062
and x29,x2,#0x0F
4063
// convert length into blocks
4064
lsr x2,x2,4
4065
cmp x2,#1
4066
b.lt .return
4067
4068
cmp x29,0
4069
// If the encryption/decryption Length is N times of 16,
4070
// the all blocks are encrypted/decrypted in .xts_encrypt_blocks
4071
b.eq .xts_encrypt_blocks
4072
4073
// If the encryption/decryption length is not N times of 16,
4074
// the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak
4075
// the other blocks are encrypted/decrypted in .xts_encrypt_blocks
4076
subs x2,x2,#1
4077
b.eq .only_2blks_tweak
4078
.xts_encrypt_blocks:
4079
#ifdef __AARCH64EB__
4080
rev32 v8.16b,v8.16b
4081
#endif
4082
mov x12,v8.d[0]
4083
mov x13,v8.d[1]
4084
mov w7,0x87
4085
extr x9,x13,x13,#32
4086
extr x15,x13,x12,#63
4087
and w8,w7,w9,asr#31
4088
eor x14,x8,x12,lsl#1
4089
mov w7,0x87
4090
extr x9,x15,x15,#32
4091
extr x17,x15,x14,#63
4092
and w8,w7,w9,asr#31
4093
eor x16,x8,x14,lsl#1
4094
mov w7,0x87
4095
extr x9,x17,x17,#32
4096
extr x19,x17,x16,#63
4097
and w8,w7,w9,asr#31
4098
eor x18,x8,x16,lsl#1
4099
mov w7,0x87
4100
extr x9,x19,x19,#32
4101
extr x21,x19,x18,#63
4102
and w8,w7,w9,asr#31
4103
eor x20,x8,x18,lsl#1
4104
mov w7,0x87
4105
extr x9,x21,x21,#32
4106
extr x23,x21,x20,#63
4107
and w8,w7,w9,asr#31
4108
eor x22,x8,x20,lsl#1
4109
mov w7,0x87
4110
extr x9,x23,x23,#32
4111
extr x25,x23,x22,#63
4112
and w8,w7,w9,asr#31
4113
eor x24,x8,x22,lsl#1
4114
mov w7,0x87
4115
extr x9,x25,x25,#32
4116
extr x27,x25,x24,#63
4117
and w8,w7,w9,asr#31
4118
eor x26,x8,x24,lsl#1
4119
.Lxts_8_blocks_process:
4120
cmp x2,#8
4121
b.lt .Lxts_4_blocks_process
4122
mov v0.d[0],x12
4123
mov v0.d[1],x13
4124
#ifdef __AARCH64EB__
4125
rev32 v0.16b,v0.16b
4126
#endif
4127
mov v1.d[0],x14
4128
mov v1.d[1],x15
4129
#ifdef __AARCH64EB__
4130
rev32 v1.16b,v1.16b
4131
#endif
4132
mov v2.d[0],x16
4133
mov v2.d[1],x17
4134
#ifdef __AARCH64EB__
4135
rev32 v2.16b,v2.16b
4136
#endif
4137
mov v3.d[0],x18
4138
mov v3.d[1],x19
4139
#ifdef __AARCH64EB__
4140
rev32 v3.16b,v3.16b
4141
#endif
4142
mov v12.d[0],x20
4143
mov v12.d[1],x21
4144
#ifdef __AARCH64EB__
4145
rev32 v12.16b,v12.16b
4146
#endif
4147
mov v13.d[0],x22
4148
mov v13.d[1],x23
4149
#ifdef __AARCH64EB__
4150
rev32 v13.16b,v13.16b
4151
#endif
4152
mov v14.d[0],x24
4153
mov v14.d[1],x25
4154
#ifdef __AARCH64EB__
4155
rev32 v14.16b,v14.16b
4156
#endif
4157
mov v15.d[0],x26
4158
mov v15.d[1],x27
4159
#ifdef __AARCH64EB__
4160
rev32 v15.16b,v15.16b
4161
#endif
4162
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
4163
eor v4.16b, v4.16b, v0.16b
4164
eor v5.16b, v5.16b, v1.16b
4165
eor v6.16b, v6.16b, v2.16b
4166
eor v7.16b, v7.16b, v3.16b
4167
ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
4168
eor v8.16b, v8.16b, v12.16b
4169
eor v9.16b, v9.16b, v13.16b
4170
eor v10.16b, v10.16b, v14.16b
4171
eor v11.16b, v11.16b, v15.16b
4172
#ifndef __AARCH64EB__
4173
rev32 v4.16b,v4.16b
4174
#endif
4175
#ifndef __AARCH64EB__
4176
rev32 v5.16b,v5.16b
4177
#endif
4178
#ifndef __AARCH64EB__
4179
rev32 v6.16b,v6.16b
4180
#endif
4181
#ifndef __AARCH64EB__
4182
rev32 v7.16b,v7.16b
4183
#endif
4184
#ifndef __AARCH64EB__
4185
rev32 v8.16b,v8.16b
4186
#endif
4187
#ifndef __AARCH64EB__
4188
rev32 v9.16b,v9.16b
4189
#endif
4190
#ifndef __AARCH64EB__
4191
rev32 v10.16b,v10.16b
4192
#endif
4193
#ifndef __AARCH64EB__
4194
rev32 v11.16b,v11.16b
4195
#endif
4196
zip1 v0.4s,v4.4s,v5.4s
4197
zip2 v1.4s,v4.4s,v5.4s
4198
zip1 v2.4s,v6.4s,v7.4s
4199
zip2 v3.4s,v6.4s,v7.4s
4200
zip1 v4.2d,v0.2d,v2.2d
4201
zip2 v5.2d,v0.2d,v2.2d
4202
zip1 v6.2d,v1.2d,v3.2d
4203
zip2 v7.2d,v1.2d,v3.2d
4204
zip1 v0.4s,v8.4s,v9.4s
4205
zip2 v1.4s,v8.4s,v9.4s
4206
zip1 v2.4s,v10.4s,v11.4s
4207
zip2 v3.4s,v10.4s,v11.4s
4208
zip1 v8.2d,v0.2d,v2.2d
4209
zip2 v9.2d,v0.2d,v2.2d
4210
zip1 v10.2d,v1.2d,v3.2d
4211
zip2 v11.2d,v1.2d,v3.2d
4212
bl _vpsm4_enc_8blks
4213
zip1 v8.4s,v0.4s,v1.4s
4214
zip2 v9.4s,v0.4s,v1.4s
4215
zip1 v10.4s,v2.4s,v3.4s
4216
zip2 v11.4s,v2.4s,v3.4s
4217
zip1 v0.2d,v8.2d,v10.2d
4218
zip2 v1.2d,v8.2d,v10.2d
4219
zip1 v2.2d,v9.2d,v11.2d
4220
zip2 v3.2d,v9.2d,v11.2d
4221
zip1 v8.4s,v4.4s,v5.4s
4222
zip2 v9.4s,v4.4s,v5.4s
4223
zip1 v10.4s,v6.4s,v7.4s
4224
zip2 v11.4s,v6.4s,v7.4s
4225
zip1 v4.2d,v8.2d,v10.2d
4226
zip2 v5.2d,v8.2d,v10.2d
4227
zip1 v6.2d,v9.2d,v11.2d
4228
zip2 v7.2d,v9.2d,v11.2d
4229
mov v12.d[0],x12
4230
mov v12.d[1],x13
4231
#ifdef __AARCH64EB__
4232
rev32 v12.16b,v12.16b
4233
#endif
4234
mov w7,0x87
4235
extr x9,x27,x27,#32
4236
extr x13,x27,x26,#63
4237
and w8,w7,w9,asr#31
4238
eor x12,x8,x26,lsl#1
4239
mov v13.d[0],x14
4240
mov v13.d[1],x15
4241
#ifdef __AARCH64EB__
4242
rev32 v13.16b,v13.16b
4243
#endif
4244
mov w7,0x87
4245
extr x9,x13,x13,#32
4246
extr x15,x13,x12,#63
4247
and w8,w7,w9,asr#31
4248
eor x14,x8,x12,lsl#1
4249
mov v14.d[0],x16
4250
mov v14.d[1],x17
4251
#ifdef __AARCH64EB__
4252
rev32 v14.16b,v14.16b
4253
#endif
4254
mov w7,0x87
4255
extr x9,x15,x15,#32
4256
extr x17,x15,x14,#63
4257
and w8,w7,w9,asr#31
4258
eor x16,x8,x14,lsl#1
4259
mov v15.d[0],x18
4260
mov v15.d[1],x19
4261
#ifdef __AARCH64EB__
4262
rev32 v15.16b,v15.16b
4263
#endif
4264
mov w7,0x87
4265
extr x9,x17,x17,#32
4266
extr x19,x17,x16,#63
4267
and w8,w7,w9,asr#31
4268
eor x18,x8,x16,lsl#1
4269
mov v8.d[0],x20
4270
mov v8.d[1],x21
4271
#ifdef __AARCH64EB__
4272
rev32 v8.16b,v8.16b
4273
#endif
4274
mov w7,0x87
4275
extr x9,x19,x19,#32
4276
extr x21,x19,x18,#63
4277
and w8,w7,w9,asr#31
4278
eor x20,x8,x18,lsl#1
4279
mov v9.d[0],x22
4280
mov v9.d[1],x23
4281
#ifdef __AARCH64EB__
4282
rev32 v9.16b,v9.16b
4283
#endif
4284
mov w7,0x87
4285
extr x9,x21,x21,#32
4286
extr x23,x21,x20,#63
4287
and w8,w7,w9,asr#31
4288
eor x22,x8,x20,lsl#1
4289
mov v10.d[0],x24
4290
mov v10.d[1],x25
4291
#ifdef __AARCH64EB__
4292
rev32 v10.16b,v10.16b
4293
#endif
4294
mov w7,0x87
4295
extr x9,x23,x23,#32
4296
extr x25,x23,x22,#63
4297
and w8,w7,w9,asr#31
4298
eor x24,x8,x22,lsl#1
4299
mov v11.d[0],x26
4300
mov v11.d[1],x27
4301
#ifdef __AARCH64EB__
4302
rev32 v11.16b,v11.16b
4303
#endif
4304
mov w7,0x87
4305
extr x9,x25,x25,#32
4306
extr x27,x25,x24,#63
4307
and w8,w7,w9,asr#31
4308
eor x26,x8,x24,lsl#1
4309
eor v0.16b, v0.16b, v12.16b
4310
eor v1.16b, v1.16b, v13.16b
4311
eor v2.16b, v2.16b, v14.16b
4312
eor v3.16b, v3.16b, v15.16b
4313
eor v4.16b, v4.16b, v8.16b
4314
eor v5.16b, v5.16b, v9.16b
4315
eor v6.16b, v6.16b, v10.16b
4316
eor v7.16b, v7.16b, v11.16b
4317
4318
// save the last tweak
4319
st1 {v11.4s},[x5]
4320
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
4321
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
4322
subs x2,x2,#8
4323
b.gt .Lxts_8_blocks_process
4324
b 100f
4325
.Lxts_4_blocks_process:
4326
mov v8.d[0],x12
4327
mov v8.d[1],x13
4328
#ifdef __AARCH64EB__
4329
rev32 v8.16b,v8.16b
4330
#endif
4331
mov v9.d[0],x14
4332
mov v9.d[1],x15
4333
#ifdef __AARCH64EB__
4334
rev32 v9.16b,v9.16b
4335
#endif
4336
mov v10.d[0],x16
4337
mov v10.d[1],x17
4338
#ifdef __AARCH64EB__
4339
rev32 v10.16b,v10.16b
4340
#endif
4341
mov v11.d[0],x18
4342
mov v11.d[1],x19
4343
#ifdef __AARCH64EB__
4344
rev32 v11.16b,v11.16b
4345
#endif
4346
cmp x2,#4
4347
b.lt 1f
4348
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
4349
eor v4.16b, v4.16b, v8.16b
4350
eor v5.16b, v5.16b, v9.16b
4351
eor v6.16b, v6.16b, v10.16b
4352
eor v7.16b, v7.16b, v11.16b
4353
#ifndef __AARCH64EB__
4354
rev32 v4.16b,v4.16b
4355
#endif
4356
#ifndef __AARCH64EB__
4357
rev32 v5.16b,v5.16b
4358
#endif
4359
#ifndef __AARCH64EB__
4360
rev32 v6.16b,v6.16b
4361
#endif
4362
#ifndef __AARCH64EB__
4363
rev32 v7.16b,v7.16b
4364
#endif
4365
zip1 v0.4s,v4.4s,v5.4s
4366
zip2 v1.4s,v4.4s,v5.4s
4367
zip1 v2.4s,v6.4s,v7.4s
4368
zip2 v3.4s,v6.4s,v7.4s
4369
zip1 v4.2d,v0.2d,v2.2d
4370
zip2 v5.2d,v0.2d,v2.2d
4371
zip1 v6.2d,v1.2d,v3.2d
4372
zip2 v7.2d,v1.2d,v3.2d
4373
bl _vpsm4_enc_4blks
4374
zip1 v4.4s,v0.4s,v1.4s
4375
zip2 v5.4s,v0.4s,v1.4s
4376
zip1 v6.4s,v2.4s,v3.4s
4377
zip2 v7.4s,v2.4s,v3.4s
4378
zip1 v0.2d,v4.2d,v6.2d
4379
zip2 v1.2d,v4.2d,v6.2d
4380
zip1 v2.2d,v5.2d,v7.2d
4381
zip2 v3.2d,v5.2d,v7.2d
4382
eor v0.16b, v0.16b, v8.16b
4383
eor v1.16b, v1.16b, v9.16b
4384
eor v2.16b, v2.16b, v10.16b
4385
eor v3.16b, v3.16b, v11.16b
4386
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
4387
sub x2,x2,#4
4388
mov v8.d[0],x20
4389
mov v8.d[1],x21
4390
#ifdef __AARCH64EB__
4391
rev32 v8.16b,v8.16b
4392
#endif
4393
mov v9.d[0],x22
4394
mov v9.d[1],x23
4395
#ifdef __AARCH64EB__
4396
rev32 v9.16b,v9.16b
4397
#endif
4398
mov v10.d[0],x24
4399
mov v10.d[1],x25
4400
#ifdef __AARCH64EB__
4401
rev32 v10.16b,v10.16b
4402
#endif
4403
// save the last tweak
4404
st1 {v11.4s},[x5]
4405
1:
4406
// process last block
4407
cmp x2,#1
4408
b.lt 100f
4409
b.gt 1f
4410
ld1 {v4.4s},[x0],#16
4411
eor v4.16b, v4.16b, v8.16b
4412
#ifndef __AARCH64EB__
4413
rev32 v4.16b,v4.16b
4414
#endif
4415
mov x10,x3
4416
mov w11,#8
4417
mov w12,v4.s[0]
4418
mov w13,v4.s[1]
4419
mov w14,v4.s[2]
4420
mov w15,v4.s[3]
4421
10:
4422
ldp w7,w8,[x10],8
4423
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4424
eor w6,w14,w15
4425
eor w9,w7,w13
4426
eor w6,w6,w9
4427
movi v1.16b,#64
4428
movi v2.16b,#128
4429
movi v3.16b,#192
4430
mov v0.s[0],w6
4431
4432
sub v1.16b,v0.16b,v1.16b
4433
sub v2.16b,v0.16b,v2.16b
4434
sub v3.16b,v0.16b,v3.16b
4435
4436
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4437
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4438
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4439
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4440
4441
mov w6,v0.s[0]
4442
mov w7,v1.s[0]
4443
mov w9,v2.s[0]
4444
add w7,w6,w7
4445
mov w6,v3.s[0]
4446
add w7,w7,w9
4447
add w7,w7,w6
4448
4449
eor w6,w7,w7,ror #32-2
4450
eor w6,w6,w7,ror #32-10
4451
eor w6,w6,w7,ror #32-18
4452
eor w6,w6,w7,ror #32-24
4453
eor w12,w12,w6
4454
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4455
eor w6,w14,w15
4456
eor w9,w12,w8
4457
eor w6,w6,w9
4458
movi v1.16b,#64
4459
movi v2.16b,#128
4460
movi v3.16b,#192
4461
mov v0.s[0],w6
4462
4463
sub v1.16b,v0.16b,v1.16b
4464
sub v2.16b,v0.16b,v2.16b
4465
sub v3.16b,v0.16b,v3.16b
4466
4467
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4468
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4469
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4470
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4471
4472
mov w6,v0.s[0]
4473
mov w7,v1.s[0]
4474
mov w9,v2.s[0]
4475
add w7,w6,w7
4476
mov w6,v3.s[0]
4477
add w7,w7,w9
4478
add w7,w7,w6
4479
4480
eor w6,w7,w7,ror #32-2
4481
eor w6,w6,w7,ror #32-10
4482
eor w6,w6,w7,ror #32-18
4483
eor w6,w6,w7,ror #32-24
4484
ldp w7,w8,[x10],8
4485
eor w13,w13,w6
4486
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4487
eor w6,w12,w13
4488
eor w9,w7,w15
4489
eor w6,w6,w9
4490
movi v1.16b,#64
4491
movi v2.16b,#128
4492
movi v3.16b,#192
4493
mov v0.s[0],w6
4494
4495
sub v1.16b,v0.16b,v1.16b
4496
sub v2.16b,v0.16b,v2.16b
4497
sub v3.16b,v0.16b,v3.16b
4498
4499
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4500
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4501
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4502
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4503
4504
mov w6,v0.s[0]
4505
mov w7,v1.s[0]
4506
mov w9,v2.s[0]
4507
add w7,w6,w7
4508
mov w6,v3.s[0]
4509
add w7,w7,w9
4510
add w7,w7,w6
4511
4512
eor w6,w7,w7,ror #32-2
4513
eor w6,w6,w7,ror #32-10
4514
eor w6,w6,w7,ror #32-18
4515
eor w6,w6,w7,ror #32-24
4516
eor w14,w14,w6
4517
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4518
eor w6,w12,w13
4519
eor w9,w14,w8
4520
eor w6,w6,w9
4521
movi v1.16b,#64
4522
movi v2.16b,#128
4523
movi v3.16b,#192
4524
mov v0.s[0],w6
4525
4526
sub v1.16b,v0.16b,v1.16b
4527
sub v2.16b,v0.16b,v2.16b
4528
sub v3.16b,v0.16b,v3.16b
4529
4530
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4531
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4532
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4533
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4534
4535
mov w6,v0.s[0]
4536
mov w7,v1.s[0]
4537
mov w9,v2.s[0]
4538
add w7,w6,w7
4539
mov w6,v3.s[0]
4540
add w7,w7,w9
4541
add w7,w7,w6
4542
4543
eor w6,w7,w7,ror #32-2
4544
eor w6,w6,w7,ror #32-10
4545
eor w6,w6,w7,ror #32-18
4546
eor w6,w6,w7,ror #32-24
4547
eor w15,w15,w6
4548
subs w11,w11,#1
4549
b.ne 10b
4550
mov v4.s[0],w15
4551
mov v4.s[1],w14
4552
mov v4.s[2],w13
4553
mov v4.s[3],w12
4554
#ifndef __AARCH64EB__
4555
rev32 v4.16b,v4.16b
4556
#endif
4557
eor v4.16b, v4.16b, v8.16b
4558
st1 {v4.4s},[x1],#16
4559
// save the last tweak
4560
st1 {v8.4s},[x5]
4561
b 100f
4562
1: // process last 2 blocks
4563
cmp x2,#2
4564
b.gt 1f
4565
ld1 {v4.4s,v5.4s},[x0],#32
4566
eor v4.16b, v4.16b, v8.16b
4567
eor v5.16b, v5.16b, v9.16b
4568
#ifndef __AARCH64EB__
4569
rev32 v4.16b,v4.16b
4570
#endif
4571
#ifndef __AARCH64EB__
4572
rev32 v5.16b,v5.16b
4573
#endif
4574
zip1 v0.4s,v4.4s,v5.4s
4575
zip2 v1.4s,v4.4s,v5.4s
4576
zip1 v2.4s,v6.4s,v7.4s
4577
zip2 v3.4s,v6.4s,v7.4s
4578
zip1 v4.2d,v0.2d,v2.2d
4579
zip2 v5.2d,v0.2d,v2.2d
4580
zip1 v6.2d,v1.2d,v3.2d
4581
zip2 v7.2d,v1.2d,v3.2d
4582
bl _vpsm4_enc_4blks
4583
zip1 v4.4s,v0.4s,v1.4s
4584
zip2 v5.4s,v0.4s,v1.4s
4585
zip1 v6.4s,v2.4s,v3.4s
4586
zip2 v7.4s,v2.4s,v3.4s
4587
zip1 v0.2d,v4.2d,v6.2d
4588
zip2 v1.2d,v4.2d,v6.2d
4589
zip1 v2.2d,v5.2d,v7.2d
4590
zip2 v3.2d,v5.2d,v7.2d
4591
eor v0.16b, v0.16b, v8.16b
4592
eor v1.16b, v1.16b, v9.16b
4593
st1 {v0.4s,v1.4s},[x1],#32
4594
// save the last tweak
4595
st1 {v9.4s},[x5]
4596
b 100f
4597
1: // process last 3 blocks
4598
ld1 {v4.4s,v5.4s,v6.4s},[x0],#48
4599
eor v4.16b, v4.16b, v8.16b
4600
eor v5.16b, v5.16b, v9.16b
4601
eor v6.16b, v6.16b, v10.16b
4602
#ifndef __AARCH64EB__
4603
rev32 v4.16b,v4.16b
4604
#endif
4605
#ifndef __AARCH64EB__
4606
rev32 v5.16b,v5.16b
4607
#endif
4608
#ifndef __AARCH64EB__
4609
rev32 v6.16b,v6.16b
4610
#endif
4611
zip1 v0.4s,v4.4s,v5.4s
4612
zip2 v1.4s,v4.4s,v5.4s
4613
zip1 v2.4s,v6.4s,v7.4s
4614
zip2 v3.4s,v6.4s,v7.4s
4615
zip1 v4.2d,v0.2d,v2.2d
4616
zip2 v5.2d,v0.2d,v2.2d
4617
zip1 v6.2d,v1.2d,v3.2d
4618
zip2 v7.2d,v1.2d,v3.2d
4619
bl _vpsm4_enc_4blks
4620
zip1 v4.4s,v0.4s,v1.4s
4621
zip2 v5.4s,v0.4s,v1.4s
4622
zip1 v6.4s,v2.4s,v3.4s
4623
zip2 v7.4s,v2.4s,v3.4s
4624
zip1 v0.2d,v4.2d,v6.2d
4625
zip2 v1.2d,v4.2d,v6.2d
4626
zip1 v2.2d,v5.2d,v7.2d
4627
zip2 v3.2d,v5.2d,v7.2d
4628
eor v0.16b, v0.16b, v8.16b
4629
eor v1.16b, v1.16b, v9.16b
4630
eor v2.16b, v2.16b, v10.16b
4631
st1 {v0.4s,v1.4s,v2.4s},[x1],#48
4632
// save the last tweak
4633
st1 {v10.4s},[x5]
4634
100:
4635
cmp x29,0
4636
b.eq .return
4637
4638
// This branch calculates the last two tweaks,
4639
// while the encryption/decryption length is larger than 32
4640
.last_2blks_tweak:
4641
ld1 {v8.4s},[x5]
4642
#ifdef __AARCH64EB__
4643
rev32 v8.16b,v8.16b
4644
#endif
4645
mov v2.16b,v8.16b
4646
adrp x10,.Lxts_magic
4647
ldr q0, [x10, #:lo12:.Lxts_magic]
4648
shl v9.16b, v2.16b, #1
4649
ext v1.16b, v2.16b, v2.16b,#15
4650
ushr v1.16b, v1.16b, #7
4651
mul v1.16b, v1.16b, v0.16b
4652
eor v9.16b, v9.16b, v1.16b
4653
mov v2.16b,v9.16b
4654
adrp x10,.Lxts_magic
4655
ldr q0, [x10, #:lo12:.Lxts_magic]
4656
shl v10.16b, v2.16b, #1
4657
ext v1.16b, v2.16b, v2.16b,#15
4658
ushr v1.16b, v1.16b, #7
4659
mul v1.16b, v1.16b, v0.16b
4660
eor v10.16b, v10.16b, v1.16b
4661
b .check_dec
4662
4663
4664
// This branch calculates the last two tweaks,
4665
// while the encryption/decryption length is equal to 32, who only need two tweaks
4666
.only_2blks_tweak:
4667
mov v9.16b,v8.16b
4668
#ifdef __AARCH64EB__
4669
rev32 v9.16b,v9.16b
4670
#endif
4671
mov v2.16b,v9.16b
4672
adrp x10,.Lxts_magic
4673
ldr q0, [x10, #:lo12:.Lxts_magic]
4674
shl v10.16b, v2.16b, #1
4675
ext v1.16b, v2.16b, v2.16b,#15
4676
ushr v1.16b, v1.16b, #7
4677
mul v1.16b, v1.16b, v0.16b
4678
eor v10.16b, v10.16b, v1.16b
4679
b .check_dec
4680
4681
4682
// Determine whether encryption or decryption is required.
4683
// The last two tweaks need to be swapped for decryption.
4684
.check_dec:
4685
// encryption:1 decryption:0
4686
cmp w28,1
4687
b.eq .process_last_2blks
4688
mov v0.16B,v9.16b
4689
mov v9.16B,v10.16b
4690
mov v10.16B,v0.16b
4691
4692
.process_last_2blks:
4693
#ifdef __AARCH64EB__
4694
rev32 v9.16b,v9.16b
4695
#endif
4696
#ifdef __AARCH64EB__
4697
rev32 v10.16b,v10.16b
4698
#endif
4699
ld1 {v4.4s},[x0],#16
4700
eor v4.16b, v4.16b, v9.16b
4701
#ifndef __AARCH64EB__
4702
rev32 v4.16b,v4.16b
4703
#endif
4704
mov x10,x3
4705
mov w11,#8
4706
mov w12,v4.s[0]
4707
mov w13,v4.s[1]
4708
mov w14,v4.s[2]
4709
mov w15,v4.s[3]
4710
10:
4711
ldp w7,w8,[x10],8
4712
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4713
eor w6,w14,w15
4714
eor w9,w7,w13
4715
eor w6,w6,w9
4716
movi v1.16b,#64
4717
movi v2.16b,#128
4718
movi v3.16b,#192
4719
mov v0.s[0],w6
4720
4721
sub v1.16b,v0.16b,v1.16b
4722
sub v2.16b,v0.16b,v2.16b
4723
sub v3.16b,v0.16b,v3.16b
4724
4725
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4726
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4727
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4728
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4729
4730
mov w6,v0.s[0]
4731
mov w7,v1.s[0]
4732
mov w9,v2.s[0]
4733
add w7,w6,w7
4734
mov w6,v3.s[0]
4735
add w7,w7,w9
4736
add w7,w7,w6
4737
4738
eor w6,w7,w7,ror #32-2
4739
eor w6,w6,w7,ror #32-10
4740
eor w6,w6,w7,ror #32-18
4741
eor w6,w6,w7,ror #32-24
4742
eor w12,w12,w6
4743
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4744
eor w6,w14,w15
4745
eor w9,w12,w8
4746
eor w6,w6,w9
4747
movi v1.16b,#64
4748
movi v2.16b,#128
4749
movi v3.16b,#192
4750
mov v0.s[0],w6
4751
4752
sub v1.16b,v0.16b,v1.16b
4753
sub v2.16b,v0.16b,v2.16b
4754
sub v3.16b,v0.16b,v3.16b
4755
4756
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4757
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4758
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4759
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4760
4761
mov w6,v0.s[0]
4762
mov w7,v1.s[0]
4763
mov w9,v2.s[0]
4764
add w7,w6,w7
4765
mov w6,v3.s[0]
4766
add w7,w7,w9
4767
add w7,w7,w6
4768
4769
eor w6,w7,w7,ror #32-2
4770
eor w6,w6,w7,ror #32-10
4771
eor w6,w6,w7,ror #32-18
4772
eor w6,w6,w7,ror #32-24
4773
ldp w7,w8,[x10],8
4774
eor w13,w13,w6
4775
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4776
eor w6,w12,w13
4777
eor w9,w7,w15
4778
eor w6,w6,w9
4779
movi v1.16b,#64
4780
movi v2.16b,#128
4781
movi v3.16b,#192
4782
mov v0.s[0],w6
4783
4784
sub v1.16b,v0.16b,v1.16b
4785
sub v2.16b,v0.16b,v2.16b
4786
sub v3.16b,v0.16b,v3.16b
4787
4788
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4789
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4790
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4791
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4792
4793
mov w6,v0.s[0]
4794
mov w7,v1.s[0]
4795
mov w9,v2.s[0]
4796
add w7,w6,w7
4797
mov w6,v3.s[0]
4798
add w7,w7,w9
4799
add w7,w7,w6
4800
4801
eor w6,w7,w7,ror #32-2
4802
eor w6,w6,w7,ror #32-10
4803
eor w6,w6,w7,ror #32-18
4804
eor w6,w6,w7,ror #32-24
4805
eor w14,w14,w6
4806
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4807
eor w6,w12,w13
4808
eor w9,w14,w8
4809
eor w6,w6,w9
4810
movi v1.16b,#64
4811
movi v2.16b,#128
4812
movi v3.16b,#192
4813
mov v0.s[0],w6
4814
4815
sub v1.16b,v0.16b,v1.16b
4816
sub v2.16b,v0.16b,v2.16b
4817
sub v3.16b,v0.16b,v3.16b
4818
4819
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4820
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4821
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4822
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4823
4824
mov w6,v0.s[0]
4825
mov w7,v1.s[0]
4826
mov w9,v2.s[0]
4827
add w7,w6,w7
4828
mov w6,v3.s[0]
4829
add w7,w7,w9
4830
add w7,w7,w6
4831
4832
eor w6,w7,w7,ror #32-2
4833
eor w6,w6,w7,ror #32-10
4834
eor w6,w6,w7,ror #32-18
4835
eor w6,w6,w7,ror #32-24
4836
eor w15,w15,w6
4837
subs w11,w11,#1
4838
b.ne 10b
4839
mov v4.s[0],w15
4840
mov v4.s[1],w14
4841
mov v4.s[2],w13
4842
mov v4.s[3],w12
4843
#ifndef __AARCH64EB__
4844
rev32 v4.16b,v4.16b
4845
#endif
4846
eor v4.16b, v4.16b, v9.16b
4847
st1 {v4.4s},[x1],#16
4848
4849
sub x26,x1,16
4850
.loop:
4851
subs x29,x29,1
4852
ldrb w7,[x26,x29]
4853
ldrb w8,[x0,x29]
4854
strb w8,[x26,x29]
4855
strb w7,[x1,x29]
4856
b.gt .loop
4857
ld1 {v4.4s}, [x26]
4858
eor v4.16b, v4.16b, v10.16b
4859
#ifndef __AARCH64EB__
4860
rev32 v4.16b,v4.16b
4861
#endif
4862
mov x10,x3
4863
mov w11,#8
4864
mov w12,v4.s[0]
4865
mov w13,v4.s[1]
4866
mov w14,v4.s[2]
4867
mov w15,v4.s[3]
4868
10:
4869
ldp w7,w8,[x10],8
4870
// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4871
eor w6,w14,w15
4872
eor w9,w7,w13
4873
eor w6,w6,w9
4874
movi v1.16b,#64
4875
movi v2.16b,#128
4876
movi v3.16b,#192
4877
mov v0.s[0],w6
4878
4879
sub v1.16b,v0.16b,v1.16b
4880
sub v2.16b,v0.16b,v2.16b
4881
sub v3.16b,v0.16b,v3.16b
4882
4883
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4884
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4885
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4886
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4887
4888
mov w6,v0.s[0]
4889
mov w7,v1.s[0]
4890
mov w9,v2.s[0]
4891
add w7,w6,w7
4892
mov w6,v3.s[0]
4893
add w7,w7,w9
4894
add w7,w7,w6
4895
4896
eor w6,w7,w7,ror #32-2
4897
eor w6,w6,w7,ror #32-10
4898
eor w6,w6,w7,ror #32-18
4899
eor w6,w6,w7,ror #32-24
4900
eor w12,w12,w6
4901
// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4902
eor w6,w14,w15
4903
eor w9,w12,w8
4904
eor w6,w6,w9
4905
movi v1.16b,#64
4906
movi v2.16b,#128
4907
movi v3.16b,#192
4908
mov v0.s[0],w6
4909
4910
sub v1.16b,v0.16b,v1.16b
4911
sub v2.16b,v0.16b,v2.16b
4912
sub v3.16b,v0.16b,v3.16b
4913
4914
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4915
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4916
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4917
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4918
4919
mov w6,v0.s[0]
4920
mov w7,v1.s[0]
4921
mov w9,v2.s[0]
4922
add w7,w6,w7
4923
mov w6,v3.s[0]
4924
add w7,w7,w9
4925
add w7,w7,w6
4926
4927
eor w6,w7,w7,ror #32-2
4928
eor w6,w6,w7,ror #32-10
4929
eor w6,w6,w7,ror #32-18
4930
eor w6,w6,w7,ror #32-24
4931
ldp w7,w8,[x10],8
4932
eor w13,w13,w6
4933
// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4934
eor w6,w12,w13
4935
eor w9,w7,w15
4936
eor w6,w6,w9
4937
movi v1.16b,#64
4938
movi v2.16b,#128
4939
movi v3.16b,#192
4940
mov v0.s[0],w6
4941
4942
sub v1.16b,v0.16b,v1.16b
4943
sub v2.16b,v0.16b,v2.16b
4944
sub v3.16b,v0.16b,v3.16b
4945
4946
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4947
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4948
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4949
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4950
4951
mov w6,v0.s[0]
4952
mov w7,v1.s[0]
4953
mov w9,v2.s[0]
4954
add w7,w6,w7
4955
mov w6,v3.s[0]
4956
add w7,w7,w9
4957
add w7,w7,w6
4958
4959
eor w6,w7,w7,ror #32-2
4960
eor w6,w6,w7,ror #32-10
4961
eor w6,w6,w7,ror #32-18
4962
eor w6,w6,w7,ror #32-24
4963
eor w14,w14,w6
4964
// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4965
eor w6,w12,w13
4966
eor w9,w14,w8
4967
eor w6,w6,w9
4968
movi v1.16b,#64
4969
movi v2.16b,#128
4970
movi v3.16b,#192
4971
mov v0.s[0],w6
4972
4973
sub v1.16b,v0.16b,v1.16b
4974
sub v2.16b,v0.16b,v2.16b
4975
sub v3.16b,v0.16b,v3.16b
4976
4977
tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4978
tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4979
tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4980
tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4981
4982
mov w6,v0.s[0]
4983
mov w7,v1.s[0]
4984
mov w9,v2.s[0]
4985
add w7,w6,w7
4986
mov w6,v3.s[0]
4987
add w7,w7,w9
4988
add w7,w7,w6
4989
4990
eor w6,w7,w7,ror #32-2
4991
eor w6,w6,w7,ror #32-10
4992
eor w6,w6,w7,ror #32-18
4993
eor w6,w6,w7,ror #32-24
4994
eor w15,w15,w6
4995
subs w11,w11,#1
4996
b.ne 10b
4997
mov v4.s[0],w15
4998
mov v4.s[1],w14
4999
mov v4.s[2],w13
5000
mov v4.s[3],w12
5001
#ifndef __AARCH64EB__
5002
rev32 v4.16b,v4.16b
5003
#endif
5004
eor v4.16b, v4.16b, v10.16b
5005
st1 {v4.4s}, [x26]
5006
.return:
5007
ldp d14, d15, [sp], #0x10
5008
ldp d12, d13, [sp], #0x10
5009
ldp d10, d11, [sp], #0x10
5010
ldp d8, d9, [sp], #0x10
5011
ldp x29, x30, [sp], #0x10
5012
ldp x27, x28, [sp], #0x10
5013
ldp x25, x26, [sp], #0x10
5014
ldp x23, x24, [sp], #0x10
5015
ldp x21, x22, [sp], #0x10
5016
ldp x19, x20, [sp], #0x10
5017
ldp x17, x18, [sp], #0x10
5018
ldp x15, x16, [sp], #0x10
5019
AARCH64_VALIDATE_LINK_REGISTER
5020
ret
5021
.size vpsm4_xts_encrypt,.-vpsm4_xts_encrypt
5022
5023