Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/bsaes-armv8.S
39507 views
1
/* Do not modify. This file is auto-generated from bsaes-armv8.pl. */
2
// Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved.
3
//
4
// Licensed under the OpenSSL license (the "License"). You may not use
5
// this file except in compliance with the License. You can obtain a copy
6
// in the file LICENSE in the source distribution or at
7
// https://www.openssl.org/source/license.html
8
//
9
// ====================================================================
10
// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL
11
// project. Rights for redistribution and usage in source and binary
12
// forms are granted according to the OpenSSL license.
13
// ====================================================================
14
//
15
// This implementation is a translation of bsaes-armv7 for AArch64.
16
// No attempt has been made to carry across the build switches for
17
// kernel targets, since the Linux kernel crypto support has moved on
18
// from when it was based on OpenSSL.
19
20
// A lot of hand-scheduling has been performed. Consequently, this code
21
// doesn't factor out neatly into macros in the same way that the
22
// AArch32 version did, and there is little to be gained by wrapping it
23
// up in Perl, and it is presented as pure assembly.
24
25
26
#include "crypto/arm_arch.h"
27
28
.text
29
30
31
32
33
34
.type _bsaes_decrypt8,%function
35
.align 4
36
// On entry:
37
// x9 -> key (previously expanded using _bsaes_key_convert)
38
// x10 = number of rounds
39
// v0-v7 input data
40
// On exit:
41
// x9-x11 corrupted
42
// other general-purpose registers preserved
43
// v0-v7 output data
44
// v11-v15 preserved
45
// other SIMD registers corrupted
46
_bsaes_decrypt8:
47
ldr q8, [x9], #16
48
adrp x11, .LM0ISR
49
add x11, x11, #:lo12:.LM0ISR
50
movi v9.16b, #0x55
51
ldr q10, [x11], #16
52
movi v16.16b, #0x33
53
movi v17.16b, #0x0f
54
sub x10, x10, #1
55
eor v0.16b, v0.16b, v8.16b
56
eor v1.16b, v1.16b, v8.16b
57
eor v2.16b, v2.16b, v8.16b
58
eor v4.16b, v4.16b, v8.16b
59
eor v3.16b, v3.16b, v8.16b
60
eor v5.16b, v5.16b, v8.16b
61
tbl v0.16b, {v0.16b}, v10.16b
62
tbl v1.16b, {v1.16b}, v10.16b
63
tbl v2.16b, {v2.16b}, v10.16b
64
tbl v4.16b, {v4.16b}, v10.16b
65
eor v6.16b, v6.16b, v8.16b
66
eor v7.16b, v7.16b, v8.16b
67
tbl v3.16b, {v3.16b}, v10.16b
68
tbl v5.16b, {v5.16b}, v10.16b
69
tbl v6.16b, {v6.16b}, v10.16b
70
ushr v8.2d, v0.2d, #1
71
tbl v7.16b, {v7.16b}, v10.16b
72
ushr v10.2d, v4.2d, #1
73
ushr v18.2d, v2.2d, #1
74
eor v8.16b, v8.16b, v1.16b
75
ushr v19.2d, v6.2d, #1
76
eor v10.16b, v10.16b, v5.16b
77
eor v18.16b, v18.16b, v3.16b
78
and v8.16b, v8.16b, v9.16b
79
eor v19.16b, v19.16b, v7.16b
80
and v10.16b, v10.16b, v9.16b
81
and v18.16b, v18.16b, v9.16b
82
eor v1.16b, v1.16b, v8.16b
83
shl v8.2d, v8.2d, #1
84
and v9.16b, v19.16b, v9.16b
85
eor v5.16b, v5.16b, v10.16b
86
shl v10.2d, v10.2d, #1
87
eor v3.16b, v3.16b, v18.16b
88
shl v18.2d, v18.2d, #1
89
eor v0.16b, v0.16b, v8.16b
90
shl v8.2d, v9.2d, #1
91
eor v7.16b, v7.16b, v9.16b
92
eor v4.16b, v4.16b, v10.16b
93
eor v2.16b, v2.16b, v18.16b
94
ushr v9.2d, v1.2d, #2
95
eor v6.16b, v6.16b, v8.16b
96
ushr v8.2d, v0.2d, #2
97
ushr v10.2d, v5.2d, #2
98
ushr v18.2d, v4.2d, #2
99
eor v9.16b, v9.16b, v3.16b
100
eor v8.16b, v8.16b, v2.16b
101
eor v10.16b, v10.16b, v7.16b
102
eor v18.16b, v18.16b, v6.16b
103
and v9.16b, v9.16b, v16.16b
104
and v8.16b, v8.16b, v16.16b
105
and v10.16b, v10.16b, v16.16b
106
and v16.16b, v18.16b, v16.16b
107
eor v3.16b, v3.16b, v9.16b
108
shl v9.2d, v9.2d, #2
109
eor v2.16b, v2.16b, v8.16b
110
shl v8.2d, v8.2d, #2
111
eor v7.16b, v7.16b, v10.16b
112
shl v10.2d, v10.2d, #2
113
eor v6.16b, v6.16b, v16.16b
114
shl v16.2d, v16.2d, #2
115
eor v1.16b, v1.16b, v9.16b
116
eor v0.16b, v0.16b, v8.16b
117
eor v5.16b, v5.16b, v10.16b
118
eor v4.16b, v4.16b, v16.16b
119
ushr v8.2d, v3.2d, #4
120
ushr v9.2d, v2.2d, #4
121
ushr v10.2d, v1.2d, #4
122
ushr v16.2d, v0.2d, #4
123
eor v8.16b, v8.16b, v7.16b
124
eor v9.16b, v9.16b, v6.16b
125
eor v10.16b, v10.16b, v5.16b
126
eor v16.16b, v16.16b, v4.16b
127
and v8.16b, v8.16b, v17.16b
128
and v9.16b, v9.16b, v17.16b
129
and v10.16b, v10.16b, v17.16b
130
and v16.16b, v16.16b, v17.16b
131
eor v7.16b, v7.16b, v8.16b
132
shl v8.2d, v8.2d, #4
133
eor v6.16b, v6.16b, v9.16b
134
shl v9.2d, v9.2d, #4
135
eor v5.16b, v5.16b, v10.16b
136
shl v10.2d, v10.2d, #4
137
eor v4.16b, v4.16b, v16.16b
138
shl v16.2d, v16.2d, #4
139
eor v3.16b, v3.16b, v8.16b
140
eor v2.16b, v2.16b, v9.16b
141
eor v1.16b, v1.16b, v10.16b
142
eor v0.16b, v0.16b, v16.16b
143
b .Ldec_sbox
144
.align 4
145
.Ldec_loop:
146
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
147
ldp q8, q9, [x9], #32
148
eor v0.16b, v16.16b, v0.16b
149
ldr q10, [x9], #16
150
eor v1.16b, v17.16b, v1.16b
151
ldr q16, [x9], #16
152
eor v2.16b, v18.16b, v2.16b
153
eor v3.16b, v19.16b, v3.16b
154
eor v4.16b, v8.16b, v4.16b
155
eor v5.16b, v9.16b, v5.16b
156
eor v6.16b, v10.16b, v6.16b
157
eor v7.16b, v16.16b, v7.16b
158
tbl v0.16b, {v0.16b}, v28.16b
159
tbl v1.16b, {v1.16b}, v28.16b
160
tbl v2.16b, {v2.16b}, v28.16b
161
tbl v3.16b, {v3.16b}, v28.16b
162
tbl v4.16b, {v4.16b}, v28.16b
163
tbl v5.16b, {v5.16b}, v28.16b
164
tbl v6.16b, {v6.16b}, v28.16b
165
tbl v7.16b, {v7.16b}, v28.16b
166
.Ldec_sbox:
167
eor v1.16b, v1.16b, v4.16b
168
eor v3.16b, v3.16b, v4.16b
169
subs x10, x10, #1
170
eor v4.16b, v4.16b, v7.16b
171
eor v2.16b, v2.16b, v7.16b
172
eor v1.16b, v1.16b, v6.16b
173
eor v6.16b, v6.16b, v4.16b
174
eor v2.16b, v2.16b, v5.16b
175
eor v0.16b, v0.16b, v1.16b
176
eor v7.16b, v7.16b, v6.16b
177
eor v8.16b, v6.16b, v2.16b
178
and v9.16b, v4.16b, v6.16b
179
eor v10.16b, v2.16b, v6.16b
180
eor v3.16b, v3.16b, v0.16b
181
eor v5.16b, v5.16b, v0.16b
182
eor v16.16b, v7.16b, v4.16b
183
eor v17.16b, v4.16b, v0.16b
184
and v18.16b, v0.16b, v2.16b
185
eor v19.16b, v7.16b, v4.16b
186
eor v1.16b, v1.16b, v3.16b
187
eor v20.16b, v3.16b, v0.16b
188
eor v21.16b, v5.16b, v2.16b
189
eor v22.16b, v3.16b, v7.16b
190
and v8.16b, v17.16b, v8.16b
191
orr v17.16b, v3.16b, v5.16b
192
eor v23.16b, v1.16b, v6.16b
193
eor v24.16b, v20.16b, v16.16b
194
eor v25.16b, v1.16b, v5.16b
195
orr v26.16b, v20.16b, v21.16b
196
and v20.16b, v20.16b, v21.16b
197
and v27.16b, v7.16b, v1.16b
198
eor v21.16b, v21.16b, v23.16b
199
orr v28.16b, v16.16b, v23.16b
200
orr v29.16b, v22.16b, v25.16b
201
eor v26.16b, v26.16b, v8.16b
202
and v16.16b, v16.16b, v23.16b
203
and v22.16b, v22.16b, v25.16b
204
and v21.16b, v24.16b, v21.16b
205
eor v8.16b, v28.16b, v8.16b
206
eor v23.16b, v5.16b, v2.16b
207
eor v24.16b, v1.16b, v6.16b
208
eor v16.16b, v16.16b, v22.16b
209
eor v22.16b, v3.16b, v0.16b
210
eor v25.16b, v29.16b, v21.16b
211
eor v21.16b, v26.16b, v21.16b
212
eor v8.16b, v8.16b, v20.16b
213
eor v26.16b, v23.16b, v24.16b
214
eor v16.16b, v16.16b, v20.16b
215
eor v28.16b, v22.16b, v19.16b
216
eor v20.16b, v25.16b, v20.16b
217
eor v9.16b, v21.16b, v9.16b
218
eor v8.16b, v8.16b, v18.16b
219
eor v18.16b, v5.16b, v1.16b
220
eor v21.16b, v16.16b, v17.16b
221
eor v16.16b, v16.16b, v17.16b
222
eor v17.16b, v20.16b, v27.16b
223
eor v20.16b, v3.16b, v7.16b
224
eor v25.16b, v9.16b, v8.16b
225
eor v27.16b, v0.16b, v4.16b
226
and v29.16b, v9.16b, v17.16b
227
eor v30.16b, v8.16b, v29.16b
228
eor v31.16b, v21.16b, v29.16b
229
eor v29.16b, v21.16b, v29.16b
230
bsl v30.16b, v17.16b, v21.16b
231
bsl v31.16b, v9.16b, v8.16b
232
bsl v16.16b, v30.16b, v29.16b
233
bsl v21.16b, v29.16b, v30.16b
234
eor v8.16b, v31.16b, v30.16b
235
and v1.16b, v1.16b, v31.16b
236
and v9.16b, v16.16b, v31.16b
237
and v6.16b, v6.16b, v30.16b
238
eor v16.16b, v17.16b, v21.16b
239
and v4.16b, v4.16b, v30.16b
240
eor v17.16b, v8.16b, v30.16b
241
and v21.16b, v24.16b, v8.16b
242
eor v9.16b, v9.16b, v25.16b
243
and v19.16b, v19.16b, v8.16b
244
eor v24.16b, v30.16b, v16.16b
245
eor v25.16b, v30.16b, v16.16b
246
and v7.16b, v7.16b, v17.16b
247
and v10.16b, v10.16b, v16.16b
248
eor v29.16b, v9.16b, v16.16b
249
eor v30.16b, v31.16b, v9.16b
250
and v0.16b, v24.16b, v0.16b
251
and v9.16b, v18.16b, v9.16b
252
and v2.16b, v25.16b, v2.16b
253
eor v10.16b, v10.16b, v6.16b
254
eor v18.16b, v29.16b, v16.16b
255
and v5.16b, v30.16b, v5.16b
256
eor v24.16b, v8.16b, v29.16b
257
and v25.16b, v26.16b, v29.16b
258
and v26.16b, v28.16b, v29.16b
259
eor v8.16b, v8.16b, v29.16b
260
eor v17.16b, v17.16b, v18.16b
261
eor v5.16b, v1.16b, v5.16b
262
and v23.16b, v24.16b, v23.16b
263
eor v21.16b, v21.16b, v25.16b
264
eor v19.16b, v19.16b, v26.16b
265
eor v0.16b, v4.16b, v0.16b
266
and v3.16b, v17.16b, v3.16b
267
eor v1.16b, v9.16b, v1.16b
268
eor v9.16b, v25.16b, v23.16b
269
eor v5.16b, v5.16b, v21.16b
270
eor v2.16b, v6.16b, v2.16b
271
and v6.16b, v8.16b, v22.16b
272
eor v3.16b, v7.16b, v3.16b
273
and v8.16b, v20.16b, v18.16b
274
eor v10.16b, v10.16b, v9.16b
275
eor v0.16b, v0.16b, v19.16b
276
eor v9.16b, v1.16b, v9.16b
277
eor v1.16b, v2.16b, v21.16b
278
eor v3.16b, v3.16b, v19.16b
279
and v16.16b, v27.16b, v16.16b
280
eor v17.16b, v26.16b, v6.16b
281
eor v6.16b, v8.16b, v7.16b
282
eor v7.16b, v1.16b, v9.16b
283
eor v1.16b, v5.16b, v3.16b
284
eor v2.16b, v10.16b, v3.16b
285
eor v4.16b, v16.16b, v4.16b
286
eor v8.16b, v6.16b, v17.16b
287
eor v5.16b, v9.16b, v3.16b
288
eor v9.16b, v0.16b, v1.16b
289
eor v6.16b, v7.16b, v1.16b
290
eor v0.16b, v4.16b, v17.16b
291
eor v4.16b, v8.16b, v7.16b
292
eor v7.16b, v9.16b, v2.16b
293
eor v8.16b, v3.16b, v0.16b
294
eor v7.16b, v7.16b, v5.16b
295
eor v3.16b, v4.16b, v7.16b
296
eor v4.16b, v7.16b, v0.16b
297
eor v7.16b, v8.16b, v3.16b
298
bcc .Ldec_done
299
ext v8.16b, v0.16b, v0.16b, #8
300
ext v9.16b, v1.16b, v1.16b, #8
301
ldr q28, [x11] // load from .LISR in common case (x10 > 0)
302
ext v10.16b, v6.16b, v6.16b, #8
303
ext v16.16b, v3.16b, v3.16b, #8
304
ext v17.16b, v5.16b, v5.16b, #8
305
ext v18.16b, v4.16b, v4.16b, #8
306
eor v8.16b, v8.16b, v0.16b
307
eor v9.16b, v9.16b, v1.16b
308
eor v10.16b, v10.16b, v6.16b
309
eor v16.16b, v16.16b, v3.16b
310
eor v17.16b, v17.16b, v5.16b
311
ext v19.16b, v2.16b, v2.16b, #8
312
ext v20.16b, v7.16b, v7.16b, #8
313
eor v18.16b, v18.16b, v4.16b
314
eor v6.16b, v6.16b, v8.16b
315
eor v8.16b, v2.16b, v10.16b
316
eor v4.16b, v4.16b, v9.16b
317
eor v2.16b, v19.16b, v2.16b
318
eor v9.16b, v20.16b, v7.16b
319
eor v0.16b, v0.16b, v16.16b
320
eor v1.16b, v1.16b, v16.16b
321
eor v6.16b, v6.16b, v17.16b
322
eor v8.16b, v8.16b, v16.16b
323
eor v7.16b, v7.16b, v18.16b
324
eor v4.16b, v4.16b, v16.16b
325
eor v2.16b, v3.16b, v2.16b
326
eor v1.16b, v1.16b, v17.16b
327
eor v3.16b, v5.16b, v9.16b
328
eor v5.16b, v8.16b, v17.16b
329
eor v7.16b, v7.16b, v17.16b
330
ext v8.16b, v0.16b, v0.16b, #12
331
ext v9.16b, v6.16b, v6.16b, #12
332
ext v10.16b, v4.16b, v4.16b, #12
333
ext v16.16b, v1.16b, v1.16b, #12
334
ext v17.16b, v5.16b, v5.16b, #12
335
ext v18.16b, v7.16b, v7.16b, #12
336
eor v0.16b, v0.16b, v8.16b
337
eor v6.16b, v6.16b, v9.16b
338
eor v4.16b, v4.16b, v10.16b
339
ext v19.16b, v2.16b, v2.16b, #12
340
ext v20.16b, v3.16b, v3.16b, #12
341
eor v1.16b, v1.16b, v16.16b
342
eor v5.16b, v5.16b, v17.16b
343
eor v7.16b, v7.16b, v18.16b
344
eor v2.16b, v2.16b, v19.16b
345
eor v16.16b, v16.16b, v0.16b
346
eor v3.16b, v3.16b, v20.16b
347
eor v17.16b, v17.16b, v4.16b
348
eor v10.16b, v10.16b, v6.16b
349
ext v0.16b, v0.16b, v0.16b, #8
350
eor v9.16b, v9.16b, v1.16b
351
ext v1.16b, v1.16b, v1.16b, #8
352
eor v8.16b, v8.16b, v3.16b
353
eor v16.16b, v16.16b, v3.16b
354
eor v18.16b, v18.16b, v5.16b
355
eor v19.16b, v19.16b, v7.16b
356
ext v21.16b, v5.16b, v5.16b, #8
357
ext v5.16b, v7.16b, v7.16b, #8
358
eor v7.16b, v20.16b, v2.16b
359
ext v4.16b, v4.16b, v4.16b, #8
360
ext v20.16b, v3.16b, v3.16b, #8
361
eor v17.16b, v17.16b, v3.16b
362
ext v2.16b, v2.16b, v2.16b, #8
363
eor v3.16b, v10.16b, v3.16b
364
ext v10.16b, v6.16b, v6.16b, #8
365
eor v0.16b, v0.16b, v8.16b
366
eor v1.16b, v1.16b, v16.16b
367
eor v5.16b, v5.16b, v18.16b
368
eor v3.16b, v3.16b, v4.16b
369
eor v7.16b, v20.16b, v7.16b
370
eor v6.16b, v2.16b, v19.16b
371
eor v4.16b, v21.16b, v17.16b
372
eor v2.16b, v10.16b, v9.16b
373
bne .Ldec_loop
374
ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0)
375
b .Ldec_loop
376
.align 4
377
.Ldec_done:
378
ushr v8.2d, v0.2d, #1
379
movi v9.16b, #0x55
380
ldr q10, [x9]
381
ushr v16.2d, v2.2d, #1
382
movi v17.16b, #0x33
383
ushr v18.2d, v6.2d, #1
384
movi v19.16b, #0x0f
385
eor v8.16b, v8.16b, v1.16b
386
ushr v20.2d, v3.2d, #1
387
eor v16.16b, v16.16b, v7.16b
388
eor v18.16b, v18.16b, v4.16b
389
and v8.16b, v8.16b, v9.16b
390
eor v20.16b, v20.16b, v5.16b
391
and v16.16b, v16.16b, v9.16b
392
and v18.16b, v18.16b, v9.16b
393
shl v21.2d, v8.2d, #1
394
eor v1.16b, v1.16b, v8.16b
395
and v8.16b, v20.16b, v9.16b
396
eor v7.16b, v7.16b, v16.16b
397
shl v9.2d, v16.2d, #1
398
eor v4.16b, v4.16b, v18.16b
399
shl v16.2d, v18.2d, #1
400
eor v0.16b, v0.16b, v21.16b
401
shl v18.2d, v8.2d, #1
402
eor v5.16b, v5.16b, v8.16b
403
eor v2.16b, v2.16b, v9.16b
404
eor v6.16b, v6.16b, v16.16b
405
ushr v8.2d, v1.2d, #2
406
eor v3.16b, v3.16b, v18.16b
407
ushr v9.2d, v0.2d, #2
408
ushr v16.2d, v7.2d, #2
409
ushr v18.2d, v2.2d, #2
410
eor v8.16b, v8.16b, v4.16b
411
eor v9.16b, v9.16b, v6.16b
412
eor v16.16b, v16.16b, v5.16b
413
eor v18.16b, v18.16b, v3.16b
414
and v8.16b, v8.16b, v17.16b
415
and v9.16b, v9.16b, v17.16b
416
and v16.16b, v16.16b, v17.16b
417
and v17.16b, v18.16b, v17.16b
418
eor v4.16b, v4.16b, v8.16b
419
shl v8.2d, v8.2d, #2
420
eor v6.16b, v6.16b, v9.16b
421
shl v9.2d, v9.2d, #2
422
eor v5.16b, v5.16b, v16.16b
423
shl v16.2d, v16.2d, #2
424
eor v3.16b, v3.16b, v17.16b
425
shl v17.2d, v17.2d, #2
426
eor v1.16b, v1.16b, v8.16b
427
eor v0.16b, v0.16b, v9.16b
428
eor v7.16b, v7.16b, v16.16b
429
eor v2.16b, v2.16b, v17.16b
430
ushr v8.2d, v4.2d, #4
431
ushr v9.2d, v6.2d, #4
432
ushr v16.2d, v1.2d, #4
433
ushr v17.2d, v0.2d, #4
434
eor v8.16b, v8.16b, v5.16b
435
eor v9.16b, v9.16b, v3.16b
436
eor v16.16b, v16.16b, v7.16b
437
eor v17.16b, v17.16b, v2.16b
438
and v8.16b, v8.16b, v19.16b
439
and v9.16b, v9.16b, v19.16b
440
and v16.16b, v16.16b, v19.16b
441
and v17.16b, v17.16b, v19.16b
442
eor v5.16b, v5.16b, v8.16b
443
shl v8.2d, v8.2d, #4
444
eor v3.16b, v3.16b, v9.16b
445
shl v9.2d, v9.2d, #4
446
eor v7.16b, v7.16b, v16.16b
447
shl v16.2d, v16.2d, #4
448
eor v2.16b, v2.16b, v17.16b
449
shl v17.2d, v17.2d, #4
450
eor v4.16b, v4.16b, v8.16b
451
eor v6.16b, v6.16b, v9.16b
452
eor v7.16b, v7.16b, v10.16b
453
eor v1.16b, v1.16b, v16.16b
454
eor v2.16b, v2.16b, v10.16b
455
eor v0.16b, v0.16b, v17.16b
456
eor v4.16b, v4.16b, v10.16b
457
eor v6.16b, v6.16b, v10.16b
458
eor v3.16b, v3.16b, v10.16b
459
eor v5.16b, v5.16b, v10.16b
460
eor v1.16b, v1.16b, v10.16b
461
eor v0.16b, v0.16b, v10.16b
462
ret
463
.size _bsaes_decrypt8,.-_bsaes_decrypt8
464
465
.section .rodata
466
.type _bsaes_consts,%object
467
.align 6
468
_bsaes_consts:
469
// InvShiftRows constants
470
// Used in _bsaes_decrypt8, which assumes contiguity
471
// .LM0ISR used with round 0 key
472
// .LISR used with middle round keys
473
// .LISRM0 used with final round key
474
.LM0ISR:
475
.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
476
.LISR:
477
.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
478
.LISRM0:
479
.quad 0x01040b0e0205080f, 0x0306090c00070a0d
480
481
// ShiftRows constants
482
// Used in _bsaes_encrypt8, which assumes contiguity
483
// .LM0SR used with round 0 key
484
// .LSR used with middle round keys
485
// .LSRM0 used with final round key
486
.LM0SR:
487
.quad 0x0a0e02060f03070b, 0x0004080c05090d01
488
.LSR:
489
.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
490
.LSRM0:
491
.quad 0x0304090e00050a0f, 0x01060b0c0207080d
492
493
.LM0_bigendian:
494
.quad 0x02060a0e03070b0f, 0x0004080c0105090d
495
.LM0_littleendian:
496
.quad 0x0105090d0004080c, 0x03070b0f02060a0e
497
498
// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into
499
// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
500
.LREVM0SR:
501
.quad 0x090d01050c000408, 0x03070b0f060a0e02
502
503
.align 6
504
.size _bsaes_consts,.-_bsaes_consts
505
506
.previous
507
508
.type _bsaes_encrypt8,%function
509
.align 4
510
// On entry:
511
// x9 -> key (previously expanded using _bsaes_key_convert)
512
// x10 = number of rounds
513
// v0-v7 input data
514
// On exit:
515
// x9-x11 corrupted
516
// other general-purpose registers preserved
517
// v0-v7 output data
518
// v11-v15 preserved
519
// other SIMD registers corrupted
520
_bsaes_encrypt8:
521
ldr q8, [x9], #16
522
adrp x11, .LM0SR
523
add x11, x11, #:lo12:.LM0SR
524
ldr q9, [x11], #16
525
_bsaes_encrypt8_alt:
526
eor v0.16b, v0.16b, v8.16b
527
eor v1.16b, v1.16b, v8.16b
528
sub x10, x10, #1
529
eor v2.16b, v2.16b, v8.16b
530
eor v4.16b, v4.16b, v8.16b
531
eor v3.16b, v3.16b, v8.16b
532
eor v5.16b, v5.16b, v8.16b
533
tbl v0.16b, {v0.16b}, v9.16b
534
tbl v1.16b, {v1.16b}, v9.16b
535
tbl v2.16b, {v2.16b}, v9.16b
536
tbl v4.16b, {v4.16b}, v9.16b
537
eor v6.16b, v6.16b, v8.16b
538
eor v7.16b, v7.16b, v8.16b
539
tbl v3.16b, {v3.16b}, v9.16b
540
tbl v5.16b, {v5.16b}, v9.16b
541
tbl v6.16b, {v6.16b}, v9.16b
542
ushr v8.2d, v0.2d, #1
543
movi v10.16b, #0x55
544
tbl v7.16b, {v7.16b}, v9.16b
545
ushr v9.2d, v4.2d, #1
546
movi v16.16b, #0x33
547
ushr v17.2d, v2.2d, #1
548
eor v8.16b, v8.16b, v1.16b
549
movi v18.16b, #0x0f
550
ushr v19.2d, v6.2d, #1
551
eor v9.16b, v9.16b, v5.16b
552
eor v17.16b, v17.16b, v3.16b
553
and v8.16b, v8.16b, v10.16b
554
eor v19.16b, v19.16b, v7.16b
555
and v9.16b, v9.16b, v10.16b
556
and v17.16b, v17.16b, v10.16b
557
eor v1.16b, v1.16b, v8.16b
558
shl v8.2d, v8.2d, #1
559
and v10.16b, v19.16b, v10.16b
560
eor v5.16b, v5.16b, v9.16b
561
shl v9.2d, v9.2d, #1
562
eor v3.16b, v3.16b, v17.16b
563
shl v17.2d, v17.2d, #1
564
eor v0.16b, v0.16b, v8.16b
565
shl v8.2d, v10.2d, #1
566
eor v7.16b, v7.16b, v10.16b
567
eor v4.16b, v4.16b, v9.16b
568
eor v2.16b, v2.16b, v17.16b
569
ushr v9.2d, v1.2d, #2
570
eor v6.16b, v6.16b, v8.16b
571
ushr v8.2d, v0.2d, #2
572
ushr v10.2d, v5.2d, #2
573
ushr v17.2d, v4.2d, #2
574
eor v9.16b, v9.16b, v3.16b
575
eor v8.16b, v8.16b, v2.16b
576
eor v10.16b, v10.16b, v7.16b
577
eor v17.16b, v17.16b, v6.16b
578
and v9.16b, v9.16b, v16.16b
579
and v8.16b, v8.16b, v16.16b
580
and v10.16b, v10.16b, v16.16b
581
and v16.16b, v17.16b, v16.16b
582
eor v3.16b, v3.16b, v9.16b
583
shl v9.2d, v9.2d, #2
584
eor v2.16b, v2.16b, v8.16b
585
shl v8.2d, v8.2d, #2
586
eor v7.16b, v7.16b, v10.16b
587
shl v10.2d, v10.2d, #2
588
eor v6.16b, v6.16b, v16.16b
589
shl v16.2d, v16.2d, #2
590
eor v1.16b, v1.16b, v9.16b
591
eor v0.16b, v0.16b, v8.16b
592
eor v5.16b, v5.16b, v10.16b
593
eor v4.16b, v4.16b, v16.16b
594
ushr v8.2d, v3.2d, #4
595
ushr v9.2d, v2.2d, #4
596
ushr v10.2d, v1.2d, #4
597
ushr v16.2d, v0.2d, #4
598
eor v8.16b, v8.16b, v7.16b
599
eor v9.16b, v9.16b, v6.16b
600
eor v10.16b, v10.16b, v5.16b
601
eor v16.16b, v16.16b, v4.16b
602
and v8.16b, v8.16b, v18.16b
603
and v9.16b, v9.16b, v18.16b
604
and v10.16b, v10.16b, v18.16b
605
and v16.16b, v16.16b, v18.16b
606
eor v7.16b, v7.16b, v8.16b
607
shl v8.2d, v8.2d, #4
608
eor v6.16b, v6.16b, v9.16b
609
shl v9.2d, v9.2d, #4
610
eor v5.16b, v5.16b, v10.16b
611
shl v10.2d, v10.2d, #4
612
eor v4.16b, v4.16b, v16.16b
613
shl v16.2d, v16.2d, #4
614
eor v3.16b, v3.16b, v8.16b
615
eor v2.16b, v2.16b, v9.16b
616
eor v1.16b, v1.16b, v10.16b
617
eor v0.16b, v0.16b, v16.16b
618
b .Lenc_sbox
619
.align 4
620
.Lenc_loop:
621
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
622
ldp q8, q9, [x9], #32
623
eor v0.16b, v16.16b, v0.16b
624
ldr q10, [x9], #16
625
eor v1.16b, v17.16b, v1.16b
626
ldr q16, [x9], #16
627
eor v2.16b, v18.16b, v2.16b
628
eor v3.16b, v19.16b, v3.16b
629
eor v4.16b, v8.16b, v4.16b
630
eor v5.16b, v9.16b, v5.16b
631
eor v6.16b, v10.16b, v6.16b
632
eor v7.16b, v16.16b, v7.16b
633
tbl v0.16b, {v0.16b}, v28.16b
634
tbl v1.16b, {v1.16b}, v28.16b
635
tbl v2.16b, {v2.16b}, v28.16b
636
tbl v3.16b, {v3.16b}, v28.16b
637
tbl v4.16b, {v4.16b}, v28.16b
638
tbl v5.16b, {v5.16b}, v28.16b
639
tbl v6.16b, {v6.16b}, v28.16b
640
tbl v7.16b, {v7.16b}, v28.16b
641
.Lenc_sbox:
642
eor v5.16b, v5.16b, v6.16b
643
eor v3.16b, v3.16b, v0.16b
644
subs x10, x10, #1
645
eor v2.16b, v2.16b, v1.16b
646
eor v5.16b, v5.16b, v0.16b
647
eor v8.16b, v3.16b, v7.16b
648
eor v6.16b, v6.16b, v2.16b
649
eor v7.16b, v7.16b, v5.16b
650
eor v8.16b, v8.16b, v4.16b
651
eor v3.16b, v6.16b, v3.16b
652
eor v4.16b, v4.16b, v5.16b
653
eor v6.16b, v1.16b, v5.16b
654
eor v2.16b, v2.16b, v7.16b
655
eor v1.16b, v8.16b, v1.16b
656
eor v8.16b, v7.16b, v4.16b
657
eor v9.16b, v3.16b, v0.16b
658
eor v10.16b, v7.16b, v6.16b
659
eor v16.16b, v5.16b, v3.16b
660
eor v17.16b, v6.16b, v2.16b
661
eor v18.16b, v5.16b, v1.16b
662
eor v19.16b, v2.16b, v4.16b
663
eor v20.16b, v1.16b, v0.16b
664
orr v21.16b, v8.16b, v9.16b
665
orr v22.16b, v10.16b, v16.16b
666
eor v23.16b, v8.16b, v17.16b
667
eor v24.16b, v9.16b, v18.16b
668
and v19.16b, v19.16b, v20.16b
669
orr v20.16b, v17.16b, v18.16b
670
and v8.16b, v8.16b, v9.16b
671
and v9.16b, v17.16b, v18.16b
672
and v17.16b, v23.16b, v24.16b
673
and v10.16b, v10.16b, v16.16b
674
eor v16.16b, v21.16b, v19.16b
675
eor v18.16b, v20.16b, v19.16b
676
and v19.16b, v2.16b, v1.16b
677
and v20.16b, v6.16b, v5.16b
678
eor v21.16b, v22.16b, v17.16b
679
eor v9.16b, v9.16b, v10.16b
680
eor v10.16b, v16.16b, v17.16b
681
eor v16.16b, v18.16b, v8.16b
682
and v17.16b, v4.16b, v0.16b
683
orr v18.16b, v7.16b, v3.16b
684
eor v21.16b, v21.16b, v8.16b
685
eor v8.16b, v9.16b, v8.16b
686
eor v9.16b, v10.16b, v19.16b
687
eor v10.16b, v3.16b, v0.16b
688
eor v16.16b, v16.16b, v17.16b
689
eor v17.16b, v5.16b, v1.16b
690
eor v19.16b, v21.16b, v20.16b
691
eor v20.16b, v8.16b, v18.16b
692
eor v8.16b, v8.16b, v18.16b
693
eor v18.16b, v7.16b, v4.16b
694
eor v21.16b, v9.16b, v16.16b
695
eor v22.16b, v6.16b, v2.16b
696
and v23.16b, v9.16b, v19.16b
697
eor v24.16b, v10.16b, v17.16b
698
eor v25.16b, v0.16b, v1.16b
699
eor v26.16b, v7.16b, v6.16b
700
eor v27.16b, v18.16b, v22.16b
701
eor v28.16b, v3.16b, v5.16b
702
eor v29.16b, v16.16b, v23.16b
703
eor v30.16b, v20.16b, v23.16b
704
eor v23.16b, v20.16b, v23.16b
705
eor v31.16b, v4.16b, v2.16b
706
bsl v29.16b, v19.16b, v20.16b
707
bsl v30.16b, v9.16b, v16.16b
708
bsl v8.16b, v29.16b, v23.16b
709
bsl v20.16b, v23.16b, v29.16b
710
eor v9.16b, v30.16b, v29.16b
711
and v5.16b, v5.16b, v30.16b
712
and v8.16b, v8.16b, v30.16b
713
and v1.16b, v1.16b, v29.16b
714
eor v16.16b, v19.16b, v20.16b
715
and v2.16b, v2.16b, v29.16b
716
eor v19.16b, v9.16b, v29.16b
717
and v17.16b, v17.16b, v9.16b
718
eor v8.16b, v8.16b, v21.16b
719
and v20.16b, v22.16b, v9.16b
720
eor v21.16b, v29.16b, v16.16b
721
eor v22.16b, v29.16b, v16.16b
722
and v23.16b, v25.16b, v16.16b
723
and v6.16b, v6.16b, v19.16b
724
eor v25.16b, v8.16b, v16.16b
725
eor v29.16b, v30.16b, v8.16b
726
and v4.16b, v21.16b, v4.16b
727
and v8.16b, v28.16b, v8.16b
728
and v0.16b, v22.16b, v0.16b
729
eor v21.16b, v23.16b, v1.16b
730
eor v22.16b, v9.16b, v25.16b
731
eor v9.16b, v9.16b, v25.16b
732
eor v23.16b, v25.16b, v16.16b
733
and v3.16b, v29.16b, v3.16b
734
and v24.16b, v24.16b, v25.16b
735
and v25.16b, v27.16b, v25.16b
736
and v10.16b, v22.16b, v10.16b
737
and v9.16b, v9.16b, v18.16b
738
eor v18.16b, v19.16b, v23.16b
739
and v19.16b, v26.16b, v23.16b
740
eor v3.16b, v5.16b, v3.16b
741
eor v17.16b, v17.16b, v24.16b
742
eor v10.16b, v24.16b, v10.16b
743
and v16.16b, v31.16b, v16.16b
744
eor v20.16b, v20.16b, v25.16b
745
eor v9.16b, v25.16b, v9.16b
746
eor v4.16b, v2.16b, v4.16b
747
and v7.16b, v18.16b, v7.16b
748
eor v18.16b, v19.16b, v6.16b
749
eor v5.16b, v8.16b, v5.16b
750
eor v0.16b, v1.16b, v0.16b
751
eor v1.16b, v21.16b, v10.16b
752
eor v8.16b, v3.16b, v17.16b
753
eor v2.16b, v16.16b, v2.16b
754
eor v3.16b, v6.16b, v7.16b
755
eor v6.16b, v18.16b, v9.16b
756
eor v4.16b, v4.16b, v20.16b
757
eor v10.16b, v5.16b, v10.16b
758
eor v0.16b, v0.16b, v17.16b
759
eor v9.16b, v2.16b, v9.16b
760
eor v3.16b, v3.16b, v20.16b
761
eor v7.16b, v6.16b, v1.16b
762
eor v5.16b, v8.16b, v4.16b
763
eor v6.16b, v10.16b, v1.16b
764
eor v2.16b, v4.16b, v0.16b
765
eor v4.16b, v3.16b, v10.16b
766
eor v9.16b, v9.16b, v7.16b
767
eor v3.16b, v0.16b, v5.16b
768
eor v0.16b, v1.16b, v4.16b
769
eor v1.16b, v4.16b, v8.16b
770
eor v4.16b, v9.16b, v5.16b
771
eor v6.16b, v6.16b, v3.16b
772
bcc .Lenc_done
773
ext v8.16b, v0.16b, v0.16b, #12
774
ext v9.16b, v4.16b, v4.16b, #12
775
ldr q28, [x11]
776
ext v10.16b, v6.16b, v6.16b, #12
777
ext v16.16b, v1.16b, v1.16b, #12
778
ext v17.16b, v3.16b, v3.16b, #12
779
ext v18.16b, v7.16b, v7.16b, #12
780
eor v0.16b, v0.16b, v8.16b
781
eor v4.16b, v4.16b, v9.16b
782
eor v6.16b, v6.16b, v10.16b
783
ext v19.16b, v2.16b, v2.16b, #12
784
ext v20.16b, v5.16b, v5.16b, #12
785
eor v1.16b, v1.16b, v16.16b
786
eor v3.16b, v3.16b, v17.16b
787
eor v7.16b, v7.16b, v18.16b
788
eor v2.16b, v2.16b, v19.16b
789
eor v16.16b, v16.16b, v0.16b
790
eor v5.16b, v5.16b, v20.16b
791
eor v17.16b, v17.16b, v6.16b
792
eor v10.16b, v10.16b, v4.16b
793
ext v0.16b, v0.16b, v0.16b, #8
794
eor v9.16b, v9.16b, v1.16b
795
ext v1.16b, v1.16b, v1.16b, #8
796
eor v8.16b, v8.16b, v5.16b
797
eor v16.16b, v16.16b, v5.16b
798
eor v18.16b, v18.16b, v3.16b
799
eor v19.16b, v19.16b, v7.16b
800
ext v3.16b, v3.16b, v3.16b, #8
801
ext v7.16b, v7.16b, v7.16b, #8
802
eor v20.16b, v20.16b, v2.16b
803
ext v6.16b, v6.16b, v6.16b, #8
804
ext v21.16b, v5.16b, v5.16b, #8
805
eor v17.16b, v17.16b, v5.16b
806
ext v2.16b, v2.16b, v2.16b, #8
807
eor v10.16b, v10.16b, v5.16b
808
ext v22.16b, v4.16b, v4.16b, #8
809
eor v0.16b, v0.16b, v8.16b
810
eor v1.16b, v1.16b, v16.16b
811
eor v5.16b, v7.16b, v18.16b
812
eor v4.16b, v3.16b, v17.16b
813
eor v3.16b, v6.16b, v10.16b
814
eor v7.16b, v21.16b, v20.16b
815
eor v6.16b, v2.16b, v19.16b
816
eor v2.16b, v22.16b, v9.16b
817
bne .Lenc_loop
818
ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0)
819
b .Lenc_loop
820
.align 4
821
.Lenc_done:
822
ushr v8.2d, v0.2d, #1
823
movi v9.16b, #0x55
824
ldr q10, [x9]
825
ushr v16.2d, v3.2d, #1
826
movi v17.16b, #0x33
827
ushr v18.2d, v4.2d, #1
828
movi v19.16b, #0x0f
829
eor v8.16b, v8.16b, v1.16b
830
ushr v20.2d, v2.2d, #1
831
eor v16.16b, v16.16b, v7.16b
832
eor v18.16b, v18.16b, v6.16b
833
and v8.16b, v8.16b, v9.16b
834
eor v20.16b, v20.16b, v5.16b
835
and v16.16b, v16.16b, v9.16b
836
and v18.16b, v18.16b, v9.16b
837
shl v21.2d, v8.2d, #1
838
eor v1.16b, v1.16b, v8.16b
839
and v8.16b, v20.16b, v9.16b
840
eor v7.16b, v7.16b, v16.16b
841
shl v9.2d, v16.2d, #1
842
eor v6.16b, v6.16b, v18.16b
843
shl v16.2d, v18.2d, #1
844
eor v0.16b, v0.16b, v21.16b
845
shl v18.2d, v8.2d, #1
846
eor v5.16b, v5.16b, v8.16b
847
eor v3.16b, v3.16b, v9.16b
848
eor v4.16b, v4.16b, v16.16b
849
ushr v8.2d, v1.2d, #2
850
eor v2.16b, v2.16b, v18.16b
851
ushr v9.2d, v0.2d, #2
852
ushr v16.2d, v7.2d, #2
853
ushr v18.2d, v3.2d, #2
854
eor v8.16b, v8.16b, v6.16b
855
eor v9.16b, v9.16b, v4.16b
856
eor v16.16b, v16.16b, v5.16b
857
eor v18.16b, v18.16b, v2.16b
858
and v8.16b, v8.16b, v17.16b
859
and v9.16b, v9.16b, v17.16b
860
and v16.16b, v16.16b, v17.16b
861
and v17.16b, v18.16b, v17.16b
862
eor v6.16b, v6.16b, v8.16b
863
shl v8.2d, v8.2d, #2
864
eor v4.16b, v4.16b, v9.16b
865
shl v9.2d, v9.2d, #2
866
eor v5.16b, v5.16b, v16.16b
867
shl v16.2d, v16.2d, #2
868
eor v2.16b, v2.16b, v17.16b
869
shl v17.2d, v17.2d, #2
870
eor v1.16b, v1.16b, v8.16b
871
eor v0.16b, v0.16b, v9.16b
872
eor v7.16b, v7.16b, v16.16b
873
eor v3.16b, v3.16b, v17.16b
874
ushr v8.2d, v6.2d, #4
875
ushr v9.2d, v4.2d, #4
876
ushr v16.2d, v1.2d, #4
877
ushr v17.2d, v0.2d, #4
878
eor v8.16b, v8.16b, v5.16b
879
eor v9.16b, v9.16b, v2.16b
880
eor v16.16b, v16.16b, v7.16b
881
eor v17.16b, v17.16b, v3.16b
882
and v8.16b, v8.16b, v19.16b
883
and v9.16b, v9.16b, v19.16b
884
and v16.16b, v16.16b, v19.16b
885
and v17.16b, v17.16b, v19.16b
886
eor v5.16b, v5.16b, v8.16b
887
shl v8.2d, v8.2d, #4
888
eor v2.16b, v2.16b, v9.16b
889
shl v9.2d, v9.2d, #4
890
eor v7.16b, v7.16b, v16.16b
891
shl v16.2d, v16.2d, #4
892
eor v3.16b, v3.16b, v17.16b
893
shl v17.2d, v17.2d, #4
894
eor v6.16b, v6.16b, v8.16b
895
eor v4.16b, v4.16b, v9.16b
896
eor v7.16b, v7.16b, v10.16b
897
eor v1.16b, v1.16b, v16.16b
898
eor v3.16b, v3.16b, v10.16b
899
eor v0.16b, v0.16b, v17.16b
900
eor v6.16b, v6.16b, v10.16b
901
eor v4.16b, v4.16b, v10.16b
902
eor v2.16b, v2.16b, v10.16b
903
eor v5.16b, v5.16b, v10.16b
904
eor v1.16b, v1.16b, v10.16b
905
eor v0.16b, v0.16b, v10.16b
906
ret
907
.size _bsaes_encrypt8,.-_bsaes_encrypt8
908
909
.type _bsaes_key_convert,%function
910
.align 4
911
// On entry:
912
// x9 -> input key (big-endian)
913
// x10 = number of rounds
914
// x17 -> output key (native endianness)
915
// On exit:
916
// x9, x10 corrupted
917
// x11 -> .LM0_bigendian
918
// x17 -> last quadword of output key
919
// other general-purpose registers preserved
920
// v2-v6 preserved
921
// v7.16b[] = 0x63
922
// v8-v14 preserved
923
// v15 = last round key (converted to native endianness)
924
// other SIMD registers corrupted
925
_bsaes_key_convert:
926
#ifdef __AARCH64EL__
927
adrp x11, .LM0_littleendian
928
add x11, x11, #:lo12:.LM0_littleendian
929
#else
930
adrp x11, .LM0_bigendian
931
add x11, x11, #:lo12:.LM0_bigendian
932
#endif
933
ldr q0, [x9], #16 // load round 0 key
934
ldr q1, [x11] // .LM0
935
ldr q15, [x9], #16 // load round 1 key
936
937
movi v7.16b, #0x63 // compose .L63
938
movi v16.16b, #0x01 // bit masks
939
movi v17.16b, #0x02
940
movi v18.16b, #0x04
941
movi v19.16b, #0x08
942
movi v20.16b, #0x10
943
movi v21.16b, #0x20
944
movi v22.16b, #0x40
945
movi v23.16b, #0x80
946
947
#ifdef __AARCH64EL__
948
rev32 v0.16b, v0.16b
949
#endif
950
sub x10, x10, #1
951
str q0, [x17], #16 // save round 0 key
952
953
.align 4
954
.Lkey_loop:
955
tbl v0.16b, {v15.16b}, v1.16b
956
ldr q15, [x9], #16 // load next round key
957
958
eor v0.16b, v0.16b, v7.16b
959
cmtst v24.16b, v0.16b, v16.16b
960
cmtst v25.16b, v0.16b, v17.16b
961
cmtst v26.16b, v0.16b, v18.16b
962
cmtst v27.16b, v0.16b, v19.16b
963
cmtst v28.16b, v0.16b, v20.16b
964
cmtst v29.16b, v0.16b, v21.16b
965
cmtst v30.16b, v0.16b, v22.16b
966
cmtst v31.16b, v0.16b, v23.16b
967
sub x10, x10, #1
968
st1 {v24.16b,v25.16b,v26.16b,v27.16b}, [x17], #64 // write bit-sliced round key
969
st1 {v28.16b,v29.16b,v30.16b,v31.16b}, [x17], #64
970
cbnz x10, .Lkey_loop
971
972
// don't save last round key
973
#ifdef __AARCH64EL__
974
rev32 v15.16b, v15.16b
975
adrp x11, .LM0_bigendian
976
add x11, x11, #:lo12:.LM0_bigendian
977
#endif
978
ret
979
.size _bsaes_key_convert,.-_bsaes_key_convert
980
981
.globl ossl_bsaes_cbc_encrypt
982
.type ossl_bsaes_cbc_encrypt,%function
983
.align 4
984
// On entry:
985
// x0 -> input ciphertext
986
// x1 -> output plaintext
987
// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
988
// x3 -> key
989
// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
990
// w5 must be == 0
991
// On exit:
992
// Output plaintext filled in
993
// Initialisation vector overwritten with last quadword of ciphertext
994
// No output registers, usual AAPCS64 register preservation
995
ossl_bsaes_cbc_encrypt:
996
AARCH64_VALID_CALL_TARGET
997
cmp x2, #128
998
bhs .Lcbc_do_bsaes
999
b AES_cbc_encrypt
1000
.Lcbc_do_bsaes:
1001
1002
// it is up to the caller to make sure we are called with enc == 0
1003
1004
stp x29, x30, [sp, #-48]!
1005
stp d8, d9, [sp, #16]
1006
stp d10, d15, [sp, #32]
1007
lsr x2, x2, #4 // len in 16 byte blocks
1008
1009
ldr w15, [x3, #240] // get # of rounds
1010
mov x14, sp
1011
1012
// allocate the key schedule on the stack
1013
add x17, sp, #96
1014
sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
1015
1016
// populate the key schedule
1017
mov x9, x3 // pass key
1018
mov x10, x15 // pass # of rounds
1019
mov sp, x17 // sp is sp
1020
bl _bsaes_key_convert
1021
ldr q6, [sp]
1022
str q15, [x17] // save last round key
1023
eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
1024
str q6, [sp]
1025
1026
ldr q15, [x4] // load IV
1027
b .Lcbc_dec_loop
1028
1029
.align 4
1030
.Lcbc_dec_loop:
1031
subs x2, x2, #0x8
1032
bmi .Lcbc_dec_loop_finish
1033
1034
ldr q0, [x0], #16 // load input
1035
mov x9, sp // pass the key
1036
ldr q1, [x0], #16
1037
mov x10, x15
1038
ldr q2, [x0], #16
1039
ldr q3, [x0], #16
1040
ldr q4, [x0], #16
1041
ldr q5, [x0], #16
1042
ldr q6, [x0], #16
1043
ldr q7, [x0], #-7*16
1044
1045
bl _bsaes_decrypt8
1046
1047
ldr q16, [x0], #16 // reload input
1048
eor v0.16b, v0.16b, v15.16b // ^= IV
1049
eor v1.16b, v1.16b, v16.16b
1050
str q0, [x1], #16 // write output
1051
ldr q0, [x0], #16
1052
str q1, [x1], #16
1053
ldr q1, [x0], #16
1054
eor v1.16b, v4.16b, v1.16b
1055
ldr q4, [x0], #16
1056
eor v2.16b, v2.16b, v4.16b
1057
eor v0.16b, v6.16b, v0.16b
1058
ldr q4, [x0], #16
1059
str q0, [x1], #16
1060
str q1, [x1], #16
1061
eor v0.16b, v7.16b, v4.16b
1062
ldr q1, [x0], #16
1063
str q2, [x1], #16
1064
ldr q2, [x0], #16
1065
ldr q15, [x0], #16
1066
str q0, [x1], #16
1067
eor v0.16b, v5.16b, v2.16b
1068
eor v1.16b, v3.16b, v1.16b
1069
str q1, [x1], #16
1070
str q0, [x1], #16
1071
1072
b .Lcbc_dec_loop
1073
1074
.Lcbc_dec_loop_finish:
1075
adds x2, x2, #8
1076
beq .Lcbc_dec_done
1077
1078
ldr q0, [x0], #16 // load input
1079
cmp x2, #2
1080
blo .Lcbc_dec_one
1081
ldr q1, [x0], #16
1082
mov x9, sp // pass the key
1083
mov x10, x15
1084
beq .Lcbc_dec_two
1085
ldr q2, [x0], #16
1086
cmp x2, #4
1087
blo .Lcbc_dec_three
1088
ldr q3, [x0], #16
1089
beq .Lcbc_dec_four
1090
ldr q4, [x0], #16
1091
cmp x2, #6
1092
blo .Lcbc_dec_five
1093
ldr q5, [x0], #16
1094
beq .Lcbc_dec_six
1095
ldr q6, [x0], #-6*16
1096
1097
bl _bsaes_decrypt8
1098
1099
ldr q5, [x0], #16 // reload input
1100
eor v0.16b, v0.16b, v15.16b // ^= IV
1101
ldr q8, [x0], #16
1102
ldr q9, [x0], #16
1103
ldr q10, [x0], #16
1104
str q0, [x1], #16 // write output
1105
ldr q0, [x0], #16
1106
eor v1.16b, v1.16b, v5.16b
1107
ldr q5, [x0], #16
1108
eor v6.16b, v6.16b, v8.16b
1109
ldr q15, [x0]
1110
eor v4.16b, v4.16b, v9.16b
1111
eor v2.16b, v2.16b, v10.16b
1112
str q1, [x1], #16
1113
eor v0.16b, v7.16b, v0.16b
1114
str q6, [x1], #16
1115
eor v1.16b, v3.16b, v5.16b
1116
str q4, [x1], #16
1117
str q2, [x1], #16
1118
str q0, [x1], #16
1119
str q1, [x1]
1120
b .Lcbc_dec_done
1121
.align 4
1122
.Lcbc_dec_six:
1123
sub x0, x0, #0x60
1124
bl _bsaes_decrypt8
1125
ldr q3, [x0], #16 // reload input
1126
eor v0.16b, v0.16b, v15.16b // ^= IV
1127
ldr q5, [x0], #16
1128
ldr q8, [x0], #16
1129
ldr q9, [x0], #16
1130
str q0, [x1], #16 // write output
1131
ldr q0, [x0], #16
1132
eor v1.16b, v1.16b, v3.16b
1133
ldr q15, [x0]
1134
eor v3.16b, v6.16b, v5.16b
1135
eor v4.16b, v4.16b, v8.16b
1136
eor v2.16b, v2.16b, v9.16b
1137
str q1, [x1], #16
1138
eor v0.16b, v7.16b, v0.16b
1139
str q3, [x1], #16
1140
str q4, [x1], #16
1141
str q2, [x1], #16
1142
str q0, [x1]
1143
b .Lcbc_dec_done
1144
.align 4
1145
.Lcbc_dec_five:
1146
sub x0, x0, #0x50
1147
bl _bsaes_decrypt8
1148
ldr q3, [x0], #16 // reload input
1149
eor v0.16b, v0.16b, v15.16b // ^= IV
1150
ldr q5, [x0], #16
1151
ldr q7, [x0], #16
1152
ldr q8, [x0], #16
1153
str q0, [x1], #16 // write output
1154
ldr q15, [x0]
1155
eor v0.16b, v1.16b, v3.16b
1156
eor v1.16b, v6.16b, v5.16b
1157
eor v3.16b, v4.16b, v7.16b
1158
str q0, [x1], #16
1159
eor v0.16b, v2.16b, v8.16b
1160
str q1, [x1], #16
1161
str q3, [x1], #16
1162
str q0, [x1]
1163
b .Lcbc_dec_done
1164
.align 4
1165
.Lcbc_dec_four:
1166
sub x0, x0, #0x40
1167
bl _bsaes_decrypt8
1168
ldr q2, [x0], #16 // reload input
1169
eor v0.16b, v0.16b, v15.16b // ^= IV
1170
ldr q3, [x0], #16
1171
ldr q5, [x0], #16
1172
str q0, [x1], #16 // write output
1173
ldr q15, [x0]
1174
eor v0.16b, v1.16b, v2.16b
1175
eor v1.16b, v6.16b, v3.16b
1176
eor v2.16b, v4.16b, v5.16b
1177
str q0, [x1], #16
1178
str q1, [x1], #16
1179
str q2, [x1]
1180
b .Lcbc_dec_done
1181
.align 4
1182
.Lcbc_dec_three:
1183
sub x0, x0, #0x30
1184
bl _bsaes_decrypt8
1185
ldr q2, [x0], #16 // reload input
1186
eor v0.16b, v0.16b, v15.16b // ^= IV
1187
ldr q3, [x0], #16
1188
ldr q15, [x0]
1189
str q0, [x1], #16 // write output
1190
eor v0.16b, v1.16b, v2.16b
1191
eor v1.16b, v6.16b, v3.16b
1192
str q0, [x1], #16
1193
str q1, [x1]
1194
b .Lcbc_dec_done
1195
.align 4
1196
.Lcbc_dec_two:
1197
sub x0, x0, #0x20
1198
bl _bsaes_decrypt8
1199
ldr q2, [x0], #16 // reload input
1200
eor v0.16b, v0.16b, v15.16b // ^= IV
1201
ldr q15, [x0]
1202
str q0, [x1], #16 // write output
1203
eor v0.16b, v1.16b, v2.16b
1204
str q0, [x1]
1205
b .Lcbc_dec_done
1206
.align 4
1207
.Lcbc_dec_one:
1208
sub x0, x0, #0x10
1209
stp x1, x4, [sp, #-32]!
1210
str x14, [sp, #16]
1211
mov v8.16b, v15.16b
1212
mov v15.16b, v0.16b
1213
mov x2, x3
1214
bl AES_decrypt
1215
ldr x14, [sp, #16]
1216
ldp x1, x4, [sp], #32
1217
ldr q0, [x1] // load result
1218
eor v0.16b, v0.16b, v8.16b // ^= IV
1219
str q0, [x1] // write output
1220
1221
.align 4
1222
.Lcbc_dec_done:
1223
movi v0.16b, #0
1224
movi v1.16b, #0
1225
.Lcbc_dec_bzero: // wipe key schedule [if any]
1226
stp q0, q1, [sp], #32
1227
cmp sp, x14
1228
bne .Lcbc_dec_bzero
1229
str q15, [x4] // return IV
1230
ldp d8, d9, [sp, #16]
1231
ldp d10, d15, [sp, #32]
1232
ldp x29, x30, [sp], #48
1233
ret
1234
.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
1235
1236
.globl ossl_bsaes_ctr32_encrypt_blocks
1237
.type ossl_bsaes_ctr32_encrypt_blocks,%function
1238
.align 4
1239
// On entry:
1240
// x0 -> input text (whole 16-byte blocks)
1241
// x1 -> output text (whole 16-byte blocks)
1242
// x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
1243
// x3 -> key
1244
// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
1245
// On exit:
1246
// Output text filled in
1247
// No output registers, usual AAPCS64 register preservation
1248
ossl_bsaes_ctr32_encrypt_blocks:
1249
AARCH64_VALID_CALL_TARGET
1250
cmp x2, #8 // use plain AES for
1251
blo .Lctr_enc_short // small sizes
1252
1253
stp x29, x30, [sp, #-80]!
1254
stp d8, d9, [sp, #16]
1255
stp d10, d11, [sp, #32]
1256
stp d12, d13, [sp, #48]
1257
stp d14, d15, [sp, #64]
1258
1259
ldr w15, [x3, #240] // get # of rounds
1260
mov x14, sp
1261
1262
// allocate the key schedule on the stack
1263
add x17, sp, #96
1264
sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
1265
1266
// populate the key schedule
1267
mov x9, x3 // pass key
1268
mov x10, x15 // pass # of rounds
1269
mov sp, x17 // sp is sp
1270
bl _bsaes_key_convert
1271
eor v7.16b, v7.16b, v15.16b // fix up last round key
1272
str q7, [x17] // save last round key
1273
1274
ldr q0, [x4] // load counter
1275
add x13, x11, #.LREVM0SR-.LM0_bigendian
1276
ldr q4, [sp] // load round0 key
1277
1278
movi v8.4s, #1 // compose 1<<96
1279
movi v9.16b, #0
1280
rev32 v15.16b, v0.16b
1281
rev32 v0.16b, v0.16b
1282
ext v11.16b, v9.16b, v8.16b, #4
1283
rev32 v4.16b, v4.16b
1284
add v12.4s, v11.4s, v11.4s // compose 2<<96
1285
str q4, [sp] // save adjusted round0 key
1286
add v13.4s, v11.4s, v12.4s // compose 3<<96
1287
add v14.4s, v12.4s, v12.4s // compose 4<<96
1288
b .Lctr_enc_loop
1289
1290
.align 4
1291
.Lctr_enc_loop:
1292
// Intermix prologue from _bsaes_encrypt8 to use the opportunity
1293
// to flip byte order in 32-bit counter
1294
1295
add v1.4s, v15.4s, v11.4s // +1
1296
add x9, sp, #0x10 // pass next round key
1297
add v2.4s, v15.4s, v12.4s // +2
1298
ldr q9, [x13] // .LREVM0SR
1299
ldr q8, [sp] // load round0 key
1300
add v3.4s, v15.4s, v13.4s // +3
1301
mov x10, x15 // pass rounds
1302
sub x11, x13, #.LREVM0SR-.LSR // pass constants
1303
add v6.4s, v2.4s, v14.4s
1304
add v4.4s, v15.4s, v14.4s // +4
1305
add v7.4s, v3.4s, v14.4s
1306
add v15.4s, v4.4s, v14.4s // next counter
1307
add v5.4s, v1.4s, v14.4s
1308
1309
bl _bsaes_encrypt8_alt
1310
1311
subs x2, x2, #8
1312
blo .Lctr_enc_loop_done
1313
1314
ldr q16, [x0], #16
1315
ldr q17, [x0], #16
1316
eor v1.16b, v1.16b, v17.16b
1317
ldr q17, [x0], #16
1318
eor v0.16b, v0.16b, v16.16b
1319
eor v4.16b, v4.16b, v17.16b
1320
str q0, [x1], #16
1321
ldr q16, [x0], #16
1322
str q1, [x1], #16
1323
mov v0.16b, v15.16b
1324
str q4, [x1], #16
1325
ldr q1, [x0], #16
1326
eor v4.16b, v6.16b, v16.16b
1327
eor v1.16b, v3.16b, v1.16b
1328
ldr q3, [x0], #16
1329
eor v3.16b, v7.16b, v3.16b
1330
ldr q6, [x0], #16
1331
eor v2.16b, v2.16b, v6.16b
1332
ldr q6, [x0], #16
1333
eor v5.16b, v5.16b, v6.16b
1334
str q4, [x1], #16
1335
str q1, [x1], #16
1336
str q3, [x1], #16
1337
str q2, [x1], #16
1338
str q5, [x1], #16
1339
1340
bne .Lctr_enc_loop
1341
b .Lctr_enc_done
1342
1343
.align 4
1344
.Lctr_enc_loop_done:
1345
add x2, x2, #8
1346
ldr q16, [x0], #16 // load input
1347
eor v0.16b, v0.16b, v16.16b
1348
str q0, [x1], #16 // write output
1349
cmp x2, #2
1350
blo .Lctr_enc_done
1351
ldr q17, [x0], #16
1352
eor v1.16b, v1.16b, v17.16b
1353
str q1, [x1], #16
1354
beq .Lctr_enc_done
1355
ldr q18, [x0], #16
1356
eor v4.16b, v4.16b, v18.16b
1357
str q4, [x1], #16
1358
cmp x2, #4
1359
blo .Lctr_enc_done
1360
ldr q19, [x0], #16
1361
eor v6.16b, v6.16b, v19.16b
1362
str q6, [x1], #16
1363
beq .Lctr_enc_done
1364
ldr q20, [x0], #16
1365
eor v3.16b, v3.16b, v20.16b
1366
str q3, [x1], #16
1367
cmp x2, #6
1368
blo .Lctr_enc_done
1369
ldr q21, [x0], #16
1370
eor v7.16b, v7.16b, v21.16b
1371
str q7, [x1], #16
1372
beq .Lctr_enc_done
1373
ldr q22, [x0]
1374
eor v2.16b, v2.16b, v22.16b
1375
str q2, [x1], #16
1376
1377
.Lctr_enc_done:
1378
movi v0.16b, #0
1379
movi v1.16b, #0
1380
.Lctr_enc_bzero: // wipe key schedule [if any]
1381
stp q0, q1, [sp], #32
1382
cmp sp, x14
1383
bne .Lctr_enc_bzero
1384
1385
ldp d8, d9, [sp, #16]
1386
ldp d10, d11, [sp, #32]
1387
ldp d12, d13, [sp, #48]
1388
ldp d14, d15, [sp, #64]
1389
ldp x29, x30, [sp], #80
1390
ret
1391
1392
.Lctr_enc_short:
1393
stp x29, x30, [sp, #-96]!
1394
stp x19, x20, [sp, #16]
1395
stp x21, x22, [sp, #32]
1396
str x23, [sp, #48]
1397
1398
mov x19, x0 // copy arguments
1399
mov x20, x1
1400
mov x21, x2
1401
mov x22, x3
1402
ldr w23, [x4, #12] // load counter .LSW
1403
ldr q1, [x4] // load whole counter value
1404
#ifdef __AARCH64EL__
1405
rev w23, w23
1406
#endif
1407
str q1, [sp, #80] // copy counter value
1408
1409
.Lctr_enc_short_loop:
1410
add x0, sp, #80 // input counter value
1411
add x1, sp, #64 // output on the stack
1412
mov x2, x22 // key
1413
1414
bl AES_encrypt
1415
1416
ldr q0, [x19], #16 // load input
1417
ldr q1, [sp, #64] // load encrypted counter
1418
add x23, x23, #1
1419
#ifdef __AARCH64EL__
1420
rev w0, w23
1421
str w0, [sp, #80+12] // next counter value
1422
#else
1423
str w23, [sp, #80+12] // next counter value
1424
#endif
1425
eor v0.16b, v0.16b, v1.16b
1426
str q0, [x20], #16 // store output
1427
subs x21, x21, #1
1428
bne .Lctr_enc_short_loop
1429
1430
movi v0.16b, #0
1431
movi v1.16b, #0
1432
stp q0, q1, [sp, #64]
1433
1434
ldr x23, [sp, #48]
1435
ldp x21, x22, [sp, #32]
1436
ldp x19, x20, [sp, #16]
1437
ldp x29, x30, [sp], #96
1438
ret
1439
.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
1440
1441
.globl ossl_bsaes_xts_encrypt
1442
.type ossl_bsaes_xts_encrypt,%function
1443
.align 4
1444
// On entry:
1445
// x0 -> input plaintext
1446
// x1 -> output ciphertext
1447
// x2 -> length of text in bytes (must be at least 16)
1448
// x3 -> key1 (used to encrypt the XORed plaintext blocks)
1449
// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
1450
// x5 -> 16-byte initial vector (typically, sector number)
1451
// On exit:
1452
// Output ciphertext filled in
1453
// No output registers, usual AAPCS64 register preservation
1454
ossl_bsaes_xts_encrypt:
1455
AARCH64_VALID_CALL_TARGET
1456
// Stack layout:
1457
// sp ->
1458
// nrounds*128-96 bytes: key schedule
1459
// x19 ->
1460
// 16 bytes: frame record
1461
// 4*16 bytes: tweak storage across _bsaes_encrypt8
1462
// 6*8 bytes: storage for 5 callee-saved general-purpose registers
1463
// 8*8 bytes: storage for 8 callee-saved SIMD registers
1464
stp x29, x30, [sp, #-192]!
1465
stp x19, x20, [sp, #80]
1466
stp x21, x22, [sp, #96]
1467
str x23, [sp, #112]
1468
stp d8, d9, [sp, #128]
1469
stp d10, d11, [sp, #144]
1470
stp d12, d13, [sp, #160]
1471
stp d14, d15, [sp, #176]
1472
1473
mov x19, sp
1474
mov x20, x0
1475
mov x21, x1
1476
mov x22, x2
1477
mov x23, x3
1478
1479
// generate initial tweak
1480
sub sp, sp, #16
1481
mov x0, x5 // iv[]
1482
mov x1, sp
1483
mov x2, x4 // key2
1484
bl AES_encrypt
1485
ldr q11, [sp], #16
1486
1487
ldr w1, [x23, #240] // get # of rounds
1488
// allocate the key schedule on the stack
1489
add x17, sp, #96
1490
sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
1491
1492
// populate the key schedule
1493
mov x9, x23 // pass key
1494
mov x10, x1 // pass # of rounds
1495
mov sp, x17
1496
bl _bsaes_key_convert
1497
eor v15.16b, v15.16b, v7.16b // fix up last round key
1498
str q15, [x17] // save last round key
1499
1500
subs x22, x22, #0x80
1501
blo .Lxts_enc_short
1502
b .Lxts_enc_loop
1503
1504
.align 4
1505
.Lxts_enc_loop:
1506
ldr q8, .Lxts_magic
1507
mov x10, x1 // pass rounds
1508
add x2, x19, #16
1509
ldr q0, [x20], #16
1510
sshr v1.2d, v11.2d, #63
1511
mov x9, sp // pass key schedule
1512
ldr q6, .Lxts_magic+16
1513
add v2.2d, v11.2d, v11.2d
1514
cmtst v3.2d, v11.2d, v6.2d
1515
and v1.16b, v1.16b, v8.16b
1516
ext v1.16b, v1.16b, v1.16b, #8
1517
and v3.16b, v3.16b, v8.16b
1518
ldr q4, [x20], #16
1519
eor v12.16b, v2.16b, v1.16b
1520
eor v1.16b, v4.16b, v12.16b
1521
eor v0.16b, v0.16b, v11.16b
1522
cmtst v2.2d, v12.2d, v6.2d
1523
add v4.2d, v12.2d, v12.2d
1524
add x0, x19, #16
1525
ext v3.16b, v3.16b, v3.16b, #8
1526
and v2.16b, v2.16b, v8.16b
1527
eor v13.16b, v4.16b, v3.16b
1528
ldr q3, [x20], #16
1529
ext v4.16b, v2.16b, v2.16b, #8
1530
eor v2.16b, v3.16b, v13.16b
1531
ldr q3, [x20], #16
1532
add v5.2d, v13.2d, v13.2d
1533
cmtst v7.2d, v13.2d, v6.2d
1534
and v7.16b, v7.16b, v8.16b
1535
ldr q9, [x20], #16
1536
ext v7.16b, v7.16b, v7.16b, #8
1537
ldr q10, [x20], #16
1538
eor v14.16b, v5.16b, v4.16b
1539
ldr q16, [x20], #16
1540
add v4.2d, v14.2d, v14.2d
1541
eor v3.16b, v3.16b, v14.16b
1542
eor v15.16b, v4.16b, v7.16b
1543
add v5.2d, v15.2d, v15.2d
1544
ldr q7, [x20], #16
1545
cmtst v4.2d, v14.2d, v6.2d
1546
and v17.16b, v4.16b, v8.16b
1547
cmtst v18.2d, v15.2d, v6.2d
1548
eor v4.16b, v9.16b, v15.16b
1549
ext v9.16b, v17.16b, v17.16b, #8
1550
eor v9.16b, v5.16b, v9.16b
1551
add v17.2d, v9.2d, v9.2d
1552
and v18.16b, v18.16b, v8.16b
1553
eor v5.16b, v10.16b, v9.16b
1554
str q9, [x2], #16
1555
ext v10.16b, v18.16b, v18.16b, #8
1556
cmtst v9.2d, v9.2d, v6.2d
1557
and v9.16b, v9.16b, v8.16b
1558
eor v10.16b, v17.16b, v10.16b
1559
cmtst v17.2d, v10.2d, v6.2d
1560
eor v6.16b, v16.16b, v10.16b
1561
str q10, [x2], #16
1562
ext v9.16b, v9.16b, v9.16b, #8
1563
add v10.2d, v10.2d, v10.2d
1564
eor v9.16b, v10.16b, v9.16b
1565
str q9, [x2], #16
1566
eor v7.16b, v7.16b, v9.16b
1567
add v9.2d, v9.2d, v9.2d
1568
and v8.16b, v17.16b, v8.16b
1569
ext v8.16b, v8.16b, v8.16b, #8
1570
eor v8.16b, v9.16b, v8.16b
1571
str q8, [x2] // next round tweak
1572
1573
bl _bsaes_encrypt8
1574
1575
ldr q8, [x0], #16
1576
eor v0.16b, v0.16b, v11.16b
1577
eor v1.16b, v1.16b, v12.16b
1578
ldr q9, [x0], #16
1579
eor v4.16b, v4.16b, v13.16b
1580
eor v6.16b, v6.16b, v14.16b
1581
ldr q10, [x0], #16
1582
eor v3.16b, v3.16b, v15.16b
1583
subs x22, x22, #0x80
1584
str q0, [x21], #16
1585
ldr q11, [x0] // next round tweak
1586
str q1, [x21], #16
1587
eor v0.16b, v7.16b, v8.16b
1588
eor v1.16b, v2.16b, v9.16b
1589
str q4, [x21], #16
1590
eor v2.16b, v5.16b, v10.16b
1591
str q6, [x21], #16
1592
str q3, [x21], #16
1593
str q0, [x21], #16
1594
str q1, [x21], #16
1595
str q2, [x21], #16
1596
bpl .Lxts_enc_loop
1597
1598
.Lxts_enc_short:
1599
adds x22, x22, #0x70
1600
bmi .Lxts_enc_done
1601
1602
ldr q8, .Lxts_magic
1603
sshr v1.2d, v11.2d, #63
1604
add v2.2d, v11.2d, v11.2d
1605
ldr q9, .Lxts_magic+16
1606
subs x22, x22, #0x10
1607
ldr q0, [x20], #16
1608
and v1.16b, v1.16b, v8.16b
1609
cmtst v3.2d, v11.2d, v9.2d
1610
ext v1.16b, v1.16b, v1.16b, #8
1611
and v3.16b, v3.16b, v8.16b
1612
eor v12.16b, v2.16b, v1.16b
1613
ext v1.16b, v3.16b, v3.16b, #8
1614
add v2.2d, v12.2d, v12.2d
1615
cmtst v3.2d, v12.2d, v9.2d
1616
eor v13.16b, v2.16b, v1.16b
1617
and v22.16b, v3.16b, v8.16b
1618
bmi .Lxts_enc_1
1619
1620
ext v2.16b, v22.16b, v22.16b, #8
1621
add v3.2d, v13.2d, v13.2d
1622
ldr q1, [x20], #16
1623
cmtst v4.2d, v13.2d, v9.2d
1624
subs x22, x22, #0x10
1625
eor v14.16b, v3.16b, v2.16b
1626
and v23.16b, v4.16b, v8.16b
1627
bmi .Lxts_enc_2
1628
1629
ext v3.16b, v23.16b, v23.16b, #8
1630
add v4.2d, v14.2d, v14.2d
1631
ldr q2, [x20], #16
1632
cmtst v5.2d, v14.2d, v9.2d
1633
eor v0.16b, v0.16b, v11.16b
1634
subs x22, x22, #0x10
1635
eor v15.16b, v4.16b, v3.16b
1636
and v24.16b, v5.16b, v8.16b
1637
bmi .Lxts_enc_3
1638
1639
ext v4.16b, v24.16b, v24.16b, #8
1640
add v5.2d, v15.2d, v15.2d
1641
ldr q3, [x20], #16
1642
cmtst v6.2d, v15.2d, v9.2d
1643
eor v1.16b, v1.16b, v12.16b
1644
subs x22, x22, #0x10
1645
eor v16.16b, v5.16b, v4.16b
1646
and v25.16b, v6.16b, v8.16b
1647
bmi .Lxts_enc_4
1648
1649
ext v5.16b, v25.16b, v25.16b, #8
1650
add v6.2d, v16.2d, v16.2d
1651
add x0, x19, #16
1652
cmtst v7.2d, v16.2d, v9.2d
1653
ldr q4, [x20], #16
1654
eor v2.16b, v2.16b, v13.16b
1655
str q16, [x0], #16
1656
subs x22, x22, #0x10
1657
eor v17.16b, v6.16b, v5.16b
1658
and v26.16b, v7.16b, v8.16b
1659
bmi .Lxts_enc_5
1660
1661
ext v7.16b, v26.16b, v26.16b, #8
1662
add v18.2d, v17.2d, v17.2d
1663
ldr q5, [x20], #16
1664
eor v3.16b, v3.16b, v14.16b
1665
str q17, [x0], #16
1666
subs x22, x22, #0x10
1667
eor v18.16b, v18.16b, v7.16b
1668
bmi .Lxts_enc_6
1669
1670
ldr q6, [x20], #16
1671
eor v4.16b, v4.16b, v15.16b
1672
eor v5.16b, v5.16b, v16.16b
1673
str q18, [x0] // next round tweak
1674
mov x9, sp // pass key schedule
1675
mov x10, x1
1676
add x0, x19, #16
1677
sub x22, x22, #0x10
1678
eor v6.16b, v6.16b, v17.16b
1679
1680
bl _bsaes_encrypt8
1681
1682
ldr q16, [x0], #16
1683
eor v0.16b, v0.16b, v11.16b
1684
eor v1.16b, v1.16b, v12.16b
1685
ldr q17, [x0], #16
1686
eor v4.16b, v4.16b, v13.16b
1687
eor v6.16b, v6.16b, v14.16b
1688
eor v3.16b, v3.16b, v15.16b
1689
ldr q11, [x0] // next round tweak
1690
str q0, [x21], #16
1691
str q1, [x21], #16
1692
eor v0.16b, v7.16b, v16.16b
1693
eor v1.16b, v2.16b, v17.16b
1694
str q4, [x21], #16
1695
str q6, [x21], #16
1696
str q3, [x21], #16
1697
str q0, [x21], #16
1698
str q1, [x21], #16
1699
b .Lxts_enc_done
1700
1701
.align 4
1702
.Lxts_enc_6:
1703
eor v4.16b, v4.16b, v15.16b
1704
eor v5.16b, v5.16b, v16.16b
1705
mov x9, sp // pass key schedule
1706
mov x10, x1 // pass rounds
1707
add x0, x19, #16
1708
1709
bl _bsaes_encrypt8
1710
1711
ldr q16, [x0], #16
1712
eor v0.16b, v0.16b, v11.16b
1713
eor v1.16b, v1.16b, v12.16b
1714
eor v4.16b, v4.16b, v13.16b
1715
eor v6.16b, v6.16b, v14.16b
1716
ldr q11, [x0] // next round tweak
1717
eor v3.16b, v3.16b, v15.16b
1718
str q0, [x21], #16
1719
str q1, [x21], #16
1720
eor v0.16b, v7.16b, v16.16b
1721
str q4, [x21], #16
1722
str q6, [x21], #16
1723
str q3, [x21], #16
1724
str q0, [x21], #16
1725
b .Lxts_enc_done
1726
1727
.align 4
1728
.Lxts_enc_5:
1729
eor v3.16b, v3.16b, v14.16b
1730
eor v4.16b, v4.16b, v15.16b
1731
mov x9, sp // pass key schedule
1732
mov x10, x1 // pass rounds
1733
add x0, x19, #16
1734
1735
bl _bsaes_encrypt8
1736
1737
eor v0.16b, v0.16b, v11.16b
1738
eor v1.16b, v1.16b, v12.16b
1739
ldr q11, [x0] // next round tweak
1740
eor v4.16b, v4.16b, v13.16b
1741
eor v6.16b, v6.16b, v14.16b
1742
eor v3.16b, v3.16b, v15.16b
1743
str q0, [x21], #16
1744
str q1, [x21], #16
1745
str q4, [x21], #16
1746
str q6, [x21], #16
1747
str q3, [x21], #16
1748
b .Lxts_enc_done
1749
1750
.align 4
1751
.Lxts_enc_4:
1752
eor v2.16b, v2.16b, v13.16b
1753
eor v3.16b, v3.16b, v14.16b
1754
mov x9, sp // pass key schedule
1755
mov x10, x1 // pass rounds
1756
add x0, x19, #16
1757
1758
bl _bsaes_encrypt8
1759
1760
eor v0.16b, v0.16b, v11.16b
1761
eor v1.16b, v1.16b, v12.16b
1762
eor v4.16b, v4.16b, v13.16b
1763
eor v6.16b, v6.16b, v14.16b
1764
mov v11.16b, v15.16b // next round tweak
1765
str q0, [x21], #16
1766
str q1, [x21], #16
1767
str q4, [x21], #16
1768
str q6, [x21], #16
1769
b .Lxts_enc_done
1770
1771
.align 4
1772
.Lxts_enc_3:
1773
eor v1.16b, v1.16b, v12.16b
1774
eor v2.16b, v2.16b, v13.16b
1775
mov x9, sp // pass key schedule
1776
mov x10, x1 // pass rounds
1777
add x0, x19, #16
1778
1779
bl _bsaes_encrypt8
1780
1781
eor v0.16b, v0.16b, v11.16b
1782
eor v1.16b, v1.16b, v12.16b
1783
eor v4.16b, v4.16b, v13.16b
1784
mov v11.16b, v14.16b // next round tweak
1785
str q0, [x21], #16
1786
str q1, [x21], #16
1787
str q4, [x21], #16
1788
b .Lxts_enc_done
1789
1790
.align 4
1791
.Lxts_enc_2:
1792
eor v0.16b, v0.16b, v11.16b
1793
eor v1.16b, v1.16b, v12.16b
1794
mov x9, sp // pass key schedule
1795
mov x10, x1 // pass rounds
1796
add x0, x19, #16
1797
1798
bl _bsaes_encrypt8
1799
1800
eor v0.16b, v0.16b, v11.16b
1801
eor v1.16b, v1.16b, v12.16b
1802
mov v11.16b, v13.16b // next round tweak
1803
str q0, [x21], #16
1804
str q1, [x21], #16
1805
b .Lxts_enc_done
1806
1807
.align 4
1808
.Lxts_enc_1:
1809
eor v0.16b, v0.16b, v11.16b
1810
sub x0, sp, #16
1811
sub x1, sp, #16
1812
mov x2, x23
1813
mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
1814
mov v14.d[0], v12.d[1]
1815
str q0, [sp, #-16]!
1816
1817
bl AES_encrypt
1818
1819
ldr q0, [sp], #16
1820
trn1 v13.2d, v11.2d, v13.2d
1821
trn1 v11.2d, v12.2d, v14.2d // next round tweak
1822
eor v0.16b, v0.16b, v13.16b
1823
str q0, [x21], #16
1824
1825
.Lxts_enc_done:
1826
adds x22, x22, #0x10
1827
beq .Lxts_enc_ret
1828
1829
sub x6, x21, #0x10
1830
// Penultimate plaintext block produces final ciphertext part-block
1831
// plus remaining part of final plaintext block. Move ciphertext part
1832
// to final position and reuse penultimate ciphertext block buffer to
1833
// construct final plaintext block
1834
.Lxts_enc_steal:
1835
ldrb w0, [x20], #1
1836
ldrb w1, [x21, #-0x10]
1837
strb w0, [x21, #-0x10]
1838
strb w1, [x21], #1
1839
1840
subs x22, x22, #1
1841
bhi .Lxts_enc_steal
1842
1843
// Finally encrypt the penultimate ciphertext block using the
1844
// last tweak
1845
ldr q0, [x6]
1846
eor v0.16b, v0.16b, v11.16b
1847
str q0, [sp, #-16]!
1848
mov x0, sp
1849
mov x1, sp
1850
mov x2, x23
1851
mov x21, x6
1852
mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
1853
1854
bl AES_encrypt
1855
1856
trn1 v11.2d, v11.2d, v13.2d
1857
ldr q0, [sp], #16
1858
eor v0.16b, v0.16b, v11.16b
1859
str q0, [x21]
1860
1861
.Lxts_enc_ret:
1862
1863
movi v0.16b, #0
1864
movi v1.16b, #0
1865
.Lxts_enc_bzero: // wipe key schedule
1866
stp q0, q1, [sp], #32
1867
cmp sp, x19
1868
bne .Lxts_enc_bzero
1869
1870
ldp x19, x20, [sp, #80]
1871
ldp x21, x22, [sp, #96]
1872
ldr x23, [sp, #112]
1873
ldp d8, d9, [sp, #128]
1874
ldp d10, d11, [sp, #144]
1875
ldp d12, d13, [sp, #160]
1876
ldp d14, d15, [sp, #176]
1877
ldp x29, x30, [sp], #192
1878
ret
1879
.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
1880
1881
// The assembler doesn't seem capable of de-duplicating these when expressed
1882
// using `ldr qd,=` syntax, so assign a symbolic address
1883
.align 5
1884
.Lxts_magic:
1885
.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000
1886
1887
.globl ossl_bsaes_xts_decrypt
1888
.type ossl_bsaes_xts_decrypt,%function
1889
.align 4
1890
// On entry:
1891
// x0 -> input ciphertext
1892
// x1 -> output plaintext
1893
// x2 -> length of text in bytes (must be at least 16)
1894
// x3 -> key1 (used to decrypt the XORed ciphertext blocks)
1895
// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
1896
// x5 -> 16-byte initial vector (typically, sector number)
1897
// On exit:
1898
// Output plaintext filled in
1899
// No output registers, usual AAPCS64 register preservation
1900
ossl_bsaes_xts_decrypt:
1901
AARCH64_VALID_CALL_TARGET
1902
// Stack layout:
1903
// sp ->
1904
// nrounds*128-96 bytes: key schedule
1905
// x19 ->
1906
// 16 bytes: frame record
1907
// 4*16 bytes: tweak storage across _bsaes_decrypt8
1908
// 6*8 bytes: storage for 5 callee-saved general-purpose registers
1909
// 8*8 bytes: storage for 8 callee-saved SIMD registers
1910
stp x29, x30, [sp, #-192]!
1911
stp x19, x20, [sp, #80]
1912
stp x21, x22, [sp, #96]
1913
str x23, [sp, #112]
1914
stp d8, d9, [sp, #128]
1915
stp d10, d11, [sp, #144]
1916
stp d12, d13, [sp, #160]
1917
stp d14, d15, [sp, #176]
1918
1919
mov x19, sp
1920
mov x20, x0
1921
mov x21, x1
1922
mov x22, x2
1923
mov x23, x3
1924
1925
// generate initial tweak
1926
sub sp, sp, #16
1927
mov x0, x5 // iv[]
1928
mov x1, sp
1929
mov x2, x4 // key2
1930
bl AES_encrypt
1931
ldr q11, [sp], #16
1932
1933
ldr w1, [x23, #240] // get # of rounds
1934
// allocate the key schedule on the stack
1935
add x17, sp, #96
1936
sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
1937
1938
// populate the key schedule
1939
mov x9, x23 // pass key
1940
mov x10, x1 // pass # of rounds
1941
mov sp, x17
1942
bl _bsaes_key_convert
1943
ldr q6, [sp]
1944
str q15, [x17] // save last round key
1945
eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
1946
str q6, [sp]
1947
1948
sub x30, x22, #0x10
1949
tst x22, #0xf // if not multiple of 16
1950
csel x22, x30, x22, ne // subtract another 16 bytes
1951
subs x22, x22, #0x80
1952
1953
blo .Lxts_dec_short
1954
b .Lxts_dec_loop
1955
1956
.align 4
1957
.Lxts_dec_loop:
1958
ldr q8, .Lxts_magic
1959
mov x10, x1 // pass rounds
1960
add x2, x19, #16
1961
ldr q0, [x20], #16
1962
sshr v1.2d, v11.2d, #63
1963
mov x9, sp // pass key schedule
1964
ldr q6, .Lxts_magic+16
1965
add v2.2d, v11.2d, v11.2d
1966
cmtst v3.2d, v11.2d, v6.2d
1967
and v1.16b, v1.16b, v8.16b
1968
ext v1.16b, v1.16b, v1.16b, #8
1969
and v3.16b, v3.16b, v8.16b
1970
ldr q4, [x20], #16
1971
eor v12.16b, v2.16b, v1.16b
1972
eor v1.16b, v4.16b, v12.16b
1973
eor v0.16b, v0.16b, v11.16b
1974
cmtst v2.2d, v12.2d, v6.2d
1975
add v4.2d, v12.2d, v12.2d
1976
add x0, x19, #16
1977
ext v3.16b, v3.16b, v3.16b, #8
1978
and v2.16b, v2.16b, v8.16b
1979
eor v13.16b, v4.16b, v3.16b
1980
ldr q3, [x20], #16
1981
ext v4.16b, v2.16b, v2.16b, #8
1982
eor v2.16b, v3.16b, v13.16b
1983
ldr q3, [x20], #16
1984
add v5.2d, v13.2d, v13.2d
1985
cmtst v7.2d, v13.2d, v6.2d
1986
and v7.16b, v7.16b, v8.16b
1987
ldr q9, [x20], #16
1988
ext v7.16b, v7.16b, v7.16b, #8
1989
ldr q10, [x20], #16
1990
eor v14.16b, v5.16b, v4.16b
1991
ldr q16, [x20], #16
1992
add v4.2d, v14.2d, v14.2d
1993
eor v3.16b, v3.16b, v14.16b
1994
eor v15.16b, v4.16b, v7.16b
1995
add v5.2d, v15.2d, v15.2d
1996
ldr q7, [x20], #16
1997
cmtst v4.2d, v14.2d, v6.2d
1998
and v17.16b, v4.16b, v8.16b
1999
cmtst v18.2d, v15.2d, v6.2d
2000
eor v4.16b, v9.16b, v15.16b
2001
ext v9.16b, v17.16b, v17.16b, #8
2002
eor v9.16b, v5.16b, v9.16b
2003
add v17.2d, v9.2d, v9.2d
2004
and v18.16b, v18.16b, v8.16b
2005
eor v5.16b, v10.16b, v9.16b
2006
str q9, [x2], #16
2007
ext v10.16b, v18.16b, v18.16b, #8
2008
cmtst v9.2d, v9.2d, v6.2d
2009
and v9.16b, v9.16b, v8.16b
2010
eor v10.16b, v17.16b, v10.16b
2011
cmtst v17.2d, v10.2d, v6.2d
2012
eor v6.16b, v16.16b, v10.16b
2013
str q10, [x2], #16
2014
ext v9.16b, v9.16b, v9.16b, #8
2015
add v10.2d, v10.2d, v10.2d
2016
eor v9.16b, v10.16b, v9.16b
2017
str q9, [x2], #16
2018
eor v7.16b, v7.16b, v9.16b
2019
add v9.2d, v9.2d, v9.2d
2020
and v8.16b, v17.16b, v8.16b
2021
ext v8.16b, v8.16b, v8.16b, #8
2022
eor v8.16b, v9.16b, v8.16b
2023
str q8, [x2] // next round tweak
2024
2025
bl _bsaes_decrypt8
2026
2027
eor v6.16b, v6.16b, v13.16b
2028
eor v0.16b, v0.16b, v11.16b
2029
ldr q8, [x0], #16
2030
eor v7.16b, v7.16b, v8.16b
2031
str q0, [x21], #16
2032
eor v0.16b, v1.16b, v12.16b
2033
ldr q1, [x0], #16
2034
eor v1.16b, v3.16b, v1.16b
2035
subs x22, x22, #0x80
2036
eor v2.16b, v2.16b, v15.16b
2037
eor v3.16b, v4.16b, v14.16b
2038
ldr q4, [x0], #16
2039
str q0, [x21], #16
2040
ldr q11, [x0] // next round tweak
2041
eor v0.16b, v5.16b, v4.16b
2042
str q6, [x21], #16
2043
str q3, [x21], #16
2044
str q2, [x21], #16
2045
str q7, [x21], #16
2046
str q1, [x21], #16
2047
str q0, [x21], #16
2048
bpl .Lxts_dec_loop
2049
2050
.Lxts_dec_short:
2051
adds x22, x22, #0x70
2052
bmi .Lxts_dec_done
2053
2054
ldr q8, .Lxts_magic
2055
sshr v1.2d, v11.2d, #63
2056
add v2.2d, v11.2d, v11.2d
2057
ldr q9, .Lxts_magic+16
2058
subs x22, x22, #0x10
2059
ldr q0, [x20], #16
2060
and v1.16b, v1.16b, v8.16b
2061
cmtst v3.2d, v11.2d, v9.2d
2062
ext v1.16b, v1.16b, v1.16b, #8
2063
and v3.16b, v3.16b, v8.16b
2064
eor v12.16b, v2.16b, v1.16b
2065
ext v1.16b, v3.16b, v3.16b, #8
2066
add v2.2d, v12.2d, v12.2d
2067
cmtst v3.2d, v12.2d, v9.2d
2068
eor v13.16b, v2.16b, v1.16b
2069
and v22.16b, v3.16b, v8.16b
2070
bmi .Lxts_dec_1
2071
2072
ext v2.16b, v22.16b, v22.16b, #8
2073
add v3.2d, v13.2d, v13.2d
2074
ldr q1, [x20], #16
2075
cmtst v4.2d, v13.2d, v9.2d
2076
subs x22, x22, #0x10
2077
eor v14.16b, v3.16b, v2.16b
2078
and v23.16b, v4.16b, v8.16b
2079
bmi .Lxts_dec_2
2080
2081
ext v3.16b, v23.16b, v23.16b, #8
2082
add v4.2d, v14.2d, v14.2d
2083
ldr q2, [x20], #16
2084
cmtst v5.2d, v14.2d, v9.2d
2085
eor v0.16b, v0.16b, v11.16b
2086
subs x22, x22, #0x10
2087
eor v15.16b, v4.16b, v3.16b
2088
and v24.16b, v5.16b, v8.16b
2089
bmi .Lxts_dec_3
2090
2091
ext v4.16b, v24.16b, v24.16b, #8
2092
add v5.2d, v15.2d, v15.2d
2093
ldr q3, [x20], #16
2094
cmtst v6.2d, v15.2d, v9.2d
2095
eor v1.16b, v1.16b, v12.16b
2096
subs x22, x22, #0x10
2097
eor v16.16b, v5.16b, v4.16b
2098
and v25.16b, v6.16b, v8.16b
2099
bmi .Lxts_dec_4
2100
2101
ext v5.16b, v25.16b, v25.16b, #8
2102
add v6.2d, v16.2d, v16.2d
2103
add x0, x19, #16
2104
cmtst v7.2d, v16.2d, v9.2d
2105
ldr q4, [x20], #16
2106
eor v2.16b, v2.16b, v13.16b
2107
str q16, [x0], #16
2108
subs x22, x22, #0x10
2109
eor v17.16b, v6.16b, v5.16b
2110
and v26.16b, v7.16b, v8.16b
2111
bmi .Lxts_dec_5
2112
2113
ext v7.16b, v26.16b, v26.16b, #8
2114
add v18.2d, v17.2d, v17.2d
2115
ldr q5, [x20], #16
2116
eor v3.16b, v3.16b, v14.16b
2117
str q17, [x0], #16
2118
subs x22, x22, #0x10
2119
eor v18.16b, v18.16b, v7.16b
2120
bmi .Lxts_dec_6
2121
2122
ldr q6, [x20], #16
2123
eor v4.16b, v4.16b, v15.16b
2124
eor v5.16b, v5.16b, v16.16b
2125
str q18, [x0] // next round tweak
2126
mov x9, sp // pass key schedule
2127
mov x10, x1
2128
add x0, x19, #16
2129
sub x22, x22, #0x10
2130
eor v6.16b, v6.16b, v17.16b
2131
2132
bl _bsaes_decrypt8
2133
2134
ldr q16, [x0], #16
2135
eor v0.16b, v0.16b, v11.16b
2136
eor v1.16b, v1.16b, v12.16b
2137
ldr q17, [x0], #16
2138
eor v6.16b, v6.16b, v13.16b
2139
eor v4.16b, v4.16b, v14.16b
2140
eor v2.16b, v2.16b, v15.16b
2141
ldr q11, [x0] // next round tweak
2142
str q0, [x21], #16
2143
str q1, [x21], #16
2144
eor v0.16b, v7.16b, v16.16b
2145
eor v1.16b, v3.16b, v17.16b
2146
str q6, [x21], #16
2147
str q4, [x21], #16
2148
str q2, [x21], #16
2149
str q0, [x21], #16
2150
str q1, [x21], #16
2151
b .Lxts_dec_done
2152
2153
.align 4
2154
.Lxts_dec_6:
2155
eor v4.16b, v4.16b, v15.16b
2156
eor v5.16b, v5.16b, v16.16b
2157
mov x9, sp // pass key schedule
2158
mov x10, x1 // pass rounds
2159
add x0, x19, #16
2160
2161
bl _bsaes_decrypt8
2162
2163
ldr q16, [x0], #16
2164
eor v0.16b, v0.16b, v11.16b
2165
eor v1.16b, v1.16b, v12.16b
2166
eor v6.16b, v6.16b, v13.16b
2167
eor v4.16b, v4.16b, v14.16b
2168
ldr q11, [x0] // next round tweak
2169
eor v2.16b, v2.16b, v15.16b
2170
str q0, [x21], #16
2171
str q1, [x21], #16
2172
eor v0.16b, v7.16b, v16.16b
2173
str q6, [x21], #16
2174
str q4, [x21], #16
2175
str q2, [x21], #16
2176
str q0, [x21], #16
2177
b .Lxts_dec_done
2178
2179
.align 4
2180
.Lxts_dec_5:
2181
eor v3.16b, v3.16b, v14.16b
2182
eor v4.16b, v4.16b, v15.16b
2183
mov x9, sp // pass key schedule
2184
mov x10, x1 // pass rounds
2185
add x0, x19, #16
2186
2187
bl _bsaes_decrypt8
2188
2189
eor v0.16b, v0.16b, v11.16b
2190
eor v1.16b, v1.16b, v12.16b
2191
ldr q11, [x0] // next round tweak
2192
eor v6.16b, v6.16b, v13.16b
2193
eor v4.16b, v4.16b, v14.16b
2194
eor v2.16b, v2.16b, v15.16b
2195
str q0, [x21], #16
2196
str q1, [x21], #16
2197
str q6, [x21], #16
2198
str q4, [x21], #16
2199
str q2, [x21], #16
2200
b .Lxts_dec_done
2201
2202
.align 4
2203
.Lxts_dec_4:
2204
eor v2.16b, v2.16b, v13.16b
2205
eor v3.16b, v3.16b, v14.16b
2206
mov x9, sp // pass key schedule
2207
mov x10, x1 // pass rounds
2208
add x0, x19, #16
2209
2210
bl _bsaes_decrypt8
2211
2212
eor v0.16b, v0.16b, v11.16b
2213
eor v1.16b, v1.16b, v12.16b
2214
eor v6.16b, v6.16b, v13.16b
2215
eor v4.16b, v4.16b, v14.16b
2216
mov v11.16b, v15.16b // next round tweak
2217
str q0, [x21], #16
2218
str q1, [x21], #16
2219
str q6, [x21], #16
2220
str q4, [x21], #16
2221
b .Lxts_dec_done
2222
2223
.align 4
2224
.Lxts_dec_3:
2225
eor v1.16b, v1.16b, v12.16b
2226
eor v2.16b, v2.16b, v13.16b
2227
mov x9, sp // pass key schedule
2228
mov x10, x1 // pass rounds
2229
add x0, x19, #16
2230
2231
bl _bsaes_decrypt8
2232
2233
eor v0.16b, v0.16b, v11.16b
2234
eor v1.16b, v1.16b, v12.16b
2235
eor v6.16b, v6.16b, v13.16b
2236
mov v11.16b, v14.16b // next round tweak
2237
str q0, [x21], #16
2238
str q1, [x21], #16
2239
str q6, [x21], #16
2240
b .Lxts_dec_done
2241
2242
.align 4
2243
.Lxts_dec_2:
2244
eor v0.16b, v0.16b, v11.16b
2245
eor v1.16b, v1.16b, v12.16b
2246
mov x9, sp // pass key schedule
2247
mov x10, x1 // pass rounds
2248
add x0, x19, #16
2249
2250
bl _bsaes_decrypt8
2251
2252
eor v0.16b, v0.16b, v11.16b
2253
eor v1.16b, v1.16b, v12.16b
2254
mov v11.16b, v13.16b // next round tweak
2255
str q0, [x21], #16
2256
str q1, [x21], #16
2257
b .Lxts_dec_done
2258
2259
.align 4
2260
.Lxts_dec_1:
2261
eor v0.16b, v0.16b, v11.16b
2262
sub x0, sp, #16
2263
sub x1, sp, #16
2264
mov x2, x23
2265
mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
2266
mov v14.d[0], v12.d[1]
2267
str q0, [sp, #-16]!
2268
2269
bl AES_decrypt
2270
2271
ldr q0, [sp], #16
2272
trn1 v13.2d, v11.2d, v13.2d
2273
trn1 v11.2d, v12.2d, v14.2d // next round tweak
2274
eor v0.16b, v0.16b, v13.16b
2275
str q0, [x21], #16
2276
2277
.Lxts_dec_done:
2278
adds x22, x22, #0x10
2279
beq .Lxts_dec_ret
2280
2281
// calculate one round of extra tweak for the stolen ciphertext
2282
ldr q8, .Lxts_magic
2283
sshr v6.2d, v11.2d, #63
2284
and v6.16b, v6.16b, v8.16b
2285
add v12.2d, v11.2d, v11.2d
2286
ext v6.16b, v6.16b, v6.16b, #8
2287
eor v12.16b, v12.16b, v6.16b
2288
2289
// perform the final decryption with the last tweak value
2290
ldr q0, [x20], #16
2291
eor v0.16b, v0.16b, v12.16b
2292
str q0, [sp, #-16]!
2293
mov x0, sp
2294
mov x1, sp
2295
mov x2, x23
2296
mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
2297
mov v14.d[0], v12.d[1]
2298
2299
bl AES_decrypt
2300
2301
trn1 v12.2d, v12.2d, v14.2d
2302
trn1 v11.2d, v11.2d, v13.2d
2303
ldr q0, [sp], #16
2304
eor v0.16b, v0.16b, v12.16b
2305
str q0, [x21]
2306
2307
mov x6, x21
2308
// Penultimate ciphertext block produces final plaintext part-block
2309
// plus remaining part of final ciphertext block. Move plaintext part
2310
// to final position and reuse penultimate plaintext block buffer to
2311
// construct final ciphertext block
2312
.Lxts_dec_steal:
2313
ldrb w1, [x21]
2314
ldrb w0, [x20], #1
2315
strb w1, [x21, #0x10]
2316
strb w0, [x21], #1
2317
2318
subs x22, x22, #1
2319
bhi .Lxts_dec_steal
2320
2321
// Finally decrypt the penultimate plaintext block using the
2322
// penultimate tweak
2323
ldr q0, [x6]
2324
eor v0.16b, v0.16b, v11.16b
2325
str q0, [sp, #-16]!
2326
mov x0, sp
2327
mov x1, sp
2328
mov x2, x23
2329
mov x21, x6
2330
2331
bl AES_decrypt
2332
2333
trn1 v11.2d, v11.2d, v13.2d
2334
ldr q0, [sp], #16
2335
eor v0.16b, v0.16b, v11.16b
2336
str q0, [x21]
2337
2338
.Lxts_dec_ret:
2339
2340
movi v0.16b, #0
2341
movi v1.16b, #0
2342
.Lxts_dec_bzero: // wipe key schedule
2343
stp q0, q1, [sp], #32
2344
cmp sp, x19
2345
bne .Lxts_dec_bzero
2346
2347
ldp x19, x20, [sp, #80]
2348
ldp x21, x22, [sp, #96]
2349
ldr x23, [sp, #112]
2350
ldp d8, d9, [sp, #128]
2351
ldp d10, d11, [sp, #144]
2352
ldp d12, d13, [sp, #160]
2353
ldp d14, d15, [sp, #176]
2354
ldp x29, x30, [sp], #192
2355
ret
2356
.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt
2357
2358