Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm64/chacha-neon-core.S
26285 views
1
/*
2
* ChaCha/HChaCha NEON helper functions
3
*
4
* Copyright (C) 2016-2018 Linaro, Ltd. <[email protected]>
5
*
6
* This program is free software; you can redistribute it and/or modify
7
* it under the terms of the GNU General Public License version 2 as
8
* published by the Free Software Foundation.
9
*
10
* Originally based on:
11
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
12
*
13
* Copyright (C) 2015 Martin Willi
14
*
15
* This program is free software; you can redistribute it and/or modify
16
* it under the terms of the GNU General Public License as published by
17
* the Free Software Foundation; either version 2 of the License, or
18
* (at your option) any later version.
19
*/
20
21
#include <linux/linkage.h>
22
#include <asm/assembler.h>
23
#include <asm/cache.h>
24
25
.text
26
.align 6
27
28
/*
29
* chacha_permute - permute one block
30
*
31
* Permute one 64-byte block where the state matrix is stored in the four NEON
32
* registers v0-v3. It performs matrix operations on four words in parallel,
33
* but requires shuffling to rearrange the words after each round.
34
*
35
* The round count is given in w3.
36
*
37
* Clobbers: w3, x10, v4, v12
38
*/
39
SYM_FUNC_START_LOCAL(chacha_permute)
40
41
adr_l x10, ROT8
42
ld1 {v12.4s}, [x10]
43
44
.Ldoubleround:
45
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46
add v0.4s, v0.4s, v1.4s
47
eor v3.16b, v3.16b, v0.16b
48
rev32 v3.8h, v3.8h
49
50
// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
51
add v2.4s, v2.4s, v3.4s
52
eor v4.16b, v1.16b, v2.16b
53
shl v1.4s, v4.4s, #12
54
sri v1.4s, v4.4s, #20
55
56
// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
57
add v0.4s, v0.4s, v1.4s
58
eor v3.16b, v3.16b, v0.16b
59
tbl v3.16b, {v3.16b}, v12.16b
60
61
// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
62
add v2.4s, v2.4s, v3.4s
63
eor v4.16b, v1.16b, v2.16b
64
shl v1.4s, v4.4s, #7
65
sri v1.4s, v4.4s, #25
66
67
// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68
ext v1.16b, v1.16b, v1.16b, #4
69
// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
70
ext v2.16b, v2.16b, v2.16b, #8
71
// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
72
ext v3.16b, v3.16b, v3.16b, #12
73
74
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75
add v0.4s, v0.4s, v1.4s
76
eor v3.16b, v3.16b, v0.16b
77
rev32 v3.8h, v3.8h
78
79
// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
80
add v2.4s, v2.4s, v3.4s
81
eor v4.16b, v1.16b, v2.16b
82
shl v1.4s, v4.4s, #12
83
sri v1.4s, v4.4s, #20
84
85
// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
86
add v0.4s, v0.4s, v1.4s
87
eor v3.16b, v3.16b, v0.16b
88
tbl v3.16b, {v3.16b}, v12.16b
89
90
// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91
add v2.4s, v2.4s, v3.4s
92
eor v4.16b, v1.16b, v2.16b
93
shl v1.4s, v4.4s, #7
94
sri v1.4s, v4.4s, #25
95
96
// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
97
ext v1.16b, v1.16b, v1.16b, #12
98
// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99
ext v2.16b, v2.16b, v2.16b, #8
100
// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101
ext v3.16b, v3.16b, v3.16b, #4
102
103
subs w3, w3, #2
104
b.ne .Ldoubleround
105
106
ret
107
SYM_FUNC_END(chacha_permute)
108
109
SYM_FUNC_START(chacha_block_xor_neon)
110
// x0: Input state matrix, s
111
// x1: 1 data block output, o
112
// x2: 1 data block input, i
113
// w3: nrounds
114
115
stp x29, x30, [sp, #-16]!
116
mov x29, sp
117
118
// x0..3 = s0..3
119
ld1 {v0.4s-v3.4s}, [x0]
120
ld1 {v8.4s-v11.4s}, [x0]
121
122
bl chacha_permute
123
124
ld1 {v4.16b-v7.16b}, [x2]
125
126
// o0 = i0 ^ (x0 + s0)
127
add v0.4s, v0.4s, v8.4s
128
eor v0.16b, v0.16b, v4.16b
129
130
// o1 = i1 ^ (x1 + s1)
131
add v1.4s, v1.4s, v9.4s
132
eor v1.16b, v1.16b, v5.16b
133
134
// o2 = i2 ^ (x2 + s2)
135
add v2.4s, v2.4s, v10.4s
136
eor v2.16b, v2.16b, v6.16b
137
138
// o3 = i3 ^ (x3 + s3)
139
add v3.4s, v3.4s, v11.4s
140
eor v3.16b, v3.16b, v7.16b
141
142
st1 {v0.16b-v3.16b}, [x1]
143
144
ldp x29, x30, [sp], #16
145
ret
146
SYM_FUNC_END(chacha_block_xor_neon)
147
148
SYM_FUNC_START(hchacha_block_neon)
149
// x0: Input state matrix, s
150
// x1: output (8 32-bit words)
151
// w2: nrounds
152
153
stp x29, x30, [sp, #-16]!
154
mov x29, sp
155
156
ld1 {v0.4s-v3.4s}, [x0]
157
158
mov w3, w2
159
bl chacha_permute
160
161
st1 {v0.4s}, [x1], #16
162
st1 {v3.4s}, [x1]
163
164
ldp x29, x30, [sp], #16
165
ret
166
SYM_FUNC_END(hchacha_block_neon)
167
168
a0 .req w12
169
a1 .req w13
170
a2 .req w14
171
a3 .req w15
172
a4 .req w16
173
a5 .req w17
174
a6 .req w19
175
a7 .req w20
176
a8 .req w21
177
a9 .req w22
178
a10 .req w23
179
a11 .req w24
180
a12 .req w25
181
a13 .req w26
182
a14 .req w27
183
a15 .req w28
184
185
.align 6
186
SYM_FUNC_START(chacha_4block_xor_neon)
187
frame_push 10
188
189
// x0: Input state matrix, s
190
// x1: 4 data blocks output, o
191
// x2: 4 data blocks input, i
192
// w3: nrounds
193
// x4: byte count
194
195
adr_l x10, .Lpermute
196
and x5, x4, #63
197
add x10, x10, x5
198
199
//
200
// This function encrypts four consecutive ChaCha blocks by loading
201
// the state matrix in NEON registers four times. The algorithm performs
202
// each operation on the corresponding word of each state matrix, hence
203
// requires no word shuffling. For final XORing step we transpose the
204
// matrix by interleaving 32- and then 64-bit words, which allows us to
205
// do XOR in NEON registers.
206
//
207
// At the same time, a fifth block is encrypted in parallel using
208
// scalar registers
209
//
210
adr_l x9, CTRINC // ... and ROT8
211
ld1 {v30.4s-v31.4s}, [x9]
212
213
// x0..15[0-3] = s0..3[0..3]
214
add x8, x0, #16
215
ld4r { v0.4s- v3.4s}, [x0]
216
ld4r { v4.4s- v7.4s}, [x8], #16
217
ld4r { v8.4s-v11.4s}, [x8], #16
218
ld4r {v12.4s-v15.4s}, [x8]
219
220
mov a0, v0.s[0]
221
mov a1, v1.s[0]
222
mov a2, v2.s[0]
223
mov a3, v3.s[0]
224
mov a4, v4.s[0]
225
mov a5, v5.s[0]
226
mov a6, v6.s[0]
227
mov a7, v7.s[0]
228
mov a8, v8.s[0]
229
mov a9, v9.s[0]
230
mov a10, v10.s[0]
231
mov a11, v11.s[0]
232
mov a12, v12.s[0]
233
mov a13, v13.s[0]
234
mov a14, v14.s[0]
235
mov a15, v15.s[0]
236
237
// x12 += counter values 1-4
238
add v12.4s, v12.4s, v30.4s
239
240
.Ldoubleround4:
241
// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
242
// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
243
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
244
// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
245
add v0.4s, v0.4s, v4.4s
246
add a0, a0, a4
247
add v1.4s, v1.4s, v5.4s
248
add a1, a1, a5
249
add v2.4s, v2.4s, v6.4s
250
add a2, a2, a6
251
add v3.4s, v3.4s, v7.4s
252
add a3, a3, a7
253
254
eor v12.16b, v12.16b, v0.16b
255
eor a12, a12, a0
256
eor v13.16b, v13.16b, v1.16b
257
eor a13, a13, a1
258
eor v14.16b, v14.16b, v2.16b
259
eor a14, a14, a2
260
eor v15.16b, v15.16b, v3.16b
261
eor a15, a15, a3
262
263
rev32 v12.8h, v12.8h
264
ror a12, a12, #16
265
rev32 v13.8h, v13.8h
266
ror a13, a13, #16
267
rev32 v14.8h, v14.8h
268
ror a14, a14, #16
269
rev32 v15.8h, v15.8h
270
ror a15, a15, #16
271
272
// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
273
// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
274
// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
275
// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
276
add v8.4s, v8.4s, v12.4s
277
add a8, a8, a12
278
add v9.4s, v9.4s, v13.4s
279
add a9, a9, a13
280
add v10.4s, v10.4s, v14.4s
281
add a10, a10, a14
282
add v11.4s, v11.4s, v15.4s
283
add a11, a11, a15
284
285
eor v16.16b, v4.16b, v8.16b
286
eor a4, a4, a8
287
eor v17.16b, v5.16b, v9.16b
288
eor a5, a5, a9
289
eor v18.16b, v6.16b, v10.16b
290
eor a6, a6, a10
291
eor v19.16b, v7.16b, v11.16b
292
eor a7, a7, a11
293
294
shl v4.4s, v16.4s, #12
295
shl v5.4s, v17.4s, #12
296
shl v6.4s, v18.4s, #12
297
shl v7.4s, v19.4s, #12
298
299
sri v4.4s, v16.4s, #20
300
ror a4, a4, #20
301
sri v5.4s, v17.4s, #20
302
ror a5, a5, #20
303
sri v6.4s, v18.4s, #20
304
ror a6, a6, #20
305
sri v7.4s, v19.4s, #20
306
ror a7, a7, #20
307
308
// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
309
// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
310
// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
311
// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
312
add v0.4s, v0.4s, v4.4s
313
add a0, a0, a4
314
add v1.4s, v1.4s, v5.4s
315
add a1, a1, a5
316
add v2.4s, v2.4s, v6.4s
317
add a2, a2, a6
318
add v3.4s, v3.4s, v7.4s
319
add a3, a3, a7
320
321
eor v12.16b, v12.16b, v0.16b
322
eor a12, a12, a0
323
eor v13.16b, v13.16b, v1.16b
324
eor a13, a13, a1
325
eor v14.16b, v14.16b, v2.16b
326
eor a14, a14, a2
327
eor v15.16b, v15.16b, v3.16b
328
eor a15, a15, a3
329
330
tbl v12.16b, {v12.16b}, v31.16b
331
ror a12, a12, #24
332
tbl v13.16b, {v13.16b}, v31.16b
333
ror a13, a13, #24
334
tbl v14.16b, {v14.16b}, v31.16b
335
ror a14, a14, #24
336
tbl v15.16b, {v15.16b}, v31.16b
337
ror a15, a15, #24
338
339
// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
340
// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
341
// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
342
// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
343
add v8.4s, v8.4s, v12.4s
344
add a8, a8, a12
345
add v9.4s, v9.4s, v13.4s
346
add a9, a9, a13
347
add v10.4s, v10.4s, v14.4s
348
add a10, a10, a14
349
add v11.4s, v11.4s, v15.4s
350
add a11, a11, a15
351
352
eor v16.16b, v4.16b, v8.16b
353
eor a4, a4, a8
354
eor v17.16b, v5.16b, v9.16b
355
eor a5, a5, a9
356
eor v18.16b, v6.16b, v10.16b
357
eor a6, a6, a10
358
eor v19.16b, v7.16b, v11.16b
359
eor a7, a7, a11
360
361
shl v4.4s, v16.4s, #7
362
shl v5.4s, v17.4s, #7
363
shl v6.4s, v18.4s, #7
364
shl v7.4s, v19.4s, #7
365
366
sri v4.4s, v16.4s, #25
367
ror a4, a4, #25
368
sri v5.4s, v17.4s, #25
369
ror a5, a5, #25
370
sri v6.4s, v18.4s, #25
371
ror a6, a6, #25
372
sri v7.4s, v19.4s, #25
373
ror a7, a7, #25
374
375
// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
376
// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
377
// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
378
// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
379
add v0.4s, v0.4s, v5.4s
380
add a0, a0, a5
381
add v1.4s, v1.4s, v6.4s
382
add a1, a1, a6
383
add v2.4s, v2.4s, v7.4s
384
add a2, a2, a7
385
add v3.4s, v3.4s, v4.4s
386
add a3, a3, a4
387
388
eor v15.16b, v15.16b, v0.16b
389
eor a15, a15, a0
390
eor v12.16b, v12.16b, v1.16b
391
eor a12, a12, a1
392
eor v13.16b, v13.16b, v2.16b
393
eor a13, a13, a2
394
eor v14.16b, v14.16b, v3.16b
395
eor a14, a14, a3
396
397
rev32 v15.8h, v15.8h
398
ror a15, a15, #16
399
rev32 v12.8h, v12.8h
400
ror a12, a12, #16
401
rev32 v13.8h, v13.8h
402
ror a13, a13, #16
403
rev32 v14.8h, v14.8h
404
ror a14, a14, #16
405
406
// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
407
// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
408
// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
409
// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
410
add v10.4s, v10.4s, v15.4s
411
add a10, a10, a15
412
add v11.4s, v11.4s, v12.4s
413
add a11, a11, a12
414
add v8.4s, v8.4s, v13.4s
415
add a8, a8, a13
416
add v9.4s, v9.4s, v14.4s
417
add a9, a9, a14
418
419
eor v16.16b, v5.16b, v10.16b
420
eor a5, a5, a10
421
eor v17.16b, v6.16b, v11.16b
422
eor a6, a6, a11
423
eor v18.16b, v7.16b, v8.16b
424
eor a7, a7, a8
425
eor v19.16b, v4.16b, v9.16b
426
eor a4, a4, a9
427
428
shl v5.4s, v16.4s, #12
429
shl v6.4s, v17.4s, #12
430
shl v7.4s, v18.4s, #12
431
shl v4.4s, v19.4s, #12
432
433
sri v5.4s, v16.4s, #20
434
ror a5, a5, #20
435
sri v6.4s, v17.4s, #20
436
ror a6, a6, #20
437
sri v7.4s, v18.4s, #20
438
ror a7, a7, #20
439
sri v4.4s, v19.4s, #20
440
ror a4, a4, #20
441
442
// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
443
// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
444
// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
445
// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
446
add v0.4s, v0.4s, v5.4s
447
add a0, a0, a5
448
add v1.4s, v1.4s, v6.4s
449
add a1, a1, a6
450
add v2.4s, v2.4s, v7.4s
451
add a2, a2, a7
452
add v3.4s, v3.4s, v4.4s
453
add a3, a3, a4
454
455
eor v15.16b, v15.16b, v0.16b
456
eor a15, a15, a0
457
eor v12.16b, v12.16b, v1.16b
458
eor a12, a12, a1
459
eor v13.16b, v13.16b, v2.16b
460
eor a13, a13, a2
461
eor v14.16b, v14.16b, v3.16b
462
eor a14, a14, a3
463
464
tbl v15.16b, {v15.16b}, v31.16b
465
ror a15, a15, #24
466
tbl v12.16b, {v12.16b}, v31.16b
467
ror a12, a12, #24
468
tbl v13.16b, {v13.16b}, v31.16b
469
ror a13, a13, #24
470
tbl v14.16b, {v14.16b}, v31.16b
471
ror a14, a14, #24
472
473
// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
474
// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
475
// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
476
// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
477
add v10.4s, v10.4s, v15.4s
478
add a10, a10, a15
479
add v11.4s, v11.4s, v12.4s
480
add a11, a11, a12
481
add v8.4s, v8.4s, v13.4s
482
add a8, a8, a13
483
add v9.4s, v9.4s, v14.4s
484
add a9, a9, a14
485
486
eor v16.16b, v5.16b, v10.16b
487
eor a5, a5, a10
488
eor v17.16b, v6.16b, v11.16b
489
eor a6, a6, a11
490
eor v18.16b, v7.16b, v8.16b
491
eor a7, a7, a8
492
eor v19.16b, v4.16b, v9.16b
493
eor a4, a4, a9
494
495
shl v5.4s, v16.4s, #7
496
shl v6.4s, v17.4s, #7
497
shl v7.4s, v18.4s, #7
498
shl v4.4s, v19.4s, #7
499
500
sri v5.4s, v16.4s, #25
501
ror a5, a5, #25
502
sri v6.4s, v17.4s, #25
503
ror a6, a6, #25
504
sri v7.4s, v18.4s, #25
505
ror a7, a7, #25
506
sri v4.4s, v19.4s, #25
507
ror a4, a4, #25
508
509
subs w3, w3, #2
510
b.ne .Ldoubleround4
511
512
ld4r {v16.4s-v19.4s}, [x0], #16
513
ld4r {v20.4s-v23.4s}, [x0], #16
514
515
// x12 += counter values 0-3
516
add v12.4s, v12.4s, v30.4s
517
518
// x0[0-3] += s0[0]
519
// x1[0-3] += s0[1]
520
// x2[0-3] += s0[2]
521
// x3[0-3] += s0[3]
522
add v0.4s, v0.4s, v16.4s
523
mov w6, v16.s[0]
524
mov w7, v17.s[0]
525
add v1.4s, v1.4s, v17.4s
526
mov w8, v18.s[0]
527
mov w9, v19.s[0]
528
add v2.4s, v2.4s, v18.4s
529
add a0, a0, w6
530
add a1, a1, w7
531
add v3.4s, v3.4s, v19.4s
532
add a2, a2, w8
533
add a3, a3, w9
534
CPU_BE( rev a0, a0 )
535
CPU_BE( rev a1, a1 )
536
CPU_BE( rev a2, a2 )
537
CPU_BE( rev a3, a3 )
538
539
ld4r {v24.4s-v27.4s}, [x0], #16
540
ld4r {v28.4s-v31.4s}, [x0]
541
542
// x4[0-3] += s1[0]
543
// x5[0-3] += s1[1]
544
// x6[0-3] += s1[2]
545
// x7[0-3] += s1[3]
546
add v4.4s, v4.4s, v20.4s
547
mov w6, v20.s[0]
548
mov w7, v21.s[0]
549
add v5.4s, v5.4s, v21.4s
550
mov w8, v22.s[0]
551
mov w9, v23.s[0]
552
add v6.4s, v6.4s, v22.4s
553
add a4, a4, w6
554
add a5, a5, w7
555
add v7.4s, v7.4s, v23.4s
556
add a6, a6, w8
557
add a7, a7, w9
558
CPU_BE( rev a4, a4 )
559
CPU_BE( rev a5, a5 )
560
CPU_BE( rev a6, a6 )
561
CPU_BE( rev a7, a7 )
562
563
// x8[0-3] += s2[0]
564
// x9[0-3] += s2[1]
565
// x10[0-3] += s2[2]
566
// x11[0-3] += s2[3]
567
add v8.4s, v8.4s, v24.4s
568
mov w6, v24.s[0]
569
mov w7, v25.s[0]
570
add v9.4s, v9.4s, v25.4s
571
mov w8, v26.s[0]
572
mov w9, v27.s[0]
573
add v10.4s, v10.4s, v26.4s
574
add a8, a8, w6
575
add a9, a9, w7
576
add v11.4s, v11.4s, v27.4s
577
add a10, a10, w8
578
add a11, a11, w9
579
CPU_BE( rev a8, a8 )
580
CPU_BE( rev a9, a9 )
581
CPU_BE( rev a10, a10 )
582
CPU_BE( rev a11, a11 )
583
584
// x12[0-3] += s3[0]
585
// x13[0-3] += s3[1]
586
// x14[0-3] += s3[2]
587
// x15[0-3] += s3[3]
588
add v12.4s, v12.4s, v28.4s
589
mov w6, v28.s[0]
590
mov w7, v29.s[0]
591
add v13.4s, v13.4s, v29.4s
592
mov w8, v30.s[0]
593
mov w9, v31.s[0]
594
add v14.4s, v14.4s, v30.4s
595
add a12, a12, w6
596
add a13, a13, w7
597
add v15.4s, v15.4s, v31.4s
598
add a14, a14, w8
599
add a15, a15, w9
600
CPU_BE( rev a12, a12 )
601
CPU_BE( rev a13, a13 )
602
CPU_BE( rev a14, a14 )
603
CPU_BE( rev a15, a15 )
604
605
// interleave 32-bit words in state n, n+1
606
ldp w6, w7, [x2], #64
607
zip1 v16.4s, v0.4s, v1.4s
608
ldp w8, w9, [x2, #-56]
609
eor a0, a0, w6
610
zip2 v17.4s, v0.4s, v1.4s
611
eor a1, a1, w7
612
zip1 v18.4s, v2.4s, v3.4s
613
eor a2, a2, w8
614
zip2 v19.4s, v2.4s, v3.4s
615
eor a3, a3, w9
616
ldp w6, w7, [x2, #-48]
617
zip1 v20.4s, v4.4s, v5.4s
618
ldp w8, w9, [x2, #-40]
619
eor a4, a4, w6
620
zip2 v21.4s, v4.4s, v5.4s
621
eor a5, a5, w7
622
zip1 v22.4s, v6.4s, v7.4s
623
eor a6, a6, w8
624
zip2 v23.4s, v6.4s, v7.4s
625
eor a7, a7, w9
626
ldp w6, w7, [x2, #-32]
627
zip1 v24.4s, v8.4s, v9.4s
628
ldp w8, w9, [x2, #-24]
629
eor a8, a8, w6
630
zip2 v25.4s, v8.4s, v9.4s
631
eor a9, a9, w7
632
zip1 v26.4s, v10.4s, v11.4s
633
eor a10, a10, w8
634
zip2 v27.4s, v10.4s, v11.4s
635
eor a11, a11, w9
636
ldp w6, w7, [x2, #-16]
637
zip1 v28.4s, v12.4s, v13.4s
638
ldp w8, w9, [x2, #-8]
639
eor a12, a12, w6
640
zip2 v29.4s, v12.4s, v13.4s
641
eor a13, a13, w7
642
zip1 v30.4s, v14.4s, v15.4s
643
eor a14, a14, w8
644
zip2 v31.4s, v14.4s, v15.4s
645
eor a15, a15, w9
646
647
add x3, x2, x4
648
sub x3, x3, #128 // start of last block
649
650
subs x5, x4, #128
651
csel x2, x2, x3, ge
652
653
// interleave 64-bit words in state n, n+2
654
zip1 v0.2d, v16.2d, v18.2d
655
zip2 v4.2d, v16.2d, v18.2d
656
stp a0, a1, [x1], #64
657
zip1 v8.2d, v17.2d, v19.2d
658
zip2 v12.2d, v17.2d, v19.2d
659
stp a2, a3, [x1, #-56]
660
661
subs x6, x4, #192
662
ld1 {v16.16b-v19.16b}, [x2], #64
663
csel x2, x2, x3, ge
664
665
zip1 v1.2d, v20.2d, v22.2d
666
zip2 v5.2d, v20.2d, v22.2d
667
stp a4, a5, [x1, #-48]
668
zip1 v9.2d, v21.2d, v23.2d
669
zip2 v13.2d, v21.2d, v23.2d
670
stp a6, a7, [x1, #-40]
671
672
subs x7, x4, #256
673
ld1 {v20.16b-v23.16b}, [x2], #64
674
csel x2, x2, x3, ge
675
676
zip1 v2.2d, v24.2d, v26.2d
677
zip2 v6.2d, v24.2d, v26.2d
678
stp a8, a9, [x1, #-32]
679
zip1 v10.2d, v25.2d, v27.2d
680
zip2 v14.2d, v25.2d, v27.2d
681
stp a10, a11, [x1, #-24]
682
683
subs x8, x4, #320
684
ld1 {v24.16b-v27.16b}, [x2], #64
685
csel x2, x2, x3, ge
686
687
zip1 v3.2d, v28.2d, v30.2d
688
zip2 v7.2d, v28.2d, v30.2d
689
stp a12, a13, [x1, #-16]
690
zip1 v11.2d, v29.2d, v31.2d
691
zip2 v15.2d, v29.2d, v31.2d
692
stp a14, a15, [x1, #-8]
693
694
tbnz x5, #63, .Lt128
695
ld1 {v28.16b-v31.16b}, [x2]
696
697
// xor with corresponding input, write to output
698
eor v16.16b, v16.16b, v0.16b
699
eor v17.16b, v17.16b, v1.16b
700
eor v18.16b, v18.16b, v2.16b
701
eor v19.16b, v19.16b, v3.16b
702
703
tbnz x6, #63, .Lt192
704
705
eor v20.16b, v20.16b, v4.16b
706
eor v21.16b, v21.16b, v5.16b
707
eor v22.16b, v22.16b, v6.16b
708
eor v23.16b, v23.16b, v7.16b
709
710
st1 {v16.16b-v19.16b}, [x1], #64
711
tbnz x7, #63, .Lt256
712
713
eor v24.16b, v24.16b, v8.16b
714
eor v25.16b, v25.16b, v9.16b
715
eor v26.16b, v26.16b, v10.16b
716
eor v27.16b, v27.16b, v11.16b
717
718
st1 {v20.16b-v23.16b}, [x1], #64
719
tbnz x8, #63, .Lt320
720
721
eor v28.16b, v28.16b, v12.16b
722
eor v29.16b, v29.16b, v13.16b
723
eor v30.16b, v30.16b, v14.16b
724
eor v31.16b, v31.16b, v15.16b
725
726
st1 {v24.16b-v27.16b}, [x1], #64
727
st1 {v28.16b-v31.16b}, [x1]
728
729
.Lout: frame_pop
730
ret
731
732
// fewer than 192 bytes of in/output
733
.Lt192: cbz x5, 1f // exactly 128 bytes?
734
ld1 {v28.16b-v31.16b}, [x10]
735
add x5, x5, x1
736
tbl v28.16b, {v4.16b-v7.16b}, v28.16b
737
tbl v29.16b, {v4.16b-v7.16b}, v29.16b
738
tbl v30.16b, {v4.16b-v7.16b}, v30.16b
739
tbl v31.16b, {v4.16b-v7.16b}, v31.16b
740
741
0: eor v20.16b, v20.16b, v28.16b
742
eor v21.16b, v21.16b, v29.16b
743
eor v22.16b, v22.16b, v30.16b
744
eor v23.16b, v23.16b, v31.16b
745
st1 {v20.16b-v23.16b}, [x5] // overlapping stores
746
1: st1 {v16.16b-v19.16b}, [x1]
747
b .Lout
748
749
// fewer than 128 bytes of in/output
750
.Lt128: ld1 {v28.16b-v31.16b}, [x10]
751
add x5, x5, x1
752
sub x1, x1, #64
753
tbl v28.16b, {v0.16b-v3.16b}, v28.16b
754
tbl v29.16b, {v0.16b-v3.16b}, v29.16b
755
tbl v30.16b, {v0.16b-v3.16b}, v30.16b
756
tbl v31.16b, {v0.16b-v3.16b}, v31.16b
757
ld1 {v16.16b-v19.16b}, [x1] // reload first output block
758
b 0b
759
760
// fewer than 256 bytes of in/output
761
.Lt256: cbz x6, 2f // exactly 192 bytes?
762
ld1 {v4.16b-v7.16b}, [x10]
763
add x6, x6, x1
764
tbl v0.16b, {v8.16b-v11.16b}, v4.16b
765
tbl v1.16b, {v8.16b-v11.16b}, v5.16b
766
tbl v2.16b, {v8.16b-v11.16b}, v6.16b
767
tbl v3.16b, {v8.16b-v11.16b}, v7.16b
768
769
eor v28.16b, v28.16b, v0.16b
770
eor v29.16b, v29.16b, v1.16b
771
eor v30.16b, v30.16b, v2.16b
772
eor v31.16b, v31.16b, v3.16b
773
st1 {v28.16b-v31.16b}, [x6] // overlapping stores
774
2: st1 {v20.16b-v23.16b}, [x1]
775
b .Lout
776
777
// fewer than 320 bytes of in/output
778
.Lt320: cbz x7, 3f // exactly 256 bytes?
779
ld1 {v4.16b-v7.16b}, [x10]
780
add x7, x7, x1
781
tbl v0.16b, {v12.16b-v15.16b}, v4.16b
782
tbl v1.16b, {v12.16b-v15.16b}, v5.16b
783
tbl v2.16b, {v12.16b-v15.16b}, v6.16b
784
tbl v3.16b, {v12.16b-v15.16b}, v7.16b
785
786
eor v28.16b, v28.16b, v0.16b
787
eor v29.16b, v29.16b, v1.16b
788
eor v30.16b, v30.16b, v2.16b
789
eor v31.16b, v31.16b, v3.16b
790
st1 {v28.16b-v31.16b}, [x7] // overlapping stores
791
3: st1 {v24.16b-v27.16b}, [x1]
792
b .Lout
793
SYM_FUNC_END(chacha_4block_xor_neon)
794
795
.section ".rodata", "a", %progbits
796
.align L1_CACHE_SHIFT
797
.Lpermute:
798
.set .Li, 0
799
.rept 128
800
.byte (.Li - 64)
801
.set .Li, .Li + 1
802
.endr
803
804
CTRINC: .word 1, 2, 3, 4
805
ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
806
807