Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/chacha-armv8-sve.S
39536 views
1
/* Do not modify. This file is auto-generated from chacha-armv8-sve.pl. */
2
// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved.
3
//
4
// Licensed under the Apache License 2.0 (the "License"). You may not use
5
// this file except in compliance with the License. You can obtain a copy
6
// in the file LICENSE in the source distribution or at
7
// https://www.openssl.org/source/license.html
8
//
9
//
10
// ChaCha20 for ARMv8 via SVE
11
//
12
// $output is the last argument if it looks like a file (it has an extension)
13
// $flavour is the first argument if it doesn't look like a file
14
#include "arm_arch.h"
15
16
.arch armv8-a
17
18
19
.hidden OPENSSL_armcap_P
20
21
.text
22
23
.section .rodata
24
.align 5
25
.type _chacha_sve_consts,%object
26
_chacha_sve_consts:
27
.Lchacha20_consts:
28
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
29
.Lrot8:
30
.word 0x02010003,0x04040404,0x02010003,0x04040404
31
.size _chacha_sve_consts,.-_chacha_sve_consts
32
33
.previous
34
35
.globl ChaCha20_ctr32_sve
36
.type ChaCha20_ctr32_sve,%function
37
.align 5
38
ChaCha20_ctr32_sve:
39
AARCH64_VALID_CALL_TARGET
40
.inst 0x04a0e3e5 //cntw x5, ALL, MUL #1
41
cmp x2,x5,lsl #6
42
b.lt .Lreturn
43
mov x7,0
44
adrp x6,OPENSSL_armcap_P
45
ldr w6,[x6,#:lo12:OPENSSL_armcap_P]
46
tst w6,#ARMV8_SVE2
47
b.eq 1f
48
mov x7,1
49
b 2f
50
1:
51
cmp x5,4
52
b.le .Lreturn
53
adrp x6,.Lrot8
54
add x6,x6,#:lo12:.Lrot8
55
ldp w9,w10,[x6]
56
.inst 0x04aa4d3f //index z31.s,w9,w10
57
2:
58
AARCH64_SIGN_LINK_REGISTER
59
stp d8,d9,[sp,-192]!
60
stp d10,d11,[sp,16]
61
stp d12,d13,[sp,32]
62
stp d14,d15,[sp,48]
63
stp x16,x17,[sp,64]
64
stp x18,x19,[sp,80]
65
stp x20,x21,[sp,96]
66
stp x22,x23,[sp,112]
67
stp x24,x25,[sp,128]
68
stp x26,x27,[sp,144]
69
stp x28,x29,[sp,160]
70
str x30,[sp,176]
71
72
adrp x6,.Lchacha20_consts
73
add x6,x6,#:lo12:.Lchacha20_consts
74
ldp x23,x24,[x6]
75
ldp x25,x26,[x3]
76
ldp x27,x28,[x3, 16]
77
ldp x29,x30,[x4]
78
.inst 0x2599e3e0 //ptrues p0.s,ALL
79
#ifdef __AARCH64EB__
80
ror x25,x25,#32
81
ror x26,x26,#32
82
ror x27,x27,#32
83
ror x28,x28,#32
84
ror x29,x29,#32
85
ror x30,x30,#32
86
#endif
87
cbz x7, 1f
88
.align 5
89
100:
90
subs x7,x2,x5,lsl #6
91
b.lt 110f
92
mov x2,x7
93
b.eq 101f
94
cmp x2,64
95
b.lt 101f
96
mixin=1
97
lsr x8,x23,#32
98
.inst 0x05a03ae0 //dup z0.s,w23
99
.inst 0x05a03af9 //dup z25.s,w23
100
.if mixin == 1
101
mov w7,w23
102
.endif
103
.inst 0x05a03904 //dup z4.s,w8
104
.inst 0x05a0391a //dup z26.s,w8
105
lsr x10,x24,#32
106
.inst 0x05a03b08 //dup z8.s,w24
107
.inst 0x05a03b1b //dup z27.s,w24
108
.if mixin == 1
109
mov w9,w24
110
.endif
111
.inst 0x05a0394c //dup z12.s,w10
112
.inst 0x05a0395c //dup z28.s,w10
113
lsr x12,x25,#32
114
.inst 0x05a03b21 //dup z1.s,w25
115
.inst 0x05a03b3d //dup z29.s,w25
116
.if mixin == 1
117
mov w11,w25
118
.endif
119
.inst 0x05a03985 //dup z5.s,w12
120
.inst 0x05a0399e //dup z30.s,w12
121
lsr x14,x26,#32
122
.inst 0x05a03b49 //dup z9.s,w26
123
.inst 0x05a03b55 //dup z21.s,w26
124
.if mixin == 1
125
mov w13,w26
126
.endif
127
.inst 0x05a039cd //dup z13.s,w14
128
.inst 0x05a039d6 //dup z22.s,w14
129
lsr x16,x27,#32
130
.inst 0x05a03b62 //dup z2.s,w27
131
.inst 0x05a03b77 //dup z23.s,w27
132
.if mixin == 1
133
mov w15,w27
134
.endif
135
.inst 0x05a03a06 //dup z6.s,w16
136
.inst 0x05a03a18 //dup z24.s,w16
137
lsr x18,x28,#32
138
.inst 0x05a03b8a //dup z10.s,w28
139
.inst 0x05a03b91 //dup z17.s,w28
140
.if mixin == 1
141
mov w17,w28
142
.endif
143
.inst 0x05a03a4e //dup z14.s,w18
144
.inst 0x05a03a52 //dup z18.s,w18
145
lsr x22,x30,#32
146
.inst 0x05a03bcb //dup z11.s,w30
147
.inst 0x05a03bd4 //dup z20.s,w30
148
.if mixin == 1
149
mov w21,w30
150
.endif
151
.inst 0x05a03acf //dup z15.s,w22
152
.inst 0x05a03adf //dup z31.s,w22
153
.if mixin == 1
154
add w20,w29,#1
155
mov w19,w29
156
.inst 0x04a14690 //index z16.s,w20,1
157
.inst 0x04a14683 //index z3.s,w20,1
158
.else
159
.inst 0x04a147b0 //index z16.s,w29,1
160
.inst 0x04a147a3 //index z3.s,w29,1
161
.endif
162
lsr x20,x29,#32
163
.inst 0x05a03a87 //dup z7.s,w20
164
.inst 0x05a03a93 //dup z19.s,w20
165
mov x6,#10
166
10:
167
.align 5
168
.inst 0x04a10000 //add z0.s,z0.s,z1.s
169
.if mixin == 1
170
add w7,w7,w11
171
.endif
172
.inst 0x04a50084 //add z4.s,z4.s,z5.s
173
.if mixin == 1
174
add w8,w8,w12
175
.endif
176
.inst 0x04a90108 //add z8.s,z8.s,z9.s
177
.if mixin == 1
178
add w9,w9,w13
179
.endif
180
.inst 0x04ad018c //add z12.s,z12.s,z13.s
181
.if mixin == 1
182
add w10,w10,w14
183
.endif
184
.if mixin == 1
185
eor w19,w19,w7
186
.endif
187
.inst 0x04703403 //xar z3.s,z3.s,z0.s,16
188
.if mixin == 1
189
ror w19,w19,16
190
.endif
191
.if mixin == 1
192
eor w20,w20,w8
193
.endif
194
.inst 0x04703487 //xar z7.s,z7.s,z4.s,16
195
.if mixin == 1
196
ror w20,w20,16
197
.endif
198
.if mixin == 1
199
eor w21,w21,w9
200
.endif
201
.inst 0x0470350b //xar z11.s,z11.s,z8.s,16
202
.if mixin == 1
203
ror w21,w21,16
204
.endif
205
.if mixin == 1
206
eor w22,w22,w10
207
.endif
208
.inst 0x0470358f //xar z15.s,z15.s,z12.s,16
209
.if mixin == 1
210
ror w22,w22,16
211
.endif
212
.inst 0x04a30042 //add z2.s,z2.s,z3.s
213
.if mixin == 1
214
add w15,w15,w19
215
.endif
216
.inst 0x04a700c6 //add z6.s,z6.s,z7.s
217
.if mixin == 1
218
add w16,w16,w20
219
.endif
220
.inst 0x04ab014a //add z10.s,z10.s,z11.s
221
.if mixin == 1
222
add w17,w17,w21
223
.endif
224
.inst 0x04af01ce //add z14.s,z14.s,z15.s
225
.if mixin == 1
226
add w18,w18,w22
227
.endif
228
.if mixin == 1
229
eor w11,w11,w15
230
.endif
231
.inst 0x046c3441 //xar z1.s,z1.s,z2.s,20
232
.if mixin == 1
233
ror w11,w11,20
234
.endif
235
.if mixin == 1
236
eor w12,w12,w16
237
.endif
238
.inst 0x046c34c5 //xar z5.s,z5.s,z6.s,20
239
.if mixin == 1
240
ror w12,w12,20
241
.endif
242
.if mixin == 1
243
eor w13,w13,w17
244
.endif
245
.inst 0x046c3549 //xar z9.s,z9.s,z10.s,20
246
.if mixin == 1
247
ror w13,w13,20
248
.endif
249
.if mixin == 1
250
eor w14,w14,w18
251
.endif
252
.inst 0x046c35cd //xar z13.s,z13.s,z14.s,20
253
.if mixin == 1
254
ror w14,w14,20
255
.endif
256
.inst 0x04a10000 //add z0.s,z0.s,z1.s
257
.if mixin == 1
258
add w7,w7,w11
259
.endif
260
.inst 0x04a50084 //add z4.s,z4.s,z5.s
261
.if mixin == 1
262
add w8,w8,w12
263
.endif
264
.inst 0x04a90108 //add z8.s,z8.s,z9.s
265
.if mixin == 1
266
add w9,w9,w13
267
.endif
268
.inst 0x04ad018c //add z12.s,z12.s,z13.s
269
.if mixin == 1
270
add w10,w10,w14
271
.endif
272
.if mixin == 1
273
eor w19,w19,w7
274
.endif
275
.inst 0x04683403 //xar z3.s,z3.s,z0.s,24
276
.if mixin == 1
277
ror w19,w19,24
278
.endif
279
.if mixin == 1
280
eor w20,w20,w8
281
.endif
282
.inst 0x04683487 //xar z7.s,z7.s,z4.s,24
283
.if mixin == 1
284
ror w20,w20,24
285
.endif
286
.if mixin == 1
287
eor w21,w21,w9
288
.endif
289
.inst 0x0468350b //xar z11.s,z11.s,z8.s,24
290
.if mixin == 1
291
ror w21,w21,24
292
.endif
293
.if mixin == 1
294
eor w22,w22,w10
295
.endif
296
.inst 0x0468358f //xar z15.s,z15.s,z12.s,24
297
.if mixin == 1
298
ror w22,w22,24
299
.endif
300
.inst 0x04a30042 //add z2.s,z2.s,z3.s
301
.if mixin == 1
302
add w15,w15,w19
303
.endif
304
.inst 0x04a700c6 //add z6.s,z6.s,z7.s
305
.if mixin == 1
306
add w16,w16,w20
307
.endif
308
.inst 0x04ab014a //add z10.s,z10.s,z11.s
309
.if mixin == 1
310
add w17,w17,w21
311
.endif
312
.inst 0x04af01ce //add z14.s,z14.s,z15.s
313
.if mixin == 1
314
add w18,w18,w22
315
.endif
316
.if mixin == 1
317
eor w11,w11,w15
318
.endif
319
.inst 0x04673441 //xar z1.s,z1.s,z2.s,25
320
.if mixin == 1
321
ror w11,w11,25
322
.endif
323
.if mixin == 1
324
eor w12,w12,w16
325
.endif
326
.inst 0x046734c5 //xar z5.s,z5.s,z6.s,25
327
.if mixin == 1
328
ror w12,w12,25
329
.endif
330
.if mixin == 1
331
eor w13,w13,w17
332
.endif
333
.inst 0x04673549 //xar z9.s,z9.s,z10.s,25
334
.if mixin == 1
335
ror w13,w13,25
336
.endif
337
.if mixin == 1
338
eor w14,w14,w18
339
.endif
340
.inst 0x046735cd //xar z13.s,z13.s,z14.s,25
341
.if mixin == 1
342
ror w14,w14,25
343
.endif
344
.inst 0x04a50000 //add z0.s,z0.s,z5.s
345
.if mixin == 1
346
add w7,w7,w12
347
.endif
348
.inst 0x04a90084 //add z4.s,z4.s,z9.s
349
.if mixin == 1
350
add w8,w8,w13
351
.endif
352
.inst 0x04ad0108 //add z8.s,z8.s,z13.s
353
.if mixin == 1
354
add w9,w9,w14
355
.endif
356
.inst 0x04a1018c //add z12.s,z12.s,z1.s
357
.if mixin == 1
358
add w10,w10,w11
359
.endif
360
.if mixin == 1
361
eor w22,w22,w7
362
.endif
363
.inst 0x0470340f //xar z15.s,z15.s,z0.s,16
364
.if mixin == 1
365
ror w22,w22,16
366
.endif
367
.if mixin == 1
368
eor w19,w19,w8
369
.endif
370
.inst 0x04703483 //xar z3.s,z3.s,z4.s,16
371
.if mixin == 1
372
ror w19,w19,16
373
.endif
374
.if mixin == 1
375
eor w20,w20,w9
376
.endif
377
.inst 0x04703507 //xar z7.s,z7.s,z8.s,16
378
.if mixin == 1
379
ror w20,w20,16
380
.endif
381
.if mixin == 1
382
eor w21,w21,w10
383
.endif
384
.inst 0x0470358b //xar z11.s,z11.s,z12.s,16
385
.if mixin == 1
386
ror w21,w21,16
387
.endif
388
.inst 0x04af014a //add z10.s,z10.s,z15.s
389
.if mixin == 1
390
add w17,w17,w22
391
.endif
392
.inst 0x04a301ce //add z14.s,z14.s,z3.s
393
.if mixin == 1
394
add w18,w18,w19
395
.endif
396
.inst 0x04a70042 //add z2.s,z2.s,z7.s
397
.if mixin == 1
398
add w15,w15,w20
399
.endif
400
.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
401
.if mixin == 1
402
add w16,w16,w21
403
.endif
404
.if mixin == 1
405
eor w12,w12,w17
406
.endif
407
.inst 0x046c3545 //xar z5.s,z5.s,z10.s,20
408
.if mixin == 1
409
ror w12,w12,20
410
.endif
411
.if mixin == 1
412
eor w13,w13,w18
413
.endif
414
.inst 0x046c35c9 //xar z9.s,z9.s,z14.s,20
415
.if mixin == 1
416
ror w13,w13,20
417
.endif
418
.if mixin == 1
419
eor w14,w14,w15
420
.endif
421
.inst 0x046c344d //xar z13.s,z13.s,z2.s,20
422
.if mixin == 1
423
ror w14,w14,20
424
.endif
425
.if mixin == 1
426
eor w11,w11,w16
427
.endif
428
.inst 0x046c34c1 //xar z1.s,z1.s,z6.s,20
429
.if mixin == 1
430
ror w11,w11,20
431
.endif
432
.inst 0x04a50000 //add z0.s,z0.s,z5.s
433
.if mixin == 1
434
add w7,w7,w12
435
.endif
436
.inst 0x04a90084 //add z4.s,z4.s,z9.s
437
.if mixin == 1
438
add w8,w8,w13
439
.endif
440
.inst 0x04ad0108 //add z8.s,z8.s,z13.s
441
.if mixin == 1
442
add w9,w9,w14
443
.endif
444
.inst 0x04a1018c //add z12.s,z12.s,z1.s
445
.if mixin == 1
446
add w10,w10,w11
447
.endif
448
.if mixin == 1
449
eor w22,w22,w7
450
.endif
451
.inst 0x0468340f //xar z15.s,z15.s,z0.s,24
452
.if mixin == 1
453
ror w22,w22,24
454
.endif
455
.if mixin == 1
456
eor w19,w19,w8
457
.endif
458
.inst 0x04683483 //xar z3.s,z3.s,z4.s,24
459
.if mixin == 1
460
ror w19,w19,24
461
.endif
462
.if mixin == 1
463
eor w20,w20,w9
464
.endif
465
.inst 0x04683507 //xar z7.s,z7.s,z8.s,24
466
.if mixin == 1
467
ror w20,w20,24
468
.endif
469
.if mixin == 1
470
eor w21,w21,w10
471
.endif
472
.inst 0x0468358b //xar z11.s,z11.s,z12.s,24
473
.if mixin == 1
474
ror w21,w21,24
475
.endif
476
.inst 0x04af014a //add z10.s,z10.s,z15.s
477
.if mixin == 1
478
add w17,w17,w22
479
.endif
480
.inst 0x04a301ce //add z14.s,z14.s,z3.s
481
.if mixin == 1
482
add w18,w18,w19
483
.endif
484
.inst 0x04a70042 //add z2.s,z2.s,z7.s
485
.if mixin == 1
486
add w15,w15,w20
487
.endif
488
.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
489
.if mixin == 1
490
add w16,w16,w21
491
.endif
492
.if mixin == 1
493
eor w12,w12,w17
494
.endif
495
.inst 0x04673545 //xar z5.s,z5.s,z10.s,25
496
.if mixin == 1
497
ror w12,w12,25
498
.endif
499
.if mixin == 1
500
eor w13,w13,w18
501
.endif
502
.inst 0x046735c9 //xar z9.s,z9.s,z14.s,25
503
.if mixin == 1
504
ror w13,w13,25
505
.endif
506
.if mixin == 1
507
eor w14,w14,w15
508
.endif
509
.inst 0x0467344d //xar z13.s,z13.s,z2.s,25
510
.if mixin == 1
511
ror w14,w14,25
512
.endif
513
.if mixin == 1
514
eor w11,w11,w16
515
.endif
516
.inst 0x046734c1 //xar z1.s,z1.s,z6.s,25
517
.if mixin == 1
518
ror w11,w11,25
519
.endif
520
sub x6,x6,1
521
cbnz x6,10b
522
.if mixin == 1
523
add w7,w7,w23
524
.endif
525
.inst 0x04b90000 //add z0.s,z0.s,z25.s
526
.if mixin == 1
527
add x8,x8,x23,lsr #32
528
.endif
529
.inst 0x04ba0084 //add z4.s,z4.s,z26.s
530
.if mixin == 1
531
add x7,x7,x8,lsl #32 // pack
532
.endif
533
.if mixin == 1
534
add w9,w9,w24
535
.endif
536
.inst 0x04bb0108 //add z8.s,z8.s,z27.s
537
.if mixin == 1
538
add x10,x10,x24,lsr #32
539
.endif
540
.inst 0x04bc018c //add z12.s,z12.s,z28.s
541
.if mixin == 1
542
add x9,x9,x10,lsl #32 // pack
543
.endif
544
.if mixin == 1
545
ldp x8,x10,[x1],#16
546
.endif
547
.if mixin == 1
548
add w11,w11,w25
549
.endif
550
.inst 0x04bd0021 //add z1.s,z1.s,z29.s
551
.if mixin == 1
552
add x12,x12,x25,lsr #32
553
.endif
554
.inst 0x04be00a5 //add z5.s,z5.s,z30.s
555
.if mixin == 1
556
add x11,x11,x12,lsl #32 // pack
557
.endif
558
.if mixin == 1
559
add w13,w13,w26
560
.endif
561
.inst 0x04b50129 //add z9.s,z9.s,z21.s
562
.if mixin == 1
563
add x14,x14,x26,lsr #32
564
.endif
565
.inst 0x04b601ad //add z13.s,z13.s,z22.s
566
.if mixin == 1
567
add x13,x13,x14,lsl #32 // pack
568
.endif
569
.if mixin == 1
570
ldp x12,x14,[x1],#16
571
.endif
572
.if mixin == 1
573
add w15,w15,w27
574
.endif
575
.inst 0x04b70042 //add z2.s,z2.s,z23.s
576
.if mixin == 1
577
add x16,x16,x27,lsr #32
578
.endif
579
.inst 0x04b800c6 //add z6.s,z6.s,z24.s
580
.if mixin == 1
581
add x15,x15,x16,lsl #32 // pack
582
.endif
583
.if mixin == 1
584
add w17,w17,w28
585
.endif
586
.inst 0x04b1014a //add z10.s,z10.s,z17.s
587
.if mixin == 1
588
add x18,x18,x28,lsr #32
589
.endif
590
.inst 0x04b201ce //add z14.s,z14.s,z18.s
591
.if mixin == 1
592
add x17,x17,x18,lsl #32 // pack
593
.endif
594
.if mixin == 1
595
ldp x16,x18,[x1],#16
596
.endif
597
.if mixin == 1
598
add w19,w19,w29
599
.endif
600
.inst 0x04b00063 //add z3.s,z3.s,z16.s
601
.if mixin == 1
602
add x20,x20,x29,lsr #32
603
.endif
604
.inst 0x04b300e7 //add z7.s,z7.s,z19.s
605
.if mixin == 1
606
add x19,x19,x20,lsl #32 // pack
607
.endif
608
.if mixin == 1
609
add w21,w21,w30
610
.endif
611
.inst 0x04b4016b //add z11.s,z11.s,z20.s
612
.if mixin == 1
613
add x22,x22,x30,lsr #32
614
.endif
615
.inst 0x04bf01ef //add z15.s,z15.s,z31.s
616
.if mixin == 1
617
add x21,x21,x22,lsl #32 // pack
618
.endif
619
.if mixin == 1
620
ldp x20,x22,[x1],#16
621
.endif
622
#ifdef __AARCH64EB__
623
rev x7,x7
624
.inst 0x05a48000 //revb z0.s,p0/m,z0.s
625
.inst 0x05a48084 //revb z4.s,p0/m,z4.s
626
rev x9,x9
627
.inst 0x05a48108 //revb z8.s,p0/m,z8.s
628
.inst 0x05a4818c //revb z12.s,p0/m,z12.s
629
rev x11,x11
630
.inst 0x05a48021 //revb z1.s,p0/m,z1.s
631
.inst 0x05a480a5 //revb z5.s,p0/m,z5.s
632
rev x13,x13
633
.inst 0x05a48129 //revb z9.s,p0/m,z9.s
634
.inst 0x05a481ad //revb z13.s,p0/m,z13.s
635
rev x15,x15
636
.inst 0x05a48042 //revb z2.s,p0/m,z2.s
637
.inst 0x05a480c6 //revb z6.s,p0/m,z6.s
638
rev x17,x17
639
.inst 0x05a4814a //revb z10.s,p0/m,z10.s
640
.inst 0x05a481ce //revb z14.s,p0/m,z14.s
641
rev x19,x19
642
.inst 0x05a48063 //revb z3.s,p0/m,z3.s
643
.inst 0x05a480e7 //revb z7.s,p0/m,z7.s
644
rev x21,x21
645
.inst 0x05a4816b //revb z11.s,p0/m,z11.s
646
.inst 0x05a481ef //revb z15.s,p0/m,z15.s
647
#endif
648
.if mixin == 1
649
add x29,x29,#1
650
.endif
651
cmp x5,4
652
b.ne 200f
653
.if mixin == 1
654
eor x7,x7,x8
655
.endif
656
.if mixin == 1
657
eor x9,x9,x10
658
.endif
659
.if mixin == 1
660
eor x11,x11,x12
661
.endif
662
.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
663
.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
664
.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
665
.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
666
667
.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
668
.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
669
.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
670
.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
671
672
.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
673
.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
674
.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
675
.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
676
677
.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
678
.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
679
.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
680
.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
681
.if mixin == 1
682
eor x13,x13,x14
683
.endif
684
.if mixin == 1
685
eor x15,x15,x16
686
.endif
687
.if mixin == 1
688
eor x17,x17,x18
689
.endif
690
.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
691
.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
692
.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
693
.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
694
695
.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
696
.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
697
.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
698
.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
699
700
.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
701
.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
702
.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
703
.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
704
705
.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
706
.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
707
.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
708
.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
709
.if mixin == 1
710
eor x19,x19,x20
711
.endif
712
.if mixin == 1
713
eor x21,x21,x22
714
.endif
715
ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
716
ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
717
.inst 0x04b13000 //eor z0.d,z0.d,z17.d
718
.inst 0x04b23021 //eor z1.d,z1.d,z18.d
719
.inst 0x04b33042 //eor z2.d,z2.d,z19.d
720
.inst 0x04b43063 //eor z3.d,z3.d,z20.d
721
.inst 0x04b53084 //eor z4.d,z4.d,z21.d
722
.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
723
.inst 0x04b730c6 //eor z6.d,z6.d,z23.d
724
.inst 0x04b830e7 //eor z7.d,z7.d,z24.d
725
ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
726
ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
727
.if mixin == 1
728
stp x7,x9,[x0],#16
729
.endif
730
.inst 0x04b13108 //eor z8.d,z8.d,z17.d
731
.inst 0x04b23129 //eor z9.d,z9.d,z18.d
732
.if mixin == 1
733
stp x11,x13,[x0],#16
734
.endif
735
.inst 0x04b3314a //eor z10.d,z10.d,z19.d
736
.inst 0x04b4316b //eor z11.d,z11.d,z20.d
737
.if mixin == 1
738
stp x15,x17,[x0],#16
739
.endif
740
.inst 0x04b5318c //eor z12.d,z12.d,z21.d
741
.inst 0x04b631ad //eor z13.d,z13.d,z22.d
742
.if mixin == 1
743
stp x19,x21,[x0],#16
744
.endif
745
.inst 0x04b731ce //eor z14.d,z14.d,z23.d
746
.inst 0x04b831ef //eor z15.d,z15.d,z24.d
747
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
748
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
749
st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
750
st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
751
b 210f
752
200:
753
.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s
754
.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s
755
.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s
756
.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s
757
758
.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s
759
.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s
760
.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s
761
.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s
762
763
.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
764
.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d
765
.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d
766
.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d
767
768
.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d
769
.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
770
.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d
771
.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d
772
.if mixin == 1
773
eor x7,x7,x8
774
.endif
775
.if mixin == 1
776
eor x9,x9,x10
777
.endif
778
.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s
779
.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s
780
.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s
781
.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s
782
783
.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s
784
.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s
785
.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s
786
.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s
787
788
.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d
789
.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d
790
.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
791
.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d
792
793
.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d
794
.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d
795
.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d
796
.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
797
.if mixin == 1
798
eor x11,x11,x12
799
.endif
800
.if mixin == 1
801
eor x13,x13,x14
802
.endif
803
.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
804
.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
805
.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
806
.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
807
808
.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
809
.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
810
.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
811
.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
812
813
.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
814
.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
815
.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
816
.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
817
818
.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
819
.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
820
.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
821
.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
822
.if mixin == 1
823
eor x15,x15,x16
824
.endif
825
.if mixin == 1
826
eor x17,x17,x18
827
.endif
828
.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
829
.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
830
.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
831
.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
832
833
.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
834
.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
835
.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
836
.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
837
838
.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
839
.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
840
.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
841
.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
842
843
.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
844
.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
845
.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
846
.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
847
.if mixin == 1
848
eor x19,x19,x20
849
.endif
850
.if mixin == 1
851
eor x21,x21,x22
852
.endif
853
.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
854
.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
855
.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
856
.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
857
.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
858
.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
859
.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
860
.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
861
.inst 0x04215101 //addvl x1,x1,8
862
.inst 0x04b13000 //eor z0.d,z0.d,z17.d
863
.inst 0x04b23084 //eor z4.d,z4.d,z18.d
864
.inst 0x04b33108 //eor z8.d,z8.d,z19.d
865
.inst 0x04b4318c //eor z12.d,z12.d,z20.d
866
.inst 0x04b53021 //eor z1.d,z1.d,z21.d
867
.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
868
.inst 0x04b73129 //eor z9.d,z9.d,z23.d
869
.inst 0x04b831ad //eor z13.d,z13.d,z24.d
870
.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
871
.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
872
.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
873
.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
874
.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
875
.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
876
.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
877
.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
878
.inst 0x04215101 //addvl x1,x1,8
879
.if mixin == 1
880
stp x7,x9,[x0],#16
881
.endif
882
.inst 0x04b13042 //eor z2.d,z2.d,z17.d
883
.inst 0x04b230c6 //eor z6.d,z6.d,z18.d
884
.if mixin == 1
885
stp x11,x13,[x0],#16
886
.endif
887
.inst 0x04b3314a //eor z10.d,z10.d,z19.d
888
.inst 0x04b431ce //eor z14.d,z14.d,z20.d
889
.if mixin == 1
890
stp x15,x17,[x0],#16
891
.endif
892
.inst 0x04b53063 //eor z3.d,z3.d,z21.d
893
.inst 0x04b630e7 //eor z7.d,z7.d,z22.d
894
.if mixin == 1
895
stp x19,x21,[x0],#16
896
.endif
897
.inst 0x04b7316b //eor z11.d,z11.d,z23.d
898
.inst 0x04b831ef //eor z15.d,z15.d,z24.d
899
.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL]
900
.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL]
901
.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL]
902
.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL]
903
.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL]
904
.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL]
905
.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL]
906
.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL]
907
.inst 0x04205100 //addvl x0,x0,8
908
.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL]
909
.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL]
910
.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL]
911
.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL]
912
.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL]
913
.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL]
914
.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL]
915
.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL]
916
.inst 0x04205100 //addvl x0,x0,8
917
210:
918
.inst 0x04b0e3fd //incw x29, ALL, MUL #1
919
subs x2,x2,64
920
b.gt 100b
921
b 110f
922
101:
923
mixin=0
924
lsr x8,x23,#32
925
.inst 0x05a03ae0 //dup z0.s,w23
926
.inst 0x05a03af9 //dup z25.s,w23
927
.if mixin == 1
928
mov w7,w23
929
.endif
930
.inst 0x05a03904 //dup z4.s,w8
931
.inst 0x05a0391a //dup z26.s,w8
932
lsr x10,x24,#32
933
.inst 0x05a03b08 //dup z8.s,w24
934
.inst 0x05a03b1b //dup z27.s,w24
935
.if mixin == 1
936
mov w9,w24
937
.endif
938
.inst 0x05a0394c //dup z12.s,w10
939
.inst 0x05a0395c //dup z28.s,w10
940
lsr x12,x25,#32
941
.inst 0x05a03b21 //dup z1.s,w25
942
.inst 0x05a03b3d //dup z29.s,w25
943
.if mixin == 1
944
mov w11,w25
945
.endif
946
.inst 0x05a03985 //dup z5.s,w12
947
.inst 0x05a0399e //dup z30.s,w12
948
lsr x14,x26,#32
949
.inst 0x05a03b49 //dup z9.s,w26
950
.inst 0x05a03b55 //dup z21.s,w26
951
.if mixin == 1
952
mov w13,w26
953
.endif
954
.inst 0x05a039cd //dup z13.s,w14
955
.inst 0x05a039d6 //dup z22.s,w14
956
lsr x16,x27,#32
957
.inst 0x05a03b62 //dup z2.s,w27
958
.inst 0x05a03b77 //dup z23.s,w27
959
.if mixin == 1
960
mov w15,w27
961
.endif
962
.inst 0x05a03a06 //dup z6.s,w16
963
.inst 0x05a03a18 //dup z24.s,w16
964
lsr x18,x28,#32
965
.inst 0x05a03b8a //dup z10.s,w28
966
.inst 0x05a03b91 //dup z17.s,w28
967
.if mixin == 1
968
mov w17,w28
969
.endif
970
.inst 0x05a03a4e //dup z14.s,w18
971
.inst 0x05a03a52 //dup z18.s,w18
972
lsr x22,x30,#32
973
.inst 0x05a03bcb //dup z11.s,w30
974
.inst 0x05a03bd4 //dup z20.s,w30
975
.if mixin == 1
976
mov w21,w30
977
.endif
978
.inst 0x05a03acf //dup z15.s,w22
979
.inst 0x05a03adf //dup z31.s,w22
980
.if mixin == 1
981
add w20,w29,#1
982
mov w19,w29
983
.inst 0x04a14690 //index z16.s,w20,1
984
.inst 0x04a14683 //index z3.s,w20,1
985
.else
986
.inst 0x04a147b0 //index z16.s,w29,1
987
.inst 0x04a147a3 //index z3.s,w29,1
988
.endif
989
lsr x20,x29,#32
990
.inst 0x05a03a87 //dup z7.s,w20
991
.inst 0x05a03a93 //dup z19.s,w20
992
mov x6,#10
993
10:
994
.align 5
995
.inst 0x04a10000 //add z0.s,z0.s,z1.s
996
.if mixin == 1
997
add w7,w7,w11
998
.endif
999
.inst 0x04a50084 //add z4.s,z4.s,z5.s
1000
.if mixin == 1
1001
add w8,w8,w12
1002
.endif
1003
.inst 0x04a90108 //add z8.s,z8.s,z9.s
1004
.if mixin == 1
1005
add w9,w9,w13
1006
.endif
1007
.inst 0x04ad018c //add z12.s,z12.s,z13.s
1008
.if mixin == 1
1009
add w10,w10,w14
1010
.endif
1011
.if mixin == 1
1012
eor w19,w19,w7
1013
.endif
1014
.inst 0x04703403 //xar z3.s,z3.s,z0.s,16
1015
.if mixin == 1
1016
ror w19,w19,16
1017
.endif
1018
.if mixin == 1
1019
eor w20,w20,w8
1020
.endif
1021
.inst 0x04703487 //xar z7.s,z7.s,z4.s,16
1022
.if mixin == 1
1023
ror w20,w20,16
1024
.endif
1025
.if mixin == 1
1026
eor w21,w21,w9
1027
.endif
1028
.inst 0x0470350b //xar z11.s,z11.s,z8.s,16
1029
.if mixin == 1
1030
ror w21,w21,16
1031
.endif
1032
.if mixin == 1
1033
eor w22,w22,w10
1034
.endif
1035
.inst 0x0470358f //xar z15.s,z15.s,z12.s,16
1036
.if mixin == 1
1037
ror w22,w22,16
1038
.endif
1039
.inst 0x04a30042 //add z2.s,z2.s,z3.s
1040
.if mixin == 1
1041
add w15,w15,w19
1042
.endif
1043
.inst 0x04a700c6 //add z6.s,z6.s,z7.s
1044
.if mixin == 1
1045
add w16,w16,w20
1046
.endif
1047
.inst 0x04ab014a //add z10.s,z10.s,z11.s
1048
.if mixin == 1
1049
add w17,w17,w21
1050
.endif
1051
.inst 0x04af01ce //add z14.s,z14.s,z15.s
1052
.if mixin == 1
1053
add w18,w18,w22
1054
.endif
1055
.if mixin == 1
1056
eor w11,w11,w15
1057
.endif
1058
.inst 0x046c3441 //xar z1.s,z1.s,z2.s,20
1059
.if mixin == 1
1060
ror w11,w11,20
1061
.endif
1062
.if mixin == 1
1063
eor w12,w12,w16
1064
.endif
1065
.inst 0x046c34c5 //xar z5.s,z5.s,z6.s,20
1066
.if mixin == 1
1067
ror w12,w12,20
1068
.endif
1069
.if mixin == 1
1070
eor w13,w13,w17
1071
.endif
1072
.inst 0x046c3549 //xar z9.s,z9.s,z10.s,20
1073
.if mixin == 1
1074
ror w13,w13,20
1075
.endif
1076
.if mixin == 1
1077
eor w14,w14,w18
1078
.endif
1079
.inst 0x046c35cd //xar z13.s,z13.s,z14.s,20
1080
.if mixin == 1
1081
ror w14,w14,20
1082
.endif
1083
.inst 0x04a10000 //add z0.s,z0.s,z1.s
1084
.if mixin == 1
1085
add w7,w7,w11
1086
.endif
1087
.inst 0x04a50084 //add z4.s,z4.s,z5.s
1088
.if mixin == 1
1089
add w8,w8,w12
1090
.endif
1091
.inst 0x04a90108 //add z8.s,z8.s,z9.s
1092
.if mixin == 1
1093
add w9,w9,w13
1094
.endif
1095
.inst 0x04ad018c //add z12.s,z12.s,z13.s
1096
.if mixin == 1
1097
add w10,w10,w14
1098
.endif
1099
.if mixin == 1
1100
eor w19,w19,w7
1101
.endif
1102
.inst 0x04683403 //xar z3.s,z3.s,z0.s,24
1103
.if mixin == 1
1104
ror w19,w19,24
1105
.endif
1106
.if mixin == 1
1107
eor w20,w20,w8
1108
.endif
1109
.inst 0x04683487 //xar z7.s,z7.s,z4.s,24
1110
.if mixin == 1
1111
ror w20,w20,24
1112
.endif
1113
.if mixin == 1
1114
eor w21,w21,w9
1115
.endif
1116
.inst 0x0468350b //xar z11.s,z11.s,z8.s,24
1117
.if mixin == 1
1118
ror w21,w21,24
1119
.endif
1120
.if mixin == 1
1121
eor w22,w22,w10
1122
.endif
1123
.inst 0x0468358f //xar z15.s,z15.s,z12.s,24
1124
.if mixin == 1
1125
ror w22,w22,24
1126
.endif
1127
.inst 0x04a30042 //add z2.s,z2.s,z3.s
1128
.if mixin == 1
1129
add w15,w15,w19
1130
.endif
1131
.inst 0x04a700c6 //add z6.s,z6.s,z7.s
1132
.if mixin == 1
1133
add w16,w16,w20
1134
.endif
1135
.inst 0x04ab014a //add z10.s,z10.s,z11.s
1136
.if mixin == 1
1137
add w17,w17,w21
1138
.endif
1139
.inst 0x04af01ce //add z14.s,z14.s,z15.s
1140
.if mixin == 1
1141
add w18,w18,w22
1142
.endif
1143
.if mixin == 1
1144
eor w11,w11,w15
1145
.endif
1146
.inst 0x04673441 //xar z1.s,z1.s,z2.s,25
1147
.if mixin == 1
1148
ror w11,w11,25
1149
.endif
1150
.if mixin == 1
1151
eor w12,w12,w16
1152
.endif
1153
.inst 0x046734c5 //xar z5.s,z5.s,z6.s,25
1154
.if mixin == 1
1155
ror w12,w12,25
1156
.endif
1157
.if mixin == 1
1158
eor w13,w13,w17
1159
.endif
1160
.inst 0x04673549 //xar z9.s,z9.s,z10.s,25
1161
.if mixin == 1
1162
ror w13,w13,25
1163
.endif
1164
.if mixin == 1
1165
eor w14,w14,w18
1166
.endif
1167
.inst 0x046735cd //xar z13.s,z13.s,z14.s,25
1168
.if mixin == 1
1169
ror w14,w14,25
1170
.endif
1171
.inst 0x04a50000 //add z0.s,z0.s,z5.s
1172
.if mixin == 1
1173
add w7,w7,w12
1174
.endif
1175
.inst 0x04a90084 //add z4.s,z4.s,z9.s
1176
.if mixin == 1
1177
add w8,w8,w13
1178
.endif
1179
.inst 0x04ad0108 //add z8.s,z8.s,z13.s
1180
.if mixin == 1
1181
add w9,w9,w14
1182
.endif
1183
.inst 0x04a1018c //add z12.s,z12.s,z1.s
1184
.if mixin == 1
1185
add w10,w10,w11
1186
.endif
1187
.if mixin == 1
1188
eor w22,w22,w7
1189
.endif
1190
.inst 0x0470340f //xar z15.s,z15.s,z0.s,16
1191
.if mixin == 1
1192
ror w22,w22,16
1193
.endif
1194
.if mixin == 1
1195
eor w19,w19,w8
1196
.endif
1197
.inst 0x04703483 //xar z3.s,z3.s,z4.s,16
1198
.if mixin == 1
1199
ror w19,w19,16
1200
.endif
1201
.if mixin == 1
1202
eor w20,w20,w9
1203
.endif
1204
.inst 0x04703507 //xar z7.s,z7.s,z8.s,16
1205
.if mixin == 1
1206
ror w20,w20,16
1207
.endif
1208
.if mixin == 1
1209
eor w21,w21,w10
1210
.endif
1211
.inst 0x0470358b //xar z11.s,z11.s,z12.s,16
1212
.if mixin == 1
1213
ror w21,w21,16
1214
.endif
1215
.inst 0x04af014a //add z10.s,z10.s,z15.s
1216
.if mixin == 1
1217
add w17,w17,w22
1218
.endif
1219
.inst 0x04a301ce //add z14.s,z14.s,z3.s
1220
.if mixin == 1
1221
add w18,w18,w19
1222
.endif
1223
.inst 0x04a70042 //add z2.s,z2.s,z7.s
1224
.if mixin == 1
1225
add w15,w15,w20
1226
.endif
1227
.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
1228
.if mixin == 1
1229
add w16,w16,w21
1230
.endif
1231
.if mixin == 1
1232
eor w12,w12,w17
1233
.endif
1234
.inst 0x046c3545 //xar z5.s,z5.s,z10.s,20
1235
.if mixin == 1
1236
ror w12,w12,20
1237
.endif
1238
.if mixin == 1
1239
eor w13,w13,w18
1240
.endif
1241
.inst 0x046c35c9 //xar z9.s,z9.s,z14.s,20
1242
.if mixin == 1
1243
ror w13,w13,20
1244
.endif
1245
.if mixin == 1
1246
eor w14,w14,w15
1247
.endif
1248
.inst 0x046c344d //xar z13.s,z13.s,z2.s,20
1249
.if mixin == 1
1250
ror w14,w14,20
1251
.endif
1252
.if mixin == 1
1253
eor w11,w11,w16
1254
.endif
1255
.inst 0x046c34c1 //xar z1.s,z1.s,z6.s,20
1256
.if mixin == 1
1257
ror w11,w11,20
1258
.endif
1259
.inst 0x04a50000 //add z0.s,z0.s,z5.s
1260
.if mixin == 1
1261
add w7,w7,w12
1262
.endif
1263
.inst 0x04a90084 //add z4.s,z4.s,z9.s
1264
.if mixin == 1
1265
add w8,w8,w13
1266
.endif
1267
.inst 0x04ad0108 //add z8.s,z8.s,z13.s
1268
.if mixin == 1
1269
add w9,w9,w14
1270
.endif
1271
.inst 0x04a1018c //add z12.s,z12.s,z1.s
1272
.if mixin == 1
1273
add w10,w10,w11
1274
.endif
1275
.if mixin == 1
1276
eor w22,w22,w7
1277
.endif
1278
.inst 0x0468340f //xar z15.s,z15.s,z0.s,24
1279
.if mixin == 1
1280
ror w22,w22,24
1281
.endif
1282
.if mixin == 1
1283
eor w19,w19,w8
1284
.endif
1285
.inst 0x04683483 //xar z3.s,z3.s,z4.s,24
1286
.if mixin == 1
1287
ror w19,w19,24
1288
.endif
1289
.if mixin == 1
1290
eor w20,w20,w9
1291
.endif
1292
.inst 0x04683507 //xar z7.s,z7.s,z8.s,24
1293
.if mixin == 1
1294
ror w20,w20,24
1295
.endif
1296
.if mixin == 1
1297
eor w21,w21,w10
1298
.endif
1299
.inst 0x0468358b //xar z11.s,z11.s,z12.s,24
1300
.if mixin == 1
1301
ror w21,w21,24
1302
.endif
1303
.inst 0x04af014a //add z10.s,z10.s,z15.s
1304
.if mixin == 1
1305
add w17,w17,w22
1306
.endif
1307
.inst 0x04a301ce //add z14.s,z14.s,z3.s
1308
.if mixin == 1
1309
add w18,w18,w19
1310
.endif
1311
.inst 0x04a70042 //add z2.s,z2.s,z7.s
1312
.if mixin == 1
1313
add w15,w15,w20
1314
.endif
1315
.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
1316
.if mixin == 1
1317
add w16,w16,w21
1318
.endif
1319
.if mixin == 1
1320
eor w12,w12,w17
1321
.endif
1322
.inst 0x04673545 //xar z5.s,z5.s,z10.s,25
1323
.if mixin == 1
1324
ror w12,w12,25
1325
.endif
1326
.if mixin == 1
1327
eor w13,w13,w18
1328
.endif
1329
.inst 0x046735c9 //xar z9.s,z9.s,z14.s,25
1330
.if mixin == 1
1331
ror w13,w13,25
1332
.endif
1333
.if mixin == 1
1334
eor w14,w14,w15
1335
.endif
1336
.inst 0x0467344d //xar z13.s,z13.s,z2.s,25
1337
.if mixin == 1
1338
ror w14,w14,25
1339
.endif
1340
.if mixin == 1
1341
eor w11,w11,w16
1342
.endif
1343
.inst 0x046734c1 //xar z1.s,z1.s,z6.s,25
1344
.if mixin == 1
1345
ror w11,w11,25
1346
.endif
1347
sub x6,x6,1
1348
cbnz x6,10b
1349
.if mixin == 1
1350
add w7,w7,w23
1351
.endif
1352
.inst 0x04b90000 //add z0.s,z0.s,z25.s
1353
.if mixin == 1
1354
add x8,x8,x23,lsr #32
1355
.endif
1356
.inst 0x04ba0084 //add z4.s,z4.s,z26.s
1357
.if mixin == 1
1358
add x7,x7,x8,lsl #32 // pack
1359
.endif
1360
.if mixin == 1
1361
add w9,w9,w24
1362
.endif
1363
.inst 0x04bb0108 //add z8.s,z8.s,z27.s
1364
.if mixin == 1
1365
add x10,x10,x24,lsr #32
1366
.endif
1367
.inst 0x04bc018c //add z12.s,z12.s,z28.s
1368
.if mixin == 1
1369
add x9,x9,x10,lsl #32 // pack
1370
.endif
1371
.if mixin == 1
1372
ldp x8,x10,[x1],#16
1373
.endif
1374
.if mixin == 1
1375
add w11,w11,w25
1376
.endif
1377
.inst 0x04bd0021 //add z1.s,z1.s,z29.s
1378
.if mixin == 1
1379
add x12,x12,x25,lsr #32
1380
.endif
1381
.inst 0x04be00a5 //add z5.s,z5.s,z30.s
1382
.if mixin == 1
1383
add x11,x11,x12,lsl #32 // pack
1384
.endif
1385
.if mixin == 1
1386
add w13,w13,w26
1387
.endif
1388
.inst 0x04b50129 //add z9.s,z9.s,z21.s
1389
.if mixin == 1
1390
add x14,x14,x26,lsr #32
1391
.endif
1392
.inst 0x04b601ad //add z13.s,z13.s,z22.s
1393
.if mixin == 1
1394
add x13,x13,x14,lsl #32 // pack
1395
.endif
1396
.if mixin == 1
1397
ldp x12,x14,[x1],#16
1398
.endif
1399
.if mixin == 1
1400
add w15,w15,w27
1401
.endif
1402
.inst 0x04b70042 //add z2.s,z2.s,z23.s
1403
.if mixin == 1
1404
add x16,x16,x27,lsr #32
1405
.endif
1406
.inst 0x04b800c6 //add z6.s,z6.s,z24.s
1407
.if mixin == 1
1408
add x15,x15,x16,lsl #32 // pack
1409
.endif
1410
.if mixin == 1
1411
add w17,w17,w28
1412
.endif
1413
.inst 0x04b1014a //add z10.s,z10.s,z17.s
1414
.if mixin == 1
1415
add x18,x18,x28,lsr #32
1416
.endif
1417
.inst 0x04b201ce //add z14.s,z14.s,z18.s
1418
.if mixin == 1
1419
add x17,x17,x18,lsl #32 // pack
1420
.endif
1421
.if mixin == 1
1422
ldp x16,x18,[x1],#16
1423
.endif
1424
.if mixin == 1
1425
add w19,w19,w29
1426
.endif
1427
.inst 0x04b00063 //add z3.s,z3.s,z16.s
1428
.if mixin == 1
1429
add x20,x20,x29,lsr #32
1430
.endif
1431
.inst 0x04b300e7 //add z7.s,z7.s,z19.s
1432
.if mixin == 1
1433
add x19,x19,x20,lsl #32 // pack
1434
.endif
1435
.if mixin == 1
1436
add w21,w21,w30
1437
.endif
1438
.inst 0x04b4016b //add z11.s,z11.s,z20.s
1439
.if mixin == 1
1440
add x22,x22,x30,lsr #32
1441
.endif
1442
.inst 0x04bf01ef //add z15.s,z15.s,z31.s
1443
.if mixin == 1
1444
add x21,x21,x22,lsl #32 // pack
1445
.endif
1446
.if mixin == 1
1447
ldp x20,x22,[x1],#16
1448
.endif
1449
#ifdef __AARCH64EB__
1450
rev x7,x7
1451
.inst 0x05a48000 //revb z0.s,p0/m,z0.s
1452
.inst 0x05a48084 //revb z4.s,p0/m,z4.s
1453
rev x9,x9
1454
.inst 0x05a48108 //revb z8.s,p0/m,z8.s
1455
.inst 0x05a4818c //revb z12.s,p0/m,z12.s
1456
rev x11,x11
1457
.inst 0x05a48021 //revb z1.s,p0/m,z1.s
1458
.inst 0x05a480a5 //revb z5.s,p0/m,z5.s
1459
rev x13,x13
1460
.inst 0x05a48129 //revb z9.s,p0/m,z9.s
1461
.inst 0x05a481ad //revb z13.s,p0/m,z13.s
1462
rev x15,x15
1463
.inst 0x05a48042 //revb z2.s,p0/m,z2.s
1464
.inst 0x05a480c6 //revb z6.s,p0/m,z6.s
1465
rev x17,x17
1466
.inst 0x05a4814a //revb z10.s,p0/m,z10.s
1467
.inst 0x05a481ce //revb z14.s,p0/m,z14.s
1468
rev x19,x19
1469
.inst 0x05a48063 //revb z3.s,p0/m,z3.s
1470
.inst 0x05a480e7 //revb z7.s,p0/m,z7.s
1471
rev x21,x21
1472
.inst 0x05a4816b //revb z11.s,p0/m,z11.s
1473
.inst 0x05a481ef //revb z15.s,p0/m,z15.s
1474
#endif
1475
.if mixin == 1
1476
add x29,x29,#1
1477
.endif
1478
cmp x5,4
1479
b.ne 200f
1480
.if mixin == 1
1481
eor x7,x7,x8
1482
.endif
1483
.if mixin == 1
1484
eor x9,x9,x10
1485
.endif
1486
.if mixin == 1
1487
eor x11,x11,x12
1488
.endif
1489
.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
1490
.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
1491
.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
1492
.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
1493
1494
.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
1495
.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
1496
.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
1497
.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
1498
1499
.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
1500
.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
1501
.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
1502
.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
1503
1504
.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
1505
.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
1506
.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
1507
.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
1508
.if mixin == 1
1509
eor x13,x13,x14
1510
.endif
1511
.if mixin == 1
1512
eor x15,x15,x16
1513
.endif
1514
.if mixin == 1
1515
eor x17,x17,x18
1516
.endif
1517
.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
1518
.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
1519
.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
1520
.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
1521
1522
.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
1523
.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
1524
.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
1525
.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
1526
1527
.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
1528
.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
1529
.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
1530
.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
1531
1532
.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
1533
.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
1534
.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
1535
.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
1536
.if mixin == 1
1537
eor x19,x19,x20
1538
.endif
1539
.if mixin == 1
1540
eor x21,x21,x22
1541
.endif
1542
ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
1543
ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
1544
.inst 0x04b13000 //eor z0.d,z0.d,z17.d
1545
.inst 0x04b23021 //eor z1.d,z1.d,z18.d
1546
.inst 0x04b33042 //eor z2.d,z2.d,z19.d
1547
.inst 0x04b43063 //eor z3.d,z3.d,z20.d
1548
.inst 0x04b53084 //eor z4.d,z4.d,z21.d
1549
.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
1550
.inst 0x04b730c6 //eor z6.d,z6.d,z23.d
1551
.inst 0x04b830e7 //eor z7.d,z7.d,z24.d
1552
ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
1553
ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
1554
.if mixin == 1
1555
stp x7,x9,[x0],#16
1556
.endif
1557
.inst 0x04b13108 //eor z8.d,z8.d,z17.d
1558
.inst 0x04b23129 //eor z9.d,z9.d,z18.d
1559
.if mixin == 1
1560
stp x11,x13,[x0],#16
1561
.endif
1562
.inst 0x04b3314a //eor z10.d,z10.d,z19.d
1563
.inst 0x04b4316b //eor z11.d,z11.d,z20.d
1564
.if mixin == 1
1565
stp x15,x17,[x0],#16
1566
.endif
1567
.inst 0x04b5318c //eor z12.d,z12.d,z21.d
1568
.inst 0x04b631ad //eor z13.d,z13.d,z22.d
1569
.if mixin == 1
1570
stp x19,x21,[x0],#16
1571
.endif
1572
.inst 0x04b731ce //eor z14.d,z14.d,z23.d
1573
.inst 0x04b831ef //eor z15.d,z15.d,z24.d
1574
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
1575
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1576
st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
1577
st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
1578
b 210f
1579
200:
1580
.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s
1581
.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s
1582
.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s
1583
.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s
1584
1585
.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s
1586
.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s
1587
.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s
1588
.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s
1589
1590
.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
1591
.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d
1592
.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d
1593
.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d
1594
1595
.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d
1596
.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
1597
.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d
1598
.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d
1599
.if mixin == 1
1600
eor x7,x7,x8
1601
.endif
1602
.if mixin == 1
1603
eor x9,x9,x10
1604
.endif
1605
.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s
1606
.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s
1607
.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s
1608
.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s
1609
1610
.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s
1611
.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s
1612
.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s
1613
.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s
1614
1615
.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d
1616
.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d
1617
.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
1618
.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d
1619
1620
.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d
1621
.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d
1622
.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d
1623
.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
1624
.if mixin == 1
1625
eor x11,x11,x12
1626
.endif
1627
.if mixin == 1
1628
eor x13,x13,x14
1629
.endif
1630
.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
1631
.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
1632
.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
1633
.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
1634
1635
.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
1636
.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
1637
.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
1638
.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
1639
1640
.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
1641
.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
1642
.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
1643
.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
1644
1645
.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
1646
.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
1647
.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
1648
.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
1649
.if mixin == 1
1650
eor x15,x15,x16
1651
.endif
1652
.if mixin == 1
1653
eor x17,x17,x18
1654
.endif
1655
.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
1656
.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
1657
.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
1658
.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
1659
1660
.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
1661
.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
1662
.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
1663
.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
1664
1665
.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
1666
.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
1667
.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
1668
.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
1669
1670
.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
1671
.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
1672
.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
1673
.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
1674
.if mixin == 1
1675
eor x19,x19,x20
1676
.endif
1677
.if mixin == 1
1678
eor x21,x21,x22
1679
.endif
1680
.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
1681
.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
1682
.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
1683
.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
1684
.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
1685
.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
1686
.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
1687
.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
1688
.inst 0x04215101 //addvl x1,x1,8
1689
.inst 0x04b13000 //eor z0.d,z0.d,z17.d
1690
.inst 0x04b23084 //eor z4.d,z4.d,z18.d
1691
.inst 0x04b33108 //eor z8.d,z8.d,z19.d
1692
.inst 0x04b4318c //eor z12.d,z12.d,z20.d
1693
.inst 0x04b53021 //eor z1.d,z1.d,z21.d
1694
.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
1695
.inst 0x04b73129 //eor z9.d,z9.d,z23.d
1696
.inst 0x04b831ad //eor z13.d,z13.d,z24.d
1697
.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
1698
.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
1699
.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
1700
.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
1701
.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
1702
.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
1703
.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
1704
.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
1705
.inst 0x04215101 //addvl x1,x1,8
1706
.if mixin == 1
1707
stp x7,x9,[x0],#16
1708
.endif
1709
.inst 0x04b13042 //eor z2.d,z2.d,z17.d
1710
.inst 0x04b230c6 //eor z6.d,z6.d,z18.d
1711
.if mixin == 1
1712
stp x11,x13,[x0],#16
1713
.endif
1714
.inst 0x04b3314a //eor z10.d,z10.d,z19.d
1715
.inst 0x04b431ce //eor z14.d,z14.d,z20.d
1716
.if mixin == 1
1717
stp x15,x17,[x0],#16
1718
.endif
1719
.inst 0x04b53063 //eor z3.d,z3.d,z21.d
1720
.inst 0x04b630e7 //eor z7.d,z7.d,z22.d
1721
.if mixin == 1
1722
stp x19,x21,[x0],#16
1723
.endif
1724
.inst 0x04b7316b //eor z11.d,z11.d,z23.d
1725
.inst 0x04b831ef //eor z15.d,z15.d,z24.d
1726
.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL]
1727
.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL]
1728
.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL]
1729
.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL]
1730
.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL]
1731
.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL]
1732
.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL]
1733
.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL]
1734
.inst 0x04205100 //addvl x0,x0,8
1735
.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL]
1736
.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL]
1737
.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL]
1738
.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL]
1739
.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL]
1740
.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL]
1741
.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL]
1742
.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL]
1743
.inst 0x04205100 //addvl x0,x0,8
1744
210:
1745
.inst 0x04b0e3fd //incw x29, ALL, MUL #1
1746
110:
1747
b 2f
1748
1:
1749
.align 5
1750
100:
1751
subs x7,x2,x5,lsl #6
1752
b.lt 110f
1753
mov x2,x7
1754
b.eq 101f
1755
cmp x2,64
1756
b.lt 101f
1757
mixin=1
1758
lsr x8,x23,#32
1759
.inst 0x05a03ae0 //dup z0.s,w23
1760
.inst 0x05a03af9 //dup z25.s,w23
1761
.if mixin == 1
1762
mov w7,w23
1763
.endif
1764
.inst 0x05a03904 //dup z4.s,w8
1765
.inst 0x05a0391a //dup z26.s,w8
1766
lsr x10,x24,#32
1767
.inst 0x05a03b08 //dup z8.s,w24
1768
.inst 0x05a03b1b //dup z27.s,w24
1769
.if mixin == 1
1770
mov w9,w24
1771
.endif
1772
.inst 0x05a0394c //dup z12.s,w10
1773
.inst 0x05a0395c //dup z28.s,w10
1774
lsr x12,x25,#32
1775
.inst 0x05a03b21 //dup z1.s,w25
1776
.inst 0x05a03b3d //dup z29.s,w25
1777
.if mixin == 1
1778
mov w11,w25
1779
.endif
1780
.inst 0x05a03985 //dup z5.s,w12
1781
.inst 0x05a0399e //dup z30.s,w12
1782
lsr x14,x26,#32
1783
.inst 0x05a03b49 //dup z9.s,w26
1784
.inst 0x05a03b55 //dup z21.s,w26
1785
.if mixin == 1
1786
mov w13,w26
1787
.endif
1788
.inst 0x05a039cd //dup z13.s,w14
1789
.inst 0x05a039d6 //dup z22.s,w14
1790
lsr x16,x27,#32
1791
.inst 0x05a03b62 //dup z2.s,w27
1792
.inst 0x05a03b77 //dup z23.s,w27
1793
.if mixin == 1
1794
mov w15,w27
1795
.endif
1796
.inst 0x05a03a06 //dup z6.s,w16
1797
.inst 0x05a03a18 //dup z24.s,w16
1798
lsr x18,x28,#32
1799
.inst 0x05a03b8a //dup z10.s,w28
1800
.if mixin == 1
1801
mov w17,w28
1802
.endif
1803
.inst 0x05a03a4e //dup z14.s,w18
1804
lsr x22,x30,#32
1805
.inst 0x05a03bcb //dup z11.s,w30
1806
.if mixin == 1
1807
mov w21,w30
1808
.endif
1809
.inst 0x05a03acf //dup z15.s,w22
1810
.if mixin == 1
1811
add w20,w29,#1
1812
mov w19,w29
1813
.inst 0x04a14690 //index z16.s,w20,1
1814
.inst 0x04a14683 //index z3.s,w20,1
1815
.else
1816
.inst 0x04a147b0 //index z16.s,w29,1
1817
.inst 0x04a147a3 //index z3.s,w29,1
1818
.endif
1819
lsr x20,x29,#32
1820
.inst 0x05a03a87 //dup z7.s,w20
1821
mov x6,#10
1822
10:
1823
.align 5
1824
.inst 0x04a10000 //add z0.s,z0.s,z1.s
1825
.if mixin == 1
1826
add w7,w7,w11
1827
.endif
1828
.inst 0x04a50084 //add z4.s,z4.s,z5.s
1829
.if mixin == 1
1830
add w8,w8,w12
1831
.endif
1832
.inst 0x04a90108 //add z8.s,z8.s,z9.s
1833
.if mixin == 1
1834
add w9,w9,w13
1835
.endif
1836
.inst 0x04ad018c //add z12.s,z12.s,z13.s
1837
.if mixin == 1
1838
add w10,w10,w14
1839
.endif
1840
.inst 0x04a03063 //eor z3.d,z3.d,z0.d
1841
.if mixin == 1
1842
eor w19,w19,w7
1843
.endif
1844
.inst 0x04a430e7 //eor z7.d,z7.d,z4.d
1845
.if mixin == 1
1846
eor w20,w20,w8
1847
.endif
1848
.inst 0x04a8316b //eor z11.d,z11.d,z8.d
1849
.if mixin == 1
1850
eor w21,w21,w9
1851
.endif
1852
.inst 0x04ac31ef //eor z15.d,z15.d,z12.d
1853
.if mixin == 1
1854
eor w22,w22,w10
1855
.endif
1856
.inst 0x05a58063 //revh z3.s,p0/m,z3.s
1857
.if mixin == 1
1858
ror w19,w19,#16
1859
.endif
1860
.inst 0x05a580e7 //revh z7.s,p0/m,z7.s
1861
.if mixin == 1
1862
ror w20,w20,#16
1863
.endif
1864
.inst 0x05a5816b //revh z11.s,p0/m,z11.s
1865
.if mixin == 1
1866
ror w21,w21,#16
1867
.endif
1868
.inst 0x05a581ef //revh z15.s,p0/m,z15.s
1869
.if mixin == 1
1870
ror w22,w22,#16
1871
.endif
1872
.inst 0x04a30042 //add z2.s,z2.s,z3.s
1873
.if mixin == 1
1874
add w15,w15,w19
1875
.endif
1876
.inst 0x04a700c6 //add z6.s,z6.s,z7.s
1877
.if mixin == 1
1878
add w16,w16,w20
1879
.endif
1880
.inst 0x04ab014a //add z10.s,z10.s,z11.s
1881
.if mixin == 1
1882
add w17,w17,w21
1883
.endif
1884
.inst 0x04af01ce //add z14.s,z14.s,z15.s
1885
.if mixin == 1
1886
add w18,w18,w22
1887
.endif
1888
.inst 0x04a23021 //eor z1.d,z1.d,z2.d
1889
.if mixin == 1
1890
eor w11,w11,w15
1891
.endif
1892
.inst 0x04a630a5 //eor z5.d,z5.d,z6.d
1893
.if mixin == 1
1894
eor w12,w12,w16
1895
.endif
1896
.inst 0x04aa3129 //eor z9.d,z9.d,z10.d
1897
.if mixin == 1
1898
eor w13,w13,w17
1899
.endif
1900
.inst 0x04ae31ad //eor z13.d,z13.d,z14.d
1901
.if mixin == 1
1902
eor w14,w14,w18
1903
.endif
1904
.inst 0x046c9c31 //lsl z17.s,z1.s,12
1905
.inst 0x046c9cb2 //lsl z18.s,z5.s,12
1906
.inst 0x046c9d33 //lsl z19.s,z9.s,12
1907
.inst 0x046c9db4 //lsl z20.s,z13.s,12
1908
.inst 0x046c9421 //lsr z1.s,z1.s,20
1909
.if mixin == 1
1910
ror w11,w11,20
1911
.endif
1912
.inst 0x046c94a5 //lsr z5.s,z5.s,20
1913
.if mixin == 1
1914
ror w12,w12,20
1915
.endif
1916
.inst 0x046c9529 //lsr z9.s,z9.s,20
1917
.if mixin == 1
1918
ror w13,w13,20
1919
.endif
1920
.inst 0x046c95ad //lsr z13.s,z13.s,20
1921
.if mixin == 1
1922
ror w14,w14,20
1923
.endif
1924
.inst 0x04713021 //orr z1.d,z1.d,z17.d
1925
.inst 0x047230a5 //orr z5.d,z5.d,z18.d
1926
.inst 0x04733129 //orr z9.d,z9.d,z19.d
1927
.inst 0x047431ad //orr z13.d,z13.d,z20.d
1928
.inst 0x04a10000 //add z0.s,z0.s,z1.s
1929
.if mixin == 1
1930
add w7,w7,w11
1931
.endif
1932
.inst 0x04a50084 //add z4.s,z4.s,z5.s
1933
.if mixin == 1
1934
add w8,w8,w12
1935
.endif
1936
.inst 0x04a90108 //add z8.s,z8.s,z9.s
1937
.if mixin == 1
1938
add w9,w9,w13
1939
.endif
1940
.inst 0x04ad018c //add z12.s,z12.s,z13.s
1941
.if mixin == 1
1942
add w10,w10,w14
1943
.endif
1944
.inst 0x04a03063 //eor z3.d,z3.d,z0.d
1945
.if mixin == 1
1946
eor w19,w19,w7
1947
.endif
1948
.inst 0x04a430e7 //eor z7.d,z7.d,z4.d
1949
.if mixin == 1
1950
eor w20,w20,w8
1951
.endif
1952
.inst 0x04a8316b //eor z11.d,z11.d,z8.d
1953
.if mixin == 1
1954
eor w21,w21,w9
1955
.endif
1956
.inst 0x04ac31ef //eor z15.d,z15.d,z12.d
1957
.if mixin == 1
1958
eor w22,w22,w10
1959
.endif
1960
.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b
1961
.if mixin == 1
1962
ror w19,w19,#24
1963
.endif
1964
.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b
1965
.if mixin == 1
1966
ror w20,w20,#24
1967
.endif
1968
.inst 0x053f316b //tbl z11.b,{z11.b},z31.b
1969
.if mixin == 1
1970
ror w21,w21,#24
1971
.endif
1972
.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b
1973
.if mixin == 1
1974
ror w22,w22,#24
1975
.endif
1976
.inst 0x04a30042 //add z2.s,z2.s,z3.s
1977
.if mixin == 1
1978
add w15,w15,w19
1979
.endif
1980
.inst 0x04a700c6 //add z6.s,z6.s,z7.s
1981
.if mixin == 1
1982
add w16,w16,w20
1983
.endif
1984
.inst 0x04ab014a //add z10.s,z10.s,z11.s
1985
.if mixin == 1
1986
add w17,w17,w21
1987
.endif
1988
.inst 0x04af01ce //add z14.s,z14.s,z15.s
1989
.if mixin == 1
1990
add w18,w18,w22
1991
.endif
1992
.inst 0x04a23021 //eor z1.d,z1.d,z2.d
1993
.if mixin == 1
1994
eor w11,w11,w15
1995
.endif
1996
.inst 0x04a630a5 //eor z5.d,z5.d,z6.d
1997
.if mixin == 1
1998
eor w12,w12,w16
1999
.endif
2000
.inst 0x04aa3129 //eor z9.d,z9.d,z10.d
2001
.if mixin == 1
2002
eor w13,w13,w17
2003
.endif
2004
.inst 0x04ae31ad //eor z13.d,z13.d,z14.d
2005
.if mixin == 1
2006
eor w14,w14,w18
2007
.endif
2008
.inst 0x04679c31 //lsl z17.s,z1.s,7
2009
.inst 0x04679cb2 //lsl z18.s,z5.s,7
2010
.inst 0x04679d33 //lsl z19.s,z9.s,7
2011
.inst 0x04679db4 //lsl z20.s,z13.s,7
2012
.inst 0x04679421 //lsr z1.s,z1.s,25
2013
.if mixin == 1
2014
ror w11,w11,25
2015
.endif
2016
.inst 0x046794a5 //lsr z5.s,z5.s,25
2017
.if mixin == 1
2018
ror w12,w12,25
2019
.endif
2020
.inst 0x04679529 //lsr z9.s,z9.s,25
2021
.if mixin == 1
2022
ror w13,w13,25
2023
.endif
2024
.inst 0x046795ad //lsr z13.s,z13.s,25
2025
.if mixin == 1
2026
ror w14,w14,25
2027
.endif
2028
.inst 0x04713021 //orr z1.d,z1.d,z17.d
2029
.inst 0x047230a5 //orr z5.d,z5.d,z18.d
2030
.inst 0x04733129 //orr z9.d,z9.d,z19.d
2031
.inst 0x047431ad //orr z13.d,z13.d,z20.d
2032
.inst 0x04a50000 //add z0.s,z0.s,z5.s
2033
.if mixin == 1
2034
add w7,w7,w12
2035
.endif
2036
.inst 0x04a90084 //add z4.s,z4.s,z9.s
2037
.if mixin == 1
2038
add w8,w8,w13
2039
.endif
2040
.inst 0x04ad0108 //add z8.s,z8.s,z13.s
2041
.if mixin == 1
2042
add w9,w9,w14
2043
.endif
2044
.inst 0x04a1018c //add z12.s,z12.s,z1.s
2045
.if mixin == 1
2046
add w10,w10,w11
2047
.endif
2048
.inst 0x04a031ef //eor z15.d,z15.d,z0.d
2049
.if mixin == 1
2050
eor w22,w22,w7
2051
.endif
2052
.inst 0x04a43063 //eor z3.d,z3.d,z4.d
2053
.if mixin == 1
2054
eor w19,w19,w8
2055
.endif
2056
.inst 0x04a830e7 //eor z7.d,z7.d,z8.d
2057
.if mixin == 1
2058
eor w20,w20,w9
2059
.endif
2060
.inst 0x04ac316b //eor z11.d,z11.d,z12.d
2061
.if mixin == 1
2062
eor w21,w21,w10
2063
.endif
2064
.inst 0x05a581ef //revh z15.s,p0/m,z15.s
2065
.if mixin == 1
2066
ror w22,w22,#16
2067
.endif
2068
.inst 0x05a58063 //revh z3.s,p0/m,z3.s
2069
.if mixin == 1
2070
ror w19,w19,#16
2071
.endif
2072
.inst 0x05a580e7 //revh z7.s,p0/m,z7.s
2073
.if mixin == 1
2074
ror w20,w20,#16
2075
.endif
2076
.inst 0x05a5816b //revh z11.s,p0/m,z11.s
2077
.if mixin == 1
2078
ror w21,w21,#16
2079
.endif
2080
.inst 0x04af014a //add z10.s,z10.s,z15.s
2081
.if mixin == 1
2082
add w17,w17,w22
2083
.endif
2084
.inst 0x04a301ce //add z14.s,z14.s,z3.s
2085
.if mixin == 1
2086
add w18,w18,w19
2087
.endif
2088
.inst 0x04a70042 //add z2.s,z2.s,z7.s
2089
.if mixin == 1
2090
add w15,w15,w20
2091
.endif
2092
.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
2093
.if mixin == 1
2094
add w16,w16,w21
2095
.endif
2096
.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d
2097
.if mixin == 1
2098
eor w12,w12,w17
2099
.endif
2100
.inst 0x04ae3129 //eor z9.d,z9.d,z14.d
2101
.if mixin == 1
2102
eor w13,w13,w18
2103
.endif
2104
.inst 0x04a231ad //eor z13.d,z13.d,z2.d
2105
.if mixin == 1
2106
eor w14,w14,w15
2107
.endif
2108
.inst 0x04a63021 //eor z1.d,z1.d,z6.d
2109
.if mixin == 1
2110
eor w11,w11,w16
2111
.endif
2112
.inst 0x046c9cb1 //lsl z17.s,z5.s,12
2113
.inst 0x046c9d32 //lsl z18.s,z9.s,12
2114
.inst 0x046c9db3 //lsl z19.s,z13.s,12
2115
.inst 0x046c9c34 //lsl z20.s,z1.s,12
2116
.inst 0x046c94a5 //lsr z5.s,z5.s,20
2117
.if mixin == 1
2118
ror w12,w12,20
2119
.endif
2120
.inst 0x046c9529 //lsr z9.s,z9.s,20
2121
.if mixin == 1
2122
ror w13,w13,20
2123
.endif
2124
.inst 0x046c95ad //lsr z13.s,z13.s,20
2125
.if mixin == 1
2126
ror w14,w14,20
2127
.endif
2128
.inst 0x046c9421 //lsr z1.s,z1.s,20
2129
.if mixin == 1
2130
ror w11,w11,20
2131
.endif
2132
.inst 0x047130a5 //orr z5.d,z5.d,z17.d
2133
.inst 0x04723129 //orr z9.d,z9.d,z18.d
2134
.inst 0x047331ad //orr z13.d,z13.d,z19.d
2135
.inst 0x04743021 //orr z1.d,z1.d,z20.d
2136
.inst 0x04a50000 //add z0.s,z0.s,z5.s
2137
.if mixin == 1
2138
add w7,w7,w12
2139
.endif
2140
.inst 0x04a90084 //add z4.s,z4.s,z9.s
2141
.if mixin == 1
2142
add w8,w8,w13
2143
.endif
2144
.inst 0x04ad0108 //add z8.s,z8.s,z13.s
2145
.if mixin == 1
2146
add w9,w9,w14
2147
.endif
2148
.inst 0x04a1018c //add z12.s,z12.s,z1.s
2149
.if mixin == 1
2150
add w10,w10,w11
2151
.endif
2152
.inst 0x04a031ef //eor z15.d,z15.d,z0.d
2153
.if mixin == 1
2154
eor w22,w22,w7
2155
.endif
2156
.inst 0x04a43063 //eor z3.d,z3.d,z4.d
2157
.if mixin == 1
2158
eor w19,w19,w8
2159
.endif
2160
.inst 0x04a830e7 //eor z7.d,z7.d,z8.d
2161
.if mixin == 1
2162
eor w20,w20,w9
2163
.endif
2164
.inst 0x04ac316b //eor z11.d,z11.d,z12.d
2165
.if mixin == 1
2166
eor w21,w21,w10
2167
.endif
2168
.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b
2169
.if mixin == 1
2170
ror w22,w22,#24
2171
.endif
2172
.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b
2173
.if mixin == 1
2174
ror w19,w19,#24
2175
.endif
2176
.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b
2177
.if mixin == 1
2178
ror w20,w20,#24
2179
.endif
2180
.inst 0x053f316b //tbl z11.b,{z11.b},z31.b
2181
.if mixin == 1
2182
ror w21,w21,#24
2183
.endif
2184
.inst 0x04af014a //add z10.s,z10.s,z15.s
2185
.if mixin == 1
2186
add w17,w17,w22
2187
.endif
2188
.inst 0x04a301ce //add z14.s,z14.s,z3.s
2189
.if mixin == 1
2190
add w18,w18,w19
2191
.endif
2192
.inst 0x04a70042 //add z2.s,z2.s,z7.s
2193
.if mixin == 1
2194
add w15,w15,w20
2195
.endif
2196
.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
2197
.if mixin == 1
2198
add w16,w16,w21
2199
.endif
2200
.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d
2201
.if mixin == 1
2202
eor w12,w12,w17
2203
.endif
2204
.inst 0x04ae3129 //eor z9.d,z9.d,z14.d
2205
.if mixin == 1
2206
eor w13,w13,w18
2207
.endif
2208
.inst 0x04a231ad //eor z13.d,z13.d,z2.d
2209
.if mixin == 1
2210
eor w14,w14,w15
2211
.endif
2212
.inst 0x04a63021 //eor z1.d,z1.d,z6.d
2213
.if mixin == 1
2214
eor w11,w11,w16
2215
.endif
2216
.inst 0x04679cb1 //lsl z17.s,z5.s,7
2217
.inst 0x04679d32 //lsl z18.s,z9.s,7
2218
.inst 0x04679db3 //lsl z19.s,z13.s,7
2219
.inst 0x04679c34 //lsl z20.s,z1.s,7
2220
.inst 0x046794a5 //lsr z5.s,z5.s,25
2221
.if mixin == 1
2222
ror w12,w12,25
2223
.endif
2224
.inst 0x04679529 //lsr z9.s,z9.s,25
2225
.if mixin == 1
2226
ror w13,w13,25
2227
.endif
2228
.inst 0x046795ad //lsr z13.s,z13.s,25
2229
.if mixin == 1
2230
ror w14,w14,25
2231
.endif
2232
.inst 0x04679421 //lsr z1.s,z1.s,25
2233
.if mixin == 1
2234
ror w11,w11,25
2235
.endif
2236
.inst 0x047130a5 //orr z5.d,z5.d,z17.d
2237
.inst 0x04723129 //orr z9.d,z9.d,z18.d
2238
.inst 0x047331ad //orr z13.d,z13.d,z19.d
2239
.inst 0x04743021 //orr z1.d,z1.d,z20.d
2240
sub x6,x6,1
2241
cbnz x6,10b
2242
lsr x6,x28,#32
2243
.inst 0x05a03b91 //dup z17.s,w28
2244
.inst 0x05a038d2 //dup z18.s,w6
2245
lsr x6,x29,#32
2246
.inst 0x05a038d3 //dup z19.s,w6
2247
lsr x6,x30,#32
2248
.if mixin == 1
2249
add w7,w7,w23
2250
.endif
2251
.inst 0x04b90000 //add z0.s,z0.s,z25.s
2252
.if mixin == 1
2253
add x8,x8,x23,lsr #32
2254
.endif
2255
.inst 0x04ba0084 //add z4.s,z4.s,z26.s
2256
.if mixin == 1
2257
add x7,x7,x8,lsl #32 // pack
2258
.endif
2259
.if mixin == 1
2260
add w9,w9,w24
2261
.endif
2262
.inst 0x04bb0108 //add z8.s,z8.s,z27.s
2263
.if mixin == 1
2264
add x10,x10,x24,lsr #32
2265
.endif
2266
.inst 0x04bc018c //add z12.s,z12.s,z28.s
2267
.if mixin == 1
2268
add x9,x9,x10,lsl #32 // pack
2269
.endif
2270
.if mixin == 1
2271
ldp x8,x10,[x1],#16
2272
.endif
2273
.if mixin == 1
2274
add w11,w11,w25
2275
.endif
2276
.inst 0x04bd0021 //add z1.s,z1.s,z29.s
2277
.if mixin == 1
2278
add x12,x12,x25,lsr #32
2279
.endif
2280
.inst 0x04be00a5 //add z5.s,z5.s,z30.s
2281
.if mixin == 1
2282
add x11,x11,x12,lsl #32 // pack
2283
.endif
2284
.if mixin == 1
2285
add w13,w13,w26
2286
.endif
2287
.inst 0x04b50129 //add z9.s,z9.s,z21.s
2288
.if mixin == 1
2289
add x14,x14,x26,lsr #32
2290
.endif
2291
.inst 0x04b601ad //add z13.s,z13.s,z22.s
2292
.if mixin == 1
2293
add x13,x13,x14,lsl #32 // pack
2294
.endif
2295
.if mixin == 1
2296
ldp x12,x14,[x1],#16
2297
.endif
2298
.if mixin == 1
2299
add w15,w15,w27
2300
.endif
2301
.inst 0x04b70042 //add z2.s,z2.s,z23.s
2302
.if mixin == 1
2303
add x16,x16,x27,lsr #32
2304
.endif
2305
.inst 0x04b800c6 //add z6.s,z6.s,z24.s
2306
.if mixin == 1
2307
add x15,x15,x16,lsl #32 // pack
2308
.endif
2309
.if mixin == 1
2310
add w17,w17,w28
2311
.endif
2312
.inst 0x04b1014a //add z10.s,z10.s,z17.s
2313
.if mixin == 1
2314
add x18,x18,x28,lsr #32
2315
.endif
2316
.inst 0x04b201ce //add z14.s,z14.s,z18.s
2317
.if mixin == 1
2318
add x17,x17,x18,lsl #32 // pack
2319
.endif
2320
.if mixin == 1
2321
ldp x16,x18,[x1],#16
2322
.endif
2323
.inst 0x05a03bd4 //dup z20.s,w30
2324
.inst 0x05a038d9 //dup z25.s,w6 // bak[15] not available for SVE
2325
.if mixin == 1
2326
add w19,w19,w29
2327
.endif
2328
.inst 0x04b00063 //add z3.s,z3.s,z16.s
2329
.if mixin == 1
2330
add x20,x20,x29,lsr #32
2331
.endif
2332
.inst 0x04b300e7 //add z7.s,z7.s,z19.s
2333
.if mixin == 1
2334
add x19,x19,x20,lsl #32 // pack
2335
.endif
2336
.if mixin == 1
2337
add w21,w21,w30
2338
.endif
2339
.inst 0x04b4016b //add z11.s,z11.s,z20.s
2340
.if mixin == 1
2341
add x22,x22,x30,lsr #32
2342
.endif
2343
.inst 0x04b901ef //add z15.s,z15.s,z25.s
2344
.if mixin == 1
2345
add x21,x21,x22,lsl #32 // pack
2346
.endif
2347
.if mixin == 1
2348
ldp x20,x22,[x1],#16
2349
.endif
2350
#ifdef __AARCH64EB__
2351
rev x7,x7
2352
.inst 0x05a48000 //revb z0.s,p0/m,z0.s
2353
.inst 0x05a48084 //revb z4.s,p0/m,z4.s
2354
rev x9,x9
2355
.inst 0x05a48108 //revb z8.s,p0/m,z8.s
2356
.inst 0x05a4818c //revb z12.s,p0/m,z12.s
2357
rev x11,x11
2358
.inst 0x05a48021 //revb z1.s,p0/m,z1.s
2359
.inst 0x05a480a5 //revb z5.s,p0/m,z5.s
2360
rev x13,x13
2361
.inst 0x05a48129 //revb z9.s,p0/m,z9.s
2362
.inst 0x05a481ad //revb z13.s,p0/m,z13.s
2363
rev x15,x15
2364
.inst 0x05a48042 //revb z2.s,p0/m,z2.s
2365
.inst 0x05a480c6 //revb z6.s,p0/m,z6.s
2366
rev x17,x17
2367
.inst 0x05a4814a //revb z10.s,p0/m,z10.s
2368
.inst 0x05a481ce //revb z14.s,p0/m,z14.s
2369
rev x19,x19
2370
.inst 0x05a48063 //revb z3.s,p0/m,z3.s
2371
.inst 0x05a480e7 //revb z7.s,p0/m,z7.s
2372
rev x21,x21
2373
.inst 0x05a4816b //revb z11.s,p0/m,z11.s
2374
.inst 0x05a481ef //revb z15.s,p0/m,z15.s
2375
#endif
2376
.if mixin == 1
2377
add x29,x29,#1
2378
.endif
2379
cmp x5,4
2380
b.ne 200f
2381
.if mixin == 1
2382
eor x7,x7,x8
2383
.endif
2384
.if mixin == 1
2385
eor x9,x9,x10
2386
.endif
2387
.if mixin == 1
2388
eor x11,x11,x12
2389
.endif
2390
.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
2391
.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
2392
.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
2393
.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
2394
2395
.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
2396
.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
2397
.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
2398
.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
2399
2400
.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
2401
.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
2402
.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
2403
.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
2404
2405
.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
2406
.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
2407
.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
2408
.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
2409
.if mixin == 1
2410
eor x13,x13,x14
2411
.endif
2412
.if mixin == 1
2413
eor x15,x15,x16
2414
.endif
2415
.if mixin == 1
2416
eor x17,x17,x18
2417
.endif
2418
.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
2419
.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
2420
.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
2421
.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
2422
2423
.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
2424
.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
2425
.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
2426
.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
2427
2428
.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
2429
.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
2430
.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
2431
.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
2432
2433
.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
2434
.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
2435
.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
2436
.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
2437
.if mixin == 1
2438
eor x19,x19,x20
2439
.endif
2440
.if mixin == 1
2441
eor x21,x21,x22
2442
.endif
2443
ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
2444
ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
2445
.inst 0x04b13000 //eor z0.d,z0.d,z17.d
2446
.inst 0x04b23021 //eor z1.d,z1.d,z18.d
2447
.inst 0x04b33042 //eor z2.d,z2.d,z19.d
2448
.inst 0x04b43063 //eor z3.d,z3.d,z20.d
2449
.inst 0x04b53084 //eor z4.d,z4.d,z21.d
2450
.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
2451
.inst 0x04b730c6 //eor z6.d,z6.d,z23.d
2452
.inst 0x04b830e7 //eor z7.d,z7.d,z24.d
2453
ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
2454
ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
2455
.if mixin == 1
2456
stp x7,x9,[x0],#16
2457
.endif
2458
.inst 0x04b13108 //eor z8.d,z8.d,z17.d
2459
.inst 0x04b23129 //eor z9.d,z9.d,z18.d
2460
.if mixin == 1
2461
stp x11,x13,[x0],#16
2462
.endif
2463
.inst 0x04b3314a //eor z10.d,z10.d,z19.d
2464
.inst 0x04b4316b //eor z11.d,z11.d,z20.d
2465
.if mixin == 1
2466
stp x15,x17,[x0],#16
2467
.endif
2468
.inst 0x04b5318c //eor z12.d,z12.d,z21.d
2469
.inst 0x04b631ad //eor z13.d,z13.d,z22.d
2470
.if mixin == 1
2471
stp x19,x21,[x0],#16
2472
.endif
2473
.inst 0x04b731ce //eor z14.d,z14.d,z23.d
2474
.inst 0x04b831ef //eor z15.d,z15.d,z24.d
2475
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
2476
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
2477
st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2478
st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2479
b 210f
2480
200:
2481
.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s
2482
.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s
2483
.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s
2484
.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s
2485
2486
.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s
2487
.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s
2488
.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s
2489
.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s
2490
2491
.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
2492
.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d
2493
.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d
2494
.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d
2495
2496
.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d
2497
.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
2498
.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d
2499
.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d
2500
.if mixin == 1
2501
eor x7,x7,x8
2502
.endif
2503
.if mixin == 1
2504
eor x9,x9,x10
2505
.endif
2506
.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s
2507
.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s
2508
.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s
2509
.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s
2510
2511
.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s
2512
.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s
2513
.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s
2514
.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s
2515
2516
.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d
2517
.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d
2518
.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
2519
.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d
2520
2521
.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d
2522
.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d
2523
.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d
2524
.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
2525
.if mixin == 1
2526
eor x11,x11,x12
2527
.endif
2528
.if mixin == 1
2529
eor x13,x13,x14
2530
.endif
2531
.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
2532
.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
2533
.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
2534
.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
2535
2536
.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
2537
.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
2538
.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
2539
.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
2540
2541
.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
2542
.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
2543
.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
2544
.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
2545
2546
.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
2547
.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
2548
.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
2549
.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
2550
.if mixin == 1
2551
eor x15,x15,x16
2552
.endif
2553
.if mixin == 1
2554
eor x17,x17,x18
2555
.endif
2556
.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
2557
.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
2558
.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
2559
.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
2560
2561
.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
2562
.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
2563
.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
2564
.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
2565
2566
.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
2567
.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
2568
.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
2569
.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
2570
2571
.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
2572
.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
2573
.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
2574
.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
2575
.if mixin == 1
2576
eor x19,x19,x20
2577
.endif
2578
.if mixin == 1
2579
eor x21,x21,x22
2580
.endif
2581
.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
2582
.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
2583
.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
2584
.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
2585
.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
2586
.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
2587
.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
2588
.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
2589
.inst 0x04215101 //addvl x1,x1,8
2590
.inst 0x04b13000 //eor z0.d,z0.d,z17.d
2591
.inst 0x04b23084 //eor z4.d,z4.d,z18.d
2592
.inst 0x04b33108 //eor z8.d,z8.d,z19.d
2593
.inst 0x04b4318c //eor z12.d,z12.d,z20.d
2594
.inst 0x04b53021 //eor z1.d,z1.d,z21.d
2595
.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
2596
.inst 0x04b73129 //eor z9.d,z9.d,z23.d
2597
.inst 0x04b831ad //eor z13.d,z13.d,z24.d
2598
.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
2599
.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
2600
.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
2601
.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
2602
.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
2603
.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
2604
.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
2605
.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
2606
.inst 0x04215101 //addvl x1,x1,8
2607
.if mixin == 1
2608
stp x7,x9,[x0],#16
2609
.endif
2610
.inst 0x04b13042 //eor z2.d,z2.d,z17.d
2611
.inst 0x04b230c6 //eor z6.d,z6.d,z18.d
2612
.if mixin == 1
2613
stp x11,x13,[x0],#16
2614
.endif
2615
.inst 0x04b3314a //eor z10.d,z10.d,z19.d
2616
.inst 0x04b431ce //eor z14.d,z14.d,z20.d
2617
.if mixin == 1
2618
stp x15,x17,[x0],#16
2619
.endif
2620
.inst 0x04b53063 //eor z3.d,z3.d,z21.d
2621
.inst 0x04b630e7 //eor z7.d,z7.d,z22.d
2622
.if mixin == 1
2623
stp x19,x21,[x0],#16
2624
.endif
2625
.inst 0x04b7316b //eor z11.d,z11.d,z23.d
2626
.inst 0x04b831ef //eor z15.d,z15.d,z24.d
2627
.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL]
2628
.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL]
2629
.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL]
2630
.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL]
2631
.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL]
2632
.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL]
2633
.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL]
2634
.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL]
2635
.inst 0x04205100 //addvl x0,x0,8
2636
.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL]
2637
.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL]
2638
.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL]
2639
.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL]
2640
.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL]
2641
.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL]
2642
.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL]
2643
.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL]
2644
.inst 0x04205100 //addvl x0,x0,8
2645
210:
2646
.inst 0x04b0e3fd //incw x29, ALL, MUL #1
2647
subs x2,x2,64
2648
b.gt 100b
2649
b 110f
2650
101:
2651
mixin=0
2652
lsr x8,x23,#32
2653
.inst 0x05a03ae0 //dup z0.s,w23
2654
.inst 0x05a03af9 //dup z25.s,w23
2655
.if mixin == 1
2656
mov w7,w23
2657
.endif
2658
.inst 0x05a03904 //dup z4.s,w8
2659
.inst 0x05a0391a //dup z26.s,w8
2660
lsr x10,x24,#32
2661
.inst 0x05a03b08 //dup z8.s,w24
2662
.inst 0x05a03b1b //dup z27.s,w24
2663
.if mixin == 1
2664
mov w9,w24
2665
.endif
2666
.inst 0x05a0394c //dup z12.s,w10
2667
.inst 0x05a0395c //dup z28.s,w10
2668
lsr x12,x25,#32
2669
.inst 0x05a03b21 //dup z1.s,w25
2670
.inst 0x05a03b3d //dup z29.s,w25
2671
.if mixin == 1
2672
mov w11,w25
2673
.endif
2674
.inst 0x05a03985 //dup z5.s,w12
2675
.inst 0x05a0399e //dup z30.s,w12
2676
lsr x14,x26,#32
2677
.inst 0x05a03b49 //dup z9.s,w26
2678
.inst 0x05a03b55 //dup z21.s,w26
2679
.if mixin == 1
2680
mov w13,w26
2681
.endif
2682
.inst 0x05a039cd //dup z13.s,w14
2683
.inst 0x05a039d6 //dup z22.s,w14
2684
lsr x16,x27,#32
2685
.inst 0x05a03b62 //dup z2.s,w27
2686
.inst 0x05a03b77 //dup z23.s,w27
2687
.if mixin == 1
2688
mov w15,w27
2689
.endif
2690
.inst 0x05a03a06 //dup z6.s,w16
2691
.inst 0x05a03a18 //dup z24.s,w16
2692
lsr x18,x28,#32
2693
.inst 0x05a03b8a //dup z10.s,w28
2694
.if mixin == 1
2695
mov w17,w28
2696
.endif
2697
.inst 0x05a03a4e //dup z14.s,w18
2698
lsr x22,x30,#32
2699
.inst 0x05a03bcb //dup z11.s,w30
2700
.if mixin == 1
2701
mov w21,w30
2702
.endif
2703
.inst 0x05a03acf //dup z15.s,w22
2704
.if mixin == 1
2705
add w20,w29,#1
2706
mov w19,w29
2707
.inst 0x04a14690 //index z16.s,w20,1
2708
.inst 0x04a14683 //index z3.s,w20,1
2709
.else
2710
.inst 0x04a147b0 //index z16.s,w29,1
2711
.inst 0x04a147a3 //index z3.s,w29,1
2712
.endif
2713
lsr x20,x29,#32
2714
.inst 0x05a03a87 //dup z7.s,w20
2715
mov x6,#10
2716
10:
2717
.align 5
2718
.inst 0x04a10000 //add z0.s,z0.s,z1.s
2719
.if mixin == 1
2720
add w7,w7,w11
2721
.endif
2722
.inst 0x04a50084 //add z4.s,z4.s,z5.s
2723
.if mixin == 1
2724
add w8,w8,w12
2725
.endif
2726
.inst 0x04a90108 //add z8.s,z8.s,z9.s
2727
.if mixin == 1
2728
add w9,w9,w13
2729
.endif
2730
.inst 0x04ad018c //add z12.s,z12.s,z13.s
2731
.if mixin == 1
2732
add w10,w10,w14
2733
.endif
2734
.inst 0x04a03063 //eor z3.d,z3.d,z0.d
2735
.if mixin == 1
2736
eor w19,w19,w7
2737
.endif
2738
.inst 0x04a430e7 //eor z7.d,z7.d,z4.d
2739
.if mixin == 1
2740
eor w20,w20,w8
2741
.endif
2742
.inst 0x04a8316b //eor z11.d,z11.d,z8.d
2743
.if mixin == 1
2744
eor w21,w21,w9
2745
.endif
2746
.inst 0x04ac31ef //eor z15.d,z15.d,z12.d
2747
.if mixin == 1
2748
eor w22,w22,w10
2749
.endif
2750
.inst 0x05a58063 //revh z3.s,p0/m,z3.s
2751
.if mixin == 1
2752
ror w19,w19,#16
2753
.endif
2754
.inst 0x05a580e7 //revh z7.s,p0/m,z7.s
2755
.if mixin == 1
2756
ror w20,w20,#16
2757
.endif
2758
.inst 0x05a5816b //revh z11.s,p0/m,z11.s
2759
.if mixin == 1
2760
ror w21,w21,#16
2761
.endif
2762
.inst 0x05a581ef //revh z15.s,p0/m,z15.s
2763
.if mixin == 1
2764
ror w22,w22,#16
2765
.endif
2766
.inst 0x04a30042 //add z2.s,z2.s,z3.s
2767
.if mixin == 1
2768
add w15,w15,w19
2769
.endif
2770
.inst 0x04a700c6 //add z6.s,z6.s,z7.s
2771
.if mixin == 1
2772
add w16,w16,w20
2773
.endif
2774
.inst 0x04ab014a //add z10.s,z10.s,z11.s
2775
.if mixin == 1
2776
add w17,w17,w21
2777
.endif
2778
.inst 0x04af01ce //add z14.s,z14.s,z15.s
2779
.if mixin == 1
2780
add w18,w18,w22
2781
.endif
2782
.inst 0x04a23021 //eor z1.d,z1.d,z2.d
2783
.if mixin == 1
2784
eor w11,w11,w15
2785
.endif
2786
.inst 0x04a630a5 //eor z5.d,z5.d,z6.d
2787
.if mixin == 1
2788
eor w12,w12,w16
2789
.endif
2790
.inst 0x04aa3129 //eor z9.d,z9.d,z10.d
2791
.if mixin == 1
2792
eor w13,w13,w17
2793
.endif
2794
.inst 0x04ae31ad //eor z13.d,z13.d,z14.d
2795
.if mixin == 1
2796
eor w14,w14,w18
2797
.endif
2798
.inst 0x046c9c31 //lsl z17.s,z1.s,12
2799
.inst 0x046c9cb2 //lsl z18.s,z5.s,12
2800
.inst 0x046c9d33 //lsl z19.s,z9.s,12
2801
.inst 0x046c9db4 //lsl z20.s,z13.s,12
2802
.inst 0x046c9421 //lsr z1.s,z1.s,20
2803
.if mixin == 1
2804
ror w11,w11,20
2805
.endif
2806
.inst 0x046c94a5 //lsr z5.s,z5.s,20
2807
.if mixin == 1
2808
ror w12,w12,20
2809
.endif
2810
.inst 0x046c9529 //lsr z9.s,z9.s,20
2811
.if mixin == 1
2812
ror w13,w13,20
2813
.endif
2814
.inst 0x046c95ad //lsr z13.s,z13.s,20
2815
.if mixin == 1
2816
ror w14,w14,20
2817
.endif
2818
.inst 0x04713021 //orr z1.d,z1.d,z17.d
2819
.inst 0x047230a5 //orr z5.d,z5.d,z18.d
2820
.inst 0x04733129 //orr z9.d,z9.d,z19.d
2821
.inst 0x047431ad //orr z13.d,z13.d,z20.d
2822
.inst 0x04a10000 //add z0.s,z0.s,z1.s
2823
.if mixin == 1
2824
add w7,w7,w11
2825
.endif
2826
.inst 0x04a50084 //add z4.s,z4.s,z5.s
2827
.if mixin == 1
2828
add w8,w8,w12
2829
.endif
2830
.inst 0x04a90108 //add z8.s,z8.s,z9.s
2831
.if mixin == 1
2832
add w9,w9,w13
2833
.endif
2834
.inst 0x04ad018c //add z12.s,z12.s,z13.s
2835
.if mixin == 1
2836
add w10,w10,w14
2837
.endif
2838
.inst 0x04a03063 //eor z3.d,z3.d,z0.d
2839
.if mixin == 1
2840
eor w19,w19,w7
2841
.endif
2842
.inst 0x04a430e7 //eor z7.d,z7.d,z4.d
2843
.if mixin == 1
2844
eor w20,w20,w8
2845
.endif
2846
.inst 0x04a8316b //eor z11.d,z11.d,z8.d
2847
.if mixin == 1
2848
eor w21,w21,w9
2849
.endif
2850
.inst 0x04ac31ef //eor z15.d,z15.d,z12.d
2851
.if mixin == 1
2852
eor w22,w22,w10
2853
.endif
2854
.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b
2855
.if mixin == 1
2856
ror w19,w19,#24
2857
.endif
2858
.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b
2859
.if mixin == 1
2860
ror w20,w20,#24
2861
.endif
2862
.inst 0x053f316b //tbl z11.b,{z11.b},z31.b
2863
.if mixin == 1
2864
ror w21,w21,#24
2865
.endif
2866
.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b
2867
.if mixin == 1
2868
ror w22,w22,#24
2869
.endif
2870
.inst 0x04a30042 //add z2.s,z2.s,z3.s
2871
.if mixin == 1
2872
add w15,w15,w19
2873
.endif
2874
.inst 0x04a700c6 //add z6.s,z6.s,z7.s
2875
.if mixin == 1
2876
add w16,w16,w20
2877
.endif
2878
.inst 0x04ab014a //add z10.s,z10.s,z11.s
2879
.if mixin == 1
2880
add w17,w17,w21
2881
.endif
2882
.inst 0x04af01ce //add z14.s,z14.s,z15.s
2883
.if mixin == 1
2884
add w18,w18,w22
2885
.endif
2886
.inst 0x04a23021 //eor z1.d,z1.d,z2.d
2887
.if mixin == 1
2888
eor w11,w11,w15
2889
.endif
2890
.inst 0x04a630a5 //eor z5.d,z5.d,z6.d
2891
.if mixin == 1
2892
eor w12,w12,w16
2893
.endif
2894
.inst 0x04aa3129 //eor z9.d,z9.d,z10.d
2895
.if mixin == 1
2896
eor w13,w13,w17
2897
.endif
2898
.inst 0x04ae31ad //eor z13.d,z13.d,z14.d
2899
.if mixin == 1
2900
eor w14,w14,w18
2901
.endif
2902
.inst 0x04679c31 //lsl z17.s,z1.s,7
2903
.inst 0x04679cb2 //lsl z18.s,z5.s,7
2904
.inst 0x04679d33 //lsl z19.s,z9.s,7
2905
.inst 0x04679db4 //lsl z20.s,z13.s,7
2906
.inst 0x04679421 //lsr z1.s,z1.s,25
2907
.if mixin == 1
2908
ror w11,w11,25
2909
.endif
2910
.inst 0x046794a5 //lsr z5.s,z5.s,25
2911
.if mixin == 1
2912
ror w12,w12,25
2913
.endif
2914
.inst 0x04679529 //lsr z9.s,z9.s,25
2915
.if mixin == 1
2916
ror w13,w13,25
2917
.endif
2918
.inst 0x046795ad //lsr z13.s,z13.s,25
2919
.if mixin == 1
2920
ror w14,w14,25
2921
.endif
2922
.inst 0x04713021 //orr z1.d,z1.d,z17.d
2923
.inst 0x047230a5 //orr z5.d,z5.d,z18.d
2924
.inst 0x04733129 //orr z9.d,z9.d,z19.d
2925
.inst 0x047431ad //orr z13.d,z13.d,z20.d
2926
.inst 0x04a50000 //add z0.s,z0.s,z5.s
2927
.if mixin == 1
2928
add w7,w7,w12
2929
.endif
2930
.inst 0x04a90084 //add z4.s,z4.s,z9.s
2931
.if mixin == 1
2932
add w8,w8,w13
2933
.endif
2934
.inst 0x04ad0108 //add z8.s,z8.s,z13.s
2935
.if mixin == 1
2936
add w9,w9,w14
2937
.endif
2938
.inst 0x04a1018c //add z12.s,z12.s,z1.s
2939
.if mixin == 1
2940
add w10,w10,w11
2941
.endif
2942
.inst 0x04a031ef //eor z15.d,z15.d,z0.d
2943
.if mixin == 1
2944
eor w22,w22,w7
2945
.endif
2946
.inst 0x04a43063 //eor z3.d,z3.d,z4.d
2947
.if mixin == 1
2948
eor w19,w19,w8
2949
.endif
2950
.inst 0x04a830e7 //eor z7.d,z7.d,z8.d
2951
.if mixin == 1
2952
eor w20,w20,w9
2953
.endif
2954
.inst 0x04ac316b //eor z11.d,z11.d,z12.d
2955
.if mixin == 1
2956
eor w21,w21,w10
2957
.endif
2958
.inst 0x05a581ef //revh z15.s,p0/m,z15.s
2959
.if mixin == 1
2960
ror w22,w22,#16
2961
.endif
2962
.inst 0x05a58063 //revh z3.s,p0/m,z3.s
2963
.if mixin == 1
2964
ror w19,w19,#16
2965
.endif
2966
.inst 0x05a580e7 //revh z7.s,p0/m,z7.s
2967
.if mixin == 1
2968
ror w20,w20,#16
2969
.endif
2970
.inst 0x05a5816b //revh z11.s,p0/m,z11.s
2971
.if mixin == 1
2972
ror w21,w21,#16
2973
.endif
2974
.inst 0x04af014a //add z10.s,z10.s,z15.s
2975
.if mixin == 1
2976
add w17,w17,w22
2977
.endif
2978
.inst 0x04a301ce //add z14.s,z14.s,z3.s
2979
.if mixin == 1
2980
add w18,w18,w19
2981
.endif
2982
.inst 0x04a70042 //add z2.s,z2.s,z7.s
2983
.if mixin == 1
2984
add w15,w15,w20
2985
.endif
2986
.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
2987
.if mixin == 1
2988
add w16,w16,w21
2989
.endif
2990
.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d
2991
.if mixin == 1
2992
eor w12,w12,w17
2993
.endif
2994
.inst 0x04ae3129 //eor z9.d,z9.d,z14.d
2995
.if mixin == 1
2996
eor w13,w13,w18
2997
.endif
2998
.inst 0x04a231ad //eor z13.d,z13.d,z2.d
2999
.if mixin == 1
3000
eor w14,w14,w15
3001
.endif
3002
.inst 0x04a63021 //eor z1.d,z1.d,z6.d
3003
.if mixin == 1
3004
eor w11,w11,w16
3005
.endif
3006
.inst 0x046c9cb1 //lsl z17.s,z5.s,12
3007
.inst 0x046c9d32 //lsl z18.s,z9.s,12
3008
.inst 0x046c9db3 //lsl z19.s,z13.s,12
3009
.inst 0x046c9c34 //lsl z20.s,z1.s,12
3010
.inst 0x046c94a5 //lsr z5.s,z5.s,20
3011
.if mixin == 1
3012
ror w12,w12,20
3013
.endif
3014
.inst 0x046c9529 //lsr z9.s,z9.s,20
3015
.if mixin == 1
3016
ror w13,w13,20
3017
.endif
3018
.inst 0x046c95ad //lsr z13.s,z13.s,20
3019
.if mixin == 1
3020
ror w14,w14,20
3021
.endif
3022
.inst 0x046c9421 //lsr z1.s,z1.s,20
3023
.if mixin == 1
3024
ror w11,w11,20
3025
.endif
3026
.inst 0x047130a5 //orr z5.d,z5.d,z17.d
3027
.inst 0x04723129 //orr z9.d,z9.d,z18.d
3028
.inst 0x047331ad //orr z13.d,z13.d,z19.d
3029
.inst 0x04743021 //orr z1.d,z1.d,z20.d
3030
.inst 0x04a50000 //add z0.s,z0.s,z5.s
3031
.if mixin == 1
3032
add w7,w7,w12
3033
.endif
3034
.inst 0x04a90084 //add z4.s,z4.s,z9.s
3035
.if mixin == 1
3036
add w8,w8,w13
3037
.endif
3038
.inst 0x04ad0108 //add z8.s,z8.s,z13.s
3039
.if mixin == 1
3040
add w9,w9,w14
3041
.endif
3042
.inst 0x04a1018c //add z12.s,z12.s,z1.s
3043
.if mixin == 1
3044
add w10,w10,w11
3045
.endif
3046
.inst 0x04a031ef //eor z15.d,z15.d,z0.d
3047
.if mixin == 1
3048
eor w22,w22,w7
3049
.endif
3050
.inst 0x04a43063 //eor z3.d,z3.d,z4.d
3051
.if mixin == 1
3052
eor w19,w19,w8
3053
.endif
3054
.inst 0x04a830e7 //eor z7.d,z7.d,z8.d
3055
.if mixin == 1
3056
eor w20,w20,w9
3057
.endif
3058
.inst 0x04ac316b //eor z11.d,z11.d,z12.d
3059
.if mixin == 1
3060
eor w21,w21,w10
3061
.endif
3062
.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b
3063
.if mixin == 1
3064
ror w22,w22,#24
3065
.endif
3066
.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b
3067
.if mixin == 1
3068
ror w19,w19,#24
3069
.endif
3070
.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b
3071
.if mixin == 1
3072
ror w20,w20,#24
3073
.endif
3074
.inst 0x053f316b //tbl z11.b,{z11.b},z31.b
3075
.if mixin == 1
3076
ror w21,w21,#24
3077
.endif
3078
.inst 0x04af014a //add z10.s,z10.s,z15.s
3079
.if mixin == 1
3080
add w17,w17,w22
3081
.endif
3082
.inst 0x04a301ce //add z14.s,z14.s,z3.s
3083
.if mixin == 1
3084
add w18,w18,w19
3085
.endif
3086
.inst 0x04a70042 //add z2.s,z2.s,z7.s
3087
.if mixin == 1
3088
add w15,w15,w20
3089
.endif
3090
.inst 0x04ab00c6 //add z6.s,z6.s,z11.s
3091
.if mixin == 1
3092
add w16,w16,w21
3093
.endif
3094
.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d
3095
.if mixin == 1
3096
eor w12,w12,w17
3097
.endif
3098
.inst 0x04ae3129 //eor z9.d,z9.d,z14.d
3099
.if mixin == 1
3100
eor w13,w13,w18
3101
.endif
3102
.inst 0x04a231ad //eor z13.d,z13.d,z2.d
3103
.if mixin == 1
3104
eor w14,w14,w15
3105
.endif
3106
.inst 0x04a63021 //eor z1.d,z1.d,z6.d
3107
.if mixin == 1
3108
eor w11,w11,w16
3109
.endif
3110
.inst 0x04679cb1 //lsl z17.s,z5.s,7
3111
.inst 0x04679d32 //lsl z18.s,z9.s,7
3112
.inst 0x04679db3 //lsl z19.s,z13.s,7
3113
.inst 0x04679c34 //lsl z20.s,z1.s,7
3114
.inst 0x046794a5 //lsr z5.s,z5.s,25
3115
.if mixin == 1
3116
ror w12,w12,25
3117
.endif
3118
.inst 0x04679529 //lsr z9.s,z9.s,25
3119
.if mixin == 1
3120
ror w13,w13,25
3121
.endif
3122
.inst 0x046795ad //lsr z13.s,z13.s,25
3123
.if mixin == 1
3124
ror w14,w14,25
3125
.endif
3126
.inst 0x04679421 //lsr z1.s,z1.s,25
3127
.if mixin == 1
3128
ror w11,w11,25
3129
.endif
3130
.inst 0x047130a5 //orr z5.d,z5.d,z17.d
3131
.inst 0x04723129 //orr z9.d,z9.d,z18.d
3132
.inst 0x047331ad //orr z13.d,z13.d,z19.d
3133
.inst 0x04743021 //orr z1.d,z1.d,z20.d
3134
sub x6,x6,1
3135
cbnz x6,10b
3136
lsr x6,x28,#32
3137
.inst 0x05a03b91 //dup z17.s,w28
3138
.inst 0x05a038d2 //dup z18.s,w6
3139
lsr x6,x29,#32
3140
.inst 0x05a038d3 //dup z19.s,w6
3141
lsr x6,x30,#32
3142
.if mixin == 1
3143
add w7,w7,w23
3144
.endif
3145
.inst 0x04b90000 //add z0.s,z0.s,z25.s
3146
.if mixin == 1
3147
add x8,x8,x23,lsr #32
3148
.endif
3149
.inst 0x04ba0084 //add z4.s,z4.s,z26.s
3150
.if mixin == 1
3151
add x7,x7,x8,lsl #32 // pack
3152
.endif
3153
.if mixin == 1
3154
add w9,w9,w24
3155
.endif
3156
.inst 0x04bb0108 //add z8.s,z8.s,z27.s
3157
.if mixin == 1
3158
add x10,x10,x24,lsr #32
3159
.endif
3160
.inst 0x04bc018c //add z12.s,z12.s,z28.s
3161
.if mixin == 1
3162
add x9,x9,x10,lsl #32 // pack
3163
.endif
3164
.if mixin == 1
3165
ldp x8,x10,[x1],#16
3166
.endif
3167
.if mixin == 1
3168
add w11,w11,w25
3169
.endif
3170
.inst 0x04bd0021 //add z1.s,z1.s,z29.s
3171
.if mixin == 1
3172
add x12,x12,x25,lsr #32
3173
.endif
3174
.inst 0x04be00a5 //add z5.s,z5.s,z30.s
3175
.if mixin == 1
3176
add x11,x11,x12,lsl #32 // pack
3177
.endif
3178
.if mixin == 1
3179
add w13,w13,w26
3180
.endif
3181
.inst 0x04b50129 //add z9.s,z9.s,z21.s
3182
.if mixin == 1
3183
add x14,x14,x26,lsr #32
3184
.endif
3185
.inst 0x04b601ad //add z13.s,z13.s,z22.s
3186
.if mixin == 1
3187
add x13,x13,x14,lsl #32 // pack
3188
.endif
3189
.if mixin == 1
3190
ldp x12,x14,[x1],#16
3191
.endif
3192
.if mixin == 1
3193
add w15,w15,w27
3194
.endif
3195
.inst 0x04b70042 //add z2.s,z2.s,z23.s
3196
.if mixin == 1
3197
add x16,x16,x27,lsr #32
3198
.endif
3199
.inst 0x04b800c6 //add z6.s,z6.s,z24.s
3200
.if mixin == 1
3201
add x15,x15,x16,lsl #32 // pack
3202
.endif
3203
.if mixin == 1
3204
add w17,w17,w28
3205
.endif
3206
.inst 0x04b1014a //add z10.s,z10.s,z17.s
3207
.if mixin == 1
3208
add x18,x18,x28,lsr #32
3209
.endif
3210
.inst 0x04b201ce //add z14.s,z14.s,z18.s
3211
.if mixin == 1
3212
add x17,x17,x18,lsl #32 // pack
3213
.endif
3214
.if mixin == 1
3215
ldp x16,x18,[x1],#16
3216
.endif
3217
.inst 0x05a03bd4 //dup z20.s,w30
3218
.inst 0x05a038d9 //dup z25.s,w6 // bak[15] not available for SVE
3219
.if mixin == 1
3220
add w19,w19,w29
3221
.endif
3222
.inst 0x04b00063 //add z3.s,z3.s,z16.s
3223
.if mixin == 1
3224
add x20,x20,x29,lsr #32
3225
.endif
3226
.inst 0x04b300e7 //add z7.s,z7.s,z19.s
3227
.if mixin == 1
3228
add x19,x19,x20,lsl #32 // pack
3229
.endif
3230
.if mixin == 1
3231
add w21,w21,w30
3232
.endif
3233
.inst 0x04b4016b //add z11.s,z11.s,z20.s
3234
.if mixin == 1
3235
add x22,x22,x30,lsr #32
3236
.endif
3237
.inst 0x04b901ef //add z15.s,z15.s,z25.s
3238
.if mixin == 1
3239
add x21,x21,x22,lsl #32 // pack
3240
.endif
3241
.if mixin == 1
3242
ldp x20,x22,[x1],#16
3243
.endif
3244
#ifdef __AARCH64EB__
3245
rev x7,x7
3246
.inst 0x05a48000 //revb z0.s,p0/m,z0.s
3247
.inst 0x05a48084 //revb z4.s,p0/m,z4.s
3248
rev x9,x9
3249
.inst 0x05a48108 //revb z8.s,p0/m,z8.s
3250
.inst 0x05a4818c //revb z12.s,p0/m,z12.s
3251
rev x11,x11
3252
.inst 0x05a48021 //revb z1.s,p0/m,z1.s
3253
.inst 0x05a480a5 //revb z5.s,p0/m,z5.s
3254
rev x13,x13
3255
.inst 0x05a48129 //revb z9.s,p0/m,z9.s
3256
.inst 0x05a481ad //revb z13.s,p0/m,z13.s
3257
rev x15,x15
3258
.inst 0x05a48042 //revb z2.s,p0/m,z2.s
3259
.inst 0x05a480c6 //revb z6.s,p0/m,z6.s
3260
rev x17,x17
3261
.inst 0x05a4814a //revb z10.s,p0/m,z10.s
3262
.inst 0x05a481ce //revb z14.s,p0/m,z14.s
3263
rev x19,x19
3264
.inst 0x05a48063 //revb z3.s,p0/m,z3.s
3265
.inst 0x05a480e7 //revb z7.s,p0/m,z7.s
3266
rev x21,x21
3267
.inst 0x05a4816b //revb z11.s,p0/m,z11.s
3268
.inst 0x05a481ef //revb z15.s,p0/m,z15.s
3269
#endif
3270
.if mixin == 1
3271
add x29,x29,#1
3272
.endif
3273
cmp x5,4
3274
b.ne 200f
3275
.if mixin == 1
3276
eor x7,x7,x8
3277
.endif
3278
.if mixin == 1
3279
eor x9,x9,x10
3280
.endif
3281
.if mixin == 1
3282
eor x11,x11,x12
3283
.endif
3284
.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
3285
.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
3286
.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
3287
.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
3288
3289
.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
3290
.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
3291
.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
3292
.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
3293
3294
.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
3295
.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
3296
.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
3297
.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
3298
3299
.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
3300
.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
3301
.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
3302
.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
3303
.if mixin == 1
3304
eor x13,x13,x14
3305
.endif
3306
.if mixin == 1
3307
eor x15,x15,x16
3308
.endif
3309
.if mixin == 1
3310
eor x17,x17,x18
3311
.endif
3312
.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
3313
.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
3314
.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
3315
.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
3316
3317
.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
3318
.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
3319
.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
3320
.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
3321
3322
.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
3323
.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
3324
.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
3325
.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
3326
3327
.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
3328
.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
3329
.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
3330
.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
3331
.if mixin == 1
3332
eor x19,x19,x20
3333
.endif
3334
.if mixin == 1
3335
eor x21,x21,x22
3336
.endif
3337
ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
3338
ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
3339
.inst 0x04b13000 //eor z0.d,z0.d,z17.d
3340
.inst 0x04b23021 //eor z1.d,z1.d,z18.d
3341
.inst 0x04b33042 //eor z2.d,z2.d,z19.d
3342
.inst 0x04b43063 //eor z3.d,z3.d,z20.d
3343
.inst 0x04b53084 //eor z4.d,z4.d,z21.d
3344
.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
3345
.inst 0x04b730c6 //eor z6.d,z6.d,z23.d
3346
.inst 0x04b830e7 //eor z7.d,z7.d,z24.d
3347
ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64
3348
ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64
3349
.if mixin == 1
3350
stp x7,x9,[x0],#16
3351
.endif
3352
.inst 0x04b13108 //eor z8.d,z8.d,z17.d
3353
.inst 0x04b23129 //eor z9.d,z9.d,z18.d
3354
.if mixin == 1
3355
stp x11,x13,[x0],#16
3356
.endif
3357
.inst 0x04b3314a //eor z10.d,z10.d,z19.d
3358
.inst 0x04b4316b //eor z11.d,z11.d,z20.d
3359
.if mixin == 1
3360
stp x15,x17,[x0],#16
3361
.endif
3362
.inst 0x04b5318c //eor z12.d,z12.d,z21.d
3363
.inst 0x04b631ad //eor z13.d,z13.d,z22.d
3364
.if mixin == 1
3365
stp x19,x21,[x0],#16
3366
.endif
3367
.inst 0x04b731ce //eor z14.d,z14.d,z23.d
3368
.inst 0x04b831ef //eor z15.d,z15.d,z24.d
3369
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
3370
st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3371
st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
3372
st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
3373
b 210f
3374
200:
3375
.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s
3376
.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s
3377
.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s
3378
.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s
3379
3380
.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s
3381
.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s
3382
.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s
3383
.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s
3384
3385
.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
3386
.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d
3387
.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d
3388
.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d
3389
3390
.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d
3391
.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
3392
.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d
3393
.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d
3394
.if mixin == 1
3395
eor x7,x7,x8
3396
.endif
3397
.if mixin == 1
3398
eor x9,x9,x10
3399
.endif
3400
.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s
3401
.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s
3402
.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s
3403
.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s
3404
3405
.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s
3406
.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s
3407
.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s
3408
.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s
3409
3410
.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d
3411
.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d
3412
.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
3413
.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d
3414
3415
.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d
3416
.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d
3417
.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d
3418
.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
3419
.if mixin == 1
3420
eor x11,x11,x12
3421
.endif
3422
.if mixin == 1
3423
eor x13,x13,x14
3424
.endif
3425
.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s
3426
.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s
3427
.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s
3428
.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s
3429
3430
.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s
3431
.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s
3432
.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s
3433
.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s
3434
3435
.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d
3436
.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d
3437
.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d
3438
.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d
3439
3440
.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d
3441
.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d
3442
.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d
3443
.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d
3444
.if mixin == 1
3445
eor x15,x15,x16
3446
.endif
3447
.if mixin == 1
3448
eor x17,x17,x18
3449
.endif
3450
.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s
3451
.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s
3452
.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s
3453
.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s
3454
3455
.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s
3456
.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s
3457
.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s
3458
.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s
3459
3460
.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d
3461
.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d
3462
.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d
3463
.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d
3464
3465
.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d
3466
.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d
3467
.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d
3468
.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d
3469
.if mixin == 1
3470
eor x19,x19,x20
3471
.endif
3472
.if mixin == 1
3473
eor x21,x21,x22
3474
.endif
3475
.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
3476
.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
3477
.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
3478
.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
3479
.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
3480
.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
3481
.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
3482
.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
3483
.inst 0x04215101 //addvl x1,x1,8
3484
.inst 0x04b13000 //eor z0.d,z0.d,z17.d
3485
.inst 0x04b23084 //eor z4.d,z4.d,z18.d
3486
.inst 0x04b33108 //eor z8.d,z8.d,z19.d
3487
.inst 0x04b4318c //eor z12.d,z12.d,z20.d
3488
.inst 0x04b53021 //eor z1.d,z1.d,z21.d
3489
.inst 0x04b630a5 //eor z5.d,z5.d,z22.d
3490
.inst 0x04b73129 //eor z9.d,z9.d,z23.d
3491
.inst 0x04b831ad //eor z13.d,z13.d,z24.d
3492
.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL]
3493
.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL]
3494
.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL]
3495
.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL]
3496
.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL]
3497
.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL]
3498
.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL]
3499
.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL]
3500
.inst 0x04215101 //addvl x1,x1,8
3501
.if mixin == 1
3502
stp x7,x9,[x0],#16
3503
.endif
3504
.inst 0x04b13042 //eor z2.d,z2.d,z17.d
3505
.inst 0x04b230c6 //eor z6.d,z6.d,z18.d
3506
.if mixin == 1
3507
stp x11,x13,[x0],#16
3508
.endif
3509
.inst 0x04b3314a //eor z10.d,z10.d,z19.d
3510
.inst 0x04b431ce //eor z14.d,z14.d,z20.d
3511
.if mixin == 1
3512
stp x15,x17,[x0],#16
3513
.endif
3514
.inst 0x04b53063 //eor z3.d,z3.d,z21.d
3515
.inst 0x04b630e7 //eor z7.d,z7.d,z22.d
3516
.if mixin == 1
3517
stp x19,x21,[x0],#16
3518
.endif
3519
.inst 0x04b7316b //eor z11.d,z11.d,z23.d
3520
.inst 0x04b831ef //eor z15.d,z15.d,z24.d
3521
.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL]
3522
.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL]
3523
.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL]
3524
.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL]
3525
.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL]
3526
.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL]
3527
.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL]
3528
.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL]
3529
.inst 0x04205100 //addvl x0,x0,8
3530
.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL]
3531
.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL]
3532
.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL]
3533
.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL]
3534
.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL]
3535
.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL]
3536
.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL]
3537
.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL]
3538
.inst 0x04205100 //addvl x0,x0,8
3539
210:
3540
.inst 0x04b0e3fd //incw x29, ALL, MUL #1
3541
110:
3542
2:
3543
str w29,[x4]
3544
ldp d10,d11,[sp,16]
3545
ldp d12,d13,[sp,32]
3546
ldp d14,d15,[sp,48]
3547
ldp x16,x17,[sp,64]
3548
ldp x18,x19,[sp,80]
3549
ldp x20,x21,[sp,96]
3550
ldp x22,x23,[sp,112]
3551
ldp x24,x25,[sp,128]
3552
ldp x26,x27,[sp,144]
3553
ldp x28,x29,[sp,160]
3554
ldr x30,[sp,176]
3555
ldp d8,d9,[sp],192
3556
AARCH64_VALIDATE_LINK_REGISTER
3557
.Lreturn:
3558
ret
3559
.size ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve
3560
3561