Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/powerpc/chacha-p10le-8x.S
26292 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
#
3
# Accelerated chacha20 implementation for ppc64le.
4
#
5
# Copyright 2023- IBM Corp. All rights reserved
6
#
7
#===================================================================================
8
# Written by Danny Tsen <[email protected]>
9
#
10
# do rounds, 8 quarter rounds
11
# 1. a += b; d ^= a; d <<<= 16;
12
# 2. c += d; b ^= c; b <<<= 12;
13
# 3. a += b; d ^= a; d <<<= 8;
14
# 4. c += d; b ^= c; b <<<= 7
15
#
16
# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
17
# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
18
# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
19
# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
20
#
21
# 4 blocks (a b c d)
22
#
23
# a0 b0 c0 d0
24
# a1 b1 c1 d1
25
# ...
26
# a4 b4 c4 d4
27
# ...
28
# a8 b8 c8 d8
29
# ...
30
# a12 b12 c12 d12
31
# a13 ...
32
# a14 ...
33
# a15 b15 c15 d15
34
#
35
# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
36
# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
37
#
38
39
#include <asm/ppc_asm.h>
40
#include <asm/asm-offsets.h>
41
#include <asm/asm-compat.h>
42
#include <linux/linkage.h>
43
44
.machine "any"
45
.text
46
47
.macro SAVE_GPR GPR OFFSET FRAME
48
std \GPR,\OFFSET(\FRAME)
49
.endm
50
51
.macro SAVE_VRS VRS OFFSET FRAME
52
li 16, \OFFSET
53
stvx \VRS, 16, \FRAME
54
.endm
55
56
.macro SAVE_VSX VSX OFFSET FRAME
57
li 16, \OFFSET
58
stxvx \VSX, 16, \FRAME
59
.endm
60
61
.macro RESTORE_GPR GPR OFFSET FRAME
62
ld \GPR,\OFFSET(\FRAME)
63
.endm
64
65
.macro RESTORE_VRS VRS OFFSET FRAME
66
li 16, \OFFSET
67
lvx \VRS, 16, \FRAME
68
.endm
69
70
.macro RESTORE_VSX VSX OFFSET FRAME
71
li 16, \OFFSET
72
lxvx \VSX, 16, \FRAME
73
.endm
74
75
.macro SAVE_REGS
76
mflr 0
77
std 0, 16(1)
78
stdu 1,-752(1)
79
80
SAVE_GPR 14, 112, 1
81
SAVE_GPR 15, 120, 1
82
SAVE_GPR 16, 128, 1
83
SAVE_GPR 17, 136, 1
84
SAVE_GPR 18, 144, 1
85
SAVE_GPR 19, 152, 1
86
SAVE_GPR 20, 160, 1
87
SAVE_GPR 21, 168, 1
88
SAVE_GPR 22, 176, 1
89
SAVE_GPR 23, 184, 1
90
SAVE_GPR 24, 192, 1
91
SAVE_GPR 25, 200, 1
92
SAVE_GPR 26, 208, 1
93
SAVE_GPR 27, 216, 1
94
SAVE_GPR 28, 224, 1
95
SAVE_GPR 29, 232, 1
96
SAVE_GPR 30, 240, 1
97
SAVE_GPR 31, 248, 1
98
99
addi 9, 1, 256
100
SAVE_VRS 20, 0, 9
101
SAVE_VRS 21, 16, 9
102
SAVE_VRS 22, 32, 9
103
SAVE_VRS 23, 48, 9
104
SAVE_VRS 24, 64, 9
105
SAVE_VRS 25, 80, 9
106
SAVE_VRS 26, 96, 9
107
SAVE_VRS 27, 112, 9
108
SAVE_VRS 28, 128, 9
109
SAVE_VRS 29, 144, 9
110
SAVE_VRS 30, 160, 9
111
SAVE_VRS 31, 176, 9
112
113
SAVE_VSX 14, 192, 9
114
SAVE_VSX 15, 208, 9
115
SAVE_VSX 16, 224, 9
116
SAVE_VSX 17, 240, 9
117
SAVE_VSX 18, 256, 9
118
SAVE_VSX 19, 272, 9
119
SAVE_VSX 20, 288, 9
120
SAVE_VSX 21, 304, 9
121
SAVE_VSX 22, 320, 9
122
SAVE_VSX 23, 336, 9
123
SAVE_VSX 24, 352, 9
124
SAVE_VSX 25, 368, 9
125
SAVE_VSX 26, 384, 9
126
SAVE_VSX 27, 400, 9
127
SAVE_VSX 28, 416, 9
128
SAVE_VSX 29, 432, 9
129
SAVE_VSX 30, 448, 9
130
SAVE_VSX 31, 464, 9
131
.endm # SAVE_REGS
132
133
.macro RESTORE_REGS
134
addi 9, 1, 256
135
RESTORE_VRS 20, 0, 9
136
RESTORE_VRS 21, 16, 9
137
RESTORE_VRS 22, 32, 9
138
RESTORE_VRS 23, 48, 9
139
RESTORE_VRS 24, 64, 9
140
RESTORE_VRS 25, 80, 9
141
RESTORE_VRS 26, 96, 9
142
RESTORE_VRS 27, 112, 9
143
RESTORE_VRS 28, 128, 9
144
RESTORE_VRS 29, 144, 9
145
RESTORE_VRS 30, 160, 9
146
RESTORE_VRS 31, 176, 9
147
148
RESTORE_VSX 14, 192, 9
149
RESTORE_VSX 15, 208, 9
150
RESTORE_VSX 16, 224, 9
151
RESTORE_VSX 17, 240, 9
152
RESTORE_VSX 18, 256, 9
153
RESTORE_VSX 19, 272, 9
154
RESTORE_VSX 20, 288, 9
155
RESTORE_VSX 21, 304, 9
156
RESTORE_VSX 22, 320, 9
157
RESTORE_VSX 23, 336, 9
158
RESTORE_VSX 24, 352, 9
159
RESTORE_VSX 25, 368, 9
160
RESTORE_VSX 26, 384, 9
161
RESTORE_VSX 27, 400, 9
162
RESTORE_VSX 28, 416, 9
163
RESTORE_VSX 29, 432, 9
164
RESTORE_VSX 30, 448, 9
165
RESTORE_VSX 31, 464, 9
166
167
RESTORE_GPR 14, 112, 1
168
RESTORE_GPR 15, 120, 1
169
RESTORE_GPR 16, 128, 1
170
RESTORE_GPR 17, 136, 1
171
RESTORE_GPR 18, 144, 1
172
RESTORE_GPR 19, 152, 1
173
RESTORE_GPR 20, 160, 1
174
RESTORE_GPR 21, 168, 1
175
RESTORE_GPR 22, 176, 1
176
RESTORE_GPR 23, 184, 1
177
RESTORE_GPR 24, 192, 1
178
RESTORE_GPR 25, 200, 1
179
RESTORE_GPR 26, 208, 1
180
RESTORE_GPR 27, 216, 1
181
RESTORE_GPR 28, 224, 1
182
RESTORE_GPR 29, 232, 1
183
RESTORE_GPR 30, 240, 1
184
RESTORE_GPR 31, 248, 1
185
186
addi 1, 1, 752
187
ld 0, 16(1)
188
mtlr 0
189
.endm # RESTORE_REGS
190
191
.macro QT_loop_8x
192
# QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
193
xxlor 0, 32+25, 32+25
194
xxlor 32+25, 20, 20
195
vadduwm 0, 0, 4
196
vadduwm 1, 1, 5
197
vadduwm 2, 2, 6
198
vadduwm 3, 3, 7
199
vadduwm 16, 16, 20
200
vadduwm 17, 17, 21
201
vadduwm 18, 18, 22
202
vadduwm 19, 19, 23
203
204
vpermxor 12, 12, 0, 25
205
vpermxor 13, 13, 1, 25
206
vpermxor 14, 14, 2, 25
207
vpermxor 15, 15, 3, 25
208
vpermxor 28, 28, 16, 25
209
vpermxor 29, 29, 17, 25
210
vpermxor 30, 30, 18, 25
211
vpermxor 31, 31, 19, 25
212
xxlor 32+25, 0, 0
213
vadduwm 8, 8, 12
214
vadduwm 9, 9, 13
215
vadduwm 10, 10, 14
216
vadduwm 11, 11, 15
217
vadduwm 24, 24, 28
218
vadduwm 25, 25, 29
219
vadduwm 26, 26, 30
220
vadduwm 27, 27, 31
221
vxor 4, 4, 8
222
vxor 5, 5, 9
223
vxor 6, 6, 10
224
vxor 7, 7, 11
225
vxor 20, 20, 24
226
vxor 21, 21, 25
227
vxor 22, 22, 26
228
vxor 23, 23, 27
229
230
xxlor 0, 32+25, 32+25
231
xxlor 32+25, 21, 21
232
vrlw 4, 4, 25 #
233
vrlw 5, 5, 25
234
vrlw 6, 6, 25
235
vrlw 7, 7, 25
236
vrlw 20, 20, 25 #
237
vrlw 21, 21, 25
238
vrlw 22, 22, 25
239
vrlw 23, 23, 25
240
xxlor 32+25, 0, 0
241
vadduwm 0, 0, 4
242
vadduwm 1, 1, 5
243
vadduwm 2, 2, 6
244
vadduwm 3, 3, 7
245
vadduwm 16, 16, 20
246
vadduwm 17, 17, 21
247
vadduwm 18, 18, 22
248
vadduwm 19, 19, 23
249
250
xxlor 0, 32+25, 32+25
251
xxlor 32+25, 22, 22
252
vpermxor 12, 12, 0, 25
253
vpermxor 13, 13, 1, 25
254
vpermxor 14, 14, 2, 25
255
vpermxor 15, 15, 3, 25
256
vpermxor 28, 28, 16, 25
257
vpermxor 29, 29, 17, 25
258
vpermxor 30, 30, 18, 25
259
vpermxor 31, 31, 19, 25
260
xxlor 32+25, 0, 0
261
vadduwm 8, 8, 12
262
vadduwm 9, 9, 13
263
vadduwm 10, 10, 14
264
vadduwm 11, 11, 15
265
vadduwm 24, 24, 28
266
vadduwm 25, 25, 29
267
vadduwm 26, 26, 30
268
vadduwm 27, 27, 31
269
xxlor 0, 32+28, 32+28
270
xxlor 32+28, 23, 23
271
vxor 4, 4, 8
272
vxor 5, 5, 9
273
vxor 6, 6, 10
274
vxor 7, 7, 11
275
vxor 20, 20, 24
276
vxor 21, 21, 25
277
vxor 22, 22, 26
278
vxor 23, 23, 27
279
vrlw 4, 4, 28 #
280
vrlw 5, 5, 28
281
vrlw 6, 6, 28
282
vrlw 7, 7, 28
283
vrlw 20, 20, 28 #
284
vrlw 21, 21, 28
285
vrlw 22, 22, 28
286
vrlw 23, 23, 28
287
xxlor 32+28, 0, 0
288
289
# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
290
xxlor 0, 32+25, 32+25
291
xxlor 32+25, 20, 20
292
vadduwm 0, 0, 5
293
vadduwm 1, 1, 6
294
vadduwm 2, 2, 7
295
vadduwm 3, 3, 4
296
vadduwm 16, 16, 21
297
vadduwm 17, 17, 22
298
vadduwm 18, 18, 23
299
vadduwm 19, 19, 20
300
301
vpermxor 15, 15, 0, 25
302
vpermxor 12, 12, 1, 25
303
vpermxor 13, 13, 2, 25
304
vpermxor 14, 14, 3, 25
305
vpermxor 31, 31, 16, 25
306
vpermxor 28, 28, 17, 25
307
vpermxor 29, 29, 18, 25
308
vpermxor 30, 30, 19, 25
309
310
xxlor 32+25, 0, 0
311
vadduwm 10, 10, 15
312
vadduwm 11, 11, 12
313
vadduwm 8, 8, 13
314
vadduwm 9, 9, 14
315
vadduwm 26, 26, 31
316
vadduwm 27, 27, 28
317
vadduwm 24, 24, 29
318
vadduwm 25, 25, 30
319
vxor 5, 5, 10
320
vxor 6, 6, 11
321
vxor 7, 7, 8
322
vxor 4, 4, 9
323
vxor 21, 21, 26
324
vxor 22, 22, 27
325
vxor 23, 23, 24
326
vxor 20, 20, 25
327
328
xxlor 0, 32+25, 32+25
329
xxlor 32+25, 21, 21
330
vrlw 5, 5, 25
331
vrlw 6, 6, 25
332
vrlw 7, 7, 25
333
vrlw 4, 4, 25
334
vrlw 21, 21, 25
335
vrlw 22, 22, 25
336
vrlw 23, 23, 25
337
vrlw 20, 20, 25
338
xxlor 32+25, 0, 0
339
340
vadduwm 0, 0, 5
341
vadduwm 1, 1, 6
342
vadduwm 2, 2, 7
343
vadduwm 3, 3, 4
344
vadduwm 16, 16, 21
345
vadduwm 17, 17, 22
346
vadduwm 18, 18, 23
347
vadduwm 19, 19, 20
348
349
xxlor 0, 32+25, 32+25
350
xxlor 32+25, 22, 22
351
vpermxor 15, 15, 0, 25
352
vpermxor 12, 12, 1, 25
353
vpermxor 13, 13, 2, 25
354
vpermxor 14, 14, 3, 25
355
vpermxor 31, 31, 16, 25
356
vpermxor 28, 28, 17, 25
357
vpermxor 29, 29, 18, 25
358
vpermxor 30, 30, 19, 25
359
xxlor 32+25, 0, 0
360
361
vadduwm 10, 10, 15
362
vadduwm 11, 11, 12
363
vadduwm 8, 8, 13
364
vadduwm 9, 9, 14
365
vadduwm 26, 26, 31
366
vadduwm 27, 27, 28
367
vadduwm 24, 24, 29
368
vadduwm 25, 25, 30
369
370
xxlor 0, 32+28, 32+28
371
xxlor 32+28, 23, 23
372
vxor 5, 5, 10
373
vxor 6, 6, 11
374
vxor 7, 7, 8
375
vxor 4, 4, 9
376
vxor 21, 21, 26
377
vxor 22, 22, 27
378
vxor 23, 23, 24
379
vxor 20, 20, 25
380
vrlw 5, 5, 28
381
vrlw 6, 6, 28
382
vrlw 7, 7, 28
383
vrlw 4, 4, 28
384
vrlw 21, 21, 28
385
vrlw 22, 22, 28
386
vrlw 23, 23, 28
387
vrlw 20, 20, 28
388
xxlor 32+28, 0, 0
389
.endm
390
391
.macro QT_loop_4x
392
# QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
393
vadduwm 0, 0, 4
394
vadduwm 1, 1, 5
395
vadduwm 2, 2, 6
396
vadduwm 3, 3, 7
397
vpermxor 12, 12, 0, 20
398
vpermxor 13, 13, 1, 20
399
vpermxor 14, 14, 2, 20
400
vpermxor 15, 15, 3, 20
401
vadduwm 8, 8, 12
402
vadduwm 9, 9, 13
403
vadduwm 10, 10, 14
404
vadduwm 11, 11, 15
405
vxor 4, 4, 8
406
vxor 5, 5, 9
407
vxor 6, 6, 10
408
vxor 7, 7, 11
409
vrlw 4, 4, 21
410
vrlw 5, 5, 21
411
vrlw 6, 6, 21
412
vrlw 7, 7, 21
413
vadduwm 0, 0, 4
414
vadduwm 1, 1, 5
415
vadduwm 2, 2, 6
416
vadduwm 3, 3, 7
417
vpermxor 12, 12, 0, 22
418
vpermxor 13, 13, 1, 22
419
vpermxor 14, 14, 2, 22
420
vpermxor 15, 15, 3, 22
421
vadduwm 8, 8, 12
422
vadduwm 9, 9, 13
423
vadduwm 10, 10, 14
424
vadduwm 11, 11, 15
425
vxor 4, 4, 8
426
vxor 5, 5, 9
427
vxor 6, 6, 10
428
vxor 7, 7, 11
429
vrlw 4, 4, 23
430
vrlw 5, 5, 23
431
vrlw 6, 6, 23
432
vrlw 7, 7, 23
433
434
# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
435
vadduwm 0, 0, 5
436
vadduwm 1, 1, 6
437
vadduwm 2, 2, 7
438
vadduwm 3, 3, 4
439
vpermxor 15, 15, 0, 20
440
vpermxor 12, 12, 1, 20
441
vpermxor 13, 13, 2, 20
442
vpermxor 14, 14, 3, 20
443
vadduwm 10, 10, 15
444
vadduwm 11, 11, 12
445
vadduwm 8, 8, 13
446
vadduwm 9, 9, 14
447
vxor 5, 5, 10
448
vxor 6, 6, 11
449
vxor 7, 7, 8
450
vxor 4, 4, 9
451
vrlw 5, 5, 21
452
vrlw 6, 6, 21
453
vrlw 7, 7, 21
454
vrlw 4, 4, 21
455
vadduwm 0, 0, 5
456
vadduwm 1, 1, 6
457
vadduwm 2, 2, 7
458
vadduwm 3, 3, 4
459
vpermxor 15, 15, 0, 22
460
vpermxor 12, 12, 1, 22
461
vpermxor 13, 13, 2, 22
462
vpermxor 14, 14, 3, 22
463
vadduwm 10, 10, 15
464
vadduwm 11, 11, 12
465
vadduwm 8, 8, 13
466
vadduwm 9, 9, 14
467
vxor 5, 5, 10
468
vxor 6, 6, 11
469
vxor 7, 7, 8
470
vxor 4, 4, 9
471
vrlw 5, 5, 23
472
vrlw 6, 6, 23
473
vrlw 7, 7, 23
474
vrlw 4, 4, 23
475
.endm
476
477
# Transpose
478
.macro TP_4x a0 a1 a2 a3
479
xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1
480
xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3
481
xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1
482
xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3
483
xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3
484
xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3
485
xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3
486
xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3
487
.endm
488
489
# key stream = working state + state
490
.macro Add_state S
491
vadduwm \S+0, \S+0, 16-\S
492
vadduwm \S+4, \S+4, 17-\S
493
vadduwm \S+8, \S+8, 18-\S
494
vadduwm \S+12, \S+12, 19-\S
495
496
vadduwm \S+1, \S+1, 16-\S
497
vadduwm \S+5, \S+5, 17-\S
498
vadduwm \S+9, \S+9, 18-\S
499
vadduwm \S+13, \S+13, 19-\S
500
501
vadduwm \S+2, \S+2, 16-\S
502
vadduwm \S+6, \S+6, 17-\S
503
vadduwm \S+10, \S+10, 18-\S
504
vadduwm \S+14, \S+14, 19-\S
505
506
vadduwm \S+3, \S+3, 16-\S
507
vadduwm \S+7, \S+7, 17-\S
508
vadduwm \S+11, \S+11, 18-\S
509
vadduwm \S+15, \S+15, 19-\S
510
.endm
511
512
#
513
# write 256 bytes
514
#
515
.macro Write_256 S
516
add 9, 14, 5
517
add 16, 14, 4
518
lxvw4x 0, 0, 9
519
lxvw4x 1, 17, 9
520
lxvw4x 2, 18, 9
521
lxvw4x 3, 19, 9
522
lxvw4x 4, 20, 9
523
lxvw4x 5, 21, 9
524
lxvw4x 6, 22, 9
525
lxvw4x 7, 23, 9
526
lxvw4x 8, 24, 9
527
lxvw4x 9, 25, 9
528
lxvw4x 10, 26, 9
529
lxvw4x 11, 27, 9
530
lxvw4x 12, 28, 9
531
lxvw4x 13, 29, 9
532
lxvw4x 14, 30, 9
533
lxvw4x 15, 31, 9
534
535
xxlxor \S+32, \S+32, 0
536
xxlxor \S+36, \S+36, 1
537
xxlxor \S+40, \S+40, 2
538
xxlxor \S+44, \S+44, 3
539
xxlxor \S+33, \S+33, 4
540
xxlxor \S+37, \S+37, 5
541
xxlxor \S+41, \S+41, 6
542
xxlxor \S+45, \S+45, 7
543
xxlxor \S+34, \S+34, 8
544
xxlxor \S+38, \S+38, 9
545
xxlxor \S+42, \S+42, 10
546
xxlxor \S+46, \S+46, 11
547
xxlxor \S+35, \S+35, 12
548
xxlxor \S+39, \S+39, 13
549
xxlxor \S+43, \S+43, 14
550
xxlxor \S+47, \S+47, 15
551
552
stxvw4x \S+32, 0, 16
553
stxvw4x \S+36, 17, 16
554
stxvw4x \S+40, 18, 16
555
stxvw4x \S+44, 19, 16
556
557
stxvw4x \S+33, 20, 16
558
stxvw4x \S+37, 21, 16
559
stxvw4x \S+41, 22, 16
560
stxvw4x \S+45, 23, 16
561
562
stxvw4x \S+34, 24, 16
563
stxvw4x \S+38, 25, 16
564
stxvw4x \S+42, 26, 16
565
stxvw4x \S+46, 27, 16
566
567
stxvw4x \S+35, 28, 16
568
stxvw4x \S+39, 29, 16
569
stxvw4x \S+43, 30, 16
570
stxvw4x \S+47, 31, 16
571
572
.endm
573
574
#
575
# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src,
576
# unsigned int len, int nrounds);
577
#
578
SYM_FUNC_START(chacha_p10le_8x)
579
.align 5
580
cmpdi 6, 0
581
ble Out_no_chacha
582
583
SAVE_REGS
584
585
# r17 - r31 mainly for Write_256 macro.
586
li 17, 16
587
li 18, 32
588
li 19, 48
589
li 20, 64
590
li 21, 80
591
li 22, 96
592
li 23, 112
593
li 24, 128
594
li 25, 144
595
li 26, 160
596
li 27, 176
597
li 28, 192
598
li 29, 208
599
li 30, 224
600
li 31, 240
601
602
mr 15, 6 # len
603
li 14, 0 # offset to inp and outp
604
605
lxvw4x 48, 0, 3 # vr16, constants
606
lxvw4x 49, 17, 3 # vr17, key 1
607
lxvw4x 50, 18, 3 # vr18, key 2
608
lxvw4x 51, 19, 3 # vr19, counter, nonce
609
610
# create (0, 1, 2, 3) counters
611
vspltisw 0, 0
612
vspltisw 1, 1
613
vspltisw 2, 2
614
vspltisw 3, 3
615
vmrghw 4, 0, 1
616
vmrglw 5, 2, 3
617
vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3)
618
619
vspltisw 21, 12
620
vspltisw 23, 7
621
622
addis 11, 2, permx@toc@ha
623
addi 11, 11, permx@toc@l
624
lxvw4x 32+20, 0, 11
625
lxvw4x 32+22, 17, 11
626
627
sradi 8, 7, 1
628
629
mtctr 8
630
631
# save constants to vsx
632
xxlor 16, 48, 48
633
xxlor 17, 49, 49
634
xxlor 18, 50, 50
635
xxlor 19, 51, 51
636
637
vspltisw 25, 4
638
vspltisw 26, 8
639
640
xxlor 25, 32+26, 32+26
641
xxlor 24, 32+25, 32+25
642
643
vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4)
644
xxlor 30, 32+30, 32+30
645
xxlor 31, 32+31, 32+31
646
647
xxlor 20, 32+20, 32+20
648
xxlor 21, 32+21, 32+21
649
xxlor 22, 32+22, 32+22
650
xxlor 23, 32+23, 32+23
651
652
cmpdi 6, 512
653
blt Loop_last
654
655
Loop_8x:
656
xxspltw 32+0, 16, 0
657
xxspltw 32+1, 16, 1
658
xxspltw 32+2, 16, 2
659
xxspltw 32+3, 16, 3
660
661
xxspltw 32+4, 17, 0
662
xxspltw 32+5, 17, 1
663
xxspltw 32+6, 17, 2
664
xxspltw 32+7, 17, 3
665
xxspltw 32+8, 18, 0
666
xxspltw 32+9, 18, 1
667
xxspltw 32+10, 18, 2
668
xxspltw 32+11, 18, 3
669
xxspltw 32+12, 19, 0
670
xxspltw 32+13, 19, 1
671
xxspltw 32+14, 19, 2
672
xxspltw 32+15, 19, 3
673
vadduwm 12, 12, 30 # increase counter
674
675
xxspltw 32+16, 16, 0
676
xxspltw 32+17, 16, 1
677
xxspltw 32+18, 16, 2
678
xxspltw 32+19, 16, 3
679
680
xxspltw 32+20, 17, 0
681
xxspltw 32+21, 17, 1
682
xxspltw 32+22, 17, 2
683
xxspltw 32+23, 17, 3
684
xxspltw 32+24, 18, 0
685
xxspltw 32+25, 18, 1
686
xxspltw 32+26, 18, 2
687
xxspltw 32+27, 18, 3
688
xxspltw 32+28, 19, 0
689
xxspltw 32+29, 19, 1
690
vadduwm 28, 28, 31 # increase counter
691
xxspltw 32+30, 19, 2
692
xxspltw 32+31, 19, 3
693
694
.align 5
695
quarter_loop_8x:
696
QT_loop_8x
697
698
bdnz quarter_loop_8x
699
700
xxlor 0, 32+30, 32+30
701
xxlor 32+30, 30, 30
702
vadduwm 12, 12, 30
703
xxlor 32+30, 0, 0
704
TP_4x 0, 1, 2, 3
705
TP_4x 4, 5, 6, 7
706
TP_4x 8, 9, 10, 11
707
TP_4x 12, 13, 14, 15
708
709
xxlor 0, 48, 48
710
xxlor 1, 49, 49
711
xxlor 2, 50, 50
712
xxlor 3, 51, 51
713
xxlor 48, 16, 16
714
xxlor 49, 17, 17
715
xxlor 50, 18, 18
716
xxlor 51, 19, 19
717
Add_state 0
718
xxlor 48, 0, 0
719
xxlor 49, 1, 1
720
xxlor 50, 2, 2
721
xxlor 51, 3, 3
722
Write_256 0
723
addi 14, 14, 256 # offset +=256
724
addi 15, 15, -256 # len -=256
725
726
xxlor 5, 32+31, 32+31
727
xxlor 32+31, 31, 31
728
vadduwm 28, 28, 31
729
xxlor 32+31, 5, 5
730
TP_4x 16+0, 16+1, 16+2, 16+3
731
TP_4x 16+4, 16+5, 16+6, 16+7
732
TP_4x 16+8, 16+9, 16+10, 16+11
733
TP_4x 16+12, 16+13, 16+14, 16+15
734
735
xxlor 32, 16, 16
736
xxlor 33, 17, 17
737
xxlor 34, 18, 18
738
xxlor 35, 19, 19
739
Add_state 16
740
Write_256 16
741
addi 14, 14, 256 # offset +=256
742
addi 15, 15, -256 # len +=256
743
744
xxlor 32+24, 24, 24
745
xxlor 32+25, 25, 25
746
xxlor 32+30, 30, 30
747
vadduwm 30, 30, 25
748
vadduwm 31, 30, 24
749
xxlor 30, 32+30, 32+30
750
xxlor 31, 32+31, 32+31
751
752
cmpdi 15, 0
753
beq Out_loop
754
755
cmpdi 15, 512
756
blt Loop_last
757
758
mtctr 8
759
b Loop_8x
760
761
Loop_last:
762
lxvw4x 48, 0, 3 # vr16, constants
763
lxvw4x 49, 17, 3 # vr17, key 1
764
lxvw4x 50, 18, 3 # vr18, key 2
765
lxvw4x 51, 19, 3 # vr19, counter, nonce
766
767
vspltisw 21, 12
768
vspltisw 23, 7
769
addis 11, 2, permx@toc@ha
770
addi 11, 11, permx@toc@l
771
lxvw4x 32+20, 0, 11
772
lxvw4x 32+22, 17, 11
773
774
sradi 8, 7, 1
775
mtctr 8
776
777
Loop_4x:
778
vspltw 0, 16, 0
779
vspltw 1, 16, 1
780
vspltw 2, 16, 2
781
vspltw 3, 16, 3
782
783
vspltw 4, 17, 0
784
vspltw 5, 17, 1
785
vspltw 6, 17, 2
786
vspltw 7, 17, 3
787
vspltw 8, 18, 0
788
vspltw 9, 18, 1
789
vspltw 10, 18, 2
790
vspltw 11, 18, 3
791
vspltw 12, 19, 0
792
vadduwm 12, 12, 30 # increase counter
793
vspltw 13, 19, 1
794
vspltw 14, 19, 2
795
vspltw 15, 19, 3
796
797
.align 5
798
quarter_loop:
799
QT_loop_4x
800
801
bdnz quarter_loop
802
803
vadduwm 12, 12, 30
804
TP_4x 0, 1, 2, 3
805
TP_4x 4, 5, 6, 7
806
TP_4x 8, 9, 10, 11
807
TP_4x 12, 13, 14, 15
808
809
Add_state 0
810
Write_256 0
811
addi 14, 14, 256 # offset += 256
812
addi 15, 15, -256 # len += 256
813
814
# Update state counter
815
vspltisw 25, 4
816
vadduwm 30, 30, 25
817
818
cmpdi 15, 0
819
beq Out_loop
820
cmpdi 15, 256
821
blt Out_loop
822
823
mtctr 8
824
b Loop_4x
825
826
Out_loop:
827
RESTORE_REGS
828
blr
829
830
Out_no_chacha:
831
li 3, 0
832
blr
833
SYM_FUNC_END(chacha_p10le_8x)
834
835
SYM_DATA_START_LOCAL(PERMX)
836
.align 5
837
permx:
838
.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
839
.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
840
SYM_DATA_END(PERMX)
841
842