Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/powerpc/poly1305-p10le_64.S
26289 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
#
3
# Accelerated poly1305 implementation for ppc64le.
4
#
5
# Copyright 2023- IBM Corp. All rights reserved
6
#
7
#===================================================================================
8
# Written by Danny Tsen <[email protected]>
9
#
10
# Poly1305 - this version mainly using vector/VSX/Scalar
11
# - 26 bits limbs
12
# - Handle multiple 64 byte blcok.
13
#
14
# Block size 16 bytes
15
# key = (r, s)
16
# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
17
# p = 2^130 - 5
18
# a += m
19
# a = (r + a) % p
20
# a += s
21
#
22
# Improve performance by breaking down polynominal to the sum of products with
23
# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
24
#
25
# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0
26
# to 9 vectors for multiplications.
27
#
28
# setup r^4, r^3, r^2, r vectors
29
# vs [r^1, r^3, r^2, r^4]
30
# vs0 = [r0,.....]
31
# vs1 = [r1,.....]
32
# vs2 = [r2,.....]
33
# vs3 = [r3,.....]
34
# vs4 = [r4,.....]
35
# vs5 = [r1*5,...]
36
# vs6 = [r2*5,...]
37
# vs7 = [r2*5,...]
38
# vs8 = [r4*5,...]
39
#
40
# Each word in a vector consists a member of a "r/s" in [a * r/s].
41
#
42
# r0, r4*5, r3*5, r2*5, r1*5;
43
# r1, r0, r4*5, r3*5, r2*5;
44
# r2, r1, r0, r4*5, r3*5;
45
# r3, r2, r1, r0, r4*5;
46
# r4, r3, r2, r1, r0 ;
47
#
48
#
49
# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
50
# k = 32 bytes key
51
# r3 = k (r, s)
52
# r4 = mlen
53
# r5 = m
54
#
55
#include <asm/ppc_asm.h>
56
#include <asm/asm-offsets.h>
57
#include <asm/asm-compat.h>
58
#include <linux/linkage.h>
59
60
.machine "any"
61
62
.text
63
64
.macro SAVE_GPR GPR OFFSET FRAME
65
std \GPR,\OFFSET(\FRAME)
66
.endm
67
68
.macro SAVE_VRS VRS OFFSET FRAME
69
li 16, \OFFSET
70
stvx \VRS, 16, \FRAME
71
.endm
72
73
.macro SAVE_VSX VSX OFFSET FRAME
74
li 16, \OFFSET
75
stxvx \VSX, 16, \FRAME
76
.endm
77
78
.macro RESTORE_GPR GPR OFFSET FRAME
79
ld \GPR,\OFFSET(\FRAME)
80
.endm
81
82
.macro RESTORE_VRS VRS OFFSET FRAME
83
li 16, \OFFSET
84
lvx \VRS, 16, \FRAME
85
.endm
86
87
.macro RESTORE_VSX VSX OFFSET FRAME
88
li 16, \OFFSET
89
lxvx \VSX, 16, \FRAME
90
.endm
91
92
.macro SAVE_REGS
93
mflr 0
94
std 0, 16(1)
95
stdu 1,-752(1)
96
97
SAVE_GPR 14, 112, 1
98
SAVE_GPR 15, 120, 1
99
SAVE_GPR 16, 128, 1
100
SAVE_GPR 17, 136, 1
101
SAVE_GPR 18, 144, 1
102
SAVE_GPR 19, 152, 1
103
SAVE_GPR 20, 160, 1
104
SAVE_GPR 21, 168, 1
105
SAVE_GPR 22, 176, 1
106
SAVE_GPR 23, 184, 1
107
SAVE_GPR 24, 192, 1
108
SAVE_GPR 25, 200, 1
109
SAVE_GPR 26, 208, 1
110
SAVE_GPR 27, 216, 1
111
SAVE_GPR 28, 224, 1
112
SAVE_GPR 29, 232, 1
113
SAVE_GPR 30, 240, 1
114
SAVE_GPR 31, 248, 1
115
116
addi 9, 1, 256
117
SAVE_VRS 20, 0, 9
118
SAVE_VRS 21, 16, 9
119
SAVE_VRS 22, 32, 9
120
SAVE_VRS 23, 48, 9
121
SAVE_VRS 24, 64, 9
122
SAVE_VRS 25, 80, 9
123
SAVE_VRS 26, 96, 9
124
SAVE_VRS 27, 112, 9
125
SAVE_VRS 28, 128, 9
126
SAVE_VRS 29, 144, 9
127
SAVE_VRS 30, 160, 9
128
SAVE_VRS 31, 176, 9
129
130
SAVE_VSX 14, 192, 9
131
SAVE_VSX 15, 208, 9
132
SAVE_VSX 16, 224, 9
133
SAVE_VSX 17, 240, 9
134
SAVE_VSX 18, 256, 9
135
SAVE_VSX 19, 272, 9
136
SAVE_VSX 20, 288, 9
137
SAVE_VSX 21, 304, 9
138
SAVE_VSX 22, 320, 9
139
SAVE_VSX 23, 336, 9
140
SAVE_VSX 24, 352, 9
141
SAVE_VSX 25, 368, 9
142
SAVE_VSX 26, 384, 9
143
SAVE_VSX 27, 400, 9
144
SAVE_VSX 28, 416, 9
145
SAVE_VSX 29, 432, 9
146
SAVE_VSX 30, 448, 9
147
SAVE_VSX 31, 464, 9
148
.endm # SAVE_REGS
149
150
.macro RESTORE_REGS
151
addi 9, 1, 256
152
RESTORE_VRS 20, 0, 9
153
RESTORE_VRS 21, 16, 9
154
RESTORE_VRS 22, 32, 9
155
RESTORE_VRS 23, 48, 9
156
RESTORE_VRS 24, 64, 9
157
RESTORE_VRS 25, 80, 9
158
RESTORE_VRS 26, 96, 9
159
RESTORE_VRS 27, 112, 9
160
RESTORE_VRS 28, 128, 9
161
RESTORE_VRS 29, 144, 9
162
RESTORE_VRS 30, 160, 9
163
RESTORE_VRS 31, 176, 9
164
165
RESTORE_VSX 14, 192, 9
166
RESTORE_VSX 15, 208, 9
167
RESTORE_VSX 16, 224, 9
168
RESTORE_VSX 17, 240, 9
169
RESTORE_VSX 18, 256, 9
170
RESTORE_VSX 19, 272, 9
171
RESTORE_VSX 20, 288, 9
172
RESTORE_VSX 21, 304, 9
173
RESTORE_VSX 22, 320, 9
174
RESTORE_VSX 23, 336, 9
175
RESTORE_VSX 24, 352, 9
176
RESTORE_VSX 25, 368, 9
177
RESTORE_VSX 26, 384, 9
178
RESTORE_VSX 27, 400, 9
179
RESTORE_VSX 28, 416, 9
180
RESTORE_VSX 29, 432, 9
181
RESTORE_VSX 30, 448, 9
182
RESTORE_VSX 31, 464, 9
183
184
RESTORE_GPR 14, 112, 1
185
RESTORE_GPR 15, 120, 1
186
RESTORE_GPR 16, 128, 1
187
RESTORE_GPR 17, 136, 1
188
RESTORE_GPR 18, 144, 1
189
RESTORE_GPR 19, 152, 1
190
RESTORE_GPR 20, 160, 1
191
RESTORE_GPR 21, 168, 1
192
RESTORE_GPR 22, 176, 1
193
RESTORE_GPR 23, 184, 1
194
RESTORE_GPR 24, 192, 1
195
RESTORE_GPR 25, 200, 1
196
RESTORE_GPR 26, 208, 1
197
RESTORE_GPR 27, 216, 1
198
RESTORE_GPR 28, 224, 1
199
RESTORE_GPR 29, 232, 1
200
RESTORE_GPR 30, 240, 1
201
RESTORE_GPR 31, 248, 1
202
203
addi 1, 1, 752
204
ld 0, 16(1)
205
mtlr 0
206
.endm # RESTORE_REGS
207
208
#
209
# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
210
# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5;
211
# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5;
212
# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;
213
# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;
214
#
215
# [r^2, r^3, r^1, r^4]
216
# [m3, m2, m4, m1]
217
#
218
# multiply odd and even words
219
.macro mul_odd
220
vmulouw 14, 4, 26
221
vmulouw 10, 5, 3
222
vmulouw 11, 6, 2
223
vmulouw 12, 7, 1
224
vmulouw 13, 8, 0
225
vmulouw 15, 4, 27
226
vaddudm 14, 14, 10
227
vaddudm 14, 14, 11
228
vmulouw 10, 5, 26
229
vmulouw 11, 6, 3
230
vaddudm 14, 14, 12
231
vaddudm 14, 14, 13 # x0
232
vaddudm 15, 15, 10
233
vaddudm 15, 15, 11
234
vmulouw 12, 7, 2
235
vmulouw 13, 8, 1
236
vaddudm 15, 15, 12
237
vaddudm 15, 15, 13 # x1
238
vmulouw 16, 4, 28
239
vmulouw 10, 5, 27
240
vmulouw 11, 6, 26
241
vaddudm 16, 16, 10
242
vaddudm 16, 16, 11
243
vmulouw 12, 7, 3
244
vmulouw 13, 8, 2
245
vaddudm 16, 16, 12
246
vaddudm 16, 16, 13 # x2
247
vmulouw 17, 4, 29
248
vmulouw 10, 5, 28
249
vmulouw 11, 6, 27
250
vaddudm 17, 17, 10
251
vaddudm 17, 17, 11
252
vmulouw 12, 7, 26
253
vmulouw 13, 8, 3
254
vaddudm 17, 17, 12
255
vaddudm 17, 17, 13 # x3
256
vmulouw 18, 4, 30
257
vmulouw 10, 5, 29
258
vmulouw 11, 6, 28
259
vaddudm 18, 18, 10
260
vaddudm 18, 18, 11
261
vmulouw 12, 7, 27
262
vmulouw 13, 8, 26
263
vaddudm 18, 18, 12
264
vaddudm 18, 18, 13 # x4
265
.endm
266
267
.macro mul_even
268
vmuleuw 9, 4, 26
269
vmuleuw 10, 5, 3
270
vmuleuw 11, 6, 2
271
vmuleuw 12, 7, 1
272
vmuleuw 13, 8, 0
273
vaddudm 14, 14, 9
274
vaddudm 14, 14, 10
275
vaddudm 14, 14, 11
276
vaddudm 14, 14, 12
277
vaddudm 14, 14, 13 # x0
278
279
vmuleuw 9, 4, 27
280
vmuleuw 10, 5, 26
281
vmuleuw 11, 6, 3
282
vmuleuw 12, 7, 2
283
vmuleuw 13, 8, 1
284
vaddudm 15, 15, 9
285
vaddudm 15, 15, 10
286
vaddudm 15, 15, 11
287
vaddudm 15, 15, 12
288
vaddudm 15, 15, 13 # x1
289
290
vmuleuw 9, 4, 28
291
vmuleuw 10, 5, 27
292
vmuleuw 11, 6, 26
293
vmuleuw 12, 7, 3
294
vmuleuw 13, 8, 2
295
vaddudm 16, 16, 9
296
vaddudm 16, 16, 10
297
vaddudm 16, 16, 11
298
vaddudm 16, 16, 12
299
vaddudm 16, 16, 13 # x2
300
301
vmuleuw 9, 4, 29
302
vmuleuw 10, 5, 28
303
vmuleuw 11, 6, 27
304
vmuleuw 12, 7, 26
305
vmuleuw 13, 8, 3
306
vaddudm 17, 17, 9
307
vaddudm 17, 17, 10
308
vaddudm 17, 17, 11
309
vaddudm 17, 17, 12
310
vaddudm 17, 17, 13 # x3
311
312
vmuleuw 9, 4, 30
313
vmuleuw 10, 5, 29
314
vmuleuw 11, 6, 28
315
vmuleuw 12, 7, 27
316
vmuleuw 13, 8, 26
317
vaddudm 18, 18, 9
318
vaddudm 18, 18, 10
319
vaddudm 18, 18, 11
320
vaddudm 18, 18, 12
321
vaddudm 18, 18, 13 # x4
322
.endm
323
324
#
325
# poly1305_setup_r
326
#
327
# setup r^4, r^3, r^2, r vectors
328
# [r, r^3, r^2, r^4]
329
# vs0 = [r0,...]
330
# vs1 = [r1,...]
331
# vs2 = [r2,...]
332
# vs3 = [r3,...]
333
# vs4 = [r4,...]
334
# vs5 = [r4*5,...]
335
# vs6 = [r3*5,...]
336
# vs7 = [r2*5,...]
337
# vs8 = [r1*5,...]
338
#
339
# r0, r4*5, r3*5, r2*5, r1*5;
340
# r1, r0, r4*5, r3*5, r2*5;
341
# r2, r1, r0, r4*5, r3*5;
342
# r3, r2, r1, r0, r4*5;
343
# r4, r3, r2, r1, r0 ;
344
#
345
.macro poly1305_setup_r
346
347
# save r
348
xxlor 26, 58, 58
349
xxlor 27, 59, 59
350
xxlor 28, 60, 60
351
xxlor 29, 61, 61
352
xxlor 30, 62, 62
353
354
xxlxor 31, 31, 31
355
356
# [r, r^3, r^2, r^4]
357
# compute r^2
358
vmr 4, 26
359
vmr 5, 27
360
vmr 6, 28
361
vmr 7, 29
362
vmr 8, 30
363
bl do_mul # r^2 r^1
364
xxpermdi 58, 58, 36, 0x3 # r0
365
xxpermdi 59, 59, 37, 0x3 # r1
366
xxpermdi 60, 60, 38, 0x3 # r2
367
xxpermdi 61, 61, 39, 0x3 # r3
368
xxpermdi 62, 62, 40, 0x3 # r4
369
xxpermdi 36, 36, 36, 0x3
370
xxpermdi 37, 37, 37, 0x3
371
xxpermdi 38, 38, 38, 0x3
372
xxpermdi 39, 39, 39, 0x3
373
xxpermdi 40, 40, 40, 0x3
374
vspltisb 13, 2
375
vsld 9, 27, 13
376
vsld 10, 28, 13
377
vsld 11, 29, 13
378
vsld 12, 30, 13
379
vaddudm 0, 9, 27
380
vaddudm 1, 10, 28
381
vaddudm 2, 11, 29
382
vaddudm 3, 12, 30
383
384
bl do_mul # r^4 r^3
385
vmrgow 26, 26, 4
386
vmrgow 27, 27, 5
387
vmrgow 28, 28, 6
388
vmrgow 29, 29, 7
389
vmrgow 30, 30, 8
390
vspltisb 13, 2
391
vsld 9, 27, 13
392
vsld 10, 28, 13
393
vsld 11, 29, 13
394
vsld 12, 30, 13
395
vaddudm 0, 9, 27
396
vaddudm 1, 10, 28
397
vaddudm 2, 11, 29
398
vaddudm 3, 12, 30
399
400
# r^2 r^4
401
xxlor 0, 58, 58
402
xxlor 1, 59, 59
403
xxlor 2, 60, 60
404
xxlor 3, 61, 61
405
xxlor 4, 62, 62
406
xxlor 5, 32, 32
407
xxlor 6, 33, 33
408
xxlor 7, 34, 34
409
xxlor 8, 35, 35
410
411
vspltw 9, 26, 3
412
vspltw 10, 26, 2
413
vmrgow 26, 10, 9
414
vspltw 9, 27, 3
415
vspltw 10, 27, 2
416
vmrgow 27, 10, 9
417
vspltw 9, 28, 3
418
vspltw 10, 28, 2
419
vmrgow 28, 10, 9
420
vspltw 9, 29, 3
421
vspltw 10, 29, 2
422
vmrgow 29, 10, 9
423
vspltw 9, 30, 3
424
vspltw 10, 30, 2
425
vmrgow 30, 10, 9
426
427
vsld 9, 27, 13
428
vsld 10, 28, 13
429
vsld 11, 29, 13
430
vsld 12, 30, 13
431
vaddudm 0, 9, 27
432
vaddudm 1, 10, 28
433
vaddudm 2, 11, 29
434
vaddudm 3, 12, 30
435
.endm
436
437
SYM_FUNC_START_LOCAL(do_mul)
438
mul_odd
439
440
# do reduction ( h %= p )
441
# carry reduction
442
vspltisb 9, 2
443
vsrd 10, 14, 31
444
vsrd 11, 17, 31
445
vand 7, 17, 25
446
vand 4, 14, 25
447
vaddudm 18, 18, 11
448
vsrd 12, 18, 31
449
vaddudm 15, 15, 10
450
451
vsrd 11, 15, 31
452
vand 8, 18, 25
453
vand 5, 15, 25
454
vaddudm 4, 4, 12
455
vsld 10, 12, 9
456
vaddudm 6, 16, 11
457
458
vsrd 13, 6, 31
459
vand 6, 6, 25
460
vaddudm 4, 4, 10
461
vsrd 10, 4, 31
462
vaddudm 7, 7, 13
463
464
vsrd 11, 7, 31
465
vand 7, 7, 25
466
vand 4, 4, 25
467
vaddudm 5, 5, 10
468
vaddudm 8, 8, 11
469
blr
470
SYM_FUNC_END(do_mul)
471
472
#
473
# init key
474
#
475
.macro do_poly1305_init
476
addis 10, 2, rmask@toc@ha
477
addi 10, 10, rmask@toc@l
478
479
ld 11, 0(10)
480
ld 12, 8(10)
481
482
li 14, 16
483
li 15, 32
484
addis 10, 2, cnum@toc@ha
485
addi 10, 10, cnum@toc@l
486
lvx 25, 0, 10 # v25 - mask
487
lvx 31, 14, 10 # v31 = 1a
488
lvx 19, 15, 10 # v19 = 1 << 24
489
lxv 24, 48(10) # vs24
490
lxv 25, 64(10) # vs25
491
492
# initialize
493
# load key from r3 to vectors
494
ld 9, 24(3)
495
ld 10, 32(3)
496
and. 9, 9, 11
497
and. 10, 10, 12
498
499
# break 26 bits
500
extrdi 14, 9, 26, 38
501
extrdi 15, 9, 26, 12
502
extrdi 16, 9, 12, 0
503
mtvsrdd 58, 0, 14
504
insrdi 16, 10, 14, 38
505
mtvsrdd 59, 0, 15
506
extrdi 17, 10, 26, 24
507
mtvsrdd 60, 0, 16
508
extrdi 18, 10, 24, 0
509
mtvsrdd 61, 0, 17
510
mtvsrdd 62, 0, 18
511
512
# r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
513
li 9, 5
514
mtvsrdd 36, 0, 9
515
vmulouw 0, 27, 4 # v0 = rr0
516
vmulouw 1, 28, 4 # v1 = rr1
517
vmulouw 2, 29, 4 # v2 = rr2
518
vmulouw 3, 30, 4 # v3 = rr3
519
.endm
520
521
#
522
# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
523
# k = 32 bytes key
524
# r3 = k (r, s)
525
# r4 = mlen
526
# r5 = m
527
#
528
SYM_FUNC_START(poly1305_p10le_4blocks)
529
.align 5
530
cmpdi 5, 64
531
blt Out_no_poly1305
532
533
SAVE_REGS
534
535
do_poly1305_init
536
537
li 21, 0 # counter to message
538
539
poly1305_setup_r
540
541
# load previous H state
542
# break/convert r6 to 26 bits
543
ld 9, 0(3)
544
ld 10, 8(3)
545
ld 19, 16(3)
546
sldi 19, 19, 24
547
mtvsrdd 41, 0, 19
548
extrdi 14, 9, 26, 38
549
extrdi 15, 9, 26, 12
550
extrdi 16, 9, 12, 0
551
mtvsrdd 36, 0, 14
552
insrdi 16, 10, 14, 38
553
mtvsrdd 37, 0, 15
554
extrdi 17, 10, 26, 24
555
mtvsrdd 38, 0, 16
556
extrdi 18, 10, 24, 0
557
mtvsrdd 39, 0, 17
558
mtvsrdd 40, 0, 18
559
vor 8, 8, 9
560
561
# input m1 m2
562
add 20, 4, 21
563
xxlor 49, 24, 24
564
xxlor 50, 25, 25
565
lxvw4x 43, 0, 20
566
addi 17, 20, 16
567
lxvw4x 44, 0, 17
568
vperm 14, 11, 12, 17
569
vperm 15, 11, 12, 18
570
vand 9, 14, 25 # a0
571
vsrd 10, 14, 31 # >> 26
572
vsrd 11, 10, 31 # 12 bits left
573
vand 10, 10, 25 # a1
574
vspltisb 13, 12
575
vand 16, 15, 25
576
vsld 12, 16, 13
577
vor 11, 11, 12
578
vand 11, 11, 25 # a2
579
vspltisb 13, 14
580
vsrd 12, 15, 13 # >> 14
581
vsrd 13, 12, 31 # >> 26, a4
582
vand 12, 12, 25 # a3
583
584
vaddudm 20, 4, 9
585
vaddudm 21, 5, 10
586
vaddudm 22, 6, 11
587
vaddudm 23, 7, 12
588
vaddudm 24, 8, 13
589
590
# m3 m4
591
addi 17, 17, 16
592
lxvw4x 43, 0, 17
593
addi 17, 17, 16
594
lxvw4x 44, 0, 17
595
vperm 14, 11, 12, 17
596
vperm 15, 11, 12, 18
597
vand 9, 14, 25 # a0
598
vsrd 10, 14, 31 # >> 26
599
vsrd 11, 10, 31 # 12 bits left
600
vand 10, 10, 25 # a1
601
vspltisb 13, 12
602
vand 16, 15, 25
603
vsld 12, 16, 13
604
vspltisb 13, 14
605
vor 11, 11, 12
606
vand 11, 11, 25 # a2
607
vsrd 12, 15, 13 # >> 14
608
vsrd 13, 12, 31 # >> 26, a4
609
vand 12, 12, 25 # a3
610
611
# Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
612
vmrgow 4, 9, 20
613
vmrgow 5, 10, 21
614
vmrgow 6, 11, 22
615
vmrgow 7, 12, 23
616
vmrgow 8, 13, 24
617
vaddudm 8, 8, 19
618
619
addi 5, 5, -64 # len -= 64
620
addi 21, 21, 64 # offset += 64
621
622
li 9, 64
623
divdu 31, 5, 9
624
625
cmpdi 31, 0
626
ble Skip_block_loop
627
628
mtctr 31
629
630
# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
631
# Rewrite the polynominal sum of product as follows,
632
# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2
633
# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
634
# .... Repeat
635
# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->
636
# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
637
#
638
loop_4blocks:
639
640
# Multiply odd words and even words
641
mul_odd
642
mul_even
643
# carry reduction
644
vspltisb 9, 2
645
vsrd 10, 14, 31
646
vsrd 11, 17, 31
647
vand 7, 17, 25
648
vand 4, 14, 25
649
vaddudm 18, 18, 11
650
vsrd 12, 18, 31
651
vaddudm 15, 15, 10
652
653
vsrd 11, 15, 31
654
vand 8, 18, 25
655
vand 5, 15, 25
656
vaddudm 4, 4, 12
657
vsld 10, 12, 9
658
vaddudm 6, 16, 11
659
660
vsrd 13, 6, 31
661
vand 6, 6, 25
662
vaddudm 4, 4, 10
663
vsrd 10, 4, 31
664
vaddudm 7, 7, 13
665
666
vsrd 11, 7, 31
667
vand 7, 7, 25
668
vand 4, 4, 25
669
vaddudm 5, 5, 10
670
vaddudm 8, 8, 11
671
672
# input m1 m2 m3 m4
673
add 20, 4, 21
674
xxlor 49, 24, 24
675
xxlor 50, 25, 25
676
lxvw4x 43, 0, 20
677
addi 17, 20, 16
678
lxvw4x 44, 0, 17
679
vperm 14, 11, 12, 17
680
vperm 15, 11, 12, 18
681
addi 17, 17, 16
682
lxvw4x 43, 0, 17
683
addi 17, 17, 16
684
lxvw4x 44, 0, 17
685
vperm 17, 11, 12, 17
686
vperm 18, 11, 12, 18
687
688
vand 20, 14, 25 # a0
689
vand 9, 17, 25 # a0
690
vsrd 21, 14, 31 # >> 26
691
vsrd 22, 21, 31 # 12 bits left
692
vsrd 10, 17, 31 # >> 26
693
vsrd 11, 10, 31 # 12 bits left
694
695
vand 21, 21, 25 # a1
696
vand 10, 10, 25 # a1
697
698
vspltisb 13, 12
699
vand 16, 15, 25
700
vsld 23, 16, 13
701
vor 22, 22, 23
702
vand 22, 22, 25 # a2
703
vand 16, 18, 25
704
vsld 12, 16, 13
705
vor 11, 11, 12
706
vand 11, 11, 25 # a2
707
vspltisb 13, 14
708
vsrd 23, 15, 13 # >> 14
709
vsrd 24, 23, 31 # >> 26, a4
710
vand 23, 23, 25 # a3
711
vsrd 12, 18, 13 # >> 14
712
vsrd 13, 12, 31 # >> 26, a4
713
vand 12, 12, 25 # a3
714
715
vaddudm 4, 4, 20
716
vaddudm 5, 5, 21
717
vaddudm 6, 6, 22
718
vaddudm 7, 7, 23
719
vaddudm 8, 8, 24
720
721
# Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
722
vmrgow 4, 9, 4
723
vmrgow 5, 10, 5
724
vmrgow 6, 11, 6
725
vmrgow 7, 12, 7
726
vmrgow 8, 13, 8
727
vaddudm 8, 8, 19
728
729
addi 5, 5, -64 # len -= 64
730
addi 21, 21, 64 # offset += 64
731
732
bdnz loop_4blocks
733
734
Skip_block_loop:
735
xxlor 58, 0, 0
736
xxlor 59, 1, 1
737
xxlor 60, 2, 2
738
xxlor 61, 3, 3
739
xxlor 62, 4, 4
740
xxlor 32, 5, 5
741
xxlor 33, 6, 6
742
xxlor 34, 7, 7
743
xxlor 35, 8, 8
744
745
# Multiply odd words and even words
746
mul_odd
747
mul_even
748
749
# Sum the products.
750
xxpermdi 41, 31, 46, 0
751
xxpermdi 42, 31, 47, 0
752
vaddudm 4, 14, 9
753
xxpermdi 36, 31, 36, 3
754
vaddudm 5, 15, 10
755
xxpermdi 37, 31, 37, 3
756
xxpermdi 43, 31, 48, 0
757
vaddudm 6, 16, 11
758
xxpermdi 38, 31, 38, 3
759
xxpermdi 44, 31, 49, 0
760
vaddudm 7, 17, 12
761
xxpermdi 39, 31, 39, 3
762
xxpermdi 45, 31, 50, 0
763
vaddudm 8, 18, 13
764
xxpermdi 40, 31, 40, 3
765
766
# carry reduction
767
vspltisb 9, 2
768
vsrd 10, 4, 31
769
vsrd 11, 7, 31
770
vand 7, 7, 25
771
vand 4, 4, 25
772
vaddudm 8, 8, 11
773
vsrd 12, 8, 31
774
vaddudm 5, 5, 10
775
776
vsrd 11, 5, 31
777
vand 8, 8, 25
778
vand 5, 5, 25
779
vaddudm 4, 4, 12
780
vsld 10, 12, 9
781
vaddudm 6, 6, 11
782
783
vsrd 13, 6, 31
784
vand 6, 6, 25
785
vaddudm 4, 4, 10
786
vsrd 10, 4, 31
787
vaddudm 7, 7, 13
788
789
vsrd 11, 7, 31
790
vand 7, 7, 25
791
vand 4, 4, 25
792
vaddudm 5, 5, 10
793
vsrd 10, 5, 31
794
vand 5, 5, 25
795
vaddudm 6, 6, 10
796
vaddudm 8, 8, 11
797
798
b do_final_update
799
800
do_final_update:
801
# combine 26 bit limbs
802
# v4, v5, v6, v7 and v8 are 26 bit vectors
803
vsld 5, 5, 31
804
vor 20, 4, 5
805
vspltisb 11, 12
806
vsrd 12, 6, 11
807
vsld 6, 6, 31
808
vsld 6, 6, 31
809
vor 20, 20, 6
810
vspltisb 11, 14
811
vsld 7, 7, 11
812
vor 21, 7, 12
813
mfvsrld 16, 40 # save last 2 bytes
814
vsld 8, 8, 11
815
vsld 8, 8, 31
816
vor 21, 21, 8
817
mfvsrld 17, 52
818
mfvsrld 19, 53
819
srdi 16, 16, 24
820
821
std 17, 0(3)
822
std 19, 8(3)
823
stw 16, 16(3)
824
825
Out_loop:
826
li 3, 0
827
828
RESTORE_REGS
829
830
blr
831
832
Out_no_poly1305:
833
li 3, 0
834
blr
835
SYM_FUNC_END(poly1305_p10le_4blocks)
836
837
#
838
# =======================================================================
839
# The following functions implement 64 x 64 bits multiplication poly1305.
840
#
841
SYM_FUNC_START_LOCAL(Poly1305_init_64)
842
# mask 0x0FFFFFFC0FFFFFFC
843
# mask 0x0FFFFFFC0FFFFFFF
844
addis 10, 2, rmask@toc@ha
845
addi 10, 10, rmask@toc@l
846
ld 11, 0(10)
847
ld 12, 8(10)
848
849
# initialize
850
# load key from r3
851
ld 9, 24(3)
852
ld 10, 32(3)
853
and. 9, 9, 11 # cramp mask r0
854
and. 10, 10, 12 # cramp mask r1
855
856
srdi 21, 10, 2
857
add 19, 21, 10 # s1: r19 - (r1 >> 2) *5
858
859
# setup r and s
860
li 25, 0
861
mtvsrdd 32+0, 9, 19 # r0, s1
862
mtvsrdd 32+1, 10, 9 # r1, r0
863
mtvsrdd 32+2, 19, 25 # s1
864
mtvsrdd 32+3, 9, 25 # r0
865
866
blr
867
SYM_FUNC_END(Poly1305_init_64)
868
869
# Poly1305_mult
870
# v6 = (h0, h1), v8 = h2
871
# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
872
#
873
# Output: v7, v10, v11
874
#
875
SYM_FUNC_START_LOCAL(Poly1305_mult)
876
#
877
# d0 = h0 * r0 + h1 * s1
878
vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1
879
880
# d1 = h0 * r1 + h1 * r0 + h2 * s1
881
vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0
882
vmsumudm 10, 8, 2, 11 # d1 += h2 * s1
883
884
# d2 = r0
885
vmsumudm 11, 8, 3, 9 # d2 = h2 * r0
886
blr
887
SYM_FUNC_END(Poly1305_mult)
888
889
#
890
# carry reduction
891
# h %=p
892
#
893
# Input: v7, v10, v11
894
# Output: r27, r28, r29
895
#
896
SYM_FUNC_START_LOCAL(Carry_reduction)
897
mfvsrld 27, 32+7
898
mfvsrld 28, 32+10
899
mfvsrld 29, 32+11
900
mfvsrd 20, 32+7 # h0.h
901
mfvsrd 21, 32+10 # h1.h
902
903
addc 28, 28, 20
904
adde 29, 29, 21
905
srdi 22, 29, 0x2
906
sldi 23, 22, 0x2
907
add 23, 23, 22 # (h2 & 3) * 5
908
addc 27, 27, 23 # h0
909
addze 28, 28 # h1
910
andi. 29, 29, 0x3 # h2
911
blr
912
SYM_FUNC_END(Carry_reduction)
913
914
#
915
# poly1305 multiplication
916
# h *= r, h %= p
917
# d0 = h0 * r0 + h1 * s1
918
# d1 = h0 * r1 + h1 * r0 + h2 * s1
919
# d2 = h0 * r0
920
#
921
#
922
# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
923
# - no highbit if final leftover block (highbit = 0)
924
#
925
SYM_FUNC_START(poly1305_64s)
926
cmpdi 5, 0
927
ble Out_no_poly1305_64
928
929
mflr 0
930
std 0, 16(1)
931
stdu 1,-400(1)
932
933
SAVE_GPR 14, 112, 1
934
SAVE_GPR 15, 120, 1
935
SAVE_GPR 16, 128, 1
936
SAVE_GPR 17, 136, 1
937
SAVE_GPR 18, 144, 1
938
SAVE_GPR 19, 152, 1
939
SAVE_GPR 20, 160, 1
940
SAVE_GPR 21, 168, 1
941
SAVE_GPR 22, 176, 1
942
SAVE_GPR 23, 184, 1
943
SAVE_GPR 24, 192, 1
944
SAVE_GPR 25, 200, 1
945
SAVE_GPR 26, 208, 1
946
SAVE_GPR 27, 216, 1
947
SAVE_GPR 28, 224, 1
948
SAVE_GPR 29, 232, 1
949
SAVE_GPR 30, 240, 1
950
SAVE_GPR 31, 248, 1
951
952
# Init poly1305
953
bl Poly1305_init_64
954
955
li 25, 0 # offset to inp and outp
956
957
add 11, 25, 4
958
959
# load h
960
# h0, h1, h2?
961
ld 27, 0(3)
962
ld 28, 8(3)
963
lwz 29, 16(3)
964
965
li 30, 16
966
divdu 31, 5, 30
967
968
mtctr 31
969
970
mr 24, 6 # highbit
971
972
Loop_block_64:
973
vxor 9, 9, 9
974
975
ld 20, 0(11)
976
ld 21, 8(11)
977
addi 11, 11, 16
978
979
addc 27, 27, 20
980
adde 28, 28, 21
981
adde 29, 29, 24
982
983
li 22, 0
984
mtvsrdd 32+6, 27, 28 # h0, h1
985
mtvsrdd 32+8, 29, 22 # h2
986
987
bl Poly1305_mult
988
989
bl Carry_reduction
990
991
bdnz Loop_block_64
992
993
std 27, 0(3)
994
std 28, 8(3)
995
stw 29, 16(3)
996
997
li 3, 0
998
999
RESTORE_GPR 14, 112, 1
1000
RESTORE_GPR 15, 120, 1
1001
RESTORE_GPR 16, 128, 1
1002
RESTORE_GPR 17, 136, 1
1003
RESTORE_GPR 18, 144, 1
1004
RESTORE_GPR 19, 152, 1
1005
RESTORE_GPR 20, 160, 1
1006
RESTORE_GPR 21, 168, 1
1007
RESTORE_GPR 22, 176, 1
1008
RESTORE_GPR 23, 184, 1
1009
RESTORE_GPR 24, 192, 1
1010
RESTORE_GPR 25, 200, 1
1011
RESTORE_GPR 26, 208, 1
1012
RESTORE_GPR 27, 216, 1
1013
RESTORE_GPR 28, 224, 1
1014
RESTORE_GPR 29, 232, 1
1015
RESTORE_GPR 30, 240, 1
1016
RESTORE_GPR 31, 248, 1
1017
1018
addi 1, 1, 400
1019
ld 0, 16(1)
1020
mtlr 0
1021
1022
blr
1023
1024
Out_no_poly1305_64:
1025
li 3, 0
1026
blr
1027
SYM_FUNC_END(poly1305_64s)
1028
1029
#
1030
# Input: r3 = h, r4 = s, r5 = mac
1031
# mac = h + s
1032
#
1033
SYM_FUNC_START(poly1305_emit_64)
1034
ld 10, 0(3)
1035
ld 11, 8(3)
1036
ld 12, 16(3)
1037
1038
# compare modulus
1039
# h + 5 + (-p)
1040
mr 6, 10
1041
mr 7, 11
1042
mr 8, 12
1043
addic. 6, 6, 5
1044
addze 7, 7
1045
addze 8, 8
1046
srdi 9, 8, 2 # overflow?
1047
cmpdi 9, 0
1048
beq Skip_h64
1049
mr 10, 6
1050
mr 11, 7
1051
mr 12, 8
1052
1053
Skip_h64:
1054
ld 6, 0(4)
1055
ld 7, 8(4)
1056
addc 10, 10, 6
1057
adde 11, 11, 7
1058
addze 12, 12
1059
1060
std 10, 0(5)
1061
std 11, 8(5)
1062
blr
1063
SYM_FUNC_END(poly1305_emit_64)
1064
1065
SYM_DATA_START_LOCAL(RMASK)
1066
.align 5
1067
rmask:
1068
.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
1069
cnum:
1070
.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
1071
.long 0x1a, 0x00, 0x1a, 0x00
1072
.long 0x01000000, 0x01000000, 0x01000000, 0x01000000
1073
.long 0x00010203, 0x04050607, 0x10111213, 0x14151617
1074
.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
1075
SYM_DATA_END(RMASK)
1076
1077