Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/mips/poly1305-mips.pl
26282 views
1
#!/usr/bin/env perl
2
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3
#
4
# ====================================================================
5
# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
6
# project.
7
# ====================================================================
8
9
# Poly1305 hash for MIPS.
10
#
11
# May 2016
12
#
13
# Numbers are cycles per processed byte with poly1305_blocks alone.
14
#
15
# IALU/gcc
16
# R1x000 ~5.5/+130% (big-endian)
17
# Octeon II 2.50/+70% (little-endian)
18
#
19
# March 2019
20
#
21
# Add 32-bit code path.
22
#
23
# October 2019
24
#
25
# Modulo-scheduling reduction allows to omit dependency chain at the
26
# end of inner loop and improve performance. Also optimize MIPS32R2
27
# code path for MIPS 1004K core. Per René von Dorst's suggestions.
28
#
29
# IALU/gcc
30
# R1x000 ~9.8/? (big-endian)
31
# Octeon II 3.65/+140% (little-endian)
32
# MT7621/1004K 4.75/? (little-endian)
33
#
34
######################################################################
35
# There is a number of MIPS ABI in use, O32 and N32/64 are most
36
# widely used. Then there is a new contender: NUBI. It appears that if
37
# one picks the latter, it's possible to arrange code in ABI neutral
38
# manner. Therefore let's stick to NUBI register layout:
39
#
40
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
41
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
42
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
43
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
44
#
45
# The return value is placed in $a0. Following coding rules facilitate
46
# interoperability:
47
#
48
# - never ever touch $tp, "thread pointer", former $gp [o32 can be
49
# excluded from the rule, because it's specified volatile];
50
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
51
# old code];
52
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
53
#
54
# For reference here is register layout for N32/64 MIPS ABIs:
55
#
56
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
57
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
58
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
59
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
60
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
61
#
62
# <[email protected]>
63
#
64
######################################################################
65
66
$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
67
68
$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
69
70
if ($flavour =~ /64|n32/i) {{{
71
######################################################################
72
# 64-bit code path
73
#
74
75
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
76
my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
77
78
$code.=<<___;
79
#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
80
defined(_MIPS_ARCH_MIPS64R6)) \\
81
&& !defined(_MIPS_ARCH_MIPS64R2)
82
# define _MIPS_ARCH_MIPS64R2
83
#endif
84
85
#if defined(_MIPS_ARCH_MIPS64R6)
86
# define dmultu(rs,rt)
87
# define mflo(rd,rs,rt) dmulu rd,rs,rt
88
# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
89
#else
90
# define dmultu(rs,rt) dmultu rs,rt
91
# define mflo(rd,rs,rt) mflo rd
92
# define mfhi(rd,rs,rt) mfhi rd
93
#endif
94
95
#ifdef __KERNEL__
96
# define poly1305_init poly1305_block_init_arch
97
# define poly1305_blocks poly1305_blocks_arch
98
# define poly1305_emit poly1305_emit_arch
99
#endif
100
101
#if defined(__MIPSEB__) && !defined(MIPSEB)
102
# define MIPSEB
103
#endif
104
105
#ifdef MIPSEB
106
# define MSB 0
107
# define LSB 7
108
#else
109
# define MSB 7
110
# define LSB 0
111
#endif
112
113
.text
114
.set noat
115
.set noreorder
116
117
.align 5
118
.globl poly1305_init
119
.ent poly1305_init
120
poly1305_init:
121
.frame $sp,0,$ra
122
.set reorder
123
124
sd $zero,0($ctx)
125
sd $zero,8($ctx)
126
sd $zero,16($ctx)
127
128
beqz $inp,.Lno_key
129
130
#if defined(_MIPS_ARCH_MIPS64R6)
131
andi $tmp0,$inp,7 # $inp % 8
132
dsubu $inp,$inp,$tmp0 # align $inp
133
sll $tmp0,$tmp0,3 # byte to bit offset
134
ld $in0,0($inp)
135
ld $in1,8($inp)
136
beqz $tmp0,.Laligned_key
137
ld $tmp2,16($inp)
138
139
subu $tmp1,$zero,$tmp0
140
# ifdef MIPSEB
141
dsllv $in0,$in0,$tmp0
142
dsrlv $tmp3,$in1,$tmp1
143
dsllv $in1,$in1,$tmp0
144
dsrlv $tmp2,$tmp2,$tmp1
145
# else
146
dsrlv $in0,$in0,$tmp0
147
dsllv $tmp3,$in1,$tmp1
148
dsrlv $in1,$in1,$tmp0
149
dsllv $tmp2,$tmp2,$tmp1
150
# endif
151
or $in0,$in0,$tmp3
152
or $in1,$in1,$tmp2
153
.Laligned_key:
154
#else
155
ldl $in0,0+MSB($inp)
156
ldl $in1,8+MSB($inp)
157
ldr $in0,0+LSB($inp)
158
ldr $in1,8+LSB($inp)
159
#endif
160
#ifdef MIPSEB
161
# if defined(_MIPS_ARCH_MIPS64R2)
162
dsbh $in0,$in0 # byte swap
163
dsbh $in1,$in1
164
dshd $in0,$in0
165
dshd $in1,$in1
166
# else
167
ori $tmp0,$zero,0xFF
168
dsll $tmp2,$tmp0,32
169
or $tmp0,$tmp2 # 0x000000FF000000FF
170
171
and $tmp1,$in0,$tmp0 # byte swap
172
and $tmp3,$in1,$tmp0
173
dsrl $tmp2,$in0,24
174
dsrl $tmp4,$in1,24
175
dsll $tmp1,24
176
dsll $tmp3,24
177
and $tmp2,$tmp0
178
and $tmp4,$tmp0
179
dsll $tmp0,8 # 0x0000FF000000FF00
180
or $tmp1,$tmp2
181
or $tmp3,$tmp4
182
and $tmp2,$in0,$tmp0
183
and $tmp4,$in1,$tmp0
184
dsrl $in0,8
185
dsrl $in1,8
186
dsll $tmp2,8
187
dsll $tmp4,8
188
and $in0,$tmp0
189
and $in1,$tmp0
190
or $tmp1,$tmp2
191
or $tmp3,$tmp4
192
or $in0,$tmp1
193
or $in1,$tmp3
194
dsrl $tmp1,$in0,32
195
dsrl $tmp3,$in1,32
196
dsll $in0,32
197
dsll $in1,32
198
or $in0,$tmp1
199
or $in1,$tmp3
200
# endif
201
#endif
202
li $tmp0,1
203
dsll $tmp0,32 # 0x0000000100000000
204
daddiu $tmp0,-63 # 0x00000000ffffffc1
205
dsll $tmp0,28 # 0x0ffffffc10000000
206
daddiu $tmp0,-1 # 0x0ffffffc0fffffff
207
208
and $in0,$tmp0
209
daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
210
and $in1,$tmp0
211
212
sd $in0,24($ctx)
213
dsrl $tmp0,$in1,2
214
sd $in1,32($ctx)
215
daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
216
sd $tmp0,40($ctx)
217
218
.Lno_key:
219
li $v0,0 # return 0
220
jr $ra
221
.end poly1305_init
222
___
223
{
224
my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
225
226
my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
227
($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
228
my ($shr,$shl) = ($s6,$s7); # used on R6
229
230
$code.=<<___;
231
.align 5
232
.globl poly1305_blocks
233
.ent poly1305_blocks
234
poly1305_blocks:
235
.set noreorder
236
dsrl $len,4 # number of complete blocks
237
bnez $len,poly1305_blocks_internal
238
nop
239
jr $ra
240
nop
241
.end poly1305_blocks
242
243
.align 5
244
.ent poly1305_blocks_internal
245
poly1305_blocks_internal:
246
.set noreorder
247
#if defined(_MIPS_ARCH_MIPS64R6)
248
.frame $sp,8*8,$ra
249
.mask $SAVED_REGS_MASK|0x000c0000,-8
250
dsubu $sp,8*8
251
sd $s7,56($sp)
252
sd $s6,48($sp)
253
#else
254
.frame $sp,6*8,$ra
255
.mask $SAVED_REGS_MASK,-8
256
dsubu $sp,6*8
257
#endif
258
sd $s5,40($sp)
259
sd $s4,32($sp)
260
___
261
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
262
sd $s3,24($sp)
263
sd $s2,16($sp)
264
sd $s1,8($sp)
265
sd $s0,0($sp)
266
___
267
$code.=<<___;
268
.set reorder
269
270
#if defined(_MIPS_ARCH_MIPS64R6)
271
andi $shr,$inp,7
272
dsubu $inp,$inp,$shr # align $inp
273
sll $shr,$shr,3 # byte to bit offset
274
subu $shl,$zero,$shr
275
#endif
276
277
ld $h0,0($ctx) # load hash value
278
ld $h1,8($ctx)
279
ld $h2,16($ctx)
280
281
ld $r0,24($ctx) # load key
282
ld $r1,32($ctx)
283
ld $rs1,40($ctx)
284
285
dsll $len,4
286
daddu $len,$inp # end of buffer
287
b .Loop
288
289
.align 4
290
.Loop:
291
#if defined(_MIPS_ARCH_MIPS64R6)
292
ld $in0,0($inp) # load input
293
ld $in1,8($inp)
294
beqz $shr,.Laligned_inp
295
296
ld $tmp2,16($inp)
297
# ifdef MIPSEB
298
dsllv $in0,$in0,$shr
299
dsrlv $tmp3,$in1,$shl
300
dsllv $in1,$in1,$shr
301
dsrlv $tmp2,$tmp2,$shl
302
# else
303
dsrlv $in0,$in0,$shr
304
dsllv $tmp3,$in1,$shl
305
dsrlv $in1,$in1,$shr
306
dsllv $tmp2,$tmp2,$shl
307
# endif
308
or $in0,$in0,$tmp3
309
or $in1,$in1,$tmp2
310
.Laligned_inp:
311
#else
312
ldl $in0,0+MSB($inp) # load input
313
ldl $in1,8+MSB($inp)
314
ldr $in0,0+LSB($inp)
315
ldr $in1,8+LSB($inp)
316
#endif
317
daddiu $inp,16
318
#ifdef MIPSEB
319
# if defined(_MIPS_ARCH_MIPS64R2)
320
dsbh $in0,$in0 # byte swap
321
dsbh $in1,$in1
322
dshd $in0,$in0
323
dshd $in1,$in1
324
# else
325
ori $tmp0,$zero,0xFF
326
dsll $tmp2,$tmp0,32
327
or $tmp0,$tmp2 # 0x000000FF000000FF
328
329
and $tmp1,$in0,$tmp0 # byte swap
330
and $tmp3,$in1,$tmp0
331
dsrl $tmp2,$in0,24
332
dsrl $tmp4,$in1,24
333
dsll $tmp1,24
334
dsll $tmp3,24
335
and $tmp2,$tmp0
336
and $tmp4,$tmp0
337
dsll $tmp0,8 # 0x0000FF000000FF00
338
or $tmp1,$tmp2
339
or $tmp3,$tmp4
340
and $tmp2,$in0,$tmp0
341
and $tmp4,$in1,$tmp0
342
dsrl $in0,8
343
dsrl $in1,8
344
dsll $tmp2,8
345
dsll $tmp4,8
346
and $in0,$tmp0
347
and $in1,$tmp0
348
or $tmp1,$tmp2
349
or $tmp3,$tmp4
350
or $in0,$tmp1
351
or $in1,$tmp3
352
dsrl $tmp1,$in0,32
353
dsrl $tmp3,$in1,32
354
dsll $in0,32
355
dsll $in1,32
356
or $in0,$tmp1
357
or $in1,$tmp3
358
# endif
359
#endif
360
dsrl $tmp1,$h2,2 # modulo-scheduled reduction
361
andi $h2,$h2,3
362
dsll $tmp0,$tmp1,2
363
364
daddu $d0,$h0,$in0 # accumulate input
365
daddu $tmp1,$tmp0
366
sltu $tmp0,$d0,$h0
367
daddu $d0,$d0,$tmp1 # ... and residue
368
sltu $tmp1,$d0,$tmp1
369
daddu $d1,$h1,$in1
370
daddu $tmp0,$tmp1
371
sltu $tmp1,$d1,$h1
372
daddu $d1,$tmp0
373
374
dmultu ($r0,$d0) # h0*r0
375
daddu $d2,$h2,$padbit
376
sltu $tmp0,$d1,$tmp0
377
mflo ($h0,$r0,$d0)
378
mfhi ($h1,$r0,$d0)
379
380
dmultu ($rs1,$d1) # h1*5*r1
381
daddu $d2,$tmp1
382
daddu $d2,$tmp0
383
mflo ($tmp0,$rs1,$d1)
384
mfhi ($tmp1,$rs1,$d1)
385
386
dmultu ($r1,$d0) # h0*r1
387
mflo ($tmp2,$r1,$d0)
388
mfhi ($h2,$r1,$d0)
389
daddu $h0,$tmp0
390
daddu $h1,$tmp1
391
sltu $tmp0,$h0,$tmp0
392
393
dmultu ($r0,$d1) # h1*r0
394
daddu $h1,$tmp0
395
daddu $h1,$tmp2
396
mflo ($tmp0,$r0,$d1)
397
mfhi ($tmp1,$r0,$d1)
398
399
dmultu ($rs1,$d2) # h2*5*r1
400
sltu $tmp2,$h1,$tmp2
401
daddu $h2,$tmp2
402
mflo ($tmp2,$rs1,$d2)
403
404
dmultu ($r0,$d2) # h2*r0
405
daddu $h1,$tmp0
406
daddu $h2,$tmp1
407
mflo ($tmp3,$r0,$d2)
408
sltu $tmp0,$h1,$tmp0
409
daddu $h2,$tmp0
410
411
daddu $h1,$tmp2
412
sltu $tmp2,$h1,$tmp2
413
daddu $h2,$tmp2
414
daddu $h2,$tmp3
415
416
bne $inp,$len,.Loop
417
418
sd $h0,0($ctx) # store hash value
419
sd $h1,8($ctx)
420
sd $h2,16($ctx)
421
422
.set noreorder
423
#if defined(_MIPS_ARCH_MIPS64R6)
424
ld $s7,56($sp)
425
ld $s6,48($sp)
426
#endif
427
ld $s5,40($sp) # epilogue
428
ld $s4,32($sp)
429
___
430
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
431
ld $s3,24($sp)
432
ld $s2,16($sp)
433
ld $s1,8($sp)
434
ld $s0,0($sp)
435
___
436
$code.=<<___;
437
jr $ra
438
#if defined(_MIPS_ARCH_MIPS64R6)
439
daddu $sp,8*8
440
#else
441
daddu $sp,6*8
442
#endif
443
.end poly1305_blocks_internal
444
___
445
}
446
{
447
my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
448
449
$code.=<<___;
450
.align 5
451
.globl poly1305_emit
452
.ent poly1305_emit
453
poly1305_emit:
454
.frame $sp,0,$ra
455
.set reorder
456
457
ld $tmp2,16($ctx)
458
ld $tmp0,0($ctx)
459
ld $tmp1,8($ctx)
460
461
li $in0,-4 # final reduction
462
dsrl $in1,$tmp2,2
463
and $in0,$tmp2
464
andi $tmp2,$tmp2,3
465
daddu $in0,$in1
466
467
daddu $tmp0,$tmp0,$in0
468
sltu $in1,$tmp0,$in0
469
daddiu $in0,$tmp0,5 # compare to modulus
470
daddu $tmp1,$tmp1,$in1
471
sltiu $tmp3,$in0,5
472
sltu $tmp4,$tmp1,$in1
473
daddu $in1,$tmp1,$tmp3
474
daddu $tmp2,$tmp2,$tmp4
475
sltu $tmp3,$in1,$tmp3
476
daddu $tmp2,$tmp2,$tmp3
477
478
dsrl $tmp2,2 # see if it carried/borrowed
479
dsubu $tmp2,$zero,$tmp2
480
481
xor $in0,$tmp0
482
xor $in1,$tmp1
483
and $in0,$tmp2
484
and $in1,$tmp2
485
xor $in0,$tmp0
486
xor $in1,$tmp1
487
488
lwu $tmp0,0($nonce) # load nonce
489
lwu $tmp1,4($nonce)
490
lwu $tmp2,8($nonce)
491
lwu $tmp3,12($nonce)
492
dsll $tmp1,32
493
dsll $tmp3,32
494
or $tmp0,$tmp1
495
or $tmp2,$tmp3
496
497
daddu $in0,$tmp0 # accumulate nonce
498
daddu $in1,$tmp2
499
sltu $tmp0,$in0,$tmp0
500
daddu $in1,$tmp0
501
502
dsrl $tmp0,$in0,8 # write mac value
503
dsrl $tmp1,$in0,16
504
dsrl $tmp2,$in0,24
505
sb $in0,0($mac)
506
dsrl $tmp3,$in0,32
507
sb $tmp0,1($mac)
508
dsrl $tmp0,$in0,40
509
sb $tmp1,2($mac)
510
dsrl $tmp1,$in0,48
511
sb $tmp2,3($mac)
512
dsrl $tmp2,$in0,56
513
sb $tmp3,4($mac)
514
dsrl $tmp3,$in1,8
515
sb $tmp0,5($mac)
516
dsrl $tmp0,$in1,16
517
sb $tmp1,6($mac)
518
dsrl $tmp1,$in1,24
519
sb $tmp2,7($mac)
520
521
sb $in1,8($mac)
522
dsrl $tmp2,$in1,32
523
sb $tmp3,9($mac)
524
dsrl $tmp3,$in1,40
525
sb $tmp0,10($mac)
526
dsrl $tmp0,$in1,48
527
sb $tmp1,11($mac)
528
dsrl $tmp1,$in1,56
529
sb $tmp2,12($mac)
530
sb $tmp3,13($mac)
531
sb $tmp0,14($mac)
532
sb $tmp1,15($mac)
533
534
jr $ra
535
.end poly1305_emit
536
.rdata
537
.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
538
.align 2
539
___
540
}
541
}}} else {{{
542
######################################################################
543
# 32-bit code path
544
#
545
546
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
547
my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
548
($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
549
550
$code.=<<___;
551
#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
552
defined(_MIPS_ARCH_MIPS32R6)) \\
553
&& !defined(_MIPS_ARCH_MIPS32R2)
554
# define _MIPS_ARCH_MIPS32R2
555
#endif
556
557
#if defined(_MIPS_ARCH_MIPS32R6)
558
# define multu(rs,rt)
559
# define mflo(rd,rs,rt) mulu rd,rs,rt
560
# define mfhi(rd,rs,rt) muhu rd,rs,rt
561
#else
562
# define multu(rs,rt) multu rs,rt
563
# define mflo(rd,rs,rt) mflo rd
564
# define mfhi(rd,rs,rt) mfhi rd
565
#endif
566
567
#ifdef __KERNEL__
568
# define poly1305_init poly1305_block_init_arch
569
# define poly1305_blocks poly1305_blocks_arch
570
# define poly1305_emit poly1305_emit_arch
571
#endif
572
573
#if defined(__MIPSEB__) && !defined(MIPSEB)
574
# define MIPSEB
575
#endif
576
577
#ifdef MIPSEB
578
# define MSB 0
579
# define LSB 3
580
#else
581
# define MSB 3
582
# define LSB 0
583
#endif
584
585
.text
586
.set noat
587
.set noreorder
588
589
.align 5
590
.globl poly1305_init
591
.ent poly1305_init
592
poly1305_init:
593
.frame $sp,0,$ra
594
.set reorder
595
596
sw $zero,0($ctx)
597
sw $zero,4($ctx)
598
sw $zero,8($ctx)
599
sw $zero,12($ctx)
600
sw $zero,16($ctx)
601
602
beqz $inp,.Lno_key
603
604
#if defined(_MIPS_ARCH_MIPS32R6)
605
andi $tmp0,$inp,3 # $inp % 4
606
subu $inp,$inp,$tmp0 # align $inp
607
sll $tmp0,$tmp0,3 # byte to bit offset
608
lw $in0,0($inp)
609
lw $in1,4($inp)
610
lw $in2,8($inp)
611
lw $in3,12($inp)
612
beqz $tmp0,.Laligned_key
613
614
lw $tmp2,16($inp)
615
subu $tmp1,$zero,$tmp0
616
# ifdef MIPSEB
617
sllv $in0,$in0,$tmp0
618
srlv $tmp3,$in1,$tmp1
619
sllv $in1,$in1,$tmp0
620
or $in0,$in0,$tmp3
621
srlv $tmp3,$in2,$tmp1
622
sllv $in2,$in2,$tmp0
623
or $in1,$in1,$tmp3
624
srlv $tmp3,$in3,$tmp1
625
sllv $in3,$in3,$tmp0
626
or $in2,$in2,$tmp3
627
srlv $tmp2,$tmp2,$tmp1
628
or $in3,$in3,$tmp2
629
# else
630
srlv $in0,$in0,$tmp0
631
sllv $tmp3,$in1,$tmp1
632
srlv $in1,$in1,$tmp0
633
or $in0,$in0,$tmp3
634
sllv $tmp3,$in2,$tmp1
635
srlv $in2,$in2,$tmp0
636
or $in1,$in1,$tmp3
637
sllv $tmp3,$in3,$tmp1
638
srlv $in3,$in3,$tmp0
639
or $in2,$in2,$tmp3
640
sllv $tmp2,$tmp2,$tmp1
641
or $in3,$in3,$tmp2
642
# endif
643
.Laligned_key:
644
#else
645
lwl $in0,0+MSB($inp)
646
lwl $in1,4+MSB($inp)
647
lwl $in2,8+MSB($inp)
648
lwl $in3,12+MSB($inp)
649
lwr $in0,0+LSB($inp)
650
lwr $in1,4+LSB($inp)
651
lwr $in2,8+LSB($inp)
652
lwr $in3,12+LSB($inp)
653
#endif
654
#ifdef MIPSEB
655
# if defined(_MIPS_ARCH_MIPS32R2)
656
wsbh $in0,$in0 # byte swap
657
wsbh $in1,$in1
658
wsbh $in2,$in2
659
wsbh $in3,$in3
660
rotr $in0,$in0,16
661
rotr $in1,$in1,16
662
rotr $in2,$in2,16
663
rotr $in3,$in3,16
664
# else
665
srl $tmp0,$in0,24 # byte swap
666
srl $tmp1,$in0,8
667
andi $tmp2,$in0,0xFF00
668
sll $in0,$in0,24
669
andi $tmp1,0xFF00
670
sll $tmp2,$tmp2,8
671
or $in0,$tmp0
672
srl $tmp0,$in1,24
673
or $tmp1,$tmp2
674
srl $tmp2,$in1,8
675
or $in0,$tmp1
676
andi $tmp1,$in1,0xFF00
677
sll $in1,$in1,24
678
andi $tmp2,0xFF00
679
sll $tmp1,$tmp1,8
680
or $in1,$tmp0
681
srl $tmp0,$in2,24
682
or $tmp2,$tmp1
683
srl $tmp1,$in2,8
684
or $in1,$tmp2
685
andi $tmp2,$in2,0xFF00
686
sll $in2,$in2,24
687
andi $tmp1,0xFF00
688
sll $tmp2,$tmp2,8
689
or $in2,$tmp0
690
srl $tmp0,$in3,24
691
or $tmp1,$tmp2
692
srl $tmp2,$in3,8
693
or $in2,$tmp1
694
andi $tmp1,$in3,0xFF00
695
sll $in3,$in3,24
696
andi $tmp2,0xFF00
697
sll $tmp1,$tmp1,8
698
or $in3,$tmp0
699
or $tmp2,$tmp1
700
or $in3,$tmp2
701
# endif
702
#endif
703
lui $tmp0,0x0fff
704
ori $tmp0,0xffff # 0x0fffffff
705
and $in0,$in0,$tmp0
706
subu $tmp0,3 # 0x0ffffffc
707
and $in1,$in1,$tmp0
708
and $in2,$in2,$tmp0
709
and $in3,$in3,$tmp0
710
711
sw $in0,20($ctx)
712
sw $in1,24($ctx)
713
sw $in2,28($ctx)
714
sw $in3,32($ctx)
715
716
srl $tmp1,$in1,2
717
srl $tmp2,$in2,2
718
srl $tmp3,$in3,2
719
addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
720
addu $in2,$in2,$tmp2
721
addu $in3,$in3,$tmp3
722
sw $in1,36($ctx)
723
sw $in2,40($ctx)
724
sw $in3,44($ctx)
725
.Lno_key:
726
li $v0,0
727
jr $ra
728
.end poly1305_init
729
___
730
{
731
my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
732
733
my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
734
($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
735
my ($d0,$d1,$d2,$d3) =
736
($a4,$a5,$a6,$a7);
737
my $shr = $t2; # used on R6
738
my $one = $t2; # used on R2
739
740
$code.=<<___;
741
.globl poly1305_blocks
742
.align 5
743
.ent poly1305_blocks
744
poly1305_blocks:
745
.frame $sp,16*4,$ra
746
.mask $SAVED_REGS_MASK,-4
747
.set noreorder
748
subu $sp, $sp,4*12
749
sw $s11,4*11($sp)
750
sw $s10,4*10($sp)
751
sw $s9, 4*9($sp)
752
sw $s8, 4*8($sp)
753
sw $s7, 4*7($sp)
754
sw $s6, 4*6($sp)
755
sw $s5, 4*5($sp)
756
sw $s4, 4*4($sp)
757
___
758
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
759
sw $s3, 4*3($sp)
760
sw $s2, 4*2($sp)
761
sw $s1, 4*1($sp)
762
sw $s0, 4*0($sp)
763
___
764
$code.=<<___;
765
.set reorder
766
767
srl $len,4 # number of complete blocks
768
li $one,1
769
beqz $len,.Labort
770
771
#if defined(_MIPS_ARCH_MIPS32R6)
772
andi $shr,$inp,3
773
subu $inp,$inp,$shr # align $inp
774
sll $shr,$shr,3 # byte to bit offset
775
#endif
776
777
lw $h0,0($ctx) # load hash value
778
lw $h1,4($ctx)
779
lw $h2,8($ctx)
780
lw $h3,12($ctx)
781
lw $h4,16($ctx)
782
783
lw $r0,20($ctx) # load key
784
lw $r1,24($ctx)
785
lw $r2,28($ctx)
786
lw $r3,32($ctx)
787
lw $rs1,36($ctx)
788
lw $rs2,40($ctx)
789
lw $rs3,44($ctx)
790
791
sll $len,4
792
addu $len,$len,$inp # end of buffer
793
b .Loop
794
795
.align 4
796
.Loop:
797
#if defined(_MIPS_ARCH_MIPS32R6)
798
lw $d0,0($inp) # load input
799
lw $d1,4($inp)
800
lw $d2,8($inp)
801
lw $d3,12($inp)
802
beqz $shr,.Laligned_inp
803
804
lw $t0,16($inp)
805
subu $t1,$zero,$shr
806
# ifdef MIPSEB
807
sllv $d0,$d0,$shr
808
srlv $at,$d1,$t1
809
sllv $d1,$d1,$shr
810
or $d0,$d0,$at
811
srlv $at,$d2,$t1
812
sllv $d2,$d2,$shr
813
or $d1,$d1,$at
814
srlv $at,$d3,$t1
815
sllv $d3,$d3,$shr
816
or $d2,$d2,$at
817
srlv $t0,$t0,$t1
818
or $d3,$d3,$t0
819
# else
820
srlv $d0,$d0,$shr
821
sllv $at,$d1,$t1
822
srlv $d1,$d1,$shr
823
or $d0,$d0,$at
824
sllv $at,$d2,$t1
825
srlv $d2,$d2,$shr
826
or $d1,$d1,$at
827
sllv $at,$d3,$t1
828
srlv $d3,$d3,$shr
829
or $d2,$d2,$at
830
sllv $t0,$t0,$t1
831
or $d3,$d3,$t0
832
# endif
833
.Laligned_inp:
834
#else
835
lwl $d0,0+MSB($inp) # load input
836
lwl $d1,4+MSB($inp)
837
lwl $d2,8+MSB($inp)
838
lwl $d3,12+MSB($inp)
839
lwr $d0,0+LSB($inp)
840
lwr $d1,4+LSB($inp)
841
lwr $d2,8+LSB($inp)
842
lwr $d3,12+LSB($inp)
843
#endif
844
#ifdef MIPSEB
845
# if defined(_MIPS_ARCH_MIPS32R2)
846
wsbh $d0,$d0 # byte swap
847
wsbh $d1,$d1
848
wsbh $d2,$d2
849
wsbh $d3,$d3
850
rotr $d0,$d0,16
851
rotr $d1,$d1,16
852
rotr $d2,$d2,16
853
rotr $d3,$d3,16
854
# else
855
srl $at,$d0,24 # byte swap
856
srl $t0,$d0,8
857
andi $t1,$d0,0xFF00
858
sll $d0,$d0,24
859
andi $t0,0xFF00
860
sll $t1,$t1,8
861
or $d0,$at
862
srl $at,$d1,24
863
or $t0,$t1
864
srl $t1,$d1,8
865
or $d0,$t0
866
andi $t0,$d1,0xFF00
867
sll $d1,$d1,24
868
andi $t1,0xFF00
869
sll $t0,$t0,8
870
or $d1,$at
871
srl $at,$d2,24
872
or $t1,$t0
873
srl $t0,$d2,8
874
or $d1,$t1
875
andi $t1,$d2,0xFF00
876
sll $d2,$d2,24
877
andi $t0,0xFF00
878
sll $t1,$t1,8
879
or $d2,$at
880
srl $at,$d3,24
881
or $t0,$t1
882
srl $t1,$d3,8
883
or $d2,$t0
884
andi $t0,$d3,0xFF00
885
sll $d3,$d3,24
886
andi $t1,0xFF00
887
sll $t0,$t0,8
888
or $d3,$at
889
or $t1,$t0
890
or $d3,$t1
891
# endif
892
#endif
893
srl $t0,$h4,2 # modulo-scheduled reduction
894
andi $h4,$h4,3
895
sll $at,$t0,2
896
897
addu $d0,$d0,$h0 # accumulate input
898
addu $t0,$t0,$at
899
sltu $h0,$d0,$h0
900
addu $d0,$d0,$t0 # ... and residue
901
sltu $at,$d0,$t0
902
903
addu $d1,$d1,$h1
904
addu $h0,$h0,$at # carry
905
sltu $h1,$d1,$h1
906
addu $d1,$d1,$h0
907
sltu $h0,$d1,$h0
908
909
addu $d2,$d2,$h2
910
addu $h1,$h1,$h0 # carry
911
sltu $h2,$d2,$h2
912
addu $d2,$d2,$h1
913
sltu $h1,$d2,$h1
914
915
addu $d3,$d3,$h3
916
addu $h2,$h2,$h1 # carry
917
sltu $h3,$d3,$h3
918
addu $d3,$d3,$h2
919
920
#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
921
multu $r0,$d0 # d0*r0
922
sltu $h2,$d3,$h2
923
maddu $rs3,$d1 # d1*s3
924
addu $h3,$h3,$h2 # carry
925
maddu $rs2,$d2 # d2*s2
926
addu $h4,$h4,$padbit
927
maddu $rs1,$d3 # d3*s1
928
addu $h4,$h4,$h3
929
mfhi $at
930
mflo $h0
931
932
multu $r1,$d0 # d0*r1
933
maddu $r0,$d1 # d1*r0
934
maddu $rs3,$d2 # d2*s3
935
maddu $rs2,$d3 # d3*s2
936
maddu $rs1,$h4 # h4*s1
937
maddu $at,$one # hi*1
938
mfhi $at
939
mflo $h1
940
941
multu $r2,$d0 # d0*r2
942
maddu $r1,$d1 # d1*r1
943
maddu $r0,$d2 # d2*r0
944
maddu $rs3,$d3 # d3*s3
945
maddu $rs2,$h4 # h4*s2
946
maddu $at,$one # hi*1
947
mfhi $at
948
mflo $h2
949
950
mul $t0,$r0,$h4 # h4*r0
951
952
multu $r3,$d0 # d0*r3
953
maddu $r2,$d1 # d1*r2
954
maddu $r1,$d2 # d2*r1
955
maddu $r0,$d3 # d3*r0
956
maddu $rs3,$h4 # h4*s3
957
maddu $at,$one # hi*1
958
mfhi $at
959
mflo $h3
960
961
addiu $inp,$inp,16
962
963
addu $h4,$t0,$at
964
#else
965
multu ($r0,$d0) # d0*r0
966
mflo ($h0,$r0,$d0)
967
mfhi ($h1,$r0,$d0)
968
969
sltu $h2,$d3,$h2
970
addu $h3,$h3,$h2 # carry
971
972
multu ($rs3,$d1) # d1*s3
973
mflo ($at,$rs3,$d1)
974
mfhi ($t0,$rs3,$d1)
975
976
addu $h4,$h4,$padbit
977
addiu $inp,$inp,16
978
addu $h4,$h4,$h3
979
980
multu ($rs2,$d2) # d2*s2
981
mflo ($a3,$rs2,$d2)
982
mfhi ($t1,$rs2,$d2)
983
addu $h0,$h0,$at
984
addu $h1,$h1,$t0
985
multu ($rs1,$d3) # d3*s1
986
sltu $at,$h0,$at
987
addu $h1,$h1,$at
988
989
mflo ($at,$rs1,$d3)
990
mfhi ($t0,$rs1,$d3)
991
addu $h0,$h0,$a3
992
addu $h1,$h1,$t1
993
multu ($r1,$d0) # d0*r1
994
sltu $a3,$h0,$a3
995
addu $h1,$h1,$a3
996
997
998
mflo ($a3,$r1,$d0)
999
mfhi ($h2,$r1,$d0)
1000
addu $h0,$h0,$at
1001
addu $h1,$h1,$t0
1002
multu ($r0,$d1) # d1*r0
1003
sltu $at,$h0,$at
1004
addu $h1,$h1,$at
1005
1006
mflo ($at,$r0,$d1)
1007
mfhi ($t0,$r0,$d1)
1008
addu $h1,$h1,$a3
1009
sltu $a3,$h1,$a3
1010
multu ($rs3,$d2) # d2*s3
1011
addu $h2,$h2,$a3
1012
1013
mflo ($a3,$rs3,$d2)
1014
mfhi ($t1,$rs3,$d2)
1015
addu $h1,$h1,$at
1016
addu $h2,$h2,$t0
1017
multu ($rs2,$d3) # d3*s2
1018
sltu $at,$h1,$at
1019
addu $h2,$h2,$at
1020
1021
mflo ($at,$rs2,$d3)
1022
mfhi ($t0,$rs2,$d3)
1023
addu $h1,$h1,$a3
1024
addu $h2,$h2,$t1
1025
multu ($rs1,$h4) # h4*s1
1026
sltu $a3,$h1,$a3
1027
addu $h2,$h2,$a3
1028
1029
mflo ($a3,$rs1,$h4)
1030
addu $h1,$h1,$at
1031
addu $h2,$h2,$t0
1032
multu ($r2,$d0) # d0*r2
1033
sltu $at,$h1,$at
1034
addu $h2,$h2,$at
1035
1036
1037
mflo ($at,$r2,$d0)
1038
mfhi ($h3,$r2,$d0)
1039
addu $h1,$h1,$a3
1040
sltu $a3,$h1,$a3
1041
multu ($r1,$d1) # d1*r1
1042
addu $h2,$h2,$a3
1043
1044
mflo ($a3,$r1,$d1)
1045
mfhi ($t1,$r1,$d1)
1046
addu $h2,$h2,$at
1047
sltu $at,$h2,$at
1048
multu ($r0,$d2) # d2*r0
1049
addu $h3,$h3,$at
1050
1051
mflo ($at,$r0,$d2)
1052
mfhi ($t0,$r0,$d2)
1053
addu $h2,$h2,$a3
1054
addu $h3,$h3,$t1
1055
multu ($rs3,$d3) # d3*s3
1056
sltu $a3,$h2,$a3
1057
addu $h3,$h3,$a3
1058
1059
mflo ($a3,$rs3,$d3)
1060
mfhi ($t1,$rs3,$d3)
1061
addu $h2,$h2,$at
1062
addu $h3,$h3,$t0
1063
multu ($rs2,$h4) # h4*s2
1064
sltu $at,$h2,$at
1065
addu $h3,$h3,$at
1066
1067
mflo ($at,$rs2,$h4)
1068
addu $h2,$h2,$a3
1069
addu $h3,$h3,$t1
1070
multu ($r3,$d0) # d0*r3
1071
sltu $a3,$h2,$a3
1072
addu $h3,$h3,$a3
1073
1074
1075
mflo ($a3,$r3,$d0)
1076
mfhi ($t1,$r3,$d0)
1077
addu $h2,$h2,$at
1078
sltu $at,$h2,$at
1079
multu ($r2,$d1) # d1*r2
1080
addu $h3,$h3,$at
1081
1082
mflo ($at,$r2,$d1)
1083
mfhi ($t0,$r2,$d1)
1084
addu $h3,$h3,$a3
1085
sltu $a3,$h3,$a3
1086
multu ($r0,$d3) # d3*r0
1087
addu $t1,$t1,$a3
1088
1089
mflo ($a3,$r0,$d3)
1090
mfhi ($d3,$r0,$d3)
1091
addu $h3,$h3,$at
1092
addu $t1,$t1,$t0
1093
multu ($r1,$d2) # d2*r1
1094
sltu $at,$h3,$at
1095
addu $t1,$t1,$at
1096
1097
mflo ($at,$r1,$d2)
1098
mfhi ($t0,$r1,$d2)
1099
addu $h3,$h3,$a3
1100
addu $t1,$t1,$d3
1101
multu ($rs3,$h4) # h4*s3
1102
sltu $a3,$h3,$a3
1103
addu $t1,$t1,$a3
1104
1105
mflo ($a3,$rs3,$h4)
1106
addu $h3,$h3,$at
1107
addu $t1,$t1,$t0
1108
multu ($r0,$h4) # h4*r0
1109
sltu $at,$h3,$at
1110
addu $t1,$t1,$at
1111
1112
1113
mflo ($h4,$r0,$h4)
1114
addu $h3,$h3,$a3
1115
sltu $a3,$h3,$a3
1116
addu $t1,$t1,$a3
1117
addu $h4,$h4,$t1
1118
1119
li $padbit,1 # if we loop, padbit is 1
1120
#endif
1121
bne $inp,$len,.Loop
1122
1123
sw $h0,0($ctx) # store hash value
1124
sw $h1,4($ctx)
1125
sw $h2,8($ctx)
1126
sw $h3,12($ctx)
1127
sw $h4,16($ctx)
1128
1129
.set noreorder
1130
.Labort:
1131
lw $s11,4*11($sp)
1132
lw $s10,4*10($sp)
1133
lw $s9, 4*9($sp)
1134
lw $s8, 4*8($sp)
1135
lw $s7, 4*7($sp)
1136
lw $s6, 4*6($sp)
1137
lw $s5, 4*5($sp)
1138
lw $s4, 4*4($sp)
1139
___
1140
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1141
lw $s3, 4*3($sp)
1142
lw $s2, 4*2($sp)
1143
lw $s1, 4*1($sp)
1144
lw $s0, 4*0($sp)
1145
___
1146
$code.=<<___;
1147
jr $ra
1148
addu $sp,$sp,4*12
1149
.end poly1305_blocks
1150
___
1151
}
1152
{
1153
my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1154
1155
$code.=<<___;
1156
.align 5
1157
.globl poly1305_emit
1158
.ent poly1305_emit
1159
poly1305_emit:
1160
.frame $sp,0,$ra
1161
.set reorder
1162
1163
lw $tmp4,16($ctx)
1164
lw $tmp0,0($ctx)
1165
lw $tmp1,4($ctx)
1166
lw $tmp2,8($ctx)
1167
lw $tmp3,12($ctx)
1168
1169
li $in0,-4 # final reduction
1170
srl $ctx,$tmp4,2
1171
and $in0,$in0,$tmp4
1172
andi $tmp4,$tmp4,3
1173
addu $ctx,$ctx,$in0
1174
1175
addu $tmp0,$tmp0,$ctx
1176
sltu $ctx,$tmp0,$ctx
1177
addiu $in0,$tmp0,5 # compare to modulus
1178
addu $tmp1,$tmp1,$ctx
1179
sltiu $in1,$in0,5
1180
sltu $ctx,$tmp1,$ctx
1181
addu $in1,$in1,$tmp1
1182
addu $tmp2,$tmp2,$ctx
1183
sltu $in2,$in1,$tmp1
1184
sltu $ctx,$tmp2,$ctx
1185
addu $in2,$in2,$tmp2
1186
addu $tmp3,$tmp3,$ctx
1187
sltu $in3,$in2,$tmp2
1188
sltu $ctx,$tmp3,$ctx
1189
addu $in3,$in3,$tmp3
1190
addu $tmp4,$tmp4,$ctx
1191
sltu $ctx,$in3,$tmp3
1192
addu $ctx,$tmp4
1193
1194
srl $ctx,2 # see if it carried/borrowed
1195
subu $ctx,$zero,$ctx
1196
1197
xor $in0,$tmp0
1198
xor $in1,$tmp1
1199
xor $in2,$tmp2
1200
xor $in3,$tmp3
1201
and $in0,$ctx
1202
and $in1,$ctx
1203
and $in2,$ctx
1204
and $in3,$ctx
1205
xor $in0,$tmp0
1206
xor $in1,$tmp1
1207
xor $in2,$tmp2
1208
xor $in3,$tmp3
1209
1210
lw $tmp0,0($nonce) # load nonce
1211
lw $tmp1,4($nonce)
1212
lw $tmp2,8($nonce)
1213
lw $tmp3,12($nonce)
1214
1215
addu $in0,$tmp0 # accumulate nonce
1216
sltu $ctx,$in0,$tmp0
1217
1218
addu $in1,$tmp1
1219
sltu $tmp1,$in1,$tmp1
1220
addu $in1,$ctx
1221
sltu $ctx,$in1,$ctx
1222
addu $ctx,$tmp1
1223
1224
addu $in2,$tmp2
1225
sltu $tmp2,$in2,$tmp2
1226
addu $in2,$ctx
1227
sltu $ctx,$in2,$ctx
1228
addu $ctx,$tmp2
1229
1230
addu $in3,$tmp3
1231
addu $in3,$ctx
1232
1233
srl $tmp0,$in0,8 # write mac value
1234
srl $tmp1,$in0,16
1235
srl $tmp2,$in0,24
1236
sb $in0, 0($mac)
1237
sb $tmp0,1($mac)
1238
srl $tmp0,$in1,8
1239
sb $tmp1,2($mac)
1240
srl $tmp1,$in1,16
1241
sb $tmp2,3($mac)
1242
srl $tmp2,$in1,24
1243
sb $in1, 4($mac)
1244
sb $tmp0,5($mac)
1245
srl $tmp0,$in2,8
1246
sb $tmp1,6($mac)
1247
srl $tmp1,$in2,16
1248
sb $tmp2,7($mac)
1249
srl $tmp2,$in2,24
1250
sb $in2, 8($mac)
1251
sb $tmp0,9($mac)
1252
srl $tmp0,$in3,8
1253
sb $tmp1,10($mac)
1254
srl $tmp1,$in3,16
1255
sb $tmp2,11($mac)
1256
srl $tmp2,$in3,24
1257
sb $in3, 12($mac)
1258
sb $tmp0,13($mac)
1259
sb $tmp1,14($mac)
1260
sb $tmp2,15($mac)
1261
1262
jr $ra
1263
.end poly1305_emit
1264
.rdata
1265
.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1266
.align 2
1267
___
1268
}
1269
}}}
1270
1271
$output=pop and open STDOUT,">$output";
1272
print $code;
1273
close STDOUT;
1274
1275