Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm/sha256-armv4.pl
26285 views
1
#!/usr/bin/env perl
2
# SPDX-License-Identifier: GPL-2.0
3
4
# This code is taken from the OpenSSL project but the author (Andy Polyakov)
5
# has relicensed it under the GPLv2. Therefore this program is free software;
6
# you can redistribute it and/or modify it under the terms of the GNU General
7
# Public License version 2 as published by the Free Software Foundation.
8
#
9
# The original headers, including the original license headers, are
10
# included below for completeness.
11
12
# ====================================================================
13
# Written by Andy Polyakov <[email protected]> for the OpenSSL
14
# project. The module is, however, dual licensed under OpenSSL and
15
# CRYPTOGAMS licenses depending on where you obtain it. For further
16
# details see https://www.openssl.org/~appro/cryptogams/.
17
# ====================================================================
18
19
# SHA256 block procedure for ARMv4. May 2007.
20
21
# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23
# byte [on single-issue Xscale PXA250 core].
24
25
# July 2010.
26
#
27
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28
# Cortex A8 core and ~20 cycles per processed byte.
29
30
# February 2011.
31
#
32
# Profiler-assisted and platform-specific optimization resulted in 16%
33
# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35
# September 2013.
36
#
37
# Add NEON implementation. On Cortex A8 it was measured to process one
38
# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39
# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40
# code (meaning that latter performs sub-optimally, nothing was done
41
# about it).
42
43
# May 2014.
44
#
45
# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
48
open STDOUT,">$output";
49
50
$ctx="r0"; $t0="r0";
51
$inp="r1"; $t4="r1";
52
$len="r2"; $t1="r2";
53
$T1="r3"; $t3="r3";
54
$A="r4";
55
$B="r5";
56
$C="r6";
57
$D="r7";
58
$E="r8";
59
$F="r9";
60
$G="r10";
61
$H="r11";
62
@V=($A,$B,$C,$D,$E,$F,$G,$H);
63
$t2="r12";
64
$Ktbl="r14";
65
66
@Sigma0=( 2,13,22);
67
@Sigma1=( 6,11,25);
68
@sigma0=( 7,18, 3);
69
@sigma1=(17,19,10);
70
71
sub BODY_00_15 {
72
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
73
74
$code.=<<___ if ($i<16);
75
#if __ARM_ARCH__>=7
76
@ ldr $t1,[$inp],#4 @ $i
77
# if $i==15
78
str $inp,[sp,#17*4] @ make room for $t4
79
# endif
80
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
81
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
82
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
83
# ifndef __ARMEB__
84
rev $t1,$t1
85
# endif
86
#else
87
@ ldrb $t1,[$inp,#3] @ $i
88
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
89
ldrb $t2,[$inp,#2]
90
ldrb $t0,[$inp,#1]
91
orr $t1,$t1,$t2,lsl#8
92
ldrb $t2,[$inp],#4
93
orr $t1,$t1,$t0,lsl#16
94
# if $i==15
95
str $inp,[sp,#17*4] @ make room for $t4
96
# endif
97
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
98
orr $t1,$t1,$t2,lsl#24
99
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
100
#endif
101
___
102
$code.=<<___;
103
ldr $t2,[$Ktbl],#4 @ *K256++
104
add $h,$h,$t1 @ h+=X[i]
105
str $t1,[sp,#`$i%16`*4]
106
eor $t1,$f,$g
107
add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
108
and $t1,$t1,$e
109
add $h,$h,$t2 @ h+=K256[i]
110
eor $t1,$t1,$g @ Ch(e,f,g)
111
eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
112
add $h,$h,$t1 @ h+=Ch(e,f,g)
113
#if $i==31
114
and $t2,$t2,#0xff
115
cmp $t2,#0xf2 @ done?
116
#endif
117
#if $i<15
118
# if __ARM_ARCH__>=7
119
ldr $t1,[$inp],#4 @ prefetch
120
# else
121
ldrb $t1,[$inp,#3]
122
# endif
123
eor $t2,$a,$b @ a^b, b^c in next round
124
#else
125
ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
126
eor $t2,$a,$b @ a^b, b^c in next round
127
ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
128
#endif
129
eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
130
and $t3,$t3,$t2 @ (b^c)&=(a^b)
131
add $d,$d,$h @ d+=h
132
eor $t3,$t3,$b @ Maj(a,b,c)
133
add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
134
@ add $h,$h,$t3 @ h+=Maj(a,b,c)
135
___
136
($t2,$t3)=($t3,$t2);
137
}
138
139
sub BODY_16_XX {
140
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
141
142
$code.=<<___;
143
@ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
144
@ ldr $t4,[sp,#`($i+14)%16`*4]
145
mov $t0,$t1,ror#$sigma0[0]
146
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
147
mov $t2,$t4,ror#$sigma1[0]
148
eor $t0,$t0,$t1,ror#$sigma0[1]
149
eor $t2,$t2,$t4,ror#$sigma1[1]
150
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
151
ldr $t1,[sp,#`($i+0)%16`*4]
152
eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
153
ldr $t4,[sp,#`($i+9)%16`*4]
154
155
add $t2,$t2,$t0
156
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
157
add $t1,$t1,$t2
158
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
159
add $t1,$t1,$t4 @ X[i]
160
___
161
&BODY_00_15(@_);
162
}
163
164
$code=<<___;
165
#ifndef __KERNEL__
166
# include "arm_arch.h"
167
#else
168
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
169
# define __ARM_MAX_ARCH__ 7
170
#endif
171
172
.text
173
#if __ARM_ARCH__<7
174
.code 32
175
#else
176
.syntax unified
177
# ifdef __thumb2__
178
.thumb
179
# else
180
.code 32
181
# endif
182
#endif
183
184
.type K256,%object
185
.align 5
186
K256:
187
.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
188
.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
189
.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
190
.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
191
.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
192
.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
193
.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
194
.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
195
.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
196
.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
197
.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
198
.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
199
.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200
.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201
.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
202
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
203
.size K256,.-K256
204
.word 0 @ terminator
205
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
206
.LOPENSSL_armcap:
207
.word OPENSSL_armcap_P-sha256_block_data_order
208
#endif
209
.align 5
210
211
.global sha256_block_data_order
212
.type sha256_block_data_order,%function
213
sha256_block_data_order:
214
.Lsha256_block_data_order:
215
#if __ARM_ARCH__<7
216
sub r3,pc,#8 @ sha256_block_data_order
217
#else
218
adr r3,.Lsha256_block_data_order
219
#endif
220
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
221
ldr r12,.LOPENSSL_armcap
222
ldr r12,[r3,r12] @ OPENSSL_armcap_P
223
tst r12,#ARMV8_SHA256
224
bne .LARMv8
225
tst r12,#ARMV7_NEON
226
bne .LNEON
227
#endif
228
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
229
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
230
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
231
sub $Ktbl,r3,#256+32 @ K256
232
sub sp,sp,#16*4 @ alloca(X[16])
233
.Loop:
234
# if __ARM_ARCH__>=7
235
ldr $t1,[$inp],#4
236
# else
237
ldrb $t1,[$inp,#3]
238
# endif
239
eor $t3,$B,$C @ magic
240
eor $t2,$t2,$t2
241
___
242
for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
243
$code.=".Lrounds_16_xx:\n";
244
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
245
$code.=<<___;
246
#if __ARM_ARCH__>=7
247
ite eq @ Thumb2 thing, sanity check in ARM
248
#endif
249
ldreq $t3,[sp,#16*4] @ pull ctx
250
bne .Lrounds_16_xx
251
252
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
253
ldr $t0,[$t3,#0]
254
ldr $t1,[$t3,#4]
255
ldr $t2,[$t3,#8]
256
add $A,$A,$t0
257
ldr $t0,[$t3,#12]
258
add $B,$B,$t1
259
ldr $t1,[$t3,#16]
260
add $C,$C,$t2
261
ldr $t2,[$t3,#20]
262
add $D,$D,$t0
263
ldr $t0,[$t3,#24]
264
add $E,$E,$t1
265
ldr $t1,[$t3,#28]
266
add $F,$F,$t2
267
ldr $inp,[sp,#17*4] @ pull inp
268
ldr $t2,[sp,#18*4] @ pull inp+len
269
add $G,$G,$t0
270
add $H,$H,$t1
271
stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
272
cmp $inp,$t2
273
sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
274
bne .Loop
275
276
add sp,sp,#`16+3`*4 @ destroy frame
277
#if __ARM_ARCH__>=5
278
ldmia sp!,{r4-r11,pc}
279
#else
280
ldmia sp!,{r4-r11,lr}
281
tst lr,#1
282
moveq pc,lr @ be binary compatible with V4, yet
283
bx lr @ interoperable with Thumb ISA:-)
284
#endif
285
.size sha256_block_data_order,.-sha256_block_data_order
286
___
287
######################################################################
288
# NEON stuff
289
#
290
{{{
291
my @X=map("q$_",(0..3));
292
my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
293
my $Xfer=$t4;
294
my $j=0;
295
296
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
297
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
298
299
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
300
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
301
my $arg = pop;
302
$arg = "#$arg" if ($arg*1 eq $arg);
303
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
304
}
305
306
sub Xupdate()
307
{ use integer;
308
my $body = shift;
309
my @insns = (&$body,&$body,&$body,&$body);
310
my ($a,$b,$c,$d,$e,$f,$g,$h);
311
312
&vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
313
eval(shift(@insns));
314
eval(shift(@insns));
315
eval(shift(@insns));
316
&vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
317
eval(shift(@insns));
318
eval(shift(@insns));
319
eval(shift(@insns));
320
&vshr_u32 ($T2,$T0,$sigma0[0]);
321
eval(shift(@insns));
322
eval(shift(@insns));
323
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
324
eval(shift(@insns));
325
eval(shift(@insns));
326
&vshr_u32 ($T1,$T0,$sigma0[2]);
327
eval(shift(@insns));
328
eval(shift(@insns));
329
&vsli_32 ($T2,$T0,32-$sigma0[0]);
330
eval(shift(@insns));
331
eval(shift(@insns));
332
&vshr_u32 ($T3,$T0,$sigma0[1]);
333
eval(shift(@insns));
334
eval(shift(@insns));
335
&veor ($T1,$T1,$T2);
336
eval(shift(@insns));
337
eval(shift(@insns));
338
&vsli_32 ($T3,$T0,32-$sigma0[1]);
339
eval(shift(@insns));
340
eval(shift(@insns));
341
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
342
eval(shift(@insns));
343
eval(shift(@insns));
344
&veor ($T1,$T1,$T3); # sigma0(X[1..4])
345
eval(shift(@insns));
346
eval(shift(@insns));
347
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
348
eval(shift(@insns));
349
eval(shift(@insns));
350
&vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
351
eval(shift(@insns));
352
eval(shift(@insns));
353
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
354
eval(shift(@insns));
355
eval(shift(@insns));
356
&veor ($T5,$T5,$T4);
357
eval(shift(@insns));
358
eval(shift(@insns));
359
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
360
eval(shift(@insns));
361
eval(shift(@insns));
362
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
363
eval(shift(@insns));
364
eval(shift(@insns));
365
&veor ($T5,$T5,$T4); # sigma1(X[14..15])
366
eval(shift(@insns));
367
eval(shift(@insns));
368
&vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
369
eval(shift(@insns));
370
eval(shift(@insns));
371
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
372
eval(shift(@insns));
373
eval(shift(@insns));
374
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
375
eval(shift(@insns));
376
eval(shift(@insns));
377
&vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
378
eval(shift(@insns));
379
eval(shift(@insns));
380
&veor ($T5,$T5,$T4);
381
eval(shift(@insns));
382
eval(shift(@insns));
383
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
384
eval(shift(@insns));
385
eval(shift(@insns));
386
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
387
eval(shift(@insns));
388
eval(shift(@insns));
389
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
390
eval(shift(@insns));
391
eval(shift(@insns));
392
&veor ($T5,$T5,$T4); # sigma1(X[16..17])
393
eval(shift(@insns));
394
eval(shift(@insns));
395
&vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
396
eval(shift(@insns));
397
eval(shift(@insns));
398
&vadd_i32 ($T0,$T0,@X[0]);
399
while($#insns>=2) { eval(shift(@insns)); }
400
&vst1_32 ("{$T0}","[$Xfer,:128]!");
401
eval(shift(@insns));
402
eval(shift(@insns));
403
404
push(@X,shift(@X)); # "rotate" X[]
405
}
406
407
sub Xpreload()
408
{ use integer;
409
my $body = shift;
410
my @insns = (&$body,&$body,&$body,&$body);
411
my ($a,$b,$c,$d,$e,$f,$g,$h);
412
413
eval(shift(@insns));
414
eval(shift(@insns));
415
eval(shift(@insns));
416
eval(shift(@insns));
417
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
418
eval(shift(@insns));
419
eval(shift(@insns));
420
eval(shift(@insns));
421
eval(shift(@insns));
422
&vrev32_8 (@X[0],@X[0]);
423
eval(shift(@insns));
424
eval(shift(@insns));
425
eval(shift(@insns));
426
eval(shift(@insns));
427
&vadd_i32 ($T0,$T0,@X[0]);
428
foreach (@insns) { eval; } # remaining instructions
429
&vst1_32 ("{$T0}","[$Xfer,:128]!");
430
431
push(@X,shift(@X)); # "rotate" X[]
432
}
433
434
sub body_00_15 () {
435
(
436
'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
437
'&add ($h,$h,$t1)', # h+=X[i]+K[i]
438
'&eor ($t1,$f,$g)',
439
'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
440
'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
441
'&and ($t1,$t1,$e)',
442
'&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
443
'&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
444
'&eor ($t1,$t1,$g)', # Ch(e,f,g)
445
'&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
446
'&eor ($t2,$a,$b)', # a^b, b^c in next round
447
'&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
448
'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
449
'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
450
'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
451
'&ldr ($t1,"[sp,#64]") if ($j==31)',
452
'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
453
'&add ($d,$d,$h)', # d+=h
454
'&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
455
'&eor ($t3,$t3,$b)', # Maj(a,b,c)
456
'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
457
)
458
}
459
460
$code.=<<___;
461
#if __ARM_MAX_ARCH__>=7
462
.arch armv7-a
463
.fpu neon
464
465
.global sha256_block_data_order_neon
466
.type sha256_block_data_order_neon,%function
467
.align 4
468
sha256_block_data_order_neon:
469
.LNEON:
470
stmdb sp!,{r4-r12,lr}
471
472
sub $H,sp,#16*4+16
473
adr $Ktbl,.Lsha256_block_data_order
474
sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
475
bic $H,$H,#15 @ align for 128-bit stores
476
mov $t2,sp
477
mov sp,$H @ alloca
478
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
479
480
vld1.8 {@X[0]},[$inp]!
481
vld1.8 {@X[1]},[$inp]!
482
vld1.8 {@X[2]},[$inp]!
483
vld1.8 {@X[3]},[$inp]!
484
vld1.32 {$T0},[$Ktbl,:128]!
485
vld1.32 {$T1},[$Ktbl,:128]!
486
vld1.32 {$T2},[$Ktbl,:128]!
487
vld1.32 {$T3},[$Ktbl,:128]!
488
vrev32.8 @X[0],@X[0] @ yes, even on
489
str $ctx,[sp,#64]
490
vrev32.8 @X[1],@X[1] @ big-endian
491
str $inp,[sp,#68]
492
mov $Xfer,sp
493
vrev32.8 @X[2],@X[2]
494
str $len,[sp,#72]
495
vrev32.8 @X[3],@X[3]
496
str $t2,[sp,#76] @ save original sp
497
vadd.i32 $T0,$T0,@X[0]
498
vadd.i32 $T1,$T1,@X[1]
499
vst1.32 {$T0},[$Xfer,:128]!
500
vadd.i32 $T2,$T2,@X[2]
501
vst1.32 {$T1},[$Xfer,:128]!
502
vadd.i32 $T3,$T3,@X[3]
503
vst1.32 {$T2},[$Xfer,:128]!
504
vst1.32 {$T3},[$Xfer,:128]!
505
506
ldmia $ctx,{$A-$H}
507
sub $Xfer,$Xfer,#64
508
ldr $t1,[sp,#0]
509
eor $t2,$t2,$t2
510
eor $t3,$B,$C
511
b .L_00_48
512
513
.align 4
514
.L_00_48:
515
___
516
&Xupdate(\&body_00_15);
517
&Xupdate(\&body_00_15);
518
&Xupdate(\&body_00_15);
519
&Xupdate(\&body_00_15);
520
$code.=<<___;
521
teq $t1,#0 @ check for K256 terminator
522
ldr $t1,[sp,#0]
523
sub $Xfer,$Xfer,#64
524
bne .L_00_48
525
526
ldr $inp,[sp,#68]
527
ldr $t0,[sp,#72]
528
sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
529
teq $inp,$t0
530
it eq
531
subeq $inp,$inp,#64 @ avoid SEGV
532
vld1.8 {@X[0]},[$inp]! @ load next input block
533
vld1.8 {@X[1]},[$inp]!
534
vld1.8 {@X[2]},[$inp]!
535
vld1.8 {@X[3]},[$inp]!
536
it ne
537
strne $inp,[sp,#68]
538
mov $Xfer,sp
539
___
540
&Xpreload(\&body_00_15);
541
&Xpreload(\&body_00_15);
542
&Xpreload(\&body_00_15);
543
&Xpreload(\&body_00_15);
544
$code.=<<___;
545
ldr $t0,[$t1,#0]
546
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
547
ldr $t2,[$t1,#4]
548
ldr $t3,[$t1,#8]
549
ldr $t4,[$t1,#12]
550
add $A,$A,$t0 @ accumulate
551
ldr $t0,[$t1,#16]
552
add $B,$B,$t2
553
ldr $t2,[$t1,#20]
554
add $C,$C,$t3
555
ldr $t3,[$t1,#24]
556
add $D,$D,$t4
557
ldr $t4,[$t1,#28]
558
add $E,$E,$t0
559
str $A,[$t1],#4
560
add $F,$F,$t2
561
str $B,[$t1],#4
562
add $G,$G,$t3
563
str $C,[$t1],#4
564
add $H,$H,$t4
565
str $D,[$t1],#4
566
stmia $t1,{$E-$H}
567
568
ittte ne
569
movne $Xfer,sp
570
ldrne $t1,[sp,#0]
571
eorne $t2,$t2,$t2
572
ldreq sp,[sp,#76] @ restore original sp
573
itt ne
574
eorne $t3,$B,$C
575
bne .L_00_48
576
577
ldmia sp!,{r4-r12,pc}
578
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
579
#endif
580
___
581
}}}
582
######################################################################
583
# ARMv8 stuff
584
#
585
{{{
586
my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
587
my @MSG=map("q$_",(8..11));
588
my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
589
my $Ktbl="r3";
590
591
$code.=<<___;
592
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
593
594
# ifdef __thumb2__
595
# define INST(a,b,c,d) .byte c,d|0xc,a,b
596
# else
597
# define INST(a,b,c,d) .byte a,b,c,d
598
# endif
599
600
.type sha256_block_data_order_armv8,%function
601
.align 5
602
sha256_block_data_order_armv8:
603
.LARMv8:
604
vld1.32 {$ABCD,$EFGH},[$ctx]
605
# ifdef __thumb2__
606
adr $Ktbl,.LARMv8
607
sub $Ktbl,$Ktbl,#.LARMv8-K256
608
# else
609
adrl $Ktbl,K256
610
# endif
611
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
612
613
.Loop_v8:
614
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
615
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
616
vld1.32 {$W0},[$Ktbl]!
617
vrev32.8 @MSG[0],@MSG[0]
618
vrev32.8 @MSG[1],@MSG[1]
619
vrev32.8 @MSG[2],@MSG[2]
620
vrev32.8 @MSG[3],@MSG[3]
621
vmov $ABCD_SAVE,$ABCD @ offload
622
vmov $EFGH_SAVE,$EFGH
623
teq $inp,$len
624
___
625
for($i=0;$i<12;$i++) {
626
$code.=<<___;
627
vld1.32 {$W1},[$Ktbl]!
628
vadd.i32 $W0,$W0,@MSG[0]
629
sha256su0 @MSG[0],@MSG[1]
630
vmov $abcd,$ABCD
631
sha256h $ABCD,$EFGH,$W0
632
sha256h2 $EFGH,$abcd,$W0
633
sha256su1 @MSG[0],@MSG[2],@MSG[3]
634
___
635
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
636
}
637
$code.=<<___;
638
vld1.32 {$W1},[$Ktbl]!
639
vadd.i32 $W0,$W0,@MSG[0]
640
vmov $abcd,$ABCD
641
sha256h $ABCD,$EFGH,$W0
642
sha256h2 $EFGH,$abcd,$W0
643
644
vld1.32 {$W0},[$Ktbl]!
645
vadd.i32 $W1,$W1,@MSG[1]
646
vmov $abcd,$ABCD
647
sha256h $ABCD,$EFGH,$W1
648
sha256h2 $EFGH,$abcd,$W1
649
650
vld1.32 {$W1},[$Ktbl]
651
vadd.i32 $W0,$W0,@MSG[2]
652
sub $Ktbl,$Ktbl,#256-16 @ rewind
653
vmov $abcd,$ABCD
654
sha256h $ABCD,$EFGH,$W0
655
sha256h2 $EFGH,$abcd,$W0
656
657
vadd.i32 $W1,$W1,@MSG[3]
658
vmov $abcd,$ABCD
659
sha256h $ABCD,$EFGH,$W1
660
sha256h2 $EFGH,$abcd,$W1
661
662
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
663
vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
664
it ne
665
bne .Loop_v8
666
667
vst1.32 {$ABCD,$EFGH},[$ctx]
668
669
ret @ bx lr
670
.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
671
#endif
672
___
673
}}}
674
$code.=<<___;
675
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
676
.align 2
677
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
678
.comm OPENSSL_armcap_P,4,4
679
#endif
680
___
681
682
open SELF,$0;
683
while(<SELF>) {
684
next if (/^#!/);
685
last if (!s/^#/@/ and !/^$/);
686
print;
687
}
688
close SELF;
689
690
{ my %opcode = (
691
"sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
692
"sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
693
694
sub unsha256 {
695
my ($mnemonic,$arg)=@_;
696
697
if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
698
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
699
|(($2&7)<<17)|(($2&8)<<4)
700
|(($3&7)<<1) |(($3&8)<<2);
701
# since ARMv7 instructions are always encoded little-endian.
702
# correct solution is to use .inst directive, but older
703
# assemblers don't implement it:-(
704
sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
705
$word&0xff,($word>>8)&0xff,
706
($word>>16)&0xff,($word>>24)&0xff,
707
$mnemonic,$arg;
708
}
709
}
710
}
711
712
foreach (split($/,$code)) {
713
714
s/\`([^\`]*)\`/eval $1/geo;
715
716
s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
717
718
s/\bret\b/bx lr/go or
719
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
720
721
print $_,"\n";
722
}
723
724
close STDOUT; # enforce flush
725
726