CoCalc -- sha256-armv4.pl

GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm/sha256-armv4.pl
²⁶²⁸⁵ views
1
#!/usr/bin/env perl
2
# SPDX-License-Identifier: GPL-2.0
3

4
# This code is taken from the OpenSSL project but the author (Andy Polyakov)
5
# has relicensed it under the GPLv2. Therefore this program is free software;
6
# you can redistribute it and/or modify it under the terms of the GNU General
7
# Public License version 2 as published by the Free Software Foundation.
8
#
9
# The original headers, including the original license headers, are
10
# included below for completeness.
11

12
# ====================================================================
13
# Written by Andy Polyakov <[email protected]> for the OpenSSL
14
# project. The module is, however, dual licensed under OpenSSL and
15
# CRYPTOGAMS licenses depending on where you obtain it. For further
16
# details see https://www.openssl.org/~appro/cryptogams/.
17
# ====================================================================
18

19
# SHA256 block procedure for ARMv4. May 2007.
20

21
# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23
# byte [on single-issue Xscale PXA250 core].
24

25
# July 2010.
26
#
27
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28
# Cortex A8 core and ~20 cycles per processed byte.
29

30
# February 2011.
31
#
32
# Profiler-assisted and platform-specific optimization resulted in 16%
33
# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34

35
# September 2013.
36
#
37
# Add NEON implementation. On Cortex A8 it was measured to process one
38
# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39
# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40
# code (meaning that latter performs sub-optimally, nothing was done
41
# about it).
42

43
# May 2014.
44
#
45
# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46

47
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
48
open STDOUT,">$output";
49

50
$ctx="r0";	$t0="r0";
51
$inp="r1";	$t4="r1";
52
$len="r2";	$t1="r2";
53
$T1="r3";	$t3="r3";
54
$A="r4";
55
$B="r5";
56
$C="r6";
57
$D="r7";
58
$E="r8";
59
$F="r9";
60
$G="r10";
61
$H="r11";
62
@V=($A,$B,$C,$D,$E,$F,$G,$H);
63
$t2="r12";
64
$Ktbl="r14";
65

66
@Sigma0=( 2,13,22);
67
@Sigma1=( 6,11,25);
68
@sigma0=( 7,18, 3);
69
@sigma1=(17,19,10);
70

71
sub BODY_00_15 {
72
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
73

74
$code.=<<___ if ($i<16);
75
#if __ARM_ARCH__>=7
76
	@ ldr	$t1,[$inp],#4			@ $i
77
# if $i==15
78
	str	$inp,[sp,#17*4]			@ make room for $t4
79
# endif
80
	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
81
	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
82
	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
83
# ifndef __ARMEB__
84
	rev	$t1,$t1
85
# endif
86
#else
87
	@ ldrb	$t1,[$inp,#3]			@ $i
88
	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
89
	ldrb	$t2,[$inp,#2]
90
	ldrb	$t0,[$inp,#1]
91
	orr	$t1,$t1,$t2,lsl#8
92
	ldrb	$t2,[$inp],#4
93
	orr	$t1,$t1,$t0,lsl#16
94
# if $i==15
95
	str	$inp,[sp,#17*4]			@ make room for $t4
96
# endif
97
	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
98
	orr	$t1,$t1,$t2,lsl#24
99
	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
100
#endif
101
___
102
$code.=<<___;
103
	ldr	$t2,[$Ktbl],#4			@ *K256++
104
	add	$h,$h,$t1			@ h+=X[i]
105
	str	$t1,[sp,#`$i%16`*4]
106
	eor	$t1,$f,$g
107
	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
108
	and	$t1,$t1,$e
109
	add	$h,$h,$t2			@ h+=K256[i]
110
	eor	$t1,$t1,$g			@ Ch(e,f,g)
111
	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
112
	add	$h,$h,$t1			@ h+=Ch(e,f,g)
113
#if $i==31
114
	and	$t2,$t2,#0xff
115
	cmp	$t2,#0xf2			@ done?
116
#endif
117
#if $i<15
118
# if __ARM_ARCH__>=7
119
	ldr	$t1,[$inp],#4			@ prefetch
120
# else
121
	ldrb	$t1,[$inp,#3]
122
# endif
123
	eor	$t2,$a,$b			@ a^b, b^c in next round
124
#else
125
	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
126
	eor	$t2,$a,$b			@ a^b, b^c in next round
127
	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
128
#endif
129
	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
130
	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
131
	add	$d,$d,$h			@ d+=h
132
	eor	$t3,$t3,$b			@ Maj(a,b,c)
133
	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
134
	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
135
___
136
	($t2,$t3)=($t3,$t2);
137
}
138

139
sub BODY_16_XX {
140
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
141

142
$code.=<<___;
143
	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
144
	@ ldr	$t4,[sp,#`($i+14)%16`*4]
145
	mov	$t0,$t1,ror#$sigma0[0]
146
	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
147
	mov	$t2,$t4,ror#$sigma1[0]
148
	eor	$t0,$t0,$t1,ror#$sigma0[1]
149
	eor	$t2,$t2,$t4,ror#$sigma1[1]
150
	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
151
	ldr	$t1,[sp,#`($i+0)%16`*4]
152
	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
153
	ldr	$t4,[sp,#`($i+9)%16`*4]
154

155
	add	$t2,$t2,$t0
156
	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
157
	add	$t1,$t1,$t2
158
	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
159
	add	$t1,$t1,$t4			@ X[i]
160
___
161
	&BODY_00_15(@_);
162
}
163

164
$code=<<___;
165
#ifndef __KERNEL__
166
# include "arm_arch.h"
167
#else
168
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
169
# define __ARM_MAX_ARCH__ 7
170
#endif
171

172
.text
173
#if __ARM_ARCH__<7
174
.code	32
175
#else
176
.syntax unified
177
# ifdef __thumb2__
178
.thumb
179
# else
180
.code   32
181
# endif
182
#endif
183

184
.type	K256,%object
185
.align	5
186
K256:
187
.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
188
.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
189
.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
190
.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
191
.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
192
.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
193
.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
194
.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
195
.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
196
.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
197
.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
198
.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
199
.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200
.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201
.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
202
.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
203
.size	K256,.-K256
204
.word	0				@ terminator
205
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
206
.LOPENSSL_armcap:
207
.word	OPENSSL_armcap_P-sha256_block_data_order
208
#endif
209
.align	5
210

211
.global	sha256_block_data_order
212
.type	sha256_block_data_order,%function
213
sha256_block_data_order:
214
.Lsha256_block_data_order:
215
#if __ARM_ARCH__<7
216
	sub	r3,pc,#8		@ sha256_block_data_order
217
#else
218
	adr	r3,.Lsha256_block_data_order
219
#endif
220
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
221
	ldr	r12,.LOPENSSL_armcap
222
	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
223
	tst	r12,#ARMV8_SHA256
224
	bne	.LARMv8
225
	tst	r12,#ARMV7_NEON
226
	bne	.LNEON
227
#endif
228
	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
229
	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
230
	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
231
	sub	$Ktbl,r3,#256+32	@ K256
232
	sub	sp,sp,#16*4		@ alloca(X[16])
233
.Loop:
234
# if __ARM_ARCH__>=7
235
	ldr	$t1,[$inp],#4
236
# else
237
	ldrb	$t1,[$inp,#3]
238
# endif
239
	eor	$t3,$B,$C		@ magic
240
	eor	$t2,$t2,$t2
241
___
242
for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
243
$code.=".Lrounds_16_xx:\n";
244
for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
245
$code.=<<___;
246
#if __ARM_ARCH__>=7
247
	ite	eq			@ Thumb2 thing, sanity check in ARM
248
#endif
249
	ldreq	$t3,[sp,#16*4]		@ pull ctx
250
	bne	.Lrounds_16_xx
251

252
	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
253
	ldr	$t0,[$t3,#0]
254
	ldr	$t1,[$t3,#4]
255
	ldr	$t2,[$t3,#8]
256
	add	$A,$A,$t0
257
	ldr	$t0,[$t3,#12]
258
	add	$B,$B,$t1
259
	ldr	$t1,[$t3,#16]
260
	add	$C,$C,$t2
261
	ldr	$t2,[$t3,#20]
262
	add	$D,$D,$t0
263
	ldr	$t0,[$t3,#24]
264
	add	$E,$E,$t1
265
	ldr	$t1,[$t3,#28]
266
	add	$F,$F,$t2
267
	ldr	$inp,[sp,#17*4]		@ pull inp
268
	ldr	$t2,[sp,#18*4]		@ pull inp+len
269
	add	$G,$G,$t0
270
	add	$H,$H,$t1
271
	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
272
	cmp	$inp,$t2
273
	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
274
	bne	.Loop
275

276
	add	sp,sp,#`16+3`*4	@ destroy frame
277
#if __ARM_ARCH__>=5
278
	ldmia	sp!,{r4-r11,pc}
279
#else
280
	ldmia	sp!,{r4-r11,lr}
281
	tst	lr,#1
282
	moveq	pc,lr			@ be binary compatible with V4, yet
283
	bx	lr			@ interoperable with Thumb ISA:-)
284
#endif
285
.size	sha256_block_data_order,.-sha256_block_data_order
286
___
287
######################################################################
288
# NEON stuff
289
#
290
{{{
291
my @X=map("q$_",(0..3));
292
my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
293
my $Xfer=$t4;
294
my $j=0;
295

296
sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
297
sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
298

299
sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
300
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
301
  my $arg = pop;
302
    $arg = "#$arg" if ($arg*1 eq $arg);
303
    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
304
}
305

306
sub Xupdate()
307
{ use integer;
308
  my $body = shift;
309
  my @insns = (&$body,&$body,&$body,&$body);
310
  my ($a,$b,$c,$d,$e,$f,$g,$h);
311

312
	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
313
	 eval(shift(@insns));
314
	 eval(shift(@insns));
315
	 eval(shift(@insns));
316
	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
317
	 eval(shift(@insns));
318
	 eval(shift(@insns));
319
	 eval(shift(@insns));
320
	&vshr_u32	($T2,$T0,$sigma0[0]);
321
	 eval(shift(@insns));
322
	 eval(shift(@insns));
323
	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
324
	 eval(shift(@insns));
325
	 eval(shift(@insns));
326
	&vshr_u32	($T1,$T0,$sigma0[2]);
327
	 eval(shift(@insns));
328
	 eval(shift(@insns));
329
	&vsli_32	($T2,$T0,32-$sigma0[0]);
330
	 eval(shift(@insns));
331
	 eval(shift(@insns));
332
	&vshr_u32	($T3,$T0,$sigma0[1]);
333
	 eval(shift(@insns));
334
	 eval(shift(@insns));
335
	&veor		($T1,$T1,$T2);
336
	 eval(shift(@insns));
337
	 eval(shift(@insns));
338
	&vsli_32	($T3,$T0,32-$sigma0[1]);
339
	 eval(shift(@insns));
340
	 eval(shift(@insns));
341
	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
342
	 eval(shift(@insns));
343
	 eval(shift(@insns));
344
	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
345
	 eval(shift(@insns));
346
	 eval(shift(@insns));
347
	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
348
	 eval(shift(@insns));
349
	 eval(shift(@insns));
350
	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
351
	 eval(shift(@insns));
352
	 eval(shift(@insns));
353
	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
354
	 eval(shift(@insns));
355
	 eval(shift(@insns));
356
	  &veor		($T5,$T5,$T4);
357
	 eval(shift(@insns));
358
	 eval(shift(@insns));
359
	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
360
	 eval(shift(@insns));
361
	 eval(shift(@insns));
362
	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
363
	 eval(shift(@insns));
364
	 eval(shift(@insns));
365
	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
366
	 eval(shift(@insns));
367
	 eval(shift(@insns));
368
	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
369
	 eval(shift(@insns));
370
	 eval(shift(@insns));
371
	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
372
	 eval(shift(@insns));
373
	 eval(shift(@insns));
374
	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
375
	 eval(shift(@insns));
376
	 eval(shift(@insns));
377
	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
378
	 eval(shift(@insns));
379
	 eval(shift(@insns));
380
	  &veor		($T5,$T5,$T4);
381
	 eval(shift(@insns));
382
	 eval(shift(@insns));
383
	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
384
	 eval(shift(@insns));
385
	 eval(shift(@insns));
386
	&vld1_32	("{$T0}","[$Ktbl,:128]!");
387
	 eval(shift(@insns));
388
	 eval(shift(@insns));
389
	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
390
	 eval(shift(@insns));
391
	 eval(shift(@insns));
392
	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
393
	 eval(shift(@insns));
394
	 eval(shift(@insns));
395
	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
396
	 eval(shift(@insns));
397
	 eval(shift(@insns));
398
	&vadd_i32	($T0,$T0,@X[0]);
399
	 while($#insns>=2) { eval(shift(@insns)); }
400
	&vst1_32	("{$T0}","[$Xfer,:128]!");
401
	 eval(shift(@insns));
402
	 eval(shift(@insns));
403

404
	push(@X,shift(@X));		# "rotate" X[]
405
}
406

407
sub Xpreload()
408
{ use integer;
409
  my $body = shift;
410
  my @insns = (&$body,&$body,&$body,&$body);
411
  my ($a,$b,$c,$d,$e,$f,$g,$h);
412

413
	 eval(shift(@insns));
414
	 eval(shift(@insns));
415
	 eval(shift(@insns));
416
	 eval(shift(@insns));
417
	&vld1_32	("{$T0}","[$Ktbl,:128]!");
418
	 eval(shift(@insns));
419
	 eval(shift(@insns));
420
	 eval(shift(@insns));
421
	 eval(shift(@insns));
422
	&vrev32_8	(@X[0],@X[0]);
423
	 eval(shift(@insns));
424
	 eval(shift(@insns));
425
	 eval(shift(@insns));
426
	 eval(shift(@insns));
427
	&vadd_i32	($T0,$T0,@X[0]);
428
	 foreach (@insns) { eval; }	# remaining instructions
429
	&vst1_32	("{$T0}","[$Xfer,:128]!");
430

431
	push(@X,shift(@X));		# "rotate" X[]
432
}
433

434
sub body_00_15 () {
435
	(
436
	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
437
	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
438
	'&eor	($t1,$f,$g)',
439
	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
440
	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
441
	'&and	($t1,$t1,$e)',
442
	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
443
	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
444
	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
445
	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
446
	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
447
	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
448
	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
449
	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
450
	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
451
	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
452
	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
453
	'&add	($d,$d,$h)',			# d+=h
454
	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
455
	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
456
	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
457
	)
458
}
459

460
$code.=<<___;
461
#if __ARM_MAX_ARCH__>=7
462
.arch	armv7-a
463
.fpu	neon
464

465
.global	sha256_block_data_order_neon
466
.type	sha256_block_data_order_neon,%function
467
.align	4
468
sha256_block_data_order_neon:
469
.LNEON:
470
	stmdb	sp!,{r4-r12,lr}
471

472
	sub	$H,sp,#16*4+16
473
	adr	$Ktbl,.Lsha256_block_data_order
474
	sub	$Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
475
	bic	$H,$H,#15		@ align for 128-bit stores
476
	mov	$t2,sp
477
	mov	sp,$H			@ alloca
478
	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
479

480
	vld1.8		{@X[0]},[$inp]!
481
	vld1.8		{@X[1]},[$inp]!
482
	vld1.8		{@X[2]},[$inp]!
483
	vld1.8		{@X[3]},[$inp]!
484
	vld1.32		{$T0},[$Ktbl,:128]!
485
	vld1.32		{$T1},[$Ktbl,:128]!
486
	vld1.32		{$T2},[$Ktbl,:128]!
487
	vld1.32		{$T3},[$Ktbl,:128]!
488
	vrev32.8	@X[0],@X[0]		@ yes, even on
489
	str		$ctx,[sp,#64]
490
	vrev32.8	@X[1],@X[1]		@ big-endian
491
	str		$inp,[sp,#68]
492
	mov		$Xfer,sp
493
	vrev32.8	@X[2],@X[2]
494
	str		$len,[sp,#72]
495
	vrev32.8	@X[3],@X[3]
496
	str		$t2,[sp,#76]		@ save original sp
497
	vadd.i32	$T0,$T0,@X[0]
498
	vadd.i32	$T1,$T1,@X[1]
499
	vst1.32		{$T0},[$Xfer,:128]!
500
	vadd.i32	$T2,$T2,@X[2]
501
	vst1.32		{$T1},[$Xfer,:128]!
502
	vadd.i32	$T3,$T3,@X[3]
503
	vst1.32		{$T2},[$Xfer,:128]!
504
	vst1.32		{$T3},[$Xfer,:128]!
505

506
	ldmia		$ctx,{$A-$H}
507
	sub		$Xfer,$Xfer,#64
508
	ldr		$t1,[sp,#0]
509
	eor		$t2,$t2,$t2
510
	eor		$t3,$B,$C
511
	b		.L_00_48
512

513
.align	4
514
.L_00_48:
515
___
516
	&Xupdate(\&body_00_15);
517
	&Xupdate(\&body_00_15);
518
	&Xupdate(\&body_00_15);
519
	&Xupdate(\&body_00_15);
520
$code.=<<___;
521
	teq	$t1,#0				@ check for K256 terminator
522
	ldr	$t1,[sp,#0]
523
	sub	$Xfer,$Xfer,#64
524
	bne	.L_00_48
525

526
	ldr		$inp,[sp,#68]
527
	ldr		$t0,[sp,#72]
528
	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
529
	teq		$inp,$t0
530
	it		eq
531
	subeq		$inp,$inp,#64		@ avoid SEGV
532
	vld1.8		{@X[0]},[$inp]!		@ load next input block
533
	vld1.8		{@X[1]},[$inp]!
534
	vld1.8		{@X[2]},[$inp]!
535
	vld1.8		{@X[3]},[$inp]!
536
	it		ne
537
	strne		$inp,[sp,#68]
538
	mov		$Xfer,sp
539
___
540
	&Xpreload(\&body_00_15);
541
	&Xpreload(\&body_00_15);
542
	&Xpreload(\&body_00_15);
543
	&Xpreload(\&body_00_15);
544
$code.=<<___;
545
	ldr	$t0,[$t1,#0]
546
	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
547
	ldr	$t2,[$t1,#4]
548
	ldr	$t3,[$t1,#8]
549
	ldr	$t4,[$t1,#12]
550
	add	$A,$A,$t0			@ accumulate
551
	ldr	$t0,[$t1,#16]
552
	add	$B,$B,$t2
553
	ldr	$t2,[$t1,#20]
554
	add	$C,$C,$t3
555
	ldr	$t3,[$t1,#24]
556
	add	$D,$D,$t4
557
	ldr	$t4,[$t1,#28]
558
	add	$E,$E,$t0
559
	str	$A,[$t1],#4
560
	add	$F,$F,$t2
561
	str	$B,[$t1],#4
562
	add	$G,$G,$t3
563
	str	$C,[$t1],#4
564
	add	$H,$H,$t4
565
	str	$D,[$t1],#4
566
	stmia	$t1,{$E-$H}
567

568
	ittte	ne
569
	movne	$Xfer,sp
570
	ldrne	$t1,[sp,#0]
571
	eorne	$t2,$t2,$t2
572
	ldreq	sp,[sp,#76]			@ restore original sp
573
	itt	ne
574
	eorne	$t3,$B,$C
575
	bne	.L_00_48
576

577
	ldmia	sp!,{r4-r12,pc}
578
.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
579
#endif
580
___
581
}}}
582
######################################################################
583
# ARMv8 stuff
584
#
585
{{{
586
my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
587
my @MSG=map("q$_",(8..11));
588
my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
589
my $Ktbl="r3";
590

591
$code.=<<___;
592
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
593

594
# ifdef __thumb2__
595
#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
596
# else
597
#  define INST(a,b,c,d)	.byte	a,b,c,d
598
# endif
599

600
.type	sha256_block_data_order_armv8,%function
601
.align	5
602
sha256_block_data_order_armv8:
603
.LARMv8:
604
	vld1.32	{$ABCD,$EFGH},[$ctx]
605
# ifdef __thumb2__
606
	adr	$Ktbl,.LARMv8
607
	sub	$Ktbl,$Ktbl,#.LARMv8-K256
608
# else
609
	adrl	$Ktbl,K256
610
# endif
611
	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
612

613
.Loop_v8:
614
	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
615
	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
616
	vld1.32		{$W0},[$Ktbl]!
617
	vrev32.8	@MSG[0],@MSG[0]
618
	vrev32.8	@MSG[1],@MSG[1]
619
	vrev32.8	@MSG[2],@MSG[2]
620
	vrev32.8	@MSG[3],@MSG[3]
621
	vmov		$ABCD_SAVE,$ABCD	@ offload
622
	vmov		$EFGH_SAVE,$EFGH
623
	teq		$inp,$len
624
___
625
for($i=0;$i<12;$i++) {
626
$code.=<<___;
627
	vld1.32		{$W1},[$Ktbl]!
628
	vadd.i32	$W0,$W0,@MSG[0]
629
	sha256su0	@MSG[0],@MSG[1]
630
	vmov		$abcd,$ABCD
631
	sha256h		$ABCD,$EFGH,$W0
632
	sha256h2	$EFGH,$abcd,$W0
633
	sha256su1	@MSG[0],@MSG[2],@MSG[3]
634
___
635
	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
636
}
637
$code.=<<___;
638
	vld1.32		{$W1},[$Ktbl]!
639
	vadd.i32	$W0,$W0,@MSG[0]
640
	vmov		$abcd,$ABCD
641
	sha256h		$ABCD,$EFGH,$W0
642
	sha256h2	$EFGH,$abcd,$W0
643

644
	vld1.32		{$W0},[$Ktbl]!
645
	vadd.i32	$W1,$W1,@MSG[1]
646
	vmov		$abcd,$ABCD
647
	sha256h		$ABCD,$EFGH,$W1
648
	sha256h2	$EFGH,$abcd,$W1
649

650
	vld1.32		{$W1},[$Ktbl]
651
	vadd.i32	$W0,$W0,@MSG[2]
652
	sub		$Ktbl,$Ktbl,#256-16	@ rewind
653
	vmov		$abcd,$ABCD
654
	sha256h		$ABCD,$EFGH,$W0
655
	sha256h2	$EFGH,$abcd,$W0
656

657
	vadd.i32	$W1,$W1,@MSG[3]
658
	vmov		$abcd,$ABCD
659
	sha256h		$ABCD,$EFGH,$W1
660
	sha256h2	$EFGH,$abcd,$W1
661

662
	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
663
	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
664
	it		ne
665
	bne		.Loop_v8
666

667
	vst1.32		{$ABCD,$EFGH},[$ctx]
668

669
	ret		@ bx lr
670
.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
671
#endif
672
___
673
}}}
674
$code.=<<___;
675
.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
676
.align	2
677
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
678
.comm   OPENSSL_armcap_P,4,4
679
#endif
680
___
681

682
open SELF,$0;
683
while(<SELF>) {
684
	next if (/^#!/);
685
	last if (!s/^#/@/ and !/^$/);
686
	print;
687
}
688
close SELF;
689

690
{   my  %opcode = (
691
	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
692
	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
693

694
    sub unsha256 {
695
	my ($mnemonic,$arg)=@_;
696

697
	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
698
	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
699
					 |(($2&7)<<17)|(($2&8)<<4)
700
					 |(($3&7)<<1) |(($3&8)<<2);
701
	    # since ARMv7 instructions are always encoded little-endian.
702
	    # correct solution is to use .inst directive, but older
703
	    # assemblers don't implement it:-(
704
	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
705
			$word&0xff,($word>>8)&0xff,
706
			($word>>16)&0xff,($word>>24)&0xff,
707
			$mnemonic,$arg;
708
	}
709
    }
710
}
711

712
foreach (split($/,$code)) {
713

714
	s/\`([^\`]*)\`/eval $1/geo;
715

716
	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
717

718
	s/\bret\b/bx	lr/go		or
719
	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
720

721
	print $_,"\n";
722
}
723

724
close STDOUT; # enforce flush
725

726
Product

Resources

Company