CoCalc -- sha512-armv4.pl

GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm/sha512-armv4.pl
²⁶²⁸² views
1
#!/usr/bin/env perl
2
# SPDX-License-Identifier: GPL-2.0
3

4
# This code is taken from the OpenSSL project but the author (Andy Polyakov)
5
# has relicensed it under the GPLv2. Therefore this program is free software;
6
# you can redistribute it and/or modify it under the terms of the GNU General
7
# Public License version 2 as published by the Free Software Foundation.
8
#
9
# The original headers, including the original license headers, are
10
# included below for completeness.
11

12
# ====================================================================
13
# Written by Andy Polyakov <[email protected]> for the OpenSSL
14
# project. The module is, however, dual licensed under OpenSSL and
15
# CRYPTOGAMS licenses depending on where you obtain it. For further
16
# details see https://www.openssl.org/~appro/cryptogams/.
17
# ====================================================================
18

19
# SHA512 block procedure for ARMv4. September 2007.
20

21
# This code is ~4.5 (four and a half) times faster than code generated
22
# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
23
# Xscale PXA250 core].
24
#
25
# July 2010.
26
#
27
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
28
# Cortex A8 core and ~40 cycles per processed byte.
29

30
# February 2011.
31
#
32
# Profiler-assisted and platform-specific optimization resulted in 7%
33
# improvement on Coxtex A8 core and ~38 cycles per byte.
34

35
# March 2011.
36
#
37
# Add NEON implementation. On Cortex A8 it was measured to process
38
# one byte in 23.3 cycles or ~60% faster than integer-only code.
39

40
# August 2012.
41
#
42
# Improve NEON performance by 12% on Snapdragon S4. In absolute
43
# terms it's 22.6 cycles per byte, which is disappointing result.
44
# Technical writers asserted that 3-way S4 pipeline can sustain
45
# multiple NEON instructions per cycle, but dual NEON issue could
46
# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
47
# for further details. On side note Cortex-A15 processes one byte in
48
# 16 cycles.
49

50
# Byte order [in]dependence. =========================================
51
#
52
# Originally caller was expected to maintain specific *dword* order in
53
# h[0-7], namely with most significant dword at *lower* address, which
54
# was reflected in below two parameters as 0 and 4. Now caller is
55
# expected to maintain native byte order for whole 64-bit values.
56
$hi="HI";
57
$lo="LO";
58
# ====================================================================
59

60
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
61
open STDOUT,">$output";
62

63
$ctx="r0";	# parameter block
64
$inp="r1";
65
$len="r2";
66

67
$Tlo="r3";
68
$Thi="r4";
69
$Alo="r5";
70
$Ahi="r6";
71
$Elo="r7";
72
$Ehi="r8";
73
$t0="r9";
74
$t1="r10";
75
$t2="r11";
76
$t3="r12";
77
############	r13 is stack pointer
78
$Ktbl="r14";
79
############	r15 is program counter
80

81
$Aoff=8*0;
82
$Boff=8*1;
83
$Coff=8*2;
84
$Doff=8*3;
85
$Eoff=8*4;
86
$Foff=8*5;
87
$Goff=8*6;
88
$Hoff=8*7;
89
$Xoff=8*8;
90

91
sub BODY_00_15() {
92
my $magic = shift;
93
$code.=<<___;
94
	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
95
	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
96
	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
97
	mov	$t0,$Elo,lsr#14
98
	str	$Tlo,[sp,#$Xoff+0]
99
	mov	$t1,$Ehi,lsr#14
100
	str	$Thi,[sp,#$Xoff+4]
101
	eor	$t0,$t0,$Ehi,lsl#18
102
	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
103
	eor	$t1,$t1,$Elo,lsl#18
104
	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
105
	eor	$t0,$t0,$Elo,lsr#18
106
	eor	$t1,$t1,$Ehi,lsr#18
107
	eor	$t0,$t0,$Ehi,lsl#14
108
	eor	$t1,$t1,$Elo,lsl#14
109
	eor	$t0,$t0,$Ehi,lsr#9
110
	eor	$t1,$t1,$Elo,lsr#9
111
	eor	$t0,$t0,$Elo,lsl#23
112
	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
113
	adds	$Tlo,$Tlo,$t0
114
	ldr	$t0,[sp,#$Foff+0]	@ f.lo
115
	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
116
	ldr	$t1,[sp,#$Foff+4]	@ f.hi
117
	adds	$Tlo,$Tlo,$t2
118
	ldr	$t2,[sp,#$Goff+0]	@ g.lo
119
	adc	$Thi,$Thi,$t3		@ T += h
120
	ldr	$t3,[sp,#$Goff+4]	@ g.hi
121

122
	eor	$t0,$t0,$t2
123
	str	$Elo,[sp,#$Eoff+0]
124
	eor	$t1,$t1,$t3
125
	str	$Ehi,[sp,#$Eoff+4]
126
	and	$t0,$t0,$Elo
127
	str	$Alo,[sp,#$Aoff+0]
128
	and	$t1,$t1,$Ehi
129
	str	$Ahi,[sp,#$Aoff+4]
130
	eor	$t0,$t0,$t2
131
	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
132
	eor	$t1,$t1,$t3		@ Ch(e,f,g)
133
	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
134

135
	adds	$Tlo,$Tlo,$t0
136
	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
137
	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
138
	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
139
	adds	$Tlo,$Tlo,$t2
140
	and	$t0,$t2,#0xff
141
	adc	$Thi,$Thi,$t3		@ T += K[i]
142
	adds	$Elo,$Elo,$Tlo
143
	ldr	$t2,[sp,#$Boff+0]	@ b.lo
144
	adc	$Ehi,$Ehi,$Thi		@ d += T
145
	teq	$t0,#$magic
146

147
	ldr	$t3,[sp,#$Coff+0]	@ c.lo
148
#if __ARM_ARCH__>=7
149
	it	eq			@ Thumb2 thing, sanity check in ARM
150
#endif
151
	orreq	$Ktbl,$Ktbl,#1
152
	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
153
	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
154
	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
155
	mov	$t0,$Alo,lsr#28
156
	mov	$t1,$Ahi,lsr#28
157
	eor	$t0,$t0,$Ahi,lsl#4
158
	eor	$t1,$t1,$Alo,lsl#4
159
	eor	$t0,$t0,$Ahi,lsr#2
160
	eor	$t1,$t1,$Alo,lsr#2
161
	eor	$t0,$t0,$Alo,lsl#30
162
	eor	$t1,$t1,$Ahi,lsl#30
163
	eor	$t0,$t0,$Ahi,lsr#7
164
	eor	$t1,$t1,$Alo,lsr#7
165
	eor	$t0,$t0,$Alo,lsl#25
166
	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
167
	adds	$Tlo,$Tlo,$t0
168
	and	$t0,$Alo,$t2
169
	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
170

171
	ldr	$t1,[sp,#$Boff+4]	@ b.hi
172
	orr	$Alo,$Alo,$t2
173
	ldr	$t2,[sp,#$Coff+4]	@ c.hi
174
	and	$Alo,$Alo,$t3
175
	and	$t3,$Ahi,$t1
176
	orr	$Ahi,$Ahi,$t1
177
	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
178
	and	$Ahi,$Ahi,$t2
179
	adds	$Alo,$Alo,$Tlo
180
	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
181
	sub	sp,sp,#8
182
	adc	$Ahi,$Ahi,$Thi		@ h += T
183
	tst	$Ktbl,#1
184
	add	$Ktbl,$Ktbl,#8
185
___
186
}
187
$code=<<___;
188
#ifndef __KERNEL__
189
# include "arm_arch.h"
190
# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
191
# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
192
#else
193
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
194
# define __ARM_MAX_ARCH__ 7
195
# define VFP_ABI_PUSH
196
# define VFP_ABI_POP
197
#endif
198

199
#ifdef __ARMEL__
200
# define LO 0
201
# define HI 4
202
# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
203
#else
204
# define HI 0
205
# define LO 4
206
# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
207
#endif
208

209
.text
210
#if __ARM_ARCH__<7
211
.code	32
212
#else
213
.syntax unified
214
# ifdef __thumb2__
215
.thumb
216
# else
217
.code   32
218
# endif
219
#endif
220

221
.type	K512,%object
222
.align	5
223
K512:
224
WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
225
WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
226
WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
227
WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
228
WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
229
WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
230
WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
231
WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
232
WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
233
WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
234
WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
235
WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
236
WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
237
WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
238
WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
239
WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
240
WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
241
WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
242
WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
243
WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
244
WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
245
WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
246
WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
247
WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
248
WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
249
WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
250
WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
251
WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
252
WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
253
WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
254
WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
255
WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
256
WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
257
WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
258
WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
259
WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
260
WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
261
WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
262
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
263
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
264
.size	K512,.-K512
265
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
266
.LOPENSSL_armcap:
267
.word	OPENSSL_armcap_P-sha512_block_data_order
268
.skip	32-4
269
#else
270
.skip	32
271
#endif
272

273
.global	sha512_block_data_order
274
.type	sha512_block_data_order,%function
275
sha512_block_data_order:
276
.Lsha512_block_data_order:
277
#if __ARM_ARCH__<7
278
	sub	r3,pc,#8		@ sha512_block_data_order
279
#else
280
	adr	r3,.Lsha512_block_data_order
281
#endif
282
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
283
	ldr	r12,.LOPENSSL_armcap
284
	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
285
	tst	r12,#1
286
	bne	.LNEON
287
#endif
288
	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
289
	stmdb	sp!,{r4-r12,lr}
290
	sub	$Ktbl,r3,#672		@ K512
291
	sub	sp,sp,#9*8
292

293
	ldr	$Elo,[$ctx,#$Eoff+$lo]
294
	ldr	$Ehi,[$ctx,#$Eoff+$hi]
295
	ldr	$t0, [$ctx,#$Goff+$lo]
296
	ldr	$t1, [$ctx,#$Goff+$hi]
297
	ldr	$t2, [$ctx,#$Hoff+$lo]
298
	ldr	$t3, [$ctx,#$Hoff+$hi]
299
.Loop:
300
	str	$t0, [sp,#$Goff+0]
301
	str	$t1, [sp,#$Goff+4]
302
	str	$t2, [sp,#$Hoff+0]
303
	str	$t3, [sp,#$Hoff+4]
304
	ldr	$Alo,[$ctx,#$Aoff+$lo]
305
	ldr	$Ahi,[$ctx,#$Aoff+$hi]
306
	ldr	$Tlo,[$ctx,#$Boff+$lo]
307
	ldr	$Thi,[$ctx,#$Boff+$hi]
308
	ldr	$t0, [$ctx,#$Coff+$lo]
309
	ldr	$t1, [$ctx,#$Coff+$hi]
310
	ldr	$t2, [$ctx,#$Doff+$lo]
311
	ldr	$t3, [$ctx,#$Doff+$hi]
312
	str	$Tlo,[sp,#$Boff+0]
313
	str	$Thi,[sp,#$Boff+4]
314
	str	$t0, [sp,#$Coff+0]
315
	str	$t1, [sp,#$Coff+4]
316
	str	$t2, [sp,#$Doff+0]
317
	str	$t3, [sp,#$Doff+4]
318
	ldr	$Tlo,[$ctx,#$Foff+$lo]
319
	ldr	$Thi,[$ctx,#$Foff+$hi]
320
	str	$Tlo,[sp,#$Foff+0]
321
	str	$Thi,[sp,#$Foff+4]
322

323
.L00_15:
324
#if __ARM_ARCH__<7
325
	ldrb	$Tlo,[$inp,#7]
326
	ldrb	$t0, [$inp,#6]
327
	ldrb	$t1, [$inp,#5]
328
	ldrb	$t2, [$inp,#4]
329
	ldrb	$Thi,[$inp,#3]
330
	ldrb	$t3, [$inp,#2]
331
	orr	$Tlo,$Tlo,$t0,lsl#8
332
	ldrb	$t0, [$inp,#1]
333
	orr	$Tlo,$Tlo,$t1,lsl#16
334
	ldrb	$t1, [$inp],#8
335
	orr	$Tlo,$Tlo,$t2,lsl#24
336
	orr	$Thi,$Thi,$t3,lsl#8
337
	orr	$Thi,$Thi,$t0,lsl#16
338
	orr	$Thi,$Thi,$t1,lsl#24
339
#else
340
	ldr	$Tlo,[$inp,#4]
341
	ldr	$Thi,[$inp],#8
342
#ifdef __ARMEL__
343
	rev	$Tlo,$Tlo
344
	rev	$Thi,$Thi
345
#endif
346
#endif
347
___
348
	&BODY_00_15(0x94);
349
$code.=<<___;
350
	tst	$Ktbl,#1
351
	beq	.L00_15
352
	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
353
	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
354
	bic	$Ktbl,$Ktbl,#1
355
.L16_79:
356
	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
357
	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
358
	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
359
	mov	$Tlo,$t0,lsr#1
360
	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
361
	mov	$Thi,$t1,lsr#1
362
	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
363
	eor	$Tlo,$Tlo,$t1,lsl#31
364
	eor	$Thi,$Thi,$t0,lsl#31
365
	eor	$Tlo,$Tlo,$t0,lsr#8
366
	eor	$Thi,$Thi,$t1,lsr#8
367
	eor	$Tlo,$Tlo,$t1,lsl#24
368
	eor	$Thi,$Thi,$t0,lsl#24
369
	eor	$Tlo,$Tlo,$t0,lsr#7
370
	eor	$Thi,$Thi,$t1,lsr#7
371
	eor	$Tlo,$Tlo,$t1,lsl#25
372

373
	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
374
	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
375
	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
376
	mov	$t0,$t2,lsr#19
377
	mov	$t1,$t3,lsr#19
378
	eor	$t0,$t0,$t3,lsl#13
379
	eor	$t1,$t1,$t2,lsl#13
380
	eor	$t0,$t0,$t3,lsr#29
381
	eor	$t1,$t1,$t2,lsr#29
382
	eor	$t0,$t0,$t2,lsl#3
383
	eor	$t1,$t1,$t3,lsl#3
384
	eor	$t0,$t0,$t2,lsr#6
385
	eor	$t1,$t1,$t3,lsr#6
386
	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
387
	eor	$t0,$t0,$t3,lsl#26
388

389
	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
390
	adds	$Tlo,$Tlo,$t0
391
	ldr	$t0,[sp,#`$Xoff+8*16`+0]
392
	adc	$Thi,$Thi,$t1
393

394
	ldr	$t1,[sp,#`$Xoff+8*16`+4]
395
	adds	$Tlo,$Tlo,$t2
396
	adc	$Thi,$Thi,$t3
397
	adds	$Tlo,$Tlo,$t0
398
	adc	$Thi,$Thi,$t1
399
___
400
	&BODY_00_15(0x17);
401
$code.=<<___;
402
#if __ARM_ARCH__>=7
403
	ittt	eq			@ Thumb2 thing, sanity check in ARM
404
#endif
405
	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
406
	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
407
	beq	.L16_79
408
	bic	$Ktbl,$Ktbl,#1
409

410
	ldr	$Tlo,[sp,#$Boff+0]
411
	ldr	$Thi,[sp,#$Boff+4]
412
	ldr	$t0, [$ctx,#$Aoff+$lo]
413
	ldr	$t1, [$ctx,#$Aoff+$hi]
414
	ldr	$t2, [$ctx,#$Boff+$lo]
415
	ldr	$t3, [$ctx,#$Boff+$hi]
416
	adds	$t0,$Alo,$t0
417
	str	$t0, [$ctx,#$Aoff+$lo]
418
	adc	$t1,$Ahi,$t1
419
	str	$t1, [$ctx,#$Aoff+$hi]
420
	adds	$t2,$Tlo,$t2
421
	str	$t2, [$ctx,#$Boff+$lo]
422
	adc	$t3,$Thi,$t3
423
	str	$t3, [$ctx,#$Boff+$hi]
424

425
	ldr	$Alo,[sp,#$Coff+0]
426
	ldr	$Ahi,[sp,#$Coff+4]
427
	ldr	$Tlo,[sp,#$Doff+0]
428
	ldr	$Thi,[sp,#$Doff+4]
429
	ldr	$t0, [$ctx,#$Coff+$lo]
430
	ldr	$t1, [$ctx,#$Coff+$hi]
431
	ldr	$t2, [$ctx,#$Doff+$lo]
432
	ldr	$t3, [$ctx,#$Doff+$hi]
433
	adds	$t0,$Alo,$t0
434
	str	$t0, [$ctx,#$Coff+$lo]
435
	adc	$t1,$Ahi,$t1
436
	str	$t1, [$ctx,#$Coff+$hi]
437
	adds	$t2,$Tlo,$t2
438
	str	$t2, [$ctx,#$Doff+$lo]
439
	adc	$t3,$Thi,$t3
440
	str	$t3, [$ctx,#$Doff+$hi]
441

442
	ldr	$Tlo,[sp,#$Foff+0]
443
	ldr	$Thi,[sp,#$Foff+4]
444
	ldr	$t0, [$ctx,#$Eoff+$lo]
445
	ldr	$t1, [$ctx,#$Eoff+$hi]
446
	ldr	$t2, [$ctx,#$Foff+$lo]
447
	ldr	$t3, [$ctx,#$Foff+$hi]
448
	adds	$Elo,$Elo,$t0
449
	str	$Elo,[$ctx,#$Eoff+$lo]
450
	adc	$Ehi,$Ehi,$t1
451
	str	$Ehi,[$ctx,#$Eoff+$hi]
452
	adds	$t2,$Tlo,$t2
453
	str	$t2, [$ctx,#$Foff+$lo]
454
	adc	$t3,$Thi,$t3
455
	str	$t3, [$ctx,#$Foff+$hi]
456

457
	ldr	$Alo,[sp,#$Goff+0]
458
	ldr	$Ahi,[sp,#$Goff+4]
459
	ldr	$Tlo,[sp,#$Hoff+0]
460
	ldr	$Thi,[sp,#$Hoff+4]
461
	ldr	$t0, [$ctx,#$Goff+$lo]
462
	ldr	$t1, [$ctx,#$Goff+$hi]
463
	ldr	$t2, [$ctx,#$Hoff+$lo]
464
	ldr	$t3, [$ctx,#$Hoff+$hi]
465
	adds	$t0,$Alo,$t0
466
	str	$t0, [$ctx,#$Goff+$lo]
467
	adc	$t1,$Ahi,$t1
468
	str	$t1, [$ctx,#$Goff+$hi]
469
	adds	$t2,$Tlo,$t2
470
	str	$t2, [$ctx,#$Hoff+$lo]
471
	adc	$t3,$Thi,$t3
472
	str	$t3, [$ctx,#$Hoff+$hi]
473

474
	add	sp,sp,#640
475
	sub	$Ktbl,$Ktbl,#640
476

477
	teq	$inp,$len
478
	bne	.Loop
479

480
	add	sp,sp,#8*9		@ destroy frame
481
#if __ARM_ARCH__>=5
482
	ldmia	sp!,{r4-r12,pc}
483
#else
484
	ldmia	sp!,{r4-r12,lr}
485
	tst	lr,#1
486
	moveq	pc,lr			@ be binary compatible with V4, yet
487
	bx	lr			@ interoperable with Thumb ISA:-)
488
#endif
489
.size	sha512_block_data_order,.-sha512_block_data_order
490
___
491

492
{
493
my @Sigma0=(28,34,39);
494
my @Sigma1=(14,18,41);
495
my @sigma0=(1, 8, 7);
496
my @sigma1=(19,61,6);
497

498
my $Ktbl="r3";
499
my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
500

501
my @X=map("d$_",(0..15));
502
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
503

504
sub NEON_00_15() {
505
my $i=shift;
506
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
507
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
508

509
$code.=<<___ if ($i<16 || $i&1);
510
	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
511
#if $i<16
512
	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
513
#endif
514
	vshr.u64	$t1,$e,#@Sigma1[1]
515
#if $i>0
516
	 vadd.i64	$a,$Maj			@ h+=Maj from the past
517
#endif
518
	vshr.u64	$t2,$e,#@Sigma1[2]
519
___
520
$code.=<<___;
521
	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
522
	vsli.64		$t0,$e,#`64-@Sigma1[0]`
523
	vsli.64		$t1,$e,#`64-@Sigma1[1]`
524
	vmov		$Ch,$e
525
	vsli.64		$t2,$e,#`64-@Sigma1[2]`
526
#if $i<16 && defined(__ARMEL__)
527
	vrev64.8	@X[$i],@X[$i]
528
#endif
529
	veor		$t1,$t0
530
	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
531
	vshr.u64	$t0,$a,#@Sigma0[0]
532
	veor		$t2,$t1			@ Sigma1(e)
533
	vadd.i64	$T1,$Ch,$h
534
	vshr.u64	$t1,$a,#@Sigma0[1]
535
	vsli.64		$t0,$a,#`64-@Sigma0[0]`
536
	vadd.i64	$T1,$t2
537
	vshr.u64	$t2,$a,#@Sigma0[2]
538
	vadd.i64	$K,@X[$i%16]
539
	vsli.64		$t1,$a,#`64-@Sigma0[1]`
540
	veor		$Maj,$a,$b
541
	vsli.64		$t2,$a,#`64-@Sigma0[2]`
542
	veor		$h,$t0,$t1
543
	vadd.i64	$T1,$K
544
	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
545
	veor		$h,$t2			@ Sigma0(a)
546
	vadd.i64	$d,$T1
547
	vadd.i64	$Maj,$T1
548
	@ vadd.i64	$h,$Maj
549
___
550
}
551

552
sub NEON_16_79() {
553
my $i=shift;
554

555
if ($i&1)	{ &NEON_00_15($i,@_); return; }
556

557
# 2x-vectorized, therefore runs every 2nd round
558
my @X=map("q$_",(0..7));			# view @X as 128-bit vector
559
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
560
my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
561
my $e=@_[4];					# $e from NEON_00_15
562
$i /= 2;
563
$code.=<<___;
564
	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
565
	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
566
	 vadd.i64	@_[0],d30			@ h+=Maj from the past
567
	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
568
	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
569
	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
570
	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
571
	veor		$s1,$t0
572
	vshr.u64	$t0,$s0,#@sigma0[0]
573
	veor		$s1,$t1				@ sigma1(X[i+14])
574
	vshr.u64	$t1,$s0,#@sigma0[1]
575
	vadd.i64	@X[$i%8],$s1
576
	vshr.u64	$s1,$s0,#@sigma0[2]
577
	vsli.64		$t0,$s0,#`64-@sigma0[0]`
578
	vsli.64		$t1,$s0,#`64-@sigma0[1]`
579
	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
580
	veor		$s1,$t0
581
	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
582
	vadd.i64	@X[$i%8],$s0
583
	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
584
	veor		$s1,$t1				@ sigma0(X[i+1])
585
	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
586
	vadd.i64	@X[$i%8],$s1
587
___
588
	&NEON_00_15(2*$i,@_);
589
}
590

591
$code.=<<___;
592
#if __ARM_MAX_ARCH__>=7
593
.arch	armv7-a
594
.fpu	neon
595

596
.global	sha512_block_data_order_neon
597
.type	sha512_block_data_order_neon,%function
598
.align	4
599
sha512_block_data_order_neon:
600
.LNEON:
601
	dmb				@ errata #451034 on early Cortex A8
602
	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
603
	VFP_ABI_PUSH
604
	adr	$Ktbl,.Lsha512_block_data_order
605
	sub	$Ktbl,$Ktbl,.Lsha512_block_data_order-K512
606
	vldmia	$ctx,{$A-$H}		@ load context
607
.Loop_neon:
608
___
609
for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
610
$code.=<<___;
611
	mov		$cnt,#4
612
.L16_79_neon:
613
	subs		$cnt,#1
614
___
615
for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
616
$code.=<<___;
617
	bne		.L16_79_neon
618

619
	 vadd.i64	$A,d30		@ h+=Maj from the past
620
	vldmia		$ctx,{d24-d31}	@ load context to temp
621
	vadd.i64	q8,q12		@ vectorized accumulate
622
	vadd.i64	q9,q13
623
	vadd.i64	q10,q14
624
	vadd.i64	q11,q15
625
	vstmia		$ctx,{$A-$H}	@ save context
626
	teq		$inp,$len
627
	sub		$Ktbl,#640	@ rewind K512
628
	bne		.Loop_neon
629

630
	VFP_ABI_POP
631
	ret				@ bx lr
632
.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
633
#endif
634
___
635
}
636
$code.=<<___;
637
.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
638
.align	2
639
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
640
.comm	OPENSSL_armcap_P,4,4
641
#endif
642
___
643

644
$code =~ s/\`([^\`]*)\`/eval $1/gem;
645
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
646
$code =~ s/\bret\b/bx	lr/gm;
647

648
open SELF,$0;
649
while(<SELF>) {
650
	next if (/^#!/);
651
	last if (!s/^#/@/ and !/^$/);
652
	print;
653
}
654
close SELF;
655

656
print $code;
657
close STDOUT; # enforce flush
658

659
Product

Resources

Company