Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm/sha512-armv4.pl
26282 views
1
#!/usr/bin/env perl
2
# SPDX-License-Identifier: GPL-2.0
3
4
# This code is taken from the OpenSSL project but the author (Andy Polyakov)
5
# has relicensed it under the GPLv2. Therefore this program is free software;
6
# you can redistribute it and/or modify it under the terms of the GNU General
7
# Public License version 2 as published by the Free Software Foundation.
8
#
9
# The original headers, including the original license headers, are
10
# included below for completeness.
11
12
# ====================================================================
13
# Written by Andy Polyakov <[email protected]> for the OpenSSL
14
# project. The module is, however, dual licensed under OpenSSL and
15
# CRYPTOGAMS licenses depending on where you obtain it. For further
16
# details see https://www.openssl.org/~appro/cryptogams/.
17
# ====================================================================
18
19
# SHA512 block procedure for ARMv4. September 2007.
20
21
# This code is ~4.5 (four and a half) times faster than code generated
22
# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
23
# Xscale PXA250 core].
24
#
25
# July 2010.
26
#
27
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
28
# Cortex A8 core and ~40 cycles per processed byte.
29
30
# February 2011.
31
#
32
# Profiler-assisted and platform-specific optimization resulted in 7%
33
# improvement on Coxtex A8 core and ~38 cycles per byte.
34
35
# March 2011.
36
#
37
# Add NEON implementation. On Cortex A8 it was measured to process
38
# one byte in 23.3 cycles or ~60% faster than integer-only code.
39
40
# August 2012.
41
#
42
# Improve NEON performance by 12% on Snapdragon S4. In absolute
43
# terms it's 22.6 cycles per byte, which is disappointing result.
44
# Technical writers asserted that 3-way S4 pipeline can sustain
45
# multiple NEON instructions per cycle, but dual NEON issue could
46
# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
47
# for further details. On side note Cortex-A15 processes one byte in
48
# 16 cycles.
49
50
# Byte order [in]dependence. =========================================
51
#
52
# Originally caller was expected to maintain specific *dword* order in
53
# h[0-7], namely with most significant dword at *lower* address, which
54
# was reflected in below two parameters as 0 and 4. Now caller is
55
# expected to maintain native byte order for whole 64-bit values.
56
$hi="HI";
57
$lo="LO";
58
# ====================================================================
59
60
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
61
open STDOUT,">$output";
62
63
$ctx="r0"; # parameter block
64
$inp="r1";
65
$len="r2";
66
67
$Tlo="r3";
68
$Thi="r4";
69
$Alo="r5";
70
$Ahi="r6";
71
$Elo="r7";
72
$Ehi="r8";
73
$t0="r9";
74
$t1="r10";
75
$t2="r11";
76
$t3="r12";
77
############ r13 is stack pointer
78
$Ktbl="r14";
79
############ r15 is program counter
80
81
$Aoff=8*0;
82
$Boff=8*1;
83
$Coff=8*2;
84
$Doff=8*3;
85
$Eoff=8*4;
86
$Foff=8*5;
87
$Goff=8*6;
88
$Hoff=8*7;
89
$Xoff=8*8;
90
91
sub BODY_00_15() {
92
my $magic = shift;
93
$code.=<<___;
94
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
95
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
96
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
97
mov $t0,$Elo,lsr#14
98
str $Tlo,[sp,#$Xoff+0]
99
mov $t1,$Ehi,lsr#14
100
str $Thi,[sp,#$Xoff+4]
101
eor $t0,$t0,$Ehi,lsl#18
102
ldr $t2,[sp,#$Hoff+0] @ h.lo
103
eor $t1,$t1,$Elo,lsl#18
104
ldr $t3,[sp,#$Hoff+4] @ h.hi
105
eor $t0,$t0,$Elo,lsr#18
106
eor $t1,$t1,$Ehi,lsr#18
107
eor $t0,$t0,$Ehi,lsl#14
108
eor $t1,$t1,$Elo,lsl#14
109
eor $t0,$t0,$Ehi,lsr#9
110
eor $t1,$t1,$Elo,lsr#9
111
eor $t0,$t0,$Elo,lsl#23
112
eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
113
adds $Tlo,$Tlo,$t0
114
ldr $t0,[sp,#$Foff+0] @ f.lo
115
adc $Thi,$Thi,$t1 @ T += Sigma1(e)
116
ldr $t1,[sp,#$Foff+4] @ f.hi
117
adds $Tlo,$Tlo,$t2
118
ldr $t2,[sp,#$Goff+0] @ g.lo
119
adc $Thi,$Thi,$t3 @ T += h
120
ldr $t3,[sp,#$Goff+4] @ g.hi
121
122
eor $t0,$t0,$t2
123
str $Elo,[sp,#$Eoff+0]
124
eor $t1,$t1,$t3
125
str $Ehi,[sp,#$Eoff+4]
126
and $t0,$t0,$Elo
127
str $Alo,[sp,#$Aoff+0]
128
and $t1,$t1,$Ehi
129
str $Ahi,[sp,#$Aoff+4]
130
eor $t0,$t0,$t2
131
ldr $t2,[$Ktbl,#$lo] @ K[i].lo
132
eor $t1,$t1,$t3 @ Ch(e,f,g)
133
ldr $t3,[$Ktbl,#$hi] @ K[i].hi
134
135
adds $Tlo,$Tlo,$t0
136
ldr $Elo,[sp,#$Doff+0] @ d.lo
137
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
138
ldr $Ehi,[sp,#$Doff+4] @ d.hi
139
adds $Tlo,$Tlo,$t2
140
and $t0,$t2,#0xff
141
adc $Thi,$Thi,$t3 @ T += K[i]
142
adds $Elo,$Elo,$Tlo
143
ldr $t2,[sp,#$Boff+0] @ b.lo
144
adc $Ehi,$Ehi,$Thi @ d += T
145
teq $t0,#$magic
146
147
ldr $t3,[sp,#$Coff+0] @ c.lo
148
#if __ARM_ARCH__>=7
149
it eq @ Thumb2 thing, sanity check in ARM
150
#endif
151
orreq $Ktbl,$Ktbl,#1
152
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
153
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
154
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
155
mov $t0,$Alo,lsr#28
156
mov $t1,$Ahi,lsr#28
157
eor $t0,$t0,$Ahi,lsl#4
158
eor $t1,$t1,$Alo,lsl#4
159
eor $t0,$t0,$Ahi,lsr#2
160
eor $t1,$t1,$Alo,lsr#2
161
eor $t0,$t0,$Alo,lsl#30
162
eor $t1,$t1,$Ahi,lsl#30
163
eor $t0,$t0,$Ahi,lsr#7
164
eor $t1,$t1,$Alo,lsr#7
165
eor $t0,$t0,$Alo,lsl#25
166
eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
167
adds $Tlo,$Tlo,$t0
168
and $t0,$Alo,$t2
169
adc $Thi,$Thi,$t1 @ T += Sigma0(a)
170
171
ldr $t1,[sp,#$Boff+4] @ b.hi
172
orr $Alo,$Alo,$t2
173
ldr $t2,[sp,#$Coff+4] @ c.hi
174
and $Alo,$Alo,$t3
175
and $t3,$Ahi,$t1
176
orr $Ahi,$Ahi,$t1
177
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
178
and $Ahi,$Ahi,$t2
179
adds $Alo,$Alo,$Tlo
180
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
181
sub sp,sp,#8
182
adc $Ahi,$Ahi,$Thi @ h += T
183
tst $Ktbl,#1
184
add $Ktbl,$Ktbl,#8
185
___
186
}
187
$code=<<___;
188
#ifndef __KERNEL__
189
# include "arm_arch.h"
190
# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
191
# define VFP_ABI_POP vldmia sp!,{d8-d15}
192
#else
193
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
194
# define __ARM_MAX_ARCH__ 7
195
# define VFP_ABI_PUSH
196
# define VFP_ABI_POP
197
#endif
198
199
#ifdef __ARMEL__
200
# define LO 0
201
# define HI 4
202
# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
203
#else
204
# define HI 0
205
# define LO 4
206
# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
207
#endif
208
209
.text
210
#if __ARM_ARCH__<7
211
.code 32
212
#else
213
.syntax unified
214
# ifdef __thumb2__
215
.thumb
216
# else
217
.code 32
218
# endif
219
#endif
220
221
.type K512,%object
222
.align 5
223
K512:
224
WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
225
WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
226
WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
227
WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
228
WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
229
WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
230
WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
231
WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
232
WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
233
WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
234
WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
235
WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
236
WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
237
WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
238
WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
239
WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
240
WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
241
WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
242
WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
243
WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
244
WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
245
WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
246
WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
247
WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
248
WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
249
WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
250
WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
251
WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
252
WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
253
WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
254
WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
255
WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
256
WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
257
WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
258
WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
259
WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
260
WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
261
WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
262
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
263
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
264
.size K512,.-K512
265
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
266
.LOPENSSL_armcap:
267
.word OPENSSL_armcap_P-sha512_block_data_order
268
.skip 32-4
269
#else
270
.skip 32
271
#endif
272
273
.global sha512_block_data_order
274
.type sha512_block_data_order,%function
275
sha512_block_data_order:
276
.Lsha512_block_data_order:
277
#if __ARM_ARCH__<7
278
sub r3,pc,#8 @ sha512_block_data_order
279
#else
280
adr r3,.Lsha512_block_data_order
281
#endif
282
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
283
ldr r12,.LOPENSSL_armcap
284
ldr r12,[r3,r12] @ OPENSSL_armcap_P
285
tst r12,#1
286
bne .LNEON
287
#endif
288
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
289
stmdb sp!,{r4-r12,lr}
290
sub $Ktbl,r3,#672 @ K512
291
sub sp,sp,#9*8
292
293
ldr $Elo,[$ctx,#$Eoff+$lo]
294
ldr $Ehi,[$ctx,#$Eoff+$hi]
295
ldr $t0, [$ctx,#$Goff+$lo]
296
ldr $t1, [$ctx,#$Goff+$hi]
297
ldr $t2, [$ctx,#$Hoff+$lo]
298
ldr $t3, [$ctx,#$Hoff+$hi]
299
.Loop:
300
str $t0, [sp,#$Goff+0]
301
str $t1, [sp,#$Goff+4]
302
str $t2, [sp,#$Hoff+0]
303
str $t3, [sp,#$Hoff+4]
304
ldr $Alo,[$ctx,#$Aoff+$lo]
305
ldr $Ahi,[$ctx,#$Aoff+$hi]
306
ldr $Tlo,[$ctx,#$Boff+$lo]
307
ldr $Thi,[$ctx,#$Boff+$hi]
308
ldr $t0, [$ctx,#$Coff+$lo]
309
ldr $t1, [$ctx,#$Coff+$hi]
310
ldr $t2, [$ctx,#$Doff+$lo]
311
ldr $t3, [$ctx,#$Doff+$hi]
312
str $Tlo,[sp,#$Boff+0]
313
str $Thi,[sp,#$Boff+4]
314
str $t0, [sp,#$Coff+0]
315
str $t1, [sp,#$Coff+4]
316
str $t2, [sp,#$Doff+0]
317
str $t3, [sp,#$Doff+4]
318
ldr $Tlo,[$ctx,#$Foff+$lo]
319
ldr $Thi,[$ctx,#$Foff+$hi]
320
str $Tlo,[sp,#$Foff+0]
321
str $Thi,[sp,#$Foff+4]
322
323
.L00_15:
324
#if __ARM_ARCH__<7
325
ldrb $Tlo,[$inp,#7]
326
ldrb $t0, [$inp,#6]
327
ldrb $t1, [$inp,#5]
328
ldrb $t2, [$inp,#4]
329
ldrb $Thi,[$inp,#3]
330
ldrb $t3, [$inp,#2]
331
orr $Tlo,$Tlo,$t0,lsl#8
332
ldrb $t0, [$inp,#1]
333
orr $Tlo,$Tlo,$t1,lsl#16
334
ldrb $t1, [$inp],#8
335
orr $Tlo,$Tlo,$t2,lsl#24
336
orr $Thi,$Thi,$t3,lsl#8
337
orr $Thi,$Thi,$t0,lsl#16
338
orr $Thi,$Thi,$t1,lsl#24
339
#else
340
ldr $Tlo,[$inp,#4]
341
ldr $Thi,[$inp],#8
342
#ifdef __ARMEL__
343
rev $Tlo,$Tlo
344
rev $Thi,$Thi
345
#endif
346
#endif
347
___
348
&BODY_00_15(0x94);
349
$code.=<<___;
350
tst $Ktbl,#1
351
beq .L00_15
352
ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
353
ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
354
bic $Ktbl,$Ktbl,#1
355
.L16_79:
356
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
357
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
358
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
359
mov $Tlo,$t0,lsr#1
360
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
361
mov $Thi,$t1,lsr#1
362
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
363
eor $Tlo,$Tlo,$t1,lsl#31
364
eor $Thi,$Thi,$t0,lsl#31
365
eor $Tlo,$Tlo,$t0,lsr#8
366
eor $Thi,$Thi,$t1,lsr#8
367
eor $Tlo,$Tlo,$t1,lsl#24
368
eor $Thi,$Thi,$t0,lsl#24
369
eor $Tlo,$Tlo,$t0,lsr#7
370
eor $Thi,$Thi,$t1,lsr#7
371
eor $Tlo,$Tlo,$t1,lsl#25
372
373
@ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
374
@ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
375
@ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
376
mov $t0,$t2,lsr#19
377
mov $t1,$t3,lsr#19
378
eor $t0,$t0,$t3,lsl#13
379
eor $t1,$t1,$t2,lsl#13
380
eor $t0,$t0,$t3,lsr#29
381
eor $t1,$t1,$t2,lsr#29
382
eor $t0,$t0,$t2,lsl#3
383
eor $t1,$t1,$t3,lsl#3
384
eor $t0,$t0,$t2,lsr#6
385
eor $t1,$t1,$t3,lsr#6
386
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
387
eor $t0,$t0,$t3,lsl#26
388
389
ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
390
adds $Tlo,$Tlo,$t0
391
ldr $t0,[sp,#`$Xoff+8*16`+0]
392
adc $Thi,$Thi,$t1
393
394
ldr $t1,[sp,#`$Xoff+8*16`+4]
395
adds $Tlo,$Tlo,$t2
396
adc $Thi,$Thi,$t3
397
adds $Tlo,$Tlo,$t0
398
adc $Thi,$Thi,$t1
399
___
400
&BODY_00_15(0x17);
401
$code.=<<___;
402
#if __ARM_ARCH__>=7
403
ittt eq @ Thumb2 thing, sanity check in ARM
404
#endif
405
ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
406
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
407
beq .L16_79
408
bic $Ktbl,$Ktbl,#1
409
410
ldr $Tlo,[sp,#$Boff+0]
411
ldr $Thi,[sp,#$Boff+4]
412
ldr $t0, [$ctx,#$Aoff+$lo]
413
ldr $t1, [$ctx,#$Aoff+$hi]
414
ldr $t2, [$ctx,#$Boff+$lo]
415
ldr $t3, [$ctx,#$Boff+$hi]
416
adds $t0,$Alo,$t0
417
str $t0, [$ctx,#$Aoff+$lo]
418
adc $t1,$Ahi,$t1
419
str $t1, [$ctx,#$Aoff+$hi]
420
adds $t2,$Tlo,$t2
421
str $t2, [$ctx,#$Boff+$lo]
422
adc $t3,$Thi,$t3
423
str $t3, [$ctx,#$Boff+$hi]
424
425
ldr $Alo,[sp,#$Coff+0]
426
ldr $Ahi,[sp,#$Coff+4]
427
ldr $Tlo,[sp,#$Doff+0]
428
ldr $Thi,[sp,#$Doff+4]
429
ldr $t0, [$ctx,#$Coff+$lo]
430
ldr $t1, [$ctx,#$Coff+$hi]
431
ldr $t2, [$ctx,#$Doff+$lo]
432
ldr $t3, [$ctx,#$Doff+$hi]
433
adds $t0,$Alo,$t0
434
str $t0, [$ctx,#$Coff+$lo]
435
adc $t1,$Ahi,$t1
436
str $t1, [$ctx,#$Coff+$hi]
437
adds $t2,$Tlo,$t2
438
str $t2, [$ctx,#$Doff+$lo]
439
adc $t3,$Thi,$t3
440
str $t3, [$ctx,#$Doff+$hi]
441
442
ldr $Tlo,[sp,#$Foff+0]
443
ldr $Thi,[sp,#$Foff+4]
444
ldr $t0, [$ctx,#$Eoff+$lo]
445
ldr $t1, [$ctx,#$Eoff+$hi]
446
ldr $t2, [$ctx,#$Foff+$lo]
447
ldr $t3, [$ctx,#$Foff+$hi]
448
adds $Elo,$Elo,$t0
449
str $Elo,[$ctx,#$Eoff+$lo]
450
adc $Ehi,$Ehi,$t1
451
str $Ehi,[$ctx,#$Eoff+$hi]
452
adds $t2,$Tlo,$t2
453
str $t2, [$ctx,#$Foff+$lo]
454
adc $t3,$Thi,$t3
455
str $t3, [$ctx,#$Foff+$hi]
456
457
ldr $Alo,[sp,#$Goff+0]
458
ldr $Ahi,[sp,#$Goff+4]
459
ldr $Tlo,[sp,#$Hoff+0]
460
ldr $Thi,[sp,#$Hoff+4]
461
ldr $t0, [$ctx,#$Goff+$lo]
462
ldr $t1, [$ctx,#$Goff+$hi]
463
ldr $t2, [$ctx,#$Hoff+$lo]
464
ldr $t3, [$ctx,#$Hoff+$hi]
465
adds $t0,$Alo,$t0
466
str $t0, [$ctx,#$Goff+$lo]
467
adc $t1,$Ahi,$t1
468
str $t1, [$ctx,#$Goff+$hi]
469
adds $t2,$Tlo,$t2
470
str $t2, [$ctx,#$Hoff+$lo]
471
adc $t3,$Thi,$t3
472
str $t3, [$ctx,#$Hoff+$hi]
473
474
add sp,sp,#640
475
sub $Ktbl,$Ktbl,#640
476
477
teq $inp,$len
478
bne .Loop
479
480
add sp,sp,#8*9 @ destroy frame
481
#if __ARM_ARCH__>=5
482
ldmia sp!,{r4-r12,pc}
483
#else
484
ldmia sp!,{r4-r12,lr}
485
tst lr,#1
486
moveq pc,lr @ be binary compatible with V4, yet
487
bx lr @ interoperable with Thumb ISA:-)
488
#endif
489
.size sha512_block_data_order,.-sha512_block_data_order
490
___
491
492
{
493
my @Sigma0=(28,34,39);
494
my @Sigma1=(14,18,41);
495
my @sigma0=(1, 8, 7);
496
my @sigma1=(19,61,6);
497
498
my $Ktbl="r3";
499
my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
500
501
my @X=map("d$_",(0..15));
502
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
503
504
sub NEON_00_15() {
505
my $i=shift;
506
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
507
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
508
509
$code.=<<___ if ($i<16 || $i&1);
510
vshr.u64 $t0,$e,#@Sigma1[0] @ $i
511
#if $i<16
512
vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
513
#endif
514
vshr.u64 $t1,$e,#@Sigma1[1]
515
#if $i>0
516
vadd.i64 $a,$Maj @ h+=Maj from the past
517
#endif
518
vshr.u64 $t2,$e,#@Sigma1[2]
519
___
520
$code.=<<___;
521
vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
522
vsli.64 $t0,$e,#`64-@Sigma1[0]`
523
vsli.64 $t1,$e,#`64-@Sigma1[1]`
524
vmov $Ch,$e
525
vsli.64 $t2,$e,#`64-@Sigma1[2]`
526
#if $i<16 && defined(__ARMEL__)
527
vrev64.8 @X[$i],@X[$i]
528
#endif
529
veor $t1,$t0
530
vbsl $Ch,$f,$g @ Ch(e,f,g)
531
vshr.u64 $t0,$a,#@Sigma0[0]
532
veor $t2,$t1 @ Sigma1(e)
533
vadd.i64 $T1,$Ch,$h
534
vshr.u64 $t1,$a,#@Sigma0[1]
535
vsli.64 $t0,$a,#`64-@Sigma0[0]`
536
vadd.i64 $T1,$t2
537
vshr.u64 $t2,$a,#@Sigma0[2]
538
vadd.i64 $K,@X[$i%16]
539
vsli.64 $t1,$a,#`64-@Sigma0[1]`
540
veor $Maj,$a,$b
541
vsli.64 $t2,$a,#`64-@Sigma0[2]`
542
veor $h,$t0,$t1
543
vadd.i64 $T1,$K
544
vbsl $Maj,$c,$b @ Maj(a,b,c)
545
veor $h,$t2 @ Sigma0(a)
546
vadd.i64 $d,$T1
547
vadd.i64 $Maj,$T1
548
@ vadd.i64 $h,$Maj
549
___
550
}
551
552
sub NEON_16_79() {
553
my $i=shift;
554
555
if ($i&1) { &NEON_00_15($i,@_); return; }
556
557
# 2x-vectorized, therefore runs every 2nd round
558
my @X=map("q$_",(0..7)); # view @X as 128-bit vector
559
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
560
my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
561
my $e=@_[4]; # $e from NEON_00_15
562
$i /= 2;
563
$code.=<<___;
564
vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
565
vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
566
vadd.i64 @_[0],d30 @ h+=Maj from the past
567
vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
568
vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
569
vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
570
vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
571
veor $s1,$t0
572
vshr.u64 $t0,$s0,#@sigma0[0]
573
veor $s1,$t1 @ sigma1(X[i+14])
574
vshr.u64 $t1,$s0,#@sigma0[1]
575
vadd.i64 @X[$i%8],$s1
576
vshr.u64 $s1,$s0,#@sigma0[2]
577
vsli.64 $t0,$s0,#`64-@sigma0[0]`
578
vsli.64 $t1,$s0,#`64-@sigma0[1]`
579
vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
580
veor $s1,$t0
581
vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
582
vadd.i64 @X[$i%8],$s0
583
vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
584
veor $s1,$t1 @ sigma0(X[i+1])
585
vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
586
vadd.i64 @X[$i%8],$s1
587
___
588
&NEON_00_15(2*$i,@_);
589
}
590
591
$code.=<<___;
592
#if __ARM_MAX_ARCH__>=7
593
.arch armv7-a
594
.fpu neon
595
596
.global sha512_block_data_order_neon
597
.type sha512_block_data_order_neon,%function
598
.align 4
599
sha512_block_data_order_neon:
600
.LNEON:
601
dmb @ errata #451034 on early Cortex A8
602
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
603
VFP_ABI_PUSH
604
adr $Ktbl,.Lsha512_block_data_order
605
sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
606
vldmia $ctx,{$A-$H} @ load context
607
.Loop_neon:
608
___
609
for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
610
$code.=<<___;
611
mov $cnt,#4
612
.L16_79_neon:
613
subs $cnt,#1
614
___
615
for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
616
$code.=<<___;
617
bne .L16_79_neon
618
619
vadd.i64 $A,d30 @ h+=Maj from the past
620
vldmia $ctx,{d24-d31} @ load context to temp
621
vadd.i64 q8,q12 @ vectorized accumulate
622
vadd.i64 q9,q13
623
vadd.i64 q10,q14
624
vadd.i64 q11,q15
625
vstmia $ctx,{$A-$H} @ save context
626
teq $inp,$len
627
sub $Ktbl,#640 @ rewind K512
628
bne .Loop_neon
629
630
VFP_ABI_POP
631
ret @ bx lr
632
.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
633
#endif
634
___
635
}
636
$code.=<<___;
637
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
638
.align 2
639
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
640
.comm OPENSSL_armcap_P,4,4
641
#endif
642
___
643
644
$code =~ s/\`([^\`]*)\`/eval $1/gem;
645
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
646
$code =~ s/\bret\b/bx lr/gm;
647
648
open SELF,$0;
649
while(<SELF>) {
650
next if (/^#!/);
651
last if (!s/^#/@/ and !/^$/);
652
print;
653
}
654
close SELF;
655
656
print $code;
657
close STDOUT; # enforce flush
658
659