Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/arm/ghash-armv4.S
39482 views
1
/* Do not modify. This file is auto-generated from ghash-armv4.pl. */
2
#include "arm_arch.h"
3
4
#if defined(__thumb2__) || defined(__clang__)
5
.syntax unified
6
#define ldrplb ldrbpl
7
#define ldrneb ldrbne
8
#endif
9
#if defined(__thumb2__)
10
.thumb
11
#else
12
.code 32
13
#endif
14
15
.text
16
17
.type rem_4bit,%object
18
.align 5
19
rem_4bit:
20
.short 0x0000,0x1C20,0x3840,0x2460
21
.short 0x7080,0x6CA0,0x48C0,0x54E0
22
.short 0xE100,0xFD20,0xD940,0xC560
23
.short 0x9180,0x8DA0,0xA9C0,0xB5E0
24
.size rem_4bit,.-rem_4bit
25
26
.type rem_4bit_get,%function
27
rem_4bit_get:
28
#if defined(__thumb2__)
29
adr r2,rem_4bit
30
#else
31
sub r2,pc,#8+32 @ &rem_4bit
32
#endif
33
b .Lrem_4bit_got
34
nop
35
nop
36
.size rem_4bit_get,.-rem_4bit_get
37
38
.globl gcm_ghash_4bit
39
.type gcm_ghash_4bit,%function
40
.align 4
41
gcm_ghash_4bit:
42
#if defined(__thumb2__)
43
adr r12,rem_4bit
44
#else
45
sub r12,pc,#8+48 @ &rem_4bit
46
#endif
47
add r3,r2,r3 @ r3 to point at the end
48
stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} @ save r3/end too
49
50
ldmia r12,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy rem_4bit ...
51
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ ... to stack
52
53
ldrb r12,[r2,#15]
54
ldrb r14,[r0,#15]
55
.Louter:
56
eor r12,r12,r14
57
and r14,r12,#0xf0
58
and r12,r12,#0x0f
59
mov r3,#14
60
61
add r7,r1,r12,lsl#4
62
ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo]
63
add r11,r1,r14
64
ldrb r12,[r2,#14]
65
66
and r14,r4,#0xf @ rem
67
ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
68
add r14,r14,r14
69
eor r4,r8,r4,lsr#4
70
ldrh r8,[sp,r14] @ rem_4bit[rem]
71
eor r4,r4,r5,lsl#28
72
ldrb r14,[r0,#14]
73
eor r5,r9,r5,lsr#4
74
eor r5,r5,r6,lsl#28
75
eor r6,r10,r6,lsr#4
76
eor r6,r6,r7,lsl#28
77
eor r7,r11,r7,lsr#4
78
eor r12,r12,r14
79
and r14,r12,#0xf0
80
and r12,r12,#0x0f
81
eor r7,r7,r8,lsl#16
82
83
.Linner:
84
add r11,r1,r12,lsl#4
85
and r12,r4,#0xf @ rem
86
subs r3,r3,#1
87
add r12,r12,r12
88
ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo]
89
eor r4,r8,r4,lsr#4
90
eor r4,r4,r5,lsl#28
91
eor r5,r9,r5,lsr#4
92
eor r5,r5,r6,lsl#28
93
ldrh r8,[sp,r12] @ rem_4bit[rem]
94
eor r6,r10,r6,lsr#4
95
#ifdef __thumb2__
96
it pl
97
#endif
98
ldrplb r12,[r2,r3]
99
eor r6,r6,r7,lsl#28
100
eor r7,r11,r7,lsr#4
101
102
add r11,r1,r14
103
and r14,r4,#0xf @ rem
104
eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
105
add r14,r14,r14
106
ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
107
eor r4,r8,r4,lsr#4
108
#ifdef __thumb2__
109
it pl
110
#endif
111
ldrplb r8,[r0,r3]
112
eor r4,r4,r5,lsl#28
113
eor r5,r9,r5,lsr#4
114
ldrh r9,[sp,r14]
115
eor r5,r5,r6,lsl#28
116
eor r6,r10,r6,lsr#4
117
eor r6,r6,r7,lsl#28
118
#ifdef __thumb2__
119
it pl
120
#endif
121
eorpl r12,r12,r8
122
eor r7,r11,r7,lsr#4
123
#ifdef __thumb2__
124
itt pl
125
#endif
126
andpl r14,r12,#0xf0
127
andpl r12,r12,#0x0f
128
eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem]
129
bpl .Linner
130
131
ldr r3,[sp,#32] @ re-load r3/end
132
add r2,r2,#16
133
mov r14,r4
134
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
135
rev r4,r4
136
str r4,[r0,#12]
137
#elif defined(__ARMEB__)
138
str r4,[r0,#12]
139
#else
140
mov r9,r4,lsr#8
141
strb r4,[r0,#12+3]
142
mov r10,r4,lsr#16
143
strb r9,[r0,#12+2]
144
mov r11,r4,lsr#24
145
strb r10,[r0,#12+1]
146
strb r11,[r0,#12]
147
#endif
148
cmp r2,r3
149
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
150
rev r5,r5
151
str r5,[r0,#8]
152
#elif defined(__ARMEB__)
153
str r5,[r0,#8]
154
#else
155
mov r9,r5,lsr#8
156
strb r5,[r0,#8+3]
157
mov r10,r5,lsr#16
158
strb r9,[r0,#8+2]
159
mov r11,r5,lsr#24
160
strb r10,[r0,#8+1]
161
strb r11,[r0,#8]
162
#endif
163
164
#ifdef __thumb2__
165
it ne
166
#endif
167
ldrneb r12,[r2,#15]
168
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
169
rev r6,r6
170
str r6,[r0,#4]
171
#elif defined(__ARMEB__)
172
str r6,[r0,#4]
173
#else
174
mov r9,r6,lsr#8
175
strb r6,[r0,#4+3]
176
mov r10,r6,lsr#16
177
strb r9,[r0,#4+2]
178
mov r11,r6,lsr#24
179
strb r10,[r0,#4+1]
180
strb r11,[r0,#4]
181
#endif
182
183
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
184
rev r7,r7
185
str r7,[r0,#0]
186
#elif defined(__ARMEB__)
187
str r7,[r0,#0]
188
#else
189
mov r9,r7,lsr#8
190
strb r7,[r0,#0+3]
191
mov r10,r7,lsr#16
192
strb r9,[r0,#0+2]
193
mov r11,r7,lsr#24
194
strb r10,[r0,#0+1]
195
strb r11,[r0,#0]
196
#endif
197
198
bne .Louter
199
200
add sp,sp,#36
201
#if __ARM_ARCH__>=5
202
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
203
#else
204
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
205
tst lr,#1
206
moveq pc,lr @ be binary compatible with V4, yet
207
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
208
#endif
209
.size gcm_ghash_4bit,.-gcm_ghash_4bit
210
211
.globl gcm_gmult_4bit
212
.type gcm_gmult_4bit,%function
213
gcm_gmult_4bit:
214
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
215
ldrb r12,[r0,#15]
216
b rem_4bit_get
217
.Lrem_4bit_got:
218
and r14,r12,#0xf0
219
and r12,r12,#0x0f
220
mov r3,#14
221
222
add r7,r1,r12,lsl#4
223
ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo]
224
ldrb r12,[r0,#14]
225
226
add r11,r1,r14
227
and r14,r4,#0xf @ rem
228
ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
229
add r14,r14,r14
230
eor r4,r8,r4,lsr#4
231
ldrh r8,[r2,r14] @ rem_4bit[rem]
232
eor r4,r4,r5,lsl#28
233
eor r5,r9,r5,lsr#4
234
eor r5,r5,r6,lsl#28
235
eor r6,r10,r6,lsr#4
236
eor r6,r6,r7,lsl#28
237
eor r7,r11,r7,lsr#4
238
and r14,r12,#0xf0
239
eor r7,r7,r8,lsl#16
240
and r12,r12,#0x0f
241
242
.Loop:
243
add r11,r1,r12,lsl#4
244
and r12,r4,#0xf @ rem
245
subs r3,r3,#1
246
add r12,r12,r12
247
ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo]
248
eor r4,r8,r4,lsr#4
249
eor r4,r4,r5,lsl#28
250
eor r5,r9,r5,lsr#4
251
eor r5,r5,r6,lsl#28
252
ldrh r8,[r2,r12] @ rem_4bit[rem]
253
eor r6,r10,r6,lsr#4
254
#ifdef __thumb2__
255
it pl
256
#endif
257
ldrplb r12,[r0,r3]
258
eor r6,r6,r7,lsl#28
259
eor r7,r11,r7,lsr#4
260
261
add r11,r1,r14
262
and r14,r4,#0xf @ rem
263
eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
264
add r14,r14,r14
265
ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
266
eor r4,r8,r4,lsr#4
267
eor r4,r4,r5,lsl#28
268
eor r5,r9,r5,lsr#4
269
ldrh r8,[r2,r14] @ rem_4bit[rem]
270
eor r5,r5,r6,lsl#28
271
eor r6,r10,r6,lsr#4
272
eor r6,r6,r7,lsl#28
273
eor r7,r11,r7,lsr#4
274
#ifdef __thumb2__
275
itt pl
276
#endif
277
andpl r14,r12,#0xf0
278
andpl r12,r12,#0x0f
279
eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
280
bpl .Loop
281
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
282
rev r4,r4
283
str r4,[r0,#12]
284
#elif defined(__ARMEB__)
285
str r4,[r0,#12]
286
#else
287
mov r9,r4,lsr#8
288
strb r4,[r0,#12+3]
289
mov r10,r4,lsr#16
290
strb r9,[r0,#12+2]
291
mov r11,r4,lsr#24
292
strb r10,[r0,#12+1]
293
strb r11,[r0,#12]
294
#endif
295
296
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
297
rev r5,r5
298
str r5,[r0,#8]
299
#elif defined(__ARMEB__)
300
str r5,[r0,#8]
301
#else
302
mov r9,r5,lsr#8
303
strb r5,[r0,#8+3]
304
mov r10,r5,lsr#16
305
strb r9,[r0,#8+2]
306
mov r11,r5,lsr#24
307
strb r10,[r0,#8+1]
308
strb r11,[r0,#8]
309
#endif
310
311
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
312
rev r6,r6
313
str r6,[r0,#4]
314
#elif defined(__ARMEB__)
315
str r6,[r0,#4]
316
#else
317
mov r9,r6,lsr#8
318
strb r6,[r0,#4+3]
319
mov r10,r6,lsr#16
320
strb r9,[r0,#4+2]
321
mov r11,r6,lsr#24
322
strb r10,[r0,#4+1]
323
strb r11,[r0,#4]
324
#endif
325
326
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
327
rev r7,r7
328
str r7,[r0,#0]
329
#elif defined(__ARMEB__)
330
str r7,[r0,#0]
331
#else
332
mov r9,r7,lsr#8
333
strb r7,[r0,#0+3]
334
mov r10,r7,lsr#16
335
strb r9,[r0,#0+2]
336
mov r11,r7,lsr#24
337
strb r10,[r0,#0+1]
338
strb r11,[r0,#0]
339
#endif
340
341
#if __ARM_ARCH__>=5
342
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
343
#else
344
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
345
tst lr,#1
346
moveq pc,lr @ be binary compatible with V4, yet
347
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
348
#endif
349
.size gcm_gmult_4bit,.-gcm_gmult_4bit
350
#if __ARM_MAX_ARCH__>=7
351
.arch armv7-a
352
.fpu neon
353
354
.globl gcm_init_neon
355
.type gcm_init_neon,%function
356
.align 4
357
gcm_init_neon:
358
vld1.64 d7,[r1]! @ load H
359
vmov.i8 q8,#0xe1
360
vld1.64 d6,[r1]
361
vshl.i64 d17,#57
362
vshr.u64 d16,#63 @ t0=0xc2....01
363
vdup.8 q9,d7[7]
364
vshr.u64 d26,d6,#63
365
vshr.s8 q9,#7 @ broadcast carry bit
366
vshl.i64 q3,q3,#1
367
vand q8,q8,q9
368
vorr d7,d26 @ H<<<=1
369
veor q3,q3,q8 @ twisted H
370
vstmia r0,{q3}
371
372
bx lr @ bx lr
373
.size gcm_init_neon,.-gcm_init_neon
374
375
.globl gcm_gmult_neon
376
.type gcm_gmult_neon,%function
377
.align 4
378
gcm_gmult_neon:
379
vld1.64 d7,[r0]! @ load Xi
380
vld1.64 d6,[r0]!
381
vmov.i64 d29,#0x0000ffffffffffff
382
vldmia r1,{d26,d27} @ load twisted H
383
vmov.i64 d30,#0x00000000ffffffff
384
#ifdef __ARMEL__
385
vrev64.8 q3,q3
386
#endif
387
vmov.i64 d31,#0x000000000000ffff
388
veor d28,d26,d27 @ Karatsuba pre-processing
389
mov r3,#16
390
b .Lgmult_neon
391
.size gcm_gmult_neon,.-gcm_gmult_neon
392
393
.globl gcm_ghash_neon
394
.type gcm_ghash_neon,%function
395
.align 4
396
gcm_ghash_neon:
397
vld1.64 d1,[r0]! @ load Xi
398
vld1.64 d0,[r0]!
399
vmov.i64 d29,#0x0000ffffffffffff
400
vldmia r1,{d26,d27} @ load twisted H
401
vmov.i64 d30,#0x00000000ffffffff
402
#ifdef __ARMEL__
403
vrev64.8 q0,q0
404
#endif
405
vmov.i64 d31,#0x000000000000ffff
406
veor d28,d26,d27 @ Karatsuba pre-processing
407
408
.Loop_neon:
409
vld1.64 d7,[r2]! @ load inp
410
vld1.64 d6,[r2]!
411
#ifdef __ARMEL__
412
vrev64.8 q3,q3
413
#endif
414
veor q3,q0 @ inp^=Xi
415
.Lgmult_neon:
416
vext.8 d16, d26, d26, #1 @ A1
417
vmull.p8 q8, d16, d6 @ F = A1*B
418
vext.8 d0, d6, d6, #1 @ B1
419
vmull.p8 q0, d26, d0 @ E = A*B1
420
vext.8 d18, d26, d26, #2 @ A2
421
vmull.p8 q9, d18, d6 @ H = A2*B
422
vext.8 d22, d6, d6, #2 @ B2
423
vmull.p8 q11, d26, d22 @ G = A*B2
424
vext.8 d20, d26, d26, #3 @ A3
425
veor q8, q8, q0 @ L = E + F
426
vmull.p8 q10, d20, d6 @ J = A3*B
427
vext.8 d0, d6, d6, #3 @ B3
428
veor q9, q9, q11 @ M = G + H
429
vmull.p8 q0, d26, d0 @ I = A*B3
430
veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
431
vand d17, d17, d29
432
vext.8 d22, d6, d6, #4 @ B4
433
veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
434
vand d19, d19, d30
435
vmull.p8 q11, d26, d22 @ K = A*B4
436
veor q10, q10, q0 @ N = I + J
437
veor d16, d16, d17
438
veor d18, d18, d19
439
veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
440
vand d21, d21, d31
441
vext.8 q8, q8, q8, #15
442
veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
443
vmov.i64 d23, #0
444
vext.8 q9, q9, q9, #14
445
veor d20, d20, d21
446
vmull.p8 q0, d26, d6 @ D = A*B
447
vext.8 q11, q11, q11, #12
448
vext.8 q10, q10, q10, #13
449
veor q8, q8, q9
450
veor q10, q10, q11
451
veor q0, q0, q8
452
veor q0, q0, q10
453
veor d6,d6,d7 @ Karatsuba pre-processing
454
vext.8 d16, d28, d28, #1 @ A1
455
vmull.p8 q8, d16, d6 @ F = A1*B
456
vext.8 d2, d6, d6, #1 @ B1
457
vmull.p8 q1, d28, d2 @ E = A*B1
458
vext.8 d18, d28, d28, #2 @ A2
459
vmull.p8 q9, d18, d6 @ H = A2*B
460
vext.8 d22, d6, d6, #2 @ B2
461
vmull.p8 q11, d28, d22 @ G = A*B2
462
vext.8 d20, d28, d28, #3 @ A3
463
veor q8, q8, q1 @ L = E + F
464
vmull.p8 q10, d20, d6 @ J = A3*B
465
vext.8 d2, d6, d6, #3 @ B3
466
veor q9, q9, q11 @ M = G + H
467
vmull.p8 q1, d28, d2 @ I = A*B3
468
veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
469
vand d17, d17, d29
470
vext.8 d22, d6, d6, #4 @ B4
471
veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
472
vand d19, d19, d30
473
vmull.p8 q11, d28, d22 @ K = A*B4
474
veor q10, q10, q1 @ N = I + J
475
veor d16, d16, d17
476
veor d18, d18, d19
477
veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
478
vand d21, d21, d31
479
vext.8 q8, q8, q8, #15
480
veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
481
vmov.i64 d23, #0
482
vext.8 q9, q9, q9, #14
483
veor d20, d20, d21
484
vmull.p8 q1, d28, d6 @ D = A*B
485
vext.8 q11, q11, q11, #12
486
vext.8 q10, q10, q10, #13
487
veor q8, q8, q9
488
veor q10, q10, q11
489
veor q1, q1, q8
490
veor q1, q1, q10
491
vext.8 d16, d27, d27, #1 @ A1
492
vmull.p8 q8, d16, d7 @ F = A1*B
493
vext.8 d4, d7, d7, #1 @ B1
494
vmull.p8 q2, d27, d4 @ E = A*B1
495
vext.8 d18, d27, d27, #2 @ A2
496
vmull.p8 q9, d18, d7 @ H = A2*B
497
vext.8 d22, d7, d7, #2 @ B2
498
vmull.p8 q11, d27, d22 @ G = A*B2
499
vext.8 d20, d27, d27, #3 @ A3
500
veor q8, q8, q2 @ L = E + F
501
vmull.p8 q10, d20, d7 @ J = A3*B
502
vext.8 d4, d7, d7, #3 @ B3
503
veor q9, q9, q11 @ M = G + H
504
vmull.p8 q2, d27, d4 @ I = A*B3
505
veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
506
vand d17, d17, d29
507
vext.8 d22, d7, d7, #4 @ B4
508
veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
509
vand d19, d19, d30
510
vmull.p8 q11, d27, d22 @ K = A*B4
511
veor q10, q10, q2 @ N = I + J
512
veor d16, d16, d17
513
veor d18, d18, d19
514
veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
515
vand d21, d21, d31
516
vext.8 q8, q8, q8, #15
517
veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
518
vmov.i64 d23, #0
519
vext.8 q9, q9, q9, #14
520
veor d20, d20, d21
521
vmull.p8 q2, d27, d7 @ D = A*B
522
vext.8 q11, q11, q11, #12
523
vext.8 q10, q10, q10, #13
524
veor q8, q8, q9
525
veor q10, q10, q11
526
veor q2, q2, q8
527
veor q2, q2, q10
528
veor q1,q1,q0 @ Karatsuba post-processing
529
veor q1,q1,q2
530
veor d1,d1,d2
531
veor d4,d4,d3 @ Xh|Xl - 256-bit result
532
533
@ equivalent of reduction_avx from ghash-x86_64.pl
534
vshl.i64 q9,q0,#57 @ 1st phase
535
vshl.i64 q10,q0,#62
536
veor q10,q10,q9 @
537
vshl.i64 q9,q0,#63
538
veor q10, q10, q9 @
539
veor d1,d1,d20 @
540
veor d4,d4,d21
541
542
vshr.u64 q10,q0,#1 @ 2nd phase
543
veor q2,q2,q0
544
veor q0,q0,q10 @
545
vshr.u64 q10,q10,#6
546
vshr.u64 q0,q0,#1 @
547
veor q0,q0,q2 @
548
veor q0,q0,q10 @
549
550
subs r3,#16
551
bne .Loop_neon
552
553
#ifdef __ARMEL__
554
vrev64.8 q0,q0
555
#endif
556
sub r0,#16
557
vst1.64 d1,[r0]! @ write out Xi
558
vst1.64 d0,[r0]
559
560
bx lr @ bx lr
561
.size gcm_ghash_neon,.-gcm_ghash_neon
562
#endif
563
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
564
.align 2
565
.align 2
566
567