Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/armv8-mont.S
39536 views
1
/* Do not modify. This file is auto-generated from armv8-mont.pl. */
2
#include "arm_arch.h"
3
#ifndef __KERNEL__
4
5
.hidden OPENSSL_armv8_rsa_neonized
6
#endif
7
.text
8
9
.globl bn_mul_mont
10
.type bn_mul_mont,%function
11
.align 5
12
bn_mul_mont:
13
AARCH64_SIGN_LINK_REGISTER
14
.Lbn_mul_mont:
15
tst x5,#3
16
b.ne .Lmul_mont
17
cmp x5,#32
18
b.le .Lscalar_impl
19
#ifndef __KERNEL__
20
#ifndef __AARCH64EB__
21
adrp x17,OPENSSL_armv8_rsa_neonized
22
ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
23
cbnz w17, bn_mul8x_mont_neon
24
#endif
25
#endif
26
27
.Lscalar_impl:
28
tst x5,#7
29
b.eq __bn_sqr8x_mont
30
tst x5,#3
31
b.eq __bn_mul4x_mont
32
33
.Lmul_mont:
34
stp x29,x30,[sp,#-64]!
35
add x29,sp,#0
36
stp x19,x20,[sp,#16]
37
stp x21,x22,[sp,#32]
38
stp x23,x24,[sp,#48]
39
40
ldr x9,[x2],#8 // bp[0]
41
sub x22,sp,x5,lsl#3
42
ldp x7,x8,[x1],#16 // ap[0..1]
43
lsl x5,x5,#3
44
ldr x4,[x4] // *n0
45
and x22,x22,#-16 // ABI says so
46
ldp x13,x14,[x3],#16 // np[0..1]
47
48
mul x6,x7,x9 // ap[0]*bp[0]
49
sub x21,x5,#16 // j=num-2
50
umulh x7,x7,x9
51
mul x10,x8,x9 // ap[1]*bp[0]
52
umulh x11,x8,x9
53
54
mul x15,x6,x4 // "tp[0]"*n0
55
mov sp,x22 // alloca
56
57
// (*) mul x12,x13,x15 // np[0]*m1
58
umulh x13,x13,x15
59
mul x16,x14,x15 // np[1]*m1
60
// (*) adds x12,x12,x6 // discarded
61
// (*) As for removal of first multiplication and addition
62
// instructions. The outcome of first addition is
63
// guaranteed to be zero, which leaves two computationally
64
// significant outcomes: it either carries or not. Then
65
// question is when does it carry? Is there alternative
66
// way to deduce it? If you follow operations, you can
67
// observe that condition for carry is quite simple:
68
// x6 being non-zero. So that carry can be calculated
69
// by adding -1 to x6. That's what next instruction does.
70
subs xzr,x6,#1 // (*)
71
umulh x17,x14,x15
72
adc x13,x13,xzr
73
cbz x21,.L1st_skip
74
75
.L1st:
76
ldr x8,[x1],#8
77
adds x6,x10,x7
78
sub x21,x21,#8 // j--
79
adc x7,x11,xzr
80
81
ldr x14,[x3],#8
82
adds x12,x16,x13
83
mul x10,x8,x9 // ap[j]*bp[0]
84
adc x13,x17,xzr
85
umulh x11,x8,x9
86
87
adds x12,x12,x6
88
mul x16,x14,x15 // np[j]*m1
89
adc x13,x13,xzr
90
umulh x17,x14,x15
91
str x12,[x22],#8 // tp[j-1]
92
cbnz x21,.L1st
93
94
.L1st_skip:
95
adds x6,x10,x7
96
sub x1,x1,x5 // rewind x1
97
adc x7,x11,xzr
98
99
adds x12,x16,x13
100
sub x3,x3,x5 // rewind x3
101
adc x13,x17,xzr
102
103
adds x12,x12,x6
104
sub x20,x5,#8 // i=num-1
105
adcs x13,x13,x7
106
107
adc x19,xzr,xzr // upmost overflow bit
108
stp x12,x13,[x22]
109
110
.Louter:
111
ldr x9,[x2],#8 // bp[i]
112
ldp x7,x8,[x1],#16
113
ldr x23,[sp] // tp[0]
114
add x22,sp,#8
115
116
mul x6,x7,x9 // ap[0]*bp[i]
117
sub x21,x5,#16 // j=num-2
118
umulh x7,x7,x9
119
ldp x13,x14,[x3],#16
120
mul x10,x8,x9 // ap[1]*bp[i]
121
adds x6,x6,x23
122
umulh x11,x8,x9
123
adc x7,x7,xzr
124
125
mul x15,x6,x4
126
sub x20,x20,#8 // i--
127
128
// (*) mul x12,x13,x15 // np[0]*m1
129
umulh x13,x13,x15
130
mul x16,x14,x15 // np[1]*m1
131
// (*) adds x12,x12,x6
132
subs xzr,x6,#1 // (*)
133
umulh x17,x14,x15
134
cbz x21,.Linner_skip
135
136
.Linner:
137
ldr x8,[x1],#8
138
adc x13,x13,xzr
139
ldr x23,[x22],#8 // tp[j]
140
adds x6,x10,x7
141
sub x21,x21,#8 // j--
142
adc x7,x11,xzr
143
144
adds x12,x16,x13
145
ldr x14,[x3],#8
146
adc x13,x17,xzr
147
148
mul x10,x8,x9 // ap[j]*bp[i]
149
adds x6,x6,x23
150
umulh x11,x8,x9
151
adc x7,x7,xzr
152
153
mul x16,x14,x15 // np[j]*m1
154
adds x12,x12,x6
155
umulh x17,x14,x15
156
stur x12,[x22,#-16] // tp[j-1]
157
cbnz x21,.Linner
158
159
.Linner_skip:
160
ldr x23,[x22],#8 // tp[j]
161
adc x13,x13,xzr
162
adds x6,x10,x7
163
sub x1,x1,x5 // rewind x1
164
adc x7,x11,xzr
165
166
adds x12,x16,x13
167
sub x3,x3,x5 // rewind x3
168
adcs x13,x17,x19
169
adc x19,xzr,xzr
170
171
adds x6,x6,x23
172
adc x7,x7,xzr
173
174
adds x12,x12,x6
175
adcs x13,x13,x7
176
adc x19,x19,xzr // upmost overflow bit
177
stp x12,x13,[x22,#-16]
178
179
cbnz x20,.Louter
180
181
// Final step. We see if result is larger than modulus, and
182
// if it is, subtract the modulus. But comparison implies
183
// subtraction. So we subtract modulus, see if it borrowed,
184
// and conditionally copy original value.
185
ldr x23,[sp] // tp[0]
186
add x22,sp,#8
187
ldr x14,[x3],#8 // np[0]
188
subs x21,x5,#8 // j=num-1 and clear borrow
189
mov x1,x0
190
.Lsub:
191
sbcs x8,x23,x14 // tp[j]-np[j]
192
ldr x23,[x22],#8
193
sub x21,x21,#8 // j--
194
ldr x14,[x3],#8
195
str x8,[x1],#8 // rp[j]=tp[j]-np[j]
196
cbnz x21,.Lsub
197
198
sbcs x8,x23,x14
199
sbcs x19,x19,xzr // did it borrow?
200
str x8,[x1],#8 // rp[num-1]
201
202
ldr x23,[sp] // tp[0]
203
add x22,sp,#8
204
ldr x8,[x0],#8 // rp[0]
205
sub x5,x5,#8 // num--
206
nop
207
.Lcond_copy:
208
sub x5,x5,#8 // num--
209
csel x14,x23,x8,lo // did it borrow?
210
ldr x23,[x22],#8
211
ldr x8,[x0],#8
212
stur xzr,[x22,#-16] // wipe tp
213
stur x14,[x0,#-16]
214
cbnz x5,.Lcond_copy
215
216
csel x14,x23,x8,lo
217
stur xzr,[x22,#-8] // wipe tp
218
stur x14,[x0,#-8]
219
220
ldp x19,x20,[x29,#16]
221
mov sp,x29
222
ldp x21,x22,[x29,#32]
223
mov x0,#1
224
ldp x23,x24,[x29,#48]
225
ldr x29,[sp],#64
226
AARCH64_VALIDATE_LINK_REGISTER
227
ret
228
.size bn_mul_mont,.-bn_mul_mont
229
.type bn_mul8x_mont_neon,%function
230
.align 5
231
bn_mul8x_mont_neon:
232
// Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
233
// only from bn_mul_mont which has already signed the return address.
234
stp x29,x30,[sp,#-80]!
235
mov x16,sp
236
stp d8,d9,[sp,#16]
237
stp d10,d11,[sp,#32]
238
stp d12,d13,[sp,#48]
239
stp d14,d15,[sp,#64]
240
lsl x5,x5,#1
241
eor v14.16b,v14.16b,v14.16b
242
243
.align 4
244
.LNEON_8n:
245
eor v6.16b,v6.16b,v6.16b
246
sub x7,sp,#128
247
eor v7.16b,v7.16b,v7.16b
248
sub x7,x7,x5,lsl#4
249
eor v8.16b,v8.16b,v8.16b
250
and x7,x7,#-64
251
eor v9.16b,v9.16b,v9.16b
252
mov sp,x7 // alloca
253
eor v10.16b,v10.16b,v10.16b
254
add x7,x7,#256
255
eor v11.16b,v11.16b,v11.16b
256
sub x8,x5,#8
257
eor v12.16b,v12.16b,v12.16b
258
eor v13.16b,v13.16b,v13.16b
259
260
.LNEON_8n_init:
261
st1 {v6.2d,v7.2d},[x7],#32
262
subs x8,x8,#8
263
st1 {v8.2d,v9.2d},[x7],#32
264
st1 {v10.2d,v11.2d},[x7],#32
265
st1 {v12.2d,v13.2d},[x7],#32
266
bne .LNEON_8n_init
267
268
add x6,sp,#256
269
ld1 {v0.4s,v1.4s},[x1],#32
270
add x10,sp,#8
271
ldr s30,[x4],#4
272
mov x9,x5
273
b .LNEON_8n_outer
274
275
.align 4
276
.LNEON_8n_outer:
277
ldr s28,[x2],#4 // *b++
278
uxtl v28.4s,v28.4h
279
add x7,sp,#128
280
ld1 {v2.4s,v3.4s},[x3],#32
281
282
umlal v6.2d,v28.2s,v0.s[0]
283
umlal v7.2d,v28.2s,v0.s[1]
284
umlal v8.2d,v28.2s,v0.s[2]
285
shl v29.2d,v6.2d,#16
286
ext v29.16b,v29.16b,v29.16b,#8
287
umlal v9.2d,v28.2s,v0.s[3]
288
add v29.2d,v29.2d,v6.2d
289
umlal v10.2d,v28.2s,v1.s[0]
290
mul v29.2s,v29.2s,v30.2s
291
umlal v11.2d,v28.2s,v1.s[1]
292
st1 {v28.2s},[sp] // put aside smashed b[8*i+0]
293
umlal v12.2d,v28.2s,v1.s[2]
294
uxtl v29.4s,v29.4h
295
umlal v13.2d,v28.2s,v1.s[3]
296
ldr s28,[x2],#4 // *b++
297
umlal v6.2d,v29.2s,v2.s[0]
298
umlal v7.2d,v29.2s,v2.s[1]
299
uxtl v28.4s,v28.4h
300
umlal v8.2d,v29.2s,v2.s[2]
301
ushr v15.2d,v6.2d,#16
302
umlal v9.2d,v29.2s,v2.s[3]
303
umlal v10.2d,v29.2s,v3.s[0]
304
ext v6.16b,v6.16b,v6.16b,#8
305
add v6.2d,v6.2d,v15.2d
306
umlal v11.2d,v29.2s,v3.s[1]
307
ushr v6.2d,v6.2d,#16
308
umlal v12.2d,v29.2s,v3.s[2]
309
umlal v13.2d,v29.2s,v3.s[3]
310
add v16.2d,v7.2d,v6.2d
311
ins v7.d[0],v16.d[0]
312
st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0]
313
umlal v7.2d,v28.2s,v0.s[0]
314
ld1 {v6.2d},[x6],#16
315
umlal v8.2d,v28.2s,v0.s[1]
316
umlal v9.2d,v28.2s,v0.s[2]
317
shl v29.2d,v7.2d,#16
318
ext v29.16b,v29.16b,v29.16b,#8
319
umlal v10.2d,v28.2s,v0.s[3]
320
add v29.2d,v29.2d,v7.2d
321
umlal v11.2d,v28.2s,v1.s[0]
322
mul v29.2s,v29.2s,v30.2s
323
umlal v12.2d,v28.2s,v1.s[1]
324
st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1]
325
umlal v13.2d,v28.2s,v1.s[2]
326
uxtl v29.4s,v29.4h
327
umlal v6.2d,v28.2s,v1.s[3]
328
ldr s28,[x2],#4 // *b++
329
umlal v7.2d,v29.2s,v2.s[0]
330
umlal v8.2d,v29.2s,v2.s[1]
331
uxtl v28.4s,v28.4h
332
umlal v9.2d,v29.2s,v2.s[2]
333
ushr v15.2d,v7.2d,#16
334
umlal v10.2d,v29.2s,v2.s[3]
335
umlal v11.2d,v29.2s,v3.s[0]
336
ext v7.16b,v7.16b,v7.16b,#8
337
add v7.2d,v7.2d,v15.2d
338
umlal v12.2d,v29.2s,v3.s[1]
339
ushr v7.2d,v7.2d,#16
340
umlal v13.2d,v29.2s,v3.s[2]
341
umlal v6.2d,v29.2s,v3.s[3]
342
add v16.2d,v8.2d,v7.2d
343
ins v8.d[0],v16.d[0]
344
st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1]
345
umlal v8.2d,v28.2s,v0.s[0]
346
ld1 {v7.2d},[x6],#16
347
umlal v9.2d,v28.2s,v0.s[1]
348
umlal v10.2d,v28.2s,v0.s[2]
349
shl v29.2d,v8.2d,#16
350
ext v29.16b,v29.16b,v29.16b,#8
351
umlal v11.2d,v28.2s,v0.s[3]
352
add v29.2d,v29.2d,v8.2d
353
umlal v12.2d,v28.2s,v1.s[0]
354
mul v29.2s,v29.2s,v30.2s
355
umlal v13.2d,v28.2s,v1.s[1]
356
st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2]
357
umlal v6.2d,v28.2s,v1.s[2]
358
uxtl v29.4s,v29.4h
359
umlal v7.2d,v28.2s,v1.s[3]
360
ldr s28,[x2],#4 // *b++
361
umlal v8.2d,v29.2s,v2.s[0]
362
umlal v9.2d,v29.2s,v2.s[1]
363
uxtl v28.4s,v28.4h
364
umlal v10.2d,v29.2s,v2.s[2]
365
ushr v15.2d,v8.2d,#16
366
umlal v11.2d,v29.2s,v2.s[3]
367
umlal v12.2d,v29.2s,v3.s[0]
368
ext v8.16b,v8.16b,v8.16b,#8
369
add v8.2d,v8.2d,v15.2d
370
umlal v13.2d,v29.2s,v3.s[1]
371
ushr v8.2d,v8.2d,#16
372
umlal v6.2d,v29.2s,v3.s[2]
373
umlal v7.2d,v29.2s,v3.s[3]
374
add v16.2d,v9.2d,v8.2d
375
ins v9.d[0],v16.d[0]
376
st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2]
377
umlal v9.2d,v28.2s,v0.s[0]
378
ld1 {v8.2d},[x6],#16
379
umlal v10.2d,v28.2s,v0.s[1]
380
umlal v11.2d,v28.2s,v0.s[2]
381
shl v29.2d,v9.2d,#16
382
ext v29.16b,v29.16b,v29.16b,#8
383
umlal v12.2d,v28.2s,v0.s[3]
384
add v29.2d,v29.2d,v9.2d
385
umlal v13.2d,v28.2s,v1.s[0]
386
mul v29.2s,v29.2s,v30.2s
387
umlal v6.2d,v28.2s,v1.s[1]
388
st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3]
389
umlal v7.2d,v28.2s,v1.s[2]
390
uxtl v29.4s,v29.4h
391
umlal v8.2d,v28.2s,v1.s[3]
392
ldr s28,[x2],#4 // *b++
393
umlal v9.2d,v29.2s,v2.s[0]
394
umlal v10.2d,v29.2s,v2.s[1]
395
uxtl v28.4s,v28.4h
396
umlal v11.2d,v29.2s,v2.s[2]
397
ushr v15.2d,v9.2d,#16
398
umlal v12.2d,v29.2s,v2.s[3]
399
umlal v13.2d,v29.2s,v3.s[0]
400
ext v9.16b,v9.16b,v9.16b,#8
401
add v9.2d,v9.2d,v15.2d
402
umlal v6.2d,v29.2s,v3.s[1]
403
ushr v9.2d,v9.2d,#16
404
umlal v7.2d,v29.2s,v3.s[2]
405
umlal v8.2d,v29.2s,v3.s[3]
406
add v16.2d,v10.2d,v9.2d
407
ins v10.d[0],v16.d[0]
408
st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3]
409
umlal v10.2d,v28.2s,v0.s[0]
410
ld1 {v9.2d},[x6],#16
411
umlal v11.2d,v28.2s,v0.s[1]
412
umlal v12.2d,v28.2s,v0.s[2]
413
shl v29.2d,v10.2d,#16
414
ext v29.16b,v29.16b,v29.16b,#8
415
umlal v13.2d,v28.2s,v0.s[3]
416
add v29.2d,v29.2d,v10.2d
417
umlal v6.2d,v28.2s,v1.s[0]
418
mul v29.2s,v29.2s,v30.2s
419
umlal v7.2d,v28.2s,v1.s[1]
420
st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4]
421
umlal v8.2d,v28.2s,v1.s[2]
422
uxtl v29.4s,v29.4h
423
umlal v9.2d,v28.2s,v1.s[3]
424
ldr s28,[x2],#4 // *b++
425
umlal v10.2d,v29.2s,v2.s[0]
426
umlal v11.2d,v29.2s,v2.s[1]
427
uxtl v28.4s,v28.4h
428
umlal v12.2d,v29.2s,v2.s[2]
429
ushr v15.2d,v10.2d,#16
430
umlal v13.2d,v29.2s,v2.s[3]
431
umlal v6.2d,v29.2s,v3.s[0]
432
ext v10.16b,v10.16b,v10.16b,#8
433
add v10.2d,v10.2d,v15.2d
434
umlal v7.2d,v29.2s,v3.s[1]
435
ushr v10.2d,v10.2d,#16
436
umlal v8.2d,v29.2s,v3.s[2]
437
umlal v9.2d,v29.2s,v3.s[3]
438
add v16.2d,v11.2d,v10.2d
439
ins v11.d[0],v16.d[0]
440
st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4]
441
umlal v11.2d,v28.2s,v0.s[0]
442
ld1 {v10.2d},[x6],#16
443
umlal v12.2d,v28.2s,v0.s[1]
444
umlal v13.2d,v28.2s,v0.s[2]
445
shl v29.2d,v11.2d,#16
446
ext v29.16b,v29.16b,v29.16b,#8
447
umlal v6.2d,v28.2s,v0.s[3]
448
add v29.2d,v29.2d,v11.2d
449
umlal v7.2d,v28.2s,v1.s[0]
450
mul v29.2s,v29.2s,v30.2s
451
umlal v8.2d,v28.2s,v1.s[1]
452
st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5]
453
umlal v9.2d,v28.2s,v1.s[2]
454
uxtl v29.4s,v29.4h
455
umlal v10.2d,v28.2s,v1.s[3]
456
ldr s28,[x2],#4 // *b++
457
umlal v11.2d,v29.2s,v2.s[0]
458
umlal v12.2d,v29.2s,v2.s[1]
459
uxtl v28.4s,v28.4h
460
umlal v13.2d,v29.2s,v2.s[2]
461
ushr v15.2d,v11.2d,#16
462
umlal v6.2d,v29.2s,v2.s[3]
463
umlal v7.2d,v29.2s,v3.s[0]
464
ext v11.16b,v11.16b,v11.16b,#8
465
add v11.2d,v11.2d,v15.2d
466
umlal v8.2d,v29.2s,v3.s[1]
467
ushr v11.2d,v11.2d,#16
468
umlal v9.2d,v29.2s,v3.s[2]
469
umlal v10.2d,v29.2s,v3.s[3]
470
add v16.2d,v12.2d,v11.2d
471
ins v12.d[0],v16.d[0]
472
st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5]
473
umlal v12.2d,v28.2s,v0.s[0]
474
ld1 {v11.2d},[x6],#16
475
umlal v13.2d,v28.2s,v0.s[1]
476
umlal v6.2d,v28.2s,v0.s[2]
477
shl v29.2d,v12.2d,#16
478
ext v29.16b,v29.16b,v29.16b,#8
479
umlal v7.2d,v28.2s,v0.s[3]
480
add v29.2d,v29.2d,v12.2d
481
umlal v8.2d,v28.2s,v1.s[0]
482
mul v29.2s,v29.2s,v30.2s
483
umlal v9.2d,v28.2s,v1.s[1]
484
st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6]
485
umlal v10.2d,v28.2s,v1.s[2]
486
uxtl v29.4s,v29.4h
487
umlal v11.2d,v28.2s,v1.s[3]
488
ldr s28,[x2],#4 // *b++
489
umlal v12.2d,v29.2s,v2.s[0]
490
umlal v13.2d,v29.2s,v2.s[1]
491
uxtl v28.4s,v28.4h
492
umlal v6.2d,v29.2s,v2.s[2]
493
ushr v15.2d,v12.2d,#16
494
umlal v7.2d,v29.2s,v2.s[3]
495
umlal v8.2d,v29.2s,v3.s[0]
496
ext v12.16b,v12.16b,v12.16b,#8
497
add v12.2d,v12.2d,v15.2d
498
umlal v9.2d,v29.2s,v3.s[1]
499
ushr v12.2d,v12.2d,#16
500
umlal v10.2d,v29.2s,v3.s[2]
501
umlal v11.2d,v29.2s,v3.s[3]
502
add v16.2d,v13.2d,v12.2d
503
ins v13.d[0],v16.d[0]
504
st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6]
505
umlal v13.2d,v28.2s,v0.s[0]
506
ld1 {v12.2d},[x6],#16
507
umlal v6.2d,v28.2s,v0.s[1]
508
umlal v7.2d,v28.2s,v0.s[2]
509
shl v29.2d,v13.2d,#16
510
ext v29.16b,v29.16b,v29.16b,#8
511
umlal v8.2d,v28.2s,v0.s[3]
512
add v29.2d,v29.2d,v13.2d
513
umlal v9.2d,v28.2s,v1.s[0]
514
mul v29.2s,v29.2s,v30.2s
515
umlal v10.2d,v28.2s,v1.s[1]
516
st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7]
517
umlal v11.2d,v28.2s,v1.s[2]
518
uxtl v29.4s,v29.4h
519
umlal v12.2d,v28.2s,v1.s[3]
520
ld1 {v28.2s},[sp] // pull smashed b[8*i+0]
521
umlal v13.2d,v29.2s,v2.s[0]
522
ld1 {v0.4s,v1.4s},[x1],#32
523
umlal v6.2d,v29.2s,v2.s[1]
524
umlal v7.2d,v29.2s,v2.s[2]
525
mov v5.16b,v13.16b
526
ushr v5.2d,v5.2d,#16
527
ext v13.16b,v13.16b,v13.16b,#8
528
umlal v8.2d,v29.2s,v2.s[3]
529
umlal v9.2d,v29.2s,v3.s[0]
530
add v13.2d,v13.2d,v5.2d
531
umlal v10.2d,v29.2s,v3.s[1]
532
ushr v13.2d,v13.2d,#16
533
eor v15.16b,v15.16b,v15.16b
534
ins v13.d[1],v15.d[0]
535
umlal v11.2d,v29.2s,v3.s[2]
536
umlal v12.2d,v29.2s,v3.s[3]
537
add v6.2d,v6.2d,v13.2d
538
st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7]
539
add x10,sp,#8 // rewind
540
sub x8,x5,#8
541
b .LNEON_8n_inner
542
543
.align 4
544
.LNEON_8n_inner:
545
subs x8,x8,#8
546
umlal v6.2d,v28.2s,v0.s[0]
547
ld1 {v13.2d},[x6]
548
umlal v7.2d,v28.2s,v0.s[1]
549
ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0]
550
umlal v8.2d,v28.2s,v0.s[2]
551
ld1 {v2.4s,v3.4s},[x3],#32
552
umlal v9.2d,v28.2s,v0.s[3]
553
b.eq .LInner_jump
554
add x6,x6,#16 // don't advance in last iteration
555
.LInner_jump:
556
umlal v10.2d,v28.2s,v1.s[0]
557
umlal v11.2d,v28.2s,v1.s[1]
558
umlal v12.2d,v28.2s,v1.s[2]
559
umlal v13.2d,v28.2s,v1.s[3]
560
ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1]
561
umlal v6.2d,v29.2s,v2.s[0]
562
umlal v7.2d,v29.2s,v2.s[1]
563
umlal v8.2d,v29.2s,v2.s[2]
564
umlal v9.2d,v29.2s,v2.s[3]
565
umlal v10.2d,v29.2s,v3.s[0]
566
umlal v11.2d,v29.2s,v3.s[1]
567
umlal v12.2d,v29.2s,v3.s[2]
568
umlal v13.2d,v29.2s,v3.s[3]
569
st1 {v6.2d},[x7],#16
570
umlal v7.2d,v28.2s,v0.s[0]
571
ld1 {v6.2d},[x6]
572
umlal v8.2d,v28.2s,v0.s[1]
573
ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1]
574
umlal v9.2d,v28.2s,v0.s[2]
575
b.eq .LInner_jump1
576
add x6,x6,#16 // don't advance in last iteration
577
.LInner_jump1:
578
umlal v10.2d,v28.2s,v0.s[3]
579
umlal v11.2d,v28.2s,v1.s[0]
580
umlal v12.2d,v28.2s,v1.s[1]
581
umlal v13.2d,v28.2s,v1.s[2]
582
umlal v6.2d,v28.2s,v1.s[3]
583
ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2]
584
umlal v7.2d,v29.2s,v2.s[0]
585
umlal v8.2d,v29.2s,v2.s[1]
586
umlal v9.2d,v29.2s,v2.s[2]
587
umlal v10.2d,v29.2s,v2.s[3]
588
umlal v11.2d,v29.2s,v3.s[0]
589
umlal v12.2d,v29.2s,v3.s[1]
590
umlal v13.2d,v29.2s,v3.s[2]
591
umlal v6.2d,v29.2s,v3.s[3]
592
st1 {v7.2d},[x7],#16
593
umlal v8.2d,v28.2s,v0.s[0]
594
ld1 {v7.2d},[x6]
595
umlal v9.2d,v28.2s,v0.s[1]
596
ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2]
597
umlal v10.2d,v28.2s,v0.s[2]
598
b.eq .LInner_jump2
599
add x6,x6,#16 // don't advance in last iteration
600
.LInner_jump2:
601
umlal v11.2d,v28.2s,v0.s[3]
602
umlal v12.2d,v28.2s,v1.s[0]
603
umlal v13.2d,v28.2s,v1.s[1]
604
umlal v6.2d,v28.2s,v1.s[2]
605
umlal v7.2d,v28.2s,v1.s[3]
606
ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3]
607
umlal v8.2d,v29.2s,v2.s[0]
608
umlal v9.2d,v29.2s,v2.s[1]
609
umlal v10.2d,v29.2s,v2.s[2]
610
umlal v11.2d,v29.2s,v2.s[3]
611
umlal v12.2d,v29.2s,v3.s[0]
612
umlal v13.2d,v29.2s,v3.s[1]
613
umlal v6.2d,v29.2s,v3.s[2]
614
umlal v7.2d,v29.2s,v3.s[3]
615
st1 {v8.2d},[x7],#16
616
umlal v9.2d,v28.2s,v0.s[0]
617
ld1 {v8.2d},[x6]
618
umlal v10.2d,v28.2s,v0.s[1]
619
ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3]
620
umlal v11.2d,v28.2s,v0.s[2]
621
b.eq .LInner_jump3
622
add x6,x6,#16 // don't advance in last iteration
623
.LInner_jump3:
624
umlal v12.2d,v28.2s,v0.s[3]
625
umlal v13.2d,v28.2s,v1.s[0]
626
umlal v6.2d,v28.2s,v1.s[1]
627
umlal v7.2d,v28.2s,v1.s[2]
628
umlal v8.2d,v28.2s,v1.s[3]
629
ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4]
630
umlal v9.2d,v29.2s,v2.s[0]
631
umlal v10.2d,v29.2s,v2.s[1]
632
umlal v11.2d,v29.2s,v2.s[2]
633
umlal v12.2d,v29.2s,v2.s[3]
634
umlal v13.2d,v29.2s,v3.s[0]
635
umlal v6.2d,v29.2s,v3.s[1]
636
umlal v7.2d,v29.2s,v3.s[2]
637
umlal v8.2d,v29.2s,v3.s[3]
638
st1 {v9.2d},[x7],#16
639
umlal v10.2d,v28.2s,v0.s[0]
640
ld1 {v9.2d},[x6]
641
umlal v11.2d,v28.2s,v0.s[1]
642
ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4]
643
umlal v12.2d,v28.2s,v0.s[2]
644
b.eq .LInner_jump4
645
add x6,x6,#16 // don't advance in last iteration
646
.LInner_jump4:
647
umlal v13.2d,v28.2s,v0.s[3]
648
umlal v6.2d,v28.2s,v1.s[0]
649
umlal v7.2d,v28.2s,v1.s[1]
650
umlal v8.2d,v28.2s,v1.s[2]
651
umlal v9.2d,v28.2s,v1.s[3]
652
ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5]
653
umlal v10.2d,v29.2s,v2.s[0]
654
umlal v11.2d,v29.2s,v2.s[1]
655
umlal v12.2d,v29.2s,v2.s[2]
656
umlal v13.2d,v29.2s,v2.s[3]
657
umlal v6.2d,v29.2s,v3.s[0]
658
umlal v7.2d,v29.2s,v3.s[1]
659
umlal v8.2d,v29.2s,v3.s[2]
660
umlal v9.2d,v29.2s,v3.s[3]
661
st1 {v10.2d},[x7],#16
662
umlal v11.2d,v28.2s,v0.s[0]
663
ld1 {v10.2d},[x6]
664
umlal v12.2d,v28.2s,v0.s[1]
665
ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5]
666
umlal v13.2d,v28.2s,v0.s[2]
667
b.eq .LInner_jump5
668
add x6,x6,#16 // don't advance in last iteration
669
.LInner_jump5:
670
umlal v6.2d,v28.2s,v0.s[3]
671
umlal v7.2d,v28.2s,v1.s[0]
672
umlal v8.2d,v28.2s,v1.s[1]
673
umlal v9.2d,v28.2s,v1.s[2]
674
umlal v10.2d,v28.2s,v1.s[3]
675
ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6]
676
umlal v11.2d,v29.2s,v2.s[0]
677
umlal v12.2d,v29.2s,v2.s[1]
678
umlal v13.2d,v29.2s,v2.s[2]
679
umlal v6.2d,v29.2s,v2.s[3]
680
umlal v7.2d,v29.2s,v3.s[0]
681
umlal v8.2d,v29.2s,v3.s[1]
682
umlal v9.2d,v29.2s,v3.s[2]
683
umlal v10.2d,v29.2s,v3.s[3]
684
st1 {v11.2d},[x7],#16
685
umlal v12.2d,v28.2s,v0.s[0]
686
ld1 {v11.2d},[x6]
687
umlal v13.2d,v28.2s,v0.s[1]
688
ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6]
689
umlal v6.2d,v28.2s,v0.s[2]
690
b.eq .LInner_jump6
691
add x6,x6,#16 // don't advance in last iteration
692
.LInner_jump6:
693
umlal v7.2d,v28.2s,v0.s[3]
694
umlal v8.2d,v28.2s,v1.s[0]
695
umlal v9.2d,v28.2s,v1.s[1]
696
umlal v10.2d,v28.2s,v1.s[2]
697
umlal v11.2d,v28.2s,v1.s[3]
698
ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7]
699
umlal v12.2d,v29.2s,v2.s[0]
700
umlal v13.2d,v29.2s,v2.s[1]
701
umlal v6.2d,v29.2s,v2.s[2]
702
umlal v7.2d,v29.2s,v2.s[3]
703
umlal v8.2d,v29.2s,v3.s[0]
704
umlal v9.2d,v29.2s,v3.s[1]
705
umlal v10.2d,v29.2s,v3.s[2]
706
umlal v11.2d,v29.2s,v3.s[3]
707
st1 {v12.2d},[x7],#16
708
umlal v13.2d,v28.2s,v0.s[0]
709
ld1 {v12.2d},[x6]
710
umlal v6.2d,v28.2s,v0.s[1]
711
ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7]
712
umlal v7.2d,v28.2s,v0.s[2]
713
b.eq .LInner_jump7
714
add x6,x6,#16 // don't advance in last iteration
715
.LInner_jump7:
716
umlal v8.2d,v28.2s,v0.s[3]
717
umlal v9.2d,v28.2s,v1.s[0]
718
umlal v10.2d,v28.2s,v1.s[1]
719
umlal v11.2d,v28.2s,v1.s[2]
720
umlal v12.2d,v28.2s,v1.s[3]
721
b.ne .LInner_after_rewind8
722
sub x1,x1,x5,lsl#2 // rewind
723
.LInner_after_rewind8:
724
umlal v13.2d,v29.2s,v2.s[0]
725
ld1 {v28.2s},[sp] // pull smashed b[8*i+0]
726
umlal v6.2d,v29.2s,v2.s[1]
727
ld1 {v0.4s,v1.4s},[x1],#32
728
umlal v7.2d,v29.2s,v2.s[2]
729
add x10,sp,#8 // rewind
730
umlal v8.2d,v29.2s,v2.s[3]
731
umlal v9.2d,v29.2s,v3.s[0]
732
umlal v10.2d,v29.2s,v3.s[1]
733
umlal v11.2d,v29.2s,v3.s[2]
734
st1 {v13.2d},[x7],#16
735
umlal v12.2d,v29.2s,v3.s[3]
736
737
bne .LNEON_8n_inner
738
add x6,sp,#128
739
st1 {v6.2d,v7.2d},[x7],#32
740
eor v2.16b,v2.16b,v2.16b // v2
741
st1 {v8.2d,v9.2d},[x7],#32
742
eor v3.16b,v3.16b,v3.16b // v3
743
st1 {v10.2d,v11.2d},[x7],#32
744
st1 {v12.2d},[x7]
745
746
subs x9,x9,#8
747
ld1 {v6.2d,v7.2d},[x6],#32
748
ld1 {v8.2d,v9.2d},[x6],#32
749
ld1 {v10.2d,v11.2d},[x6],#32
750
ld1 {v12.2d,v13.2d},[x6],#32
751
752
b.eq .LInner_8n_jump_2steps
753
sub x3,x3,x5,lsl#2 // rewind
754
b .LNEON_8n_outer
755
756
.LInner_8n_jump_2steps:
757
add x7,sp,#128
758
st1 {v2.2d,v3.2d}, [sp],#32 // start wiping stack frame
759
mov v5.16b,v6.16b
760
ushr v15.2d,v6.2d,#16
761
ext v6.16b,v6.16b,v6.16b,#8
762
st1 {v2.2d,v3.2d}, [sp],#32
763
add v6.2d,v6.2d,v15.2d
764
st1 {v2.2d,v3.2d}, [sp],#32
765
ushr v15.2d,v6.2d,#16
766
st1 {v2.2d,v3.2d}, [sp],#32
767
zip1 v6.4h,v5.4h,v6.4h
768
ins v15.d[1],v14.d[0]
769
770
mov x8,x5
771
b .LNEON_tail_entry
772
773
.align 4
774
.LNEON_tail:
775
add v6.2d,v6.2d,v15.2d
776
mov v5.16b,v6.16b
777
ushr v15.2d,v6.2d,#16
778
ext v6.16b,v6.16b,v6.16b,#8
779
ld1 {v8.2d,v9.2d}, [x6],#32
780
add v6.2d,v6.2d,v15.2d
781
ld1 {v10.2d,v11.2d}, [x6],#32
782
ushr v15.2d,v6.2d,#16
783
ld1 {v12.2d,v13.2d}, [x6],#32
784
zip1 v6.4h,v5.4h,v6.4h
785
ins v15.d[1],v14.d[0]
786
787
.LNEON_tail_entry:
788
add v7.2d,v7.2d,v15.2d
789
st1 {v6.s}[0], [x7],#4
790
ushr v15.2d,v7.2d,#16
791
mov v5.16b,v7.16b
792
ext v7.16b,v7.16b,v7.16b,#8
793
add v7.2d,v7.2d,v15.2d
794
ushr v15.2d,v7.2d,#16
795
zip1 v7.4h,v5.4h,v7.4h
796
ins v15.d[1],v14.d[0]
797
add v8.2d,v8.2d,v15.2d
798
st1 {v7.s}[0], [x7],#4
799
ushr v15.2d,v8.2d,#16
800
mov v5.16b,v8.16b
801
ext v8.16b,v8.16b,v8.16b,#8
802
add v8.2d,v8.2d,v15.2d
803
ushr v15.2d,v8.2d,#16
804
zip1 v8.4h,v5.4h,v8.4h
805
ins v15.d[1],v14.d[0]
806
add v9.2d,v9.2d,v15.2d
807
st1 {v8.s}[0], [x7],#4
808
ushr v15.2d,v9.2d,#16
809
mov v5.16b,v9.16b
810
ext v9.16b,v9.16b,v9.16b,#8
811
add v9.2d,v9.2d,v15.2d
812
ushr v15.2d,v9.2d,#16
813
zip1 v9.4h,v5.4h,v9.4h
814
ins v15.d[1],v14.d[0]
815
add v10.2d,v10.2d,v15.2d
816
st1 {v9.s}[0], [x7],#4
817
ushr v15.2d,v10.2d,#16
818
mov v5.16b,v10.16b
819
ext v10.16b,v10.16b,v10.16b,#8
820
add v10.2d,v10.2d,v15.2d
821
ushr v15.2d,v10.2d,#16
822
zip1 v10.4h,v5.4h,v10.4h
823
ins v15.d[1],v14.d[0]
824
add v11.2d,v11.2d,v15.2d
825
st1 {v10.s}[0], [x7],#4
826
ushr v15.2d,v11.2d,#16
827
mov v5.16b,v11.16b
828
ext v11.16b,v11.16b,v11.16b,#8
829
add v11.2d,v11.2d,v15.2d
830
ushr v15.2d,v11.2d,#16
831
zip1 v11.4h,v5.4h,v11.4h
832
ins v15.d[1],v14.d[0]
833
add v12.2d,v12.2d,v15.2d
834
st1 {v11.s}[0], [x7],#4
835
ushr v15.2d,v12.2d,#16
836
mov v5.16b,v12.16b
837
ext v12.16b,v12.16b,v12.16b,#8
838
add v12.2d,v12.2d,v15.2d
839
ushr v15.2d,v12.2d,#16
840
zip1 v12.4h,v5.4h,v12.4h
841
ins v15.d[1],v14.d[0]
842
add v13.2d,v13.2d,v15.2d
843
st1 {v12.s}[0], [x7],#4
844
ushr v15.2d,v13.2d,#16
845
mov v5.16b,v13.16b
846
ext v13.16b,v13.16b,v13.16b,#8
847
add v13.2d,v13.2d,v15.2d
848
ushr v15.2d,v13.2d,#16
849
zip1 v13.4h,v5.4h,v13.4h
850
ins v15.d[1],v14.d[0]
851
ld1 {v6.2d,v7.2d}, [x6],#32
852
subs x8,x8,#8
853
st1 {v13.s}[0], [x7],#4
854
bne .LNEON_tail
855
856
st1 {v15.s}[0], [x7],#4 // top-most bit
857
sub x3,x3,x5,lsl#2 // rewind x3
858
subs x1,sp,#0 // clear carry flag
859
add x2,sp,x5,lsl#2
860
861
.LNEON_sub:
862
ldp w4,w5,[x1],#8
863
ldp w6,w7,[x1],#8
864
ldp w8,w9,[x3],#8
865
ldp w10,w11,[x3],#8
866
sbcs w8,w4,w8
867
sbcs w9,w5,w9
868
sbcs w10,w6,w10
869
sbcs w11,w7,w11
870
sub x17,x2,x1
871
stp w8,w9,[x0],#8
872
stp w10,w11,[x0],#8
873
cbnz x17,.LNEON_sub
874
875
ldr w10, [x1] // load top-most bit
876
mov x11,sp
877
eor v0.16b,v0.16b,v0.16b
878
sub x11,x2,x11 // this is num*4
879
eor v1.16b,v1.16b,v1.16b
880
mov x1,sp
881
sub x0,x0,x11 // rewind x0
882
mov x3,x2 // second 3/4th of frame
883
sbcs w10,w10,wzr // result is carry flag
884
885
.LNEON_copy_n_zap:
886
ldp w4,w5,[x1],#8
887
ldp w6,w7,[x1],#8
888
ldp w8,w9,[x0],#8
889
ldp w10,w11,[x0]
890
sub x0,x0,#8
891
b.cs .LCopy_1
892
mov w8,w4
893
mov w9,w5
894
mov w10,w6
895
mov w11,w7
896
.LCopy_1:
897
st1 {v0.2d,v1.2d}, [x3],#32 // wipe
898
st1 {v0.2d,v1.2d}, [x3],#32 // wipe
899
ldp w4,w5,[x1],#8
900
ldp w6,w7,[x1],#8
901
stp w8,w9,[x0],#8
902
stp w10,w11,[x0],#8
903
sub x1,x1,#32
904
ldp w8,w9,[x0],#8
905
ldp w10,w11,[x0]
906
sub x0,x0,#8
907
b.cs .LCopy_2
908
mov w8, w4
909
mov w9, w5
910
mov w10, w6
911
mov w11, w7
912
.LCopy_2:
913
st1 {v0.2d,v1.2d}, [x1],#32 // wipe
914
st1 {v0.2d,v1.2d}, [x3],#32 // wipe
915
sub x17,x2,x1 // preserves carry
916
stp w8,w9,[x0],#8
917
stp w10,w11,[x0],#8
918
cbnz x17,.LNEON_copy_n_zap
919
920
mov sp,x16
921
ldp d14,d15,[sp,#64]
922
ldp d12,d13,[sp,#48]
923
ldp d10,d11,[sp,#32]
924
ldp d8,d9,[sp,#16]
925
ldr x29,[sp],#80
926
AARCH64_VALIDATE_LINK_REGISTER
927
ret // bx lr
928
929
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
930
.type __bn_sqr8x_mont,%function
931
.align 5
932
__bn_sqr8x_mont:
933
cmp x1,x2
934
b.ne __bn_mul4x_mont
935
.Lsqr8x_mont:
936
// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
937
// only from bn_mul_mont which has already signed the return address.
938
stp x29,x30,[sp,#-128]!
939
add x29,sp,#0
940
stp x19,x20,[sp,#16]
941
stp x21,x22,[sp,#32]
942
stp x23,x24,[sp,#48]
943
stp x25,x26,[sp,#64]
944
stp x27,x28,[sp,#80]
945
stp x0,x3,[sp,#96] // offload rp and np
946
947
ldp x6,x7,[x1,#8*0]
948
ldp x8,x9,[x1,#8*2]
949
ldp x10,x11,[x1,#8*4]
950
ldp x12,x13,[x1,#8*6]
951
952
sub x2,sp,x5,lsl#4
953
lsl x5,x5,#3
954
ldr x4,[x4] // *n0
955
mov sp,x2 // alloca
956
sub x27,x5,#8*8
957
b .Lsqr8x_zero_start
958
959
.Lsqr8x_zero:
960
sub x27,x27,#8*8
961
stp xzr,xzr,[x2,#8*0]
962
stp xzr,xzr,[x2,#8*2]
963
stp xzr,xzr,[x2,#8*4]
964
stp xzr,xzr,[x2,#8*6]
965
.Lsqr8x_zero_start:
966
stp xzr,xzr,[x2,#8*8]
967
stp xzr,xzr,[x2,#8*10]
968
stp xzr,xzr,[x2,#8*12]
969
stp xzr,xzr,[x2,#8*14]
970
add x2,x2,#8*16
971
cbnz x27,.Lsqr8x_zero
972
973
add x3,x1,x5
974
add x1,x1,#8*8
975
mov x19,xzr
976
mov x20,xzr
977
mov x21,xzr
978
mov x22,xzr
979
mov x23,xzr
980
mov x24,xzr
981
mov x25,xzr
982
mov x26,xzr
983
mov x2,sp
984
str x4,[x29,#112] // offload n0
985
986
// Multiply everything but a[i]*a[i]
987
.align 4
988
.Lsqr8x_outer_loop:
989
// a[1]a[0] (i)
990
// a[2]a[0]
991
// a[3]a[0]
992
// a[4]a[0]
993
// a[5]a[0]
994
// a[6]a[0]
995
// a[7]a[0]
996
// a[2]a[1] (ii)
997
// a[3]a[1]
998
// a[4]a[1]
999
// a[5]a[1]
1000
// a[6]a[1]
1001
// a[7]a[1]
1002
// a[3]a[2] (iii)
1003
// a[4]a[2]
1004
// a[5]a[2]
1005
// a[6]a[2]
1006
// a[7]a[2]
1007
// a[4]a[3] (iv)
1008
// a[5]a[3]
1009
// a[6]a[3]
1010
// a[7]a[3]
1011
// a[5]a[4] (v)
1012
// a[6]a[4]
1013
// a[7]a[4]
1014
// a[6]a[5] (vi)
1015
// a[7]a[5]
1016
// a[7]a[6] (vii)
1017
1018
mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
1019
mul x15,x8,x6
1020
mul x16,x9,x6
1021
mul x17,x10,x6
1022
adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
1023
mul x14,x11,x6
1024
adcs x21,x21,x15
1025
mul x15,x12,x6
1026
adcs x22,x22,x16
1027
mul x16,x13,x6
1028
adcs x23,x23,x17
1029
umulh x17,x7,x6 // hi(a[1..7]*a[0])
1030
adcs x24,x24,x14
1031
umulh x14,x8,x6
1032
adcs x25,x25,x15
1033
umulh x15,x9,x6
1034
adcs x26,x26,x16
1035
umulh x16,x10,x6
1036
stp x19,x20,[x2],#8*2 // t[0..1]
1037
adc x19,xzr,xzr // t[8]
1038
adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
1039
umulh x17,x11,x6
1040
adcs x22,x22,x14
1041
umulh x14,x12,x6
1042
adcs x23,x23,x15
1043
umulh x15,x13,x6
1044
adcs x24,x24,x16
1045
mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
1046
adcs x25,x25,x17
1047
mul x17,x9,x7
1048
adcs x26,x26,x14
1049
mul x14,x10,x7
1050
adc x19,x19,x15
1051
1052
mul x15,x11,x7
1053
adds x22,x22,x16
1054
mul x16,x12,x7
1055
adcs x23,x23,x17
1056
mul x17,x13,x7
1057
adcs x24,x24,x14
1058
umulh x14,x8,x7 // hi(a[2..7]*a[1])
1059
adcs x25,x25,x15
1060
umulh x15,x9,x7
1061
adcs x26,x26,x16
1062
umulh x16,x10,x7
1063
adcs x19,x19,x17
1064
umulh x17,x11,x7
1065
stp x21,x22,[x2],#8*2 // t[2..3]
1066
adc x20,xzr,xzr // t[9]
1067
adds x23,x23,x14
1068
umulh x14,x12,x7
1069
adcs x24,x24,x15
1070
umulh x15,x13,x7
1071
adcs x25,x25,x16
1072
mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
1073
adcs x26,x26,x17
1074
mul x17,x10,x8
1075
adcs x19,x19,x14
1076
mul x14,x11,x8
1077
adc x20,x20,x15
1078
1079
mul x15,x12,x8
1080
adds x24,x24,x16
1081
mul x16,x13,x8
1082
adcs x25,x25,x17
1083
umulh x17,x9,x8 // hi(a[3..7]*a[2])
1084
adcs x26,x26,x14
1085
umulh x14,x10,x8
1086
adcs x19,x19,x15
1087
umulh x15,x11,x8
1088
adcs x20,x20,x16
1089
umulh x16,x12,x8
1090
stp x23,x24,[x2],#8*2 // t[4..5]
1091
adc x21,xzr,xzr // t[10]
1092
adds x25,x25,x17
1093
umulh x17,x13,x8
1094
adcs x26,x26,x14
1095
mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
1096
adcs x19,x19,x15
1097
mul x15,x11,x9
1098
adcs x20,x20,x16
1099
mul x16,x12,x9
1100
adc x21,x21,x17
1101
1102
mul x17,x13,x9
1103
adds x26,x26,x14
1104
umulh x14,x10,x9 // hi(a[4..7]*a[3])
1105
adcs x19,x19,x15
1106
umulh x15,x11,x9
1107
adcs x20,x20,x16
1108
umulh x16,x12,x9
1109
adcs x21,x21,x17
1110
umulh x17,x13,x9
1111
stp x25,x26,[x2],#8*2 // t[6..7]
1112
adc x22,xzr,xzr // t[11]
1113
adds x19,x19,x14
1114
mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
1115
adcs x20,x20,x15
1116
mul x15,x12,x10
1117
adcs x21,x21,x16
1118
mul x16,x13,x10
1119
adc x22,x22,x17
1120
1121
umulh x17,x11,x10 // hi(a[5..7]*a[4])
1122
adds x20,x20,x14
1123
umulh x14,x12,x10
1124
adcs x21,x21,x15
1125
umulh x15,x13,x10
1126
adcs x22,x22,x16
1127
mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
1128
adc x23,xzr,xzr // t[12]
1129
adds x21,x21,x17
1130
mul x17,x13,x11
1131
adcs x22,x22,x14
1132
umulh x14,x12,x11 // hi(a[6..7]*a[5])
1133
adc x23,x23,x15
1134
1135
umulh x15,x13,x11
1136
adds x22,x22,x16
1137
mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
1138
adcs x23,x23,x17
1139
umulh x17,x13,x12 // hi(a[7]*a[6])
1140
adc x24,xzr,xzr // t[13]
1141
adds x23,x23,x14
1142
sub x27,x3,x1 // done yet?
1143
adc x24,x24,x15
1144
1145
adds x24,x24,x16
1146
sub x14,x3,x5 // rewinded ap
1147
adc x25,xzr,xzr // t[14]
1148
add x25,x25,x17
1149
1150
cbz x27,.Lsqr8x_outer_break
1151
1152
mov x4,x6
1153
ldp x6,x7,[x2,#8*0]
1154
ldp x8,x9,[x2,#8*2]
1155
ldp x10,x11,[x2,#8*4]
1156
ldp x12,x13,[x2,#8*6]
1157
adds x19,x19,x6
1158
adcs x20,x20,x7
1159
ldp x6,x7,[x1,#8*0]
1160
adcs x21,x21,x8
1161
adcs x22,x22,x9
1162
ldp x8,x9,[x1,#8*2]
1163
adcs x23,x23,x10
1164
adcs x24,x24,x11
1165
ldp x10,x11,[x1,#8*4]
1166
adcs x25,x25,x12
1167
mov x0,x1
1168
adcs x26,xzr,x13
1169
ldp x12,x13,[x1,#8*6]
1170
add x1,x1,#8*8
1171
//adc x28,xzr,xzr // moved below
1172
mov x27,#-8*8
1173
1174
// a[8]a[0]
1175
// a[9]a[0]
1176
// a[a]a[0]
1177
// a[b]a[0]
1178
// a[c]a[0]
1179
// a[d]a[0]
1180
// a[e]a[0]
1181
// a[f]a[0]
1182
// a[8]a[1]
1183
// a[f]a[1]........................
1184
// a[8]a[2]
1185
// a[f]a[2]........................
1186
// a[8]a[3]
1187
// a[f]a[3]........................
1188
// a[8]a[4]
1189
// a[f]a[4]........................
1190
// a[8]a[5]
1191
// a[f]a[5]........................
1192
// a[8]a[6]
1193
// a[f]a[6]........................
1194
// a[8]a[7]
1195
// a[f]a[7]........................
1196
.Lsqr8x_mul:
1197
mul x14,x6,x4
1198
adc x28,xzr,xzr // carry bit, modulo-scheduled
1199
mul x15,x7,x4
1200
add x27,x27,#8
1201
mul x16,x8,x4
1202
mul x17,x9,x4
1203
adds x19,x19,x14
1204
mul x14,x10,x4
1205
adcs x20,x20,x15
1206
mul x15,x11,x4
1207
adcs x21,x21,x16
1208
mul x16,x12,x4
1209
adcs x22,x22,x17
1210
mul x17,x13,x4
1211
adcs x23,x23,x14
1212
umulh x14,x6,x4
1213
adcs x24,x24,x15
1214
umulh x15,x7,x4
1215
adcs x25,x25,x16
1216
umulh x16,x8,x4
1217
adcs x26,x26,x17
1218
umulh x17,x9,x4
1219
adc x28,x28,xzr
1220
str x19,[x2],#8
1221
adds x19,x20,x14
1222
umulh x14,x10,x4
1223
adcs x20,x21,x15
1224
umulh x15,x11,x4
1225
adcs x21,x22,x16
1226
umulh x16,x12,x4
1227
adcs x22,x23,x17
1228
umulh x17,x13,x4
1229
ldr x4,[x0,x27]
1230
adcs x23,x24,x14
1231
adcs x24,x25,x15
1232
adcs x25,x26,x16
1233
adcs x26,x28,x17
1234
//adc x28,xzr,xzr // moved above
1235
cbnz x27,.Lsqr8x_mul
1236
// note that carry flag is guaranteed
1237
// to be zero at this point
1238
cmp x1,x3 // done yet?
1239
b.eq .Lsqr8x_break
1240
1241
ldp x6,x7,[x2,#8*0]
1242
ldp x8,x9,[x2,#8*2]
1243
ldp x10,x11,[x2,#8*4]
1244
ldp x12,x13,[x2,#8*6]
1245
adds x19,x19,x6
1246
ldur x4,[x0,#-8*8]
1247
adcs x20,x20,x7
1248
ldp x6,x7,[x1,#8*0]
1249
adcs x21,x21,x8
1250
adcs x22,x22,x9
1251
ldp x8,x9,[x1,#8*2]
1252
adcs x23,x23,x10
1253
adcs x24,x24,x11
1254
ldp x10,x11,[x1,#8*4]
1255
adcs x25,x25,x12
1256
mov x27,#-8*8
1257
adcs x26,x26,x13
1258
ldp x12,x13,[x1,#8*6]
1259
add x1,x1,#8*8
1260
//adc x28,xzr,xzr // moved above
1261
b .Lsqr8x_mul
1262
1263
.align 4
1264
.Lsqr8x_break:
1265
ldp x6,x7,[x0,#8*0]
1266
add x1,x0,#8*8
1267
ldp x8,x9,[x0,#8*2]
1268
sub x14,x3,x1 // is it last iteration?
1269
ldp x10,x11,[x0,#8*4]
1270
sub x15,x2,x14
1271
ldp x12,x13,[x0,#8*6]
1272
cbz x14,.Lsqr8x_outer_loop
1273
1274
stp x19,x20,[x2,#8*0]
1275
ldp x19,x20,[x15,#8*0]
1276
stp x21,x22,[x2,#8*2]
1277
ldp x21,x22,[x15,#8*2]
1278
stp x23,x24,[x2,#8*4]
1279
ldp x23,x24,[x15,#8*4]
1280
stp x25,x26,[x2,#8*6]
1281
mov x2,x15
1282
ldp x25,x26,[x15,#8*6]
1283
b .Lsqr8x_outer_loop
1284
1285
.align 4
1286
.Lsqr8x_outer_break:
1287
// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1288
ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
1289
ldp x15,x16,[sp,#8*1]
1290
ldp x11,x13,[x14,#8*2]
1291
add x1,x14,#8*4
1292
ldp x17,x14,[sp,#8*3]
1293
1294
stp x19,x20,[x2,#8*0]
1295
mul x19,x7,x7
1296
stp x21,x22,[x2,#8*2]
1297
umulh x7,x7,x7
1298
stp x23,x24,[x2,#8*4]
1299
mul x8,x9,x9
1300
stp x25,x26,[x2,#8*6]
1301
mov x2,sp
1302
umulh x9,x9,x9
1303
adds x20,x7,x15,lsl#1
1304
extr x15,x16,x15,#63
1305
sub x27,x5,#8*4
1306
1307
.Lsqr4x_shift_n_add:
1308
adcs x21,x8,x15
1309
extr x16,x17,x16,#63
1310
sub x27,x27,#8*4
1311
adcs x22,x9,x16
1312
ldp x15,x16,[x2,#8*5]
1313
mul x10,x11,x11
1314
ldp x7,x9,[x1],#8*2
1315
umulh x11,x11,x11
1316
mul x12,x13,x13
1317
umulh x13,x13,x13
1318
extr x17,x14,x17,#63
1319
stp x19,x20,[x2,#8*0]
1320
adcs x23,x10,x17
1321
extr x14,x15,x14,#63
1322
stp x21,x22,[x2,#8*2]
1323
adcs x24,x11,x14
1324
ldp x17,x14,[x2,#8*7]
1325
extr x15,x16,x15,#63
1326
adcs x25,x12,x15
1327
extr x16,x17,x16,#63
1328
adcs x26,x13,x16
1329
ldp x15,x16,[x2,#8*9]
1330
mul x6,x7,x7
1331
ldp x11,x13,[x1],#8*2
1332
umulh x7,x7,x7
1333
mul x8,x9,x9
1334
umulh x9,x9,x9
1335
stp x23,x24,[x2,#8*4]
1336
extr x17,x14,x17,#63
1337
stp x25,x26,[x2,#8*6]
1338
add x2,x2,#8*8
1339
adcs x19,x6,x17
1340
extr x14,x15,x14,#63
1341
adcs x20,x7,x14
1342
ldp x17,x14,[x2,#8*3]
1343
extr x15,x16,x15,#63
1344
cbnz x27,.Lsqr4x_shift_n_add
1345
ldp x1,x4,[x29,#104] // pull np and n0
1346
1347
adcs x21,x8,x15
1348
extr x16,x17,x16,#63
1349
adcs x22,x9,x16
1350
ldp x15,x16,[x2,#8*5]
1351
mul x10,x11,x11
1352
umulh x11,x11,x11
1353
stp x19,x20,[x2,#8*0]
1354
mul x12,x13,x13
1355
umulh x13,x13,x13
1356
stp x21,x22,[x2,#8*2]
1357
extr x17,x14,x17,#63
1358
adcs x23,x10,x17
1359
extr x14,x15,x14,#63
1360
ldp x19,x20,[sp,#8*0]
1361
adcs x24,x11,x14
1362
extr x15,x16,x15,#63
1363
ldp x6,x7,[x1,#8*0]
1364
adcs x25,x12,x15
1365
extr x16,xzr,x16,#63
1366
ldp x8,x9,[x1,#8*2]
1367
adc x26,x13,x16
1368
ldp x10,x11,[x1,#8*4]
1369
1370
// Reduce by 512 bits per iteration
1371
mul x28,x4,x19 // t[0]*n0
1372
ldp x12,x13,[x1,#8*6]
1373
add x3,x1,x5
1374
ldp x21,x22,[sp,#8*2]
1375
stp x23,x24,[x2,#8*4]
1376
ldp x23,x24,[sp,#8*4]
1377
stp x25,x26,[x2,#8*6]
1378
ldp x25,x26,[sp,#8*6]
1379
add x1,x1,#8*8
1380
mov x30,xzr // initial top-most carry
1381
mov x2,sp
1382
mov x27,#8
1383
1384
.Lsqr8x_reduction:
1385
// (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
1386
mul x15,x7,x28
1387
sub x27,x27,#1
1388
mul x16,x8,x28
1389
str x28,[x2],#8 // put aside t[0]*n0 for tail processing
1390
mul x17,x9,x28
1391
// (*) adds xzr,x19,x14
1392
subs xzr,x19,#1 // (*)
1393
mul x14,x10,x28
1394
adcs x19,x20,x15
1395
mul x15,x11,x28
1396
adcs x20,x21,x16
1397
mul x16,x12,x28
1398
adcs x21,x22,x17
1399
mul x17,x13,x28
1400
adcs x22,x23,x14
1401
umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
1402
adcs x23,x24,x15
1403
umulh x15,x7,x28
1404
adcs x24,x25,x16
1405
umulh x16,x8,x28
1406
adcs x25,x26,x17
1407
umulh x17,x9,x28
1408
adc x26,xzr,xzr
1409
adds x19,x19,x14
1410
umulh x14,x10,x28
1411
adcs x20,x20,x15
1412
umulh x15,x11,x28
1413
adcs x21,x21,x16
1414
umulh x16,x12,x28
1415
adcs x22,x22,x17
1416
umulh x17,x13,x28
1417
mul x28,x4,x19 // next t[0]*n0
1418
adcs x23,x23,x14
1419
adcs x24,x24,x15
1420
adcs x25,x25,x16
1421
adc x26,x26,x17
1422
cbnz x27,.Lsqr8x_reduction
1423
1424
ldp x14,x15,[x2,#8*0]
1425
ldp x16,x17,[x2,#8*2]
1426
mov x0,x2
1427
sub x27,x3,x1 // done yet?
1428
adds x19,x19,x14
1429
adcs x20,x20,x15
1430
ldp x14,x15,[x2,#8*4]
1431
adcs x21,x21,x16
1432
adcs x22,x22,x17
1433
ldp x16,x17,[x2,#8*6]
1434
adcs x23,x23,x14
1435
adcs x24,x24,x15
1436
adcs x25,x25,x16
1437
adcs x26,x26,x17
1438
//adc x28,xzr,xzr // moved below
1439
cbz x27,.Lsqr8x8_post_condition
1440
1441
ldur x4,[x2,#-8*8]
1442
ldp x6,x7,[x1,#8*0]
1443
ldp x8,x9,[x1,#8*2]
1444
ldp x10,x11,[x1,#8*4]
1445
mov x27,#-8*8
1446
ldp x12,x13,[x1,#8*6]
1447
add x1,x1,#8*8
1448
1449
.Lsqr8x_tail:
1450
mul x14,x6,x4
1451
adc x28,xzr,xzr // carry bit, modulo-scheduled
1452
mul x15,x7,x4
1453
add x27,x27,#8
1454
mul x16,x8,x4
1455
mul x17,x9,x4
1456
adds x19,x19,x14
1457
mul x14,x10,x4
1458
adcs x20,x20,x15
1459
mul x15,x11,x4
1460
adcs x21,x21,x16
1461
mul x16,x12,x4
1462
adcs x22,x22,x17
1463
mul x17,x13,x4
1464
adcs x23,x23,x14
1465
umulh x14,x6,x4
1466
adcs x24,x24,x15
1467
umulh x15,x7,x4
1468
adcs x25,x25,x16
1469
umulh x16,x8,x4
1470
adcs x26,x26,x17
1471
umulh x17,x9,x4
1472
adc x28,x28,xzr
1473
str x19,[x2],#8
1474
adds x19,x20,x14
1475
umulh x14,x10,x4
1476
adcs x20,x21,x15
1477
umulh x15,x11,x4
1478
adcs x21,x22,x16
1479
umulh x16,x12,x4
1480
adcs x22,x23,x17
1481
umulh x17,x13,x4
1482
ldr x4,[x0,x27]
1483
adcs x23,x24,x14
1484
adcs x24,x25,x15
1485
adcs x25,x26,x16
1486
adcs x26,x28,x17
1487
//adc x28,xzr,xzr // moved above
1488
cbnz x27,.Lsqr8x_tail
1489
// note that carry flag is guaranteed
1490
// to be zero at this point
1491
ldp x6,x7,[x2,#8*0]
1492
sub x27,x3,x1 // done yet?
1493
sub x16,x3,x5 // rewinded np
1494
ldp x8,x9,[x2,#8*2]
1495
ldp x10,x11,[x2,#8*4]
1496
ldp x12,x13,[x2,#8*6]
1497
cbz x27,.Lsqr8x_tail_break
1498
1499
ldur x4,[x0,#-8*8]
1500
adds x19,x19,x6
1501
adcs x20,x20,x7
1502
ldp x6,x7,[x1,#8*0]
1503
adcs x21,x21,x8
1504
adcs x22,x22,x9
1505
ldp x8,x9,[x1,#8*2]
1506
adcs x23,x23,x10
1507
adcs x24,x24,x11
1508
ldp x10,x11,[x1,#8*4]
1509
adcs x25,x25,x12
1510
mov x27,#-8*8
1511
adcs x26,x26,x13
1512
ldp x12,x13,[x1,#8*6]
1513
add x1,x1,#8*8
1514
//adc x28,xzr,xzr // moved above
1515
b .Lsqr8x_tail
1516
1517
.align 4
1518
.Lsqr8x_tail_break:
1519
ldr x4,[x29,#112] // pull n0
1520
add x27,x2,#8*8 // end of current t[num] window
1521
1522
subs xzr,x30,#1 // "move" top-most carry to carry bit
1523
adcs x14,x19,x6
1524
adcs x15,x20,x7
1525
ldp x19,x20,[x0,#8*0]
1526
adcs x21,x21,x8
1527
ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
1528
adcs x22,x22,x9
1529
ldp x8,x9,[x16,#8*2]
1530
adcs x23,x23,x10
1531
adcs x24,x24,x11
1532
ldp x10,x11,[x16,#8*4]
1533
adcs x25,x25,x12
1534
adcs x26,x26,x13
1535
ldp x12,x13,[x16,#8*6]
1536
add x1,x16,#8*8
1537
adc x30,xzr,xzr // top-most carry
1538
mul x28,x4,x19
1539
stp x14,x15,[x2,#8*0]
1540
stp x21,x22,[x2,#8*2]
1541
ldp x21,x22,[x0,#8*2]
1542
stp x23,x24,[x2,#8*4]
1543
ldp x23,x24,[x0,#8*4]
1544
cmp x27,x29 // did we hit the bottom?
1545
stp x25,x26,[x2,#8*6]
1546
mov x2,x0 // slide the window
1547
ldp x25,x26,[x0,#8*6]
1548
mov x27,#8
1549
b.ne .Lsqr8x_reduction
1550
1551
// Final step. We see if result is larger than modulus, and
1552
// if it is, subtract the modulus. But comparison implies
1553
// subtraction. So we subtract modulus, see if it borrowed,
1554
// and conditionally copy original value.
1555
ldr x0,[x29,#96] // pull rp
1556
add x2,x2,#8*8
1557
subs x14,x19,x6
1558
sbcs x15,x20,x7
1559
sub x27,x5,#8*8
1560
mov x3,x0 // x0 copy
1561
1562
.Lsqr8x_sub:
1563
sbcs x16,x21,x8
1564
ldp x6,x7,[x1,#8*0]
1565
sbcs x17,x22,x9
1566
stp x14,x15,[x0,#8*0]
1567
sbcs x14,x23,x10
1568
ldp x8,x9,[x1,#8*2]
1569
sbcs x15,x24,x11
1570
stp x16,x17,[x0,#8*2]
1571
sbcs x16,x25,x12
1572
ldp x10,x11,[x1,#8*4]
1573
sbcs x17,x26,x13
1574
ldp x12,x13,[x1,#8*6]
1575
add x1,x1,#8*8
1576
ldp x19,x20,[x2,#8*0]
1577
sub x27,x27,#8*8
1578
ldp x21,x22,[x2,#8*2]
1579
ldp x23,x24,[x2,#8*4]
1580
ldp x25,x26,[x2,#8*6]
1581
add x2,x2,#8*8
1582
stp x14,x15,[x0,#8*4]
1583
sbcs x14,x19,x6
1584
stp x16,x17,[x0,#8*6]
1585
add x0,x0,#8*8
1586
sbcs x15,x20,x7
1587
cbnz x27,.Lsqr8x_sub
1588
1589
sbcs x16,x21,x8
1590
mov x2,sp
1591
add x1,sp,x5
1592
ldp x6,x7,[x3,#8*0]
1593
sbcs x17,x22,x9
1594
stp x14,x15,[x0,#8*0]
1595
sbcs x14,x23,x10
1596
ldp x8,x9,[x3,#8*2]
1597
sbcs x15,x24,x11
1598
stp x16,x17,[x0,#8*2]
1599
sbcs x16,x25,x12
1600
ldp x19,x20,[x1,#8*0]
1601
sbcs x17,x26,x13
1602
ldp x21,x22,[x1,#8*2]
1603
sbcs xzr,x30,xzr // did it borrow?
1604
ldr x30,[x29,#8] // pull return address
1605
stp x14,x15,[x0,#8*4]
1606
stp x16,x17,[x0,#8*6]
1607
1608
sub x27,x5,#8*4
1609
.Lsqr4x_cond_copy:
1610
sub x27,x27,#8*4
1611
csel x14,x19,x6,lo
1612
stp xzr,xzr,[x2,#8*0]
1613
csel x15,x20,x7,lo
1614
ldp x6,x7,[x3,#8*4]
1615
ldp x19,x20,[x1,#8*4]
1616
csel x16,x21,x8,lo
1617
stp xzr,xzr,[x2,#8*2]
1618
add x2,x2,#8*4
1619
csel x17,x22,x9,lo
1620
ldp x8,x9,[x3,#8*6]
1621
ldp x21,x22,[x1,#8*6]
1622
add x1,x1,#8*4
1623
stp x14,x15,[x3,#8*0]
1624
stp x16,x17,[x3,#8*2]
1625
add x3,x3,#8*4
1626
stp xzr,xzr,[x1,#8*0]
1627
stp xzr,xzr,[x1,#8*2]
1628
cbnz x27,.Lsqr4x_cond_copy
1629
1630
csel x14,x19,x6,lo
1631
stp xzr,xzr,[x2,#8*0]
1632
csel x15,x20,x7,lo
1633
stp xzr,xzr,[x2,#8*2]
1634
csel x16,x21,x8,lo
1635
csel x17,x22,x9,lo
1636
stp x14,x15,[x3,#8*0]
1637
stp x16,x17,[x3,#8*2]
1638
1639
b .Lsqr8x_done
1640
1641
.align 4
1642
.Lsqr8x8_post_condition:
1643
adc x28,xzr,xzr
1644
ldr x30,[x29,#8] // pull return address
1645
// x19-7,x28 hold result, x6-7 hold modulus
1646
subs x6,x19,x6
1647
ldr x1,[x29,#96] // pull rp
1648
sbcs x7,x20,x7
1649
stp xzr,xzr,[sp,#8*0]
1650
sbcs x8,x21,x8
1651
stp xzr,xzr,[sp,#8*2]
1652
sbcs x9,x22,x9
1653
stp xzr,xzr,[sp,#8*4]
1654
sbcs x10,x23,x10
1655
stp xzr,xzr,[sp,#8*6]
1656
sbcs x11,x24,x11
1657
stp xzr,xzr,[sp,#8*8]
1658
sbcs x12,x25,x12
1659
stp xzr,xzr,[sp,#8*10]
1660
sbcs x13,x26,x13
1661
stp xzr,xzr,[sp,#8*12]
1662
sbcs x28,x28,xzr // did it borrow?
1663
stp xzr,xzr,[sp,#8*14]
1664
1665
// x6-7 hold result-modulus
1666
csel x6,x19,x6,lo
1667
csel x7,x20,x7,lo
1668
csel x8,x21,x8,lo
1669
csel x9,x22,x9,lo
1670
stp x6,x7,[x1,#8*0]
1671
csel x10,x23,x10,lo
1672
csel x11,x24,x11,lo
1673
stp x8,x9,[x1,#8*2]
1674
csel x12,x25,x12,lo
1675
csel x13,x26,x13,lo
1676
stp x10,x11,[x1,#8*4]
1677
stp x12,x13,[x1,#8*6]
1678
1679
.Lsqr8x_done:
1680
ldp x19,x20,[x29,#16]
1681
mov sp,x29
1682
ldp x21,x22,[x29,#32]
1683
mov x0,#1
1684
ldp x23,x24,[x29,#48]
1685
ldp x25,x26,[x29,#64]
1686
ldp x27,x28,[x29,#80]
1687
ldr x29,[sp],#128
1688
// x30 is loaded earlier
1689
AARCH64_VALIDATE_LINK_REGISTER
1690
ret
1691
.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1692
.type __bn_mul4x_mont,%function
1693
.align 5
1694
__bn_mul4x_mont:
1695
// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
1696
// only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
1697
stp x29,x30,[sp,#-128]!
1698
add x29,sp,#0
1699
stp x19,x20,[sp,#16]
1700
stp x21,x22,[sp,#32]
1701
stp x23,x24,[sp,#48]
1702
stp x25,x26,[sp,#64]
1703
stp x27,x28,[sp,#80]
1704
1705
sub x26,sp,x5,lsl#3
1706
lsl x5,x5,#3
1707
ldr x4,[x4] // *n0
1708
sub sp,x26,#8*4 // alloca
1709
1710
add x10,x2,x5
1711
add x27,x1,x5
1712
stp x0,x10,[x29,#96] // offload rp and &b[num]
1713
1714
ldr x24,[x2,#8*0] // b[0]
1715
ldp x6,x7,[x1,#8*0] // a[0..3]
1716
ldp x8,x9,[x1,#8*2]
1717
add x1,x1,#8*4
1718
mov x19,xzr
1719
mov x20,xzr
1720
mov x21,xzr
1721
mov x22,xzr
1722
ldp x14,x15,[x3,#8*0] // n[0..3]
1723
ldp x16,x17,[x3,#8*2]
1724
adds x3,x3,#8*4 // clear carry bit
1725
mov x0,xzr
1726
mov x28,#0
1727
mov x26,sp
1728
1729
.Loop_mul4x_1st_reduction:
1730
mul x10,x6,x24 // lo(a[0..3]*b[0])
1731
adc x0,x0,xzr // modulo-scheduled
1732
mul x11,x7,x24
1733
add x28,x28,#8
1734
mul x12,x8,x24
1735
and x28,x28,#31
1736
mul x13,x9,x24
1737
adds x19,x19,x10
1738
umulh x10,x6,x24 // hi(a[0..3]*b[0])
1739
adcs x20,x20,x11
1740
mul x25,x19,x4 // t[0]*n0
1741
adcs x21,x21,x12
1742
umulh x11,x7,x24
1743
adcs x22,x22,x13
1744
umulh x12,x8,x24
1745
adc x23,xzr,xzr
1746
umulh x13,x9,x24
1747
ldr x24,[x2,x28] // next b[i] (or b[0])
1748
adds x20,x20,x10
1749
// (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
1750
str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1751
adcs x21,x21,x11
1752
mul x11,x15,x25
1753
adcs x22,x22,x12
1754
mul x12,x16,x25
1755
adc x23,x23,x13 // can't overflow
1756
mul x13,x17,x25
1757
// (*) adds xzr,x19,x10
1758
subs xzr,x19,#1 // (*)
1759
umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
1760
adcs x19,x20,x11
1761
umulh x11,x15,x25
1762
adcs x20,x21,x12
1763
umulh x12,x16,x25
1764
adcs x21,x22,x13
1765
umulh x13,x17,x25
1766
adcs x22,x23,x0
1767
adc x0,xzr,xzr
1768
adds x19,x19,x10
1769
sub x10,x27,x1
1770
adcs x20,x20,x11
1771
adcs x21,x21,x12
1772
adcs x22,x22,x13
1773
//adc x0,x0,xzr
1774
cbnz x28,.Loop_mul4x_1st_reduction
1775
1776
cbz x10,.Lmul4x4_post_condition
1777
1778
ldp x6,x7,[x1,#8*0] // a[4..7]
1779
ldp x8,x9,[x1,#8*2]
1780
add x1,x1,#8*4
1781
ldr x25,[sp] // a[0]*n0
1782
ldp x14,x15,[x3,#8*0] // n[4..7]
1783
ldp x16,x17,[x3,#8*2]
1784
add x3,x3,#8*4
1785
1786
.Loop_mul4x_1st_tail:
1787
mul x10,x6,x24 // lo(a[4..7]*b[i])
1788
adc x0,x0,xzr // modulo-scheduled
1789
mul x11,x7,x24
1790
add x28,x28,#8
1791
mul x12,x8,x24
1792
and x28,x28,#31
1793
mul x13,x9,x24
1794
adds x19,x19,x10
1795
umulh x10,x6,x24 // hi(a[4..7]*b[i])
1796
adcs x20,x20,x11
1797
umulh x11,x7,x24
1798
adcs x21,x21,x12
1799
umulh x12,x8,x24
1800
adcs x22,x22,x13
1801
umulh x13,x9,x24
1802
adc x23,xzr,xzr
1803
ldr x24,[x2,x28] // next b[i] (or b[0])
1804
adds x20,x20,x10
1805
mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
1806
adcs x21,x21,x11
1807
mul x11,x15,x25
1808
adcs x22,x22,x12
1809
mul x12,x16,x25
1810
adc x23,x23,x13 // can't overflow
1811
mul x13,x17,x25
1812
adds x19,x19,x10
1813
umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
1814
adcs x20,x20,x11
1815
umulh x11,x15,x25
1816
adcs x21,x21,x12
1817
umulh x12,x16,x25
1818
adcs x22,x22,x13
1819
adcs x23,x23,x0
1820
umulh x13,x17,x25
1821
adc x0,xzr,xzr
1822
ldr x25,[sp,x28] // next t[0]*n0
1823
str x19,[x26],#8 // result!!!
1824
adds x19,x20,x10
1825
sub x10,x27,x1 // done yet?
1826
adcs x20,x21,x11
1827
adcs x21,x22,x12
1828
adcs x22,x23,x13
1829
//adc x0,x0,xzr
1830
cbnz x28,.Loop_mul4x_1st_tail
1831
1832
sub x11,x27,x5 // rewinded x1
1833
cbz x10,.Lmul4x_proceed
1834
1835
ldp x6,x7,[x1,#8*0]
1836
ldp x8,x9,[x1,#8*2]
1837
add x1,x1,#8*4
1838
ldp x14,x15,[x3,#8*0]
1839
ldp x16,x17,[x3,#8*2]
1840
add x3,x3,#8*4
1841
b .Loop_mul4x_1st_tail
1842
1843
.align 5
1844
.Lmul4x_proceed:
1845
ldr x24,[x2,#8*4]! // *++b
1846
adc x30,x0,xzr
1847
ldp x6,x7,[x11,#8*0] // a[0..3]
1848
sub x3,x3,x5 // rewind np
1849
ldp x8,x9,[x11,#8*2]
1850
add x1,x11,#8*4
1851
1852
stp x19,x20,[x26,#8*0] // result!!!
1853
ldp x19,x20,[sp,#8*4] // t[0..3]
1854
stp x21,x22,[x26,#8*2] // result!!!
1855
ldp x21,x22,[sp,#8*6]
1856
1857
ldp x14,x15,[x3,#8*0] // n[0..3]
1858
mov x26,sp
1859
ldp x16,x17,[x3,#8*2]
1860
adds x3,x3,#8*4 // clear carry bit
1861
mov x0,xzr
1862
1863
.align 4
1864
.Loop_mul4x_reduction:
1865
mul x10,x6,x24 // lo(a[0..3]*b[4])
1866
adc x0,x0,xzr // modulo-scheduled
1867
mul x11,x7,x24
1868
add x28,x28,#8
1869
mul x12,x8,x24
1870
and x28,x28,#31
1871
mul x13,x9,x24
1872
adds x19,x19,x10
1873
umulh x10,x6,x24 // hi(a[0..3]*b[4])
1874
adcs x20,x20,x11
1875
mul x25,x19,x4 // t[0]*n0
1876
adcs x21,x21,x12
1877
umulh x11,x7,x24
1878
adcs x22,x22,x13
1879
umulh x12,x8,x24
1880
adc x23,xzr,xzr
1881
umulh x13,x9,x24
1882
ldr x24,[x2,x28] // next b[i]
1883
adds x20,x20,x10
1884
// (*) mul x10,x14,x25
1885
str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1886
adcs x21,x21,x11
1887
mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
1888
adcs x22,x22,x12
1889
mul x12,x16,x25
1890
adc x23,x23,x13 // can't overflow
1891
mul x13,x17,x25
1892
// (*) adds xzr,x19,x10
1893
subs xzr,x19,#1 // (*)
1894
umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
1895
adcs x19,x20,x11
1896
umulh x11,x15,x25
1897
adcs x20,x21,x12
1898
umulh x12,x16,x25
1899
adcs x21,x22,x13
1900
umulh x13,x17,x25
1901
adcs x22,x23,x0
1902
adc x0,xzr,xzr
1903
adds x19,x19,x10
1904
adcs x20,x20,x11
1905
adcs x21,x21,x12
1906
adcs x22,x22,x13
1907
//adc x0,x0,xzr
1908
cbnz x28,.Loop_mul4x_reduction
1909
1910
adc x0,x0,xzr
1911
ldp x10,x11,[x26,#8*4] // t[4..7]
1912
ldp x12,x13,[x26,#8*6]
1913
ldp x6,x7,[x1,#8*0] // a[4..7]
1914
ldp x8,x9,[x1,#8*2]
1915
add x1,x1,#8*4
1916
adds x19,x19,x10
1917
adcs x20,x20,x11
1918
adcs x21,x21,x12
1919
adcs x22,x22,x13
1920
//adc x0,x0,xzr
1921
1922
ldr x25,[sp] // t[0]*n0
1923
ldp x14,x15,[x3,#8*0] // n[4..7]
1924
ldp x16,x17,[x3,#8*2]
1925
add x3,x3,#8*4
1926
1927
.align 4
1928
.Loop_mul4x_tail:
1929
mul x10,x6,x24 // lo(a[4..7]*b[4])
1930
adc x0,x0,xzr // modulo-scheduled
1931
mul x11,x7,x24
1932
add x28,x28,#8
1933
mul x12,x8,x24
1934
and x28,x28,#31
1935
mul x13,x9,x24
1936
adds x19,x19,x10
1937
umulh x10,x6,x24 // hi(a[4..7]*b[4])
1938
adcs x20,x20,x11
1939
umulh x11,x7,x24
1940
adcs x21,x21,x12
1941
umulh x12,x8,x24
1942
adcs x22,x22,x13
1943
umulh x13,x9,x24
1944
adc x23,xzr,xzr
1945
ldr x24,[x2,x28] // next b[i]
1946
adds x20,x20,x10
1947
mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
1948
adcs x21,x21,x11
1949
mul x11,x15,x25
1950
adcs x22,x22,x12
1951
mul x12,x16,x25
1952
adc x23,x23,x13 // can't overflow
1953
mul x13,x17,x25
1954
adds x19,x19,x10
1955
umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
1956
adcs x20,x20,x11
1957
umulh x11,x15,x25
1958
adcs x21,x21,x12
1959
umulh x12,x16,x25
1960
adcs x22,x22,x13
1961
umulh x13,x17,x25
1962
adcs x23,x23,x0
1963
ldr x25,[sp,x28] // next a[0]*n0
1964
adc x0,xzr,xzr
1965
str x19,[x26],#8 // result!!!
1966
adds x19,x20,x10
1967
sub x10,x27,x1 // done yet?
1968
adcs x20,x21,x11
1969
adcs x21,x22,x12
1970
adcs x22,x23,x13
1971
//adc x0,x0,xzr
1972
cbnz x28,.Loop_mul4x_tail
1973
1974
sub x11,x3,x5 // rewinded np?
1975
adc x0,x0,xzr
1976
cbz x10,.Loop_mul4x_break
1977
1978
ldp x10,x11,[x26,#8*4]
1979
ldp x12,x13,[x26,#8*6]
1980
ldp x6,x7,[x1,#8*0]
1981
ldp x8,x9,[x1,#8*2]
1982
add x1,x1,#8*4
1983
adds x19,x19,x10
1984
adcs x20,x20,x11
1985
adcs x21,x21,x12
1986
adcs x22,x22,x13
1987
//adc x0,x0,xzr
1988
ldp x14,x15,[x3,#8*0]
1989
ldp x16,x17,[x3,#8*2]
1990
add x3,x3,#8*4
1991
b .Loop_mul4x_tail
1992
1993
.align 4
1994
.Loop_mul4x_break:
1995
ldp x12,x13,[x29,#96] // pull rp and &b[num]
1996
adds x19,x19,x30
1997
add x2,x2,#8*4 // bp++
1998
adcs x20,x20,xzr
1999
sub x1,x1,x5 // rewind ap
2000
adcs x21,x21,xzr
2001
stp x19,x20,[x26,#8*0] // result!!!
2002
adcs x22,x22,xzr
2003
ldp x19,x20,[sp,#8*4] // t[0..3]
2004
adc x30,x0,xzr
2005
stp x21,x22,[x26,#8*2] // result!!!
2006
cmp x2,x13 // done yet?
2007
ldp x21,x22,[sp,#8*6]
2008
ldp x14,x15,[x11,#8*0] // n[0..3]
2009
ldp x16,x17,[x11,#8*2]
2010
add x3,x11,#8*4
2011
b.eq .Lmul4x_post
2012
2013
ldr x24,[x2]
2014
ldp x6,x7,[x1,#8*0] // a[0..3]
2015
ldp x8,x9,[x1,#8*2]
2016
adds x1,x1,#8*4 // clear carry bit
2017
mov x0,xzr
2018
mov x26,sp
2019
b .Loop_mul4x_reduction
2020
2021
.align 4
2022
.Lmul4x_post:
2023
// Final step. We see if result is larger than modulus, and
2024
// if it is, subtract the modulus. But comparison implies
2025
// subtraction. So we subtract modulus, see if it borrowed,
2026
// and conditionally copy original value.
2027
mov x0,x12
2028
mov x27,x12 // x0 copy
2029
subs x10,x19,x14
2030
add x26,sp,#8*8
2031
sbcs x11,x20,x15
2032
sub x28,x5,#8*4
2033
2034
.Lmul4x_sub:
2035
sbcs x12,x21,x16
2036
ldp x14,x15,[x3,#8*0]
2037
sub x28,x28,#8*4
2038
ldp x19,x20,[x26,#8*0]
2039
sbcs x13,x22,x17
2040
ldp x16,x17,[x3,#8*2]
2041
add x3,x3,#8*4
2042
ldp x21,x22,[x26,#8*2]
2043
add x26,x26,#8*4
2044
stp x10,x11,[x0,#8*0]
2045
sbcs x10,x19,x14
2046
stp x12,x13,[x0,#8*2]
2047
add x0,x0,#8*4
2048
sbcs x11,x20,x15
2049
cbnz x28,.Lmul4x_sub
2050
2051
sbcs x12,x21,x16
2052
mov x26,sp
2053
add x1,sp,#8*4
2054
ldp x6,x7,[x27,#8*0]
2055
sbcs x13,x22,x17
2056
stp x10,x11,[x0,#8*0]
2057
ldp x8,x9,[x27,#8*2]
2058
stp x12,x13,[x0,#8*2]
2059
ldp x19,x20,[x1,#8*0]
2060
ldp x21,x22,[x1,#8*2]
2061
sbcs xzr,x30,xzr // did it borrow?
2062
ldr x30,[x29,#8] // pull return address
2063
2064
sub x28,x5,#8*4
2065
.Lmul4x_cond_copy:
2066
sub x28,x28,#8*4
2067
csel x10,x19,x6,lo
2068
stp xzr,xzr,[x26,#8*0]
2069
csel x11,x20,x7,lo
2070
ldp x6,x7,[x27,#8*4]
2071
ldp x19,x20,[x1,#8*4]
2072
csel x12,x21,x8,lo
2073
stp xzr,xzr,[x26,#8*2]
2074
add x26,x26,#8*4
2075
csel x13,x22,x9,lo
2076
ldp x8,x9,[x27,#8*6]
2077
ldp x21,x22,[x1,#8*6]
2078
add x1,x1,#8*4
2079
stp x10,x11,[x27,#8*0]
2080
stp x12,x13,[x27,#8*2]
2081
add x27,x27,#8*4
2082
cbnz x28,.Lmul4x_cond_copy
2083
2084
csel x10,x19,x6,lo
2085
stp xzr,xzr,[x26,#8*0]
2086
csel x11,x20,x7,lo
2087
stp xzr,xzr,[x26,#8*2]
2088
csel x12,x21,x8,lo
2089
stp xzr,xzr,[x26,#8*3]
2090
csel x13,x22,x9,lo
2091
stp xzr,xzr,[x26,#8*4]
2092
stp x10,x11,[x27,#8*0]
2093
stp x12,x13,[x27,#8*2]
2094
2095
b .Lmul4x_done
2096
2097
.align 4
2098
.Lmul4x4_post_condition:
2099
adc x0,x0,xzr
2100
ldr x1,[x29,#96] // pull rp
2101
// x19-3,x0 hold result, x14-7 hold modulus
2102
subs x6,x19,x14
2103
ldr x30,[x29,#8] // pull return address
2104
sbcs x7,x20,x15
2105
stp xzr,xzr,[sp,#8*0]
2106
sbcs x8,x21,x16
2107
stp xzr,xzr,[sp,#8*2]
2108
sbcs x9,x22,x17
2109
stp xzr,xzr,[sp,#8*4]
2110
sbcs xzr,x0,xzr // did it borrow?
2111
stp xzr,xzr,[sp,#8*6]
2112
2113
// x6-3 hold result-modulus
2114
csel x6,x19,x6,lo
2115
csel x7,x20,x7,lo
2116
csel x8,x21,x8,lo
2117
csel x9,x22,x9,lo
2118
stp x6,x7,[x1,#8*0]
2119
stp x8,x9,[x1,#8*2]
2120
2121
.Lmul4x_done:
2122
ldp x19,x20,[x29,#16]
2123
mov sp,x29
2124
ldp x21,x22,[x29,#32]
2125
mov x0,#1
2126
ldp x23,x24,[x29,#48]
2127
ldp x25,x26,[x29,#64]
2128
ldp x27,x28,[x29,#80]
2129
ldr x29,[sp],#128
2130
// x30 loaded earlier
2131
AARCH64_VALIDATE_LINK_REGISTER
2132
ret
2133
.size __bn_mul4x_mont,.-__bn_mul4x_mont
2134
.section .rodata
2135
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2136
.align 2
2137
.align 4
2138
2139