Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/amd64/x86_64-mont5.S
39482 views
1
/* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
2
.text
3
4
5
6
.globl bn_mul_mont_gather5
7
.type bn_mul_mont_gather5,@function
8
.align 64
9
bn_mul_mont_gather5:
10
.cfi_startproc
11
movl %r9d,%r9d
12
movq %rsp,%rax
13
.cfi_def_cfa_register %rax
14
testl $7,%r9d
15
jnz .Lmul_enter
16
movl OPENSSL_ia32cap_P+8(%rip),%r11d
17
jmp .Lmul4x_enter
18
19
.align 16
20
.Lmul_enter:
21
movd 8(%rsp),%xmm5
22
pushq %rbx
23
.cfi_offset %rbx,-16
24
pushq %rbp
25
.cfi_offset %rbp,-24
26
pushq %r12
27
.cfi_offset %r12,-32
28
pushq %r13
29
.cfi_offset %r13,-40
30
pushq %r14
31
.cfi_offset %r14,-48
32
pushq %r15
33
.cfi_offset %r15,-56
34
35
negq %r9
36
movq %rsp,%r11
37
leaq -280(%rsp,%r9,8),%r10
38
negq %r9
39
andq $-1024,%r10
40
41
42
43
44
45
46
47
48
49
subq %r10,%r11
50
andq $-4096,%r11
51
leaq (%r10,%r11,1),%rsp
52
movq (%rsp),%r11
53
cmpq %r10,%rsp
54
ja .Lmul_page_walk
55
jmp .Lmul_page_walk_done
56
57
.Lmul_page_walk:
58
leaq -4096(%rsp),%rsp
59
movq (%rsp),%r11
60
cmpq %r10,%rsp
61
ja .Lmul_page_walk
62
.Lmul_page_walk_done:
63
64
leaq .Linc(%rip),%r10
65
movq %rax,8(%rsp,%r9,8)
66
.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
67
.Lmul_body:
68
69
leaq 128(%rdx),%r12
70
movdqa 0(%r10),%xmm0
71
movdqa 16(%r10),%xmm1
72
leaq 24-112(%rsp,%r9,8),%r10
73
andq $-16,%r10
74
75
pshufd $0,%xmm5,%xmm5
76
movdqa %xmm1,%xmm4
77
movdqa %xmm1,%xmm2
78
paddd %xmm0,%xmm1
79
pcmpeqd %xmm5,%xmm0
80
.byte 0x67
81
movdqa %xmm4,%xmm3
82
paddd %xmm1,%xmm2
83
pcmpeqd %xmm5,%xmm1
84
movdqa %xmm0,112(%r10)
85
movdqa %xmm4,%xmm0
86
87
paddd %xmm2,%xmm3
88
pcmpeqd %xmm5,%xmm2
89
movdqa %xmm1,128(%r10)
90
movdqa %xmm4,%xmm1
91
92
paddd %xmm3,%xmm0
93
pcmpeqd %xmm5,%xmm3
94
movdqa %xmm2,144(%r10)
95
movdqa %xmm4,%xmm2
96
97
paddd %xmm0,%xmm1
98
pcmpeqd %xmm5,%xmm0
99
movdqa %xmm3,160(%r10)
100
movdqa %xmm4,%xmm3
101
paddd %xmm1,%xmm2
102
pcmpeqd %xmm5,%xmm1
103
movdqa %xmm0,176(%r10)
104
movdqa %xmm4,%xmm0
105
106
paddd %xmm2,%xmm3
107
pcmpeqd %xmm5,%xmm2
108
movdqa %xmm1,192(%r10)
109
movdqa %xmm4,%xmm1
110
111
paddd %xmm3,%xmm0
112
pcmpeqd %xmm5,%xmm3
113
movdqa %xmm2,208(%r10)
114
movdqa %xmm4,%xmm2
115
116
paddd %xmm0,%xmm1
117
pcmpeqd %xmm5,%xmm0
118
movdqa %xmm3,224(%r10)
119
movdqa %xmm4,%xmm3
120
paddd %xmm1,%xmm2
121
pcmpeqd %xmm5,%xmm1
122
movdqa %xmm0,240(%r10)
123
movdqa %xmm4,%xmm0
124
125
paddd %xmm2,%xmm3
126
pcmpeqd %xmm5,%xmm2
127
movdqa %xmm1,256(%r10)
128
movdqa %xmm4,%xmm1
129
130
paddd %xmm3,%xmm0
131
pcmpeqd %xmm5,%xmm3
132
movdqa %xmm2,272(%r10)
133
movdqa %xmm4,%xmm2
134
135
paddd %xmm0,%xmm1
136
pcmpeqd %xmm5,%xmm0
137
movdqa %xmm3,288(%r10)
138
movdqa %xmm4,%xmm3
139
paddd %xmm1,%xmm2
140
pcmpeqd %xmm5,%xmm1
141
movdqa %xmm0,304(%r10)
142
143
paddd %xmm2,%xmm3
144
.byte 0x67
145
pcmpeqd %xmm5,%xmm2
146
movdqa %xmm1,320(%r10)
147
148
pcmpeqd %xmm5,%xmm3
149
movdqa %xmm2,336(%r10)
150
pand 64(%r12),%xmm0
151
152
pand 80(%r12),%xmm1
153
pand 96(%r12),%xmm2
154
movdqa %xmm3,352(%r10)
155
pand 112(%r12),%xmm3
156
por %xmm2,%xmm0
157
por %xmm3,%xmm1
158
movdqa -128(%r12),%xmm4
159
movdqa -112(%r12),%xmm5
160
movdqa -96(%r12),%xmm2
161
pand 112(%r10),%xmm4
162
movdqa -80(%r12),%xmm3
163
pand 128(%r10),%xmm5
164
por %xmm4,%xmm0
165
pand 144(%r10),%xmm2
166
por %xmm5,%xmm1
167
pand 160(%r10),%xmm3
168
por %xmm2,%xmm0
169
por %xmm3,%xmm1
170
movdqa -64(%r12),%xmm4
171
movdqa -48(%r12),%xmm5
172
movdqa -32(%r12),%xmm2
173
pand 176(%r10),%xmm4
174
movdqa -16(%r12),%xmm3
175
pand 192(%r10),%xmm5
176
por %xmm4,%xmm0
177
pand 208(%r10),%xmm2
178
por %xmm5,%xmm1
179
pand 224(%r10),%xmm3
180
por %xmm2,%xmm0
181
por %xmm3,%xmm1
182
movdqa 0(%r12),%xmm4
183
movdqa 16(%r12),%xmm5
184
movdqa 32(%r12),%xmm2
185
pand 240(%r10),%xmm4
186
movdqa 48(%r12),%xmm3
187
pand 256(%r10),%xmm5
188
por %xmm4,%xmm0
189
pand 272(%r10),%xmm2
190
por %xmm5,%xmm1
191
pand 288(%r10),%xmm3
192
por %xmm2,%xmm0
193
por %xmm3,%xmm1
194
por %xmm1,%xmm0
195
pshufd $0x4e,%xmm0,%xmm1
196
por %xmm1,%xmm0
197
leaq 256(%r12),%r12
198
.byte 102,72,15,126,195
199
200
movq (%r8),%r8
201
movq (%rsi),%rax
202
203
xorq %r14,%r14
204
xorq %r15,%r15
205
206
movq %r8,%rbp
207
mulq %rbx
208
movq %rax,%r10
209
movq (%rcx),%rax
210
211
imulq %r10,%rbp
212
movq %rdx,%r11
213
214
mulq %rbp
215
addq %rax,%r10
216
movq 8(%rsi),%rax
217
adcq $0,%rdx
218
movq %rdx,%r13
219
220
leaq 1(%r15),%r15
221
jmp .L1st_enter
222
223
.align 16
224
.L1st:
225
addq %rax,%r13
226
movq (%rsi,%r15,8),%rax
227
adcq $0,%rdx
228
addq %r11,%r13
229
movq %r10,%r11
230
adcq $0,%rdx
231
movq %r13,-16(%rsp,%r15,8)
232
movq %rdx,%r13
233
234
.L1st_enter:
235
mulq %rbx
236
addq %rax,%r11
237
movq (%rcx,%r15,8),%rax
238
adcq $0,%rdx
239
leaq 1(%r15),%r15
240
movq %rdx,%r10
241
242
mulq %rbp
243
cmpq %r9,%r15
244
jne .L1st
245
246
247
addq %rax,%r13
248
adcq $0,%rdx
249
addq %r11,%r13
250
adcq $0,%rdx
251
movq %r13,-16(%rsp,%r9,8)
252
movq %rdx,%r13
253
movq %r10,%r11
254
255
xorq %rdx,%rdx
256
addq %r11,%r13
257
adcq $0,%rdx
258
movq %r13,-8(%rsp,%r9,8)
259
movq %rdx,(%rsp,%r9,8)
260
261
leaq 1(%r14),%r14
262
jmp .Louter
263
.align 16
264
.Louter:
265
leaq 24+128(%rsp,%r9,8),%rdx
266
andq $-16,%rdx
267
pxor %xmm4,%xmm4
268
pxor %xmm5,%xmm5
269
movdqa -128(%r12),%xmm0
270
movdqa -112(%r12),%xmm1
271
movdqa -96(%r12),%xmm2
272
movdqa -80(%r12),%xmm3
273
pand -128(%rdx),%xmm0
274
pand -112(%rdx),%xmm1
275
por %xmm0,%xmm4
276
pand -96(%rdx),%xmm2
277
por %xmm1,%xmm5
278
pand -80(%rdx),%xmm3
279
por %xmm2,%xmm4
280
por %xmm3,%xmm5
281
movdqa -64(%r12),%xmm0
282
movdqa -48(%r12),%xmm1
283
movdqa -32(%r12),%xmm2
284
movdqa -16(%r12),%xmm3
285
pand -64(%rdx),%xmm0
286
pand -48(%rdx),%xmm1
287
por %xmm0,%xmm4
288
pand -32(%rdx),%xmm2
289
por %xmm1,%xmm5
290
pand -16(%rdx),%xmm3
291
por %xmm2,%xmm4
292
por %xmm3,%xmm5
293
movdqa 0(%r12),%xmm0
294
movdqa 16(%r12),%xmm1
295
movdqa 32(%r12),%xmm2
296
movdqa 48(%r12),%xmm3
297
pand 0(%rdx),%xmm0
298
pand 16(%rdx),%xmm1
299
por %xmm0,%xmm4
300
pand 32(%rdx),%xmm2
301
por %xmm1,%xmm5
302
pand 48(%rdx),%xmm3
303
por %xmm2,%xmm4
304
por %xmm3,%xmm5
305
movdqa 64(%r12),%xmm0
306
movdqa 80(%r12),%xmm1
307
movdqa 96(%r12),%xmm2
308
movdqa 112(%r12),%xmm3
309
pand 64(%rdx),%xmm0
310
pand 80(%rdx),%xmm1
311
por %xmm0,%xmm4
312
pand 96(%rdx),%xmm2
313
por %xmm1,%xmm5
314
pand 112(%rdx),%xmm3
315
por %xmm2,%xmm4
316
por %xmm3,%xmm5
317
por %xmm5,%xmm4
318
pshufd $0x4e,%xmm4,%xmm0
319
por %xmm4,%xmm0
320
leaq 256(%r12),%r12
321
322
movq (%rsi),%rax
323
.byte 102,72,15,126,195
324
325
xorq %r15,%r15
326
movq %r8,%rbp
327
movq (%rsp),%r10
328
329
mulq %rbx
330
addq %rax,%r10
331
movq (%rcx),%rax
332
adcq $0,%rdx
333
334
imulq %r10,%rbp
335
movq %rdx,%r11
336
337
mulq %rbp
338
addq %rax,%r10
339
movq 8(%rsi),%rax
340
adcq $0,%rdx
341
movq 8(%rsp),%r10
342
movq %rdx,%r13
343
344
leaq 1(%r15),%r15
345
jmp .Linner_enter
346
347
.align 16
348
.Linner:
349
addq %rax,%r13
350
movq (%rsi,%r15,8),%rax
351
adcq $0,%rdx
352
addq %r10,%r13
353
movq (%rsp,%r15,8),%r10
354
adcq $0,%rdx
355
movq %r13,-16(%rsp,%r15,8)
356
movq %rdx,%r13
357
358
.Linner_enter:
359
mulq %rbx
360
addq %rax,%r11
361
movq (%rcx,%r15,8),%rax
362
adcq $0,%rdx
363
addq %r11,%r10
364
movq %rdx,%r11
365
adcq $0,%r11
366
leaq 1(%r15),%r15
367
368
mulq %rbp
369
cmpq %r9,%r15
370
jne .Linner
371
372
addq %rax,%r13
373
adcq $0,%rdx
374
addq %r10,%r13
375
movq (%rsp,%r9,8),%r10
376
adcq $0,%rdx
377
movq %r13,-16(%rsp,%r9,8)
378
movq %rdx,%r13
379
380
xorq %rdx,%rdx
381
addq %r11,%r13
382
adcq $0,%rdx
383
addq %r10,%r13
384
adcq $0,%rdx
385
movq %r13,-8(%rsp,%r9,8)
386
movq %rdx,(%rsp,%r9,8)
387
388
leaq 1(%r14),%r14
389
cmpq %r9,%r14
390
jb .Louter
391
392
xorq %r14,%r14
393
movq (%rsp),%rax
394
leaq (%rsp),%rsi
395
movq %r9,%r15
396
jmp .Lsub
397
.align 16
398
.Lsub: sbbq (%rcx,%r14,8),%rax
399
movq %rax,(%rdi,%r14,8)
400
movq 8(%rsi,%r14,8),%rax
401
leaq 1(%r14),%r14
402
decq %r15
403
jnz .Lsub
404
405
sbbq $0,%rax
406
movq $-1,%rbx
407
xorq %rax,%rbx
408
xorq %r14,%r14
409
movq %r9,%r15
410
411
.Lcopy:
412
movq (%rdi,%r14,8),%rcx
413
movq (%rsp,%r14,8),%rdx
414
andq %rbx,%rcx
415
andq %rax,%rdx
416
movq %r14,(%rsp,%r14,8)
417
orq %rcx,%rdx
418
movq %rdx,(%rdi,%r14,8)
419
leaq 1(%r14),%r14
420
subq $1,%r15
421
jnz .Lcopy
422
423
movq 8(%rsp,%r9,8),%rsi
424
.cfi_def_cfa %rsi,8
425
movq $1,%rax
426
427
movq -48(%rsi),%r15
428
.cfi_restore %r15
429
movq -40(%rsi),%r14
430
.cfi_restore %r14
431
movq -32(%rsi),%r13
432
.cfi_restore %r13
433
movq -24(%rsi),%r12
434
.cfi_restore %r12
435
movq -16(%rsi),%rbp
436
.cfi_restore %rbp
437
movq -8(%rsi),%rbx
438
.cfi_restore %rbx
439
leaq (%rsi),%rsp
440
.cfi_def_cfa_register %rsp
441
.Lmul_epilogue:
442
.byte 0xf3,0xc3
443
.cfi_endproc
444
.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
445
.type bn_mul4x_mont_gather5,@function
446
.align 32
447
bn_mul4x_mont_gather5:
448
.cfi_startproc
449
.byte 0x67
450
movq %rsp,%rax
451
.cfi_def_cfa_register %rax
452
.Lmul4x_enter:
453
andl $0x80108,%r11d
454
cmpl $0x80108,%r11d
455
je .Lmulx4x_enter
456
pushq %rbx
457
.cfi_offset %rbx,-16
458
pushq %rbp
459
.cfi_offset %rbp,-24
460
pushq %r12
461
.cfi_offset %r12,-32
462
pushq %r13
463
.cfi_offset %r13,-40
464
pushq %r14
465
.cfi_offset %r14,-48
466
pushq %r15
467
.cfi_offset %r15,-56
468
.Lmul4x_prologue:
469
470
.byte 0x67
471
shll $3,%r9d
472
leaq (%r9,%r9,2),%r10
473
negq %r9
474
475
476
477
478
479
480
481
482
483
484
leaq -320(%rsp,%r9,2),%r11
485
movq %rsp,%rbp
486
subq %rdi,%r11
487
andq $4095,%r11
488
cmpq %r11,%r10
489
jb .Lmul4xsp_alt
490
subq %r11,%rbp
491
leaq -320(%rbp,%r9,2),%rbp
492
jmp .Lmul4xsp_done
493
494
.align 32
495
.Lmul4xsp_alt:
496
leaq 4096-320(,%r9,2),%r10
497
leaq -320(%rbp,%r9,2),%rbp
498
subq %r10,%r11
499
movq $0,%r10
500
cmovcq %r10,%r11
501
subq %r11,%rbp
502
.Lmul4xsp_done:
503
andq $-64,%rbp
504
movq %rsp,%r11
505
subq %rbp,%r11
506
andq $-4096,%r11
507
leaq (%r11,%rbp,1),%rsp
508
movq (%rsp),%r10
509
cmpq %rbp,%rsp
510
ja .Lmul4x_page_walk
511
jmp .Lmul4x_page_walk_done
512
513
.Lmul4x_page_walk:
514
leaq -4096(%rsp),%rsp
515
movq (%rsp),%r10
516
cmpq %rbp,%rsp
517
ja .Lmul4x_page_walk
518
.Lmul4x_page_walk_done:
519
520
negq %r9
521
522
movq %rax,40(%rsp)
523
.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
524
.Lmul4x_body:
525
526
call mul4x_internal
527
528
movq 40(%rsp),%rsi
529
.cfi_def_cfa %rsi,8
530
movq $1,%rax
531
532
movq -48(%rsi),%r15
533
.cfi_restore %r15
534
movq -40(%rsi),%r14
535
.cfi_restore %r14
536
movq -32(%rsi),%r13
537
.cfi_restore %r13
538
movq -24(%rsi),%r12
539
.cfi_restore %r12
540
movq -16(%rsi),%rbp
541
.cfi_restore %rbp
542
movq -8(%rsi),%rbx
543
.cfi_restore %rbx
544
leaq (%rsi),%rsp
545
.cfi_def_cfa_register %rsp
546
.Lmul4x_epilogue:
547
.byte 0xf3,0xc3
548
.cfi_endproc
549
.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
550
551
.type mul4x_internal,@function
552
.align 32
553
mul4x_internal:
554
.cfi_startproc
555
shlq $5,%r9
556
movd 8(%rax),%xmm5
557
leaq .Linc(%rip),%rax
558
leaq 128(%rdx,%r9,1),%r13
559
shrq $5,%r9
560
movdqa 0(%rax),%xmm0
561
movdqa 16(%rax),%xmm1
562
leaq 88-112(%rsp,%r9,1),%r10
563
leaq 128(%rdx),%r12
564
565
pshufd $0,%xmm5,%xmm5
566
movdqa %xmm1,%xmm4
567
.byte 0x67,0x67
568
movdqa %xmm1,%xmm2
569
paddd %xmm0,%xmm1
570
pcmpeqd %xmm5,%xmm0
571
.byte 0x67
572
movdqa %xmm4,%xmm3
573
paddd %xmm1,%xmm2
574
pcmpeqd %xmm5,%xmm1
575
movdqa %xmm0,112(%r10)
576
movdqa %xmm4,%xmm0
577
578
paddd %xmm2,%xmm3
579
pcmpeqd %xmm5,%xmm2
580
movdqa %xmm1,128(%r10)
581
movdqa %xmm4,%xmm1
582
583
paddd %xmm3,%xmm0
584
pcmpeqd %xmm5,%xmm3
585
movdqa %xmm2,144(%r10)
586
movdqa %xmm4,%xmm2
587
588
paddd %xmm0,%xmm1
589
pcmpeqd %xmm5,%xmm0
590
movdqa %xmm3,160(%r10)
591
movdqa %xmm4,%xmm3
592
paddd %xmm1,%xmm2
593
pcmpeqd %xmm5,%xmm1
594
movdqa %xmm0,176(%r10)
595
movdqa %xmm4,%xmm0
596
597
paddd %xmm2,%xmm3
598
pcmpeqd %xmm5,%xmm2
599
movdqa %xmm1,192(%r10)
600
movdqa %xmm4,%xmm1
601
602
paddd %xmm3,%xmm0
603
pcmpeqd %xmm5,%xmm3
604
movdqa %xmm2,208(%r10)
605
movdqa %xmm4,%xmm2
606
607
paddd %xmm0,%xmm1
608
pcmpeqd %xmm5,%xmm0
609
movdqa %xmm3,224(%r10)
610
movdqa %xmm4,%xmm3
611
paddd %xmm1,%xmm2
612
pcmpeqd %xmm5,%xmm1
613
movdqa %xmm0,240(%r10)
614
movdqa %xmm4,%xmm0
615
616
paddd %xmm2,%xmm3
617
pcmpeqd %xmm5,%xmm2
618
movdqa %xmm1,256(%r10)
619
movdqa %xmm4,%xmm1
620
621
paddd %xmm3,%xmm0
622
pcmpeqd %xmm5,%xmm3
623
movdqa %xmm2,272(%r10)
624
movdqa %xmm4,%xmm2
625
626
paddd %xmm0,%xmm1
627
pcmpeqd %xmm5,%xmm0
628
movdqa %xmm3,288(%r10)
629
movdqa %xmm4,%xmm3
630
paddd %xmm1,%xmm2
631
pcmpeqd %xmm5,%xmm1
632
movdqa %xmm0,304(%r10)
633
634
paddd %xmm2,%xmm3
635
.byte 0x67
636
pcmpeqd %xmm5,%xmm2
637
movdqa %xmm1,320(%r10)
638
639
pcmpeqd %xmm5,%xmm3
640
movdqa %xmm2,336(%r10)
641
pand 64(%r12),%xmm0
642
643
pand 80(%r12),%xmm1
644
pand 96(%r12),%xmm2
645
movdqa %xmm3,352(%r10)
646
pand 112(%r12),%xmm3
647
por %xmm2,%xmm0
648
por %xmm3,%xmm1
649
movdqa -128(%r12),%xmm4
650
movdqa -112(%r12),%xmm5
651
movdqa -96(%r12),%xmm2
652
pand 112(%r10),%xmm4
653
movdqa -80(%r12),%xmm3
654
pand 128(%r10),%xmm5
655
por %xmm4,%xmm0
656
pand 144(%r10),%xmm2
657
por %xmm5,%xmm1
658
pand 160(%r10),%xmm3
659
por %xmm2,%xmm0
660
por %xmm3,%xmm1
661
movdqa -64(%r12),%xmm4
662
movdqa -48(%r12),%xmm5
663
movdqa -32(%r12),%xmm2
664
pand 176(%r10),%xmm4
665
movdqa -16(%r12),%xmm3
666
pand 192(%r10),%xmm5
667
por %xmm4,%xmm0
668
pand 208(%r10),%xmm2
669
por %xmm5,%xmm1
670
pand 224(%r10),%xmm3
671
por %xmm2,%xmm0
672
por %xmm3,%xmm1
673
movdqa 0(%r12),%xmm4
674
movdqa 16(%r12),%xmm5
675
movdqa 32(%r12),%xmm2
676
pand 240(%r10),%xmm4
677
movdqa 48(%r12),%xmm3
678
pand 256(%r10),%xmm5
679
por %xmm4,%xmm0
680
pand 272(%r10),%xmm2
681
por %xmm5,%xmm1
682
pand 288(%r10),%xmm3
683
por %xmm2,%xmm0
684
por %xmm3,%xmm1
685
por %xmm1,%xmm0
686
pshufd $0x4e,%xmm0,%xmm1
687
por %xmm1,%xmm0
688
leaq 256(%r12),%r12
689
.byte 102,72,15,126,195
690
691
movq %r13,16+8(%rsp)
692
movq %rdi,56+8(%rsp)
693
694
movq (%r8),%r8
695
movq (%rsi),%rax
696
leaq (%rsi,%r9,1),%rsi
697
negq %r9
698
699
movq %r8,%rbp
700
mulq %rbx
701
movq %rax,%r10
702
movq (%rcx),%rax
703
704
imulq %r10,%rbp
705
leaq 64+8(%rsp),%r14
706
movq %rdx,%r11
707
708
mulq %rbp
709
addq %rax,%r10
710
movq 8(%rsi,%r9,1),%rax
711
adcq $0,%rdx
712
movq %rdx,%rdi
713
714
mulq %rbx
715
addq %rax,%r11
716
movq 8(%rcx),%rax
717
adcq $0,%rdx
718
movq %rdx,%r10
719
720
mulq %rbp
721
addq %rax,%rdi
722
movq 16(%rsi,%r9,1),%rax
723
adcq $0,%rdx
724
addq %r11,%rdi
725
leaq 32(%r9),%r15
726
leaq 32(%rcx),%rcx
727
adcq $0,%rdx
728
movq %rdi,(%r14)
729
movq %rdx,%r13
730
jmp .L1st4x
731
732
.align 32
733
.L1st4x:
734
mulq %rbx
735
addq %rax,%r10
736
movq -16(%rcx),%rax
737
leaq 32(%r14),%r14
738
adcq $0,%rdx
739
movq %rdx,%r11
740
741
mulq %rbp
742
addq %rax,%r13
743
movq -8(%rsi,%r15,1),%rax
744
adcq $0,%rdx
745
addq %r10,%r13
746
adcq $0,%rdx
747
movq %r13,-24(%r14)
748
movq %rdx,%rdi
749
750
mulq %rbx
751
addq %rax,%r11
752
movq -8(%rcx),%rax
753
adcq $0,%rdx
754
movq %rdx,%r10
755
756
mulq %rbp
757
addq %rax,%rdi
758
movq (%rsi,%r15,1),%rax
759
adcq $0,%rdx
760
addq %r11,%rdi
761
adcq $0,%rdx
762
movq %rdi,-16(%r14)
763
movq %rdx,%r13
764
765
mulq %rbx
766
addq %rax,%r10
767
movq 0(%rcx),%rax
768
adcq $0,%rdx
769
movq %rdx,%r11
770
771
mulq %rbp
772
addq %rax,%r13
773
movq 8(%rsi,%r15,1),%rax
774
adcq $0,%rdx
775
addq %r10,%r13
776
adcq $0,%rdx
777
movq %r13,-8(%r14)
778
movq %rdx,%rdi
779
780
mulq %rbx
781
addq %rax,%r11
782
movq 8(%rcx),%rax
783
adcq $0,%rdx
784
movq %rdx,%r10
785
786
mulq %rbp
787
addq %rax,%rdi
788
movq 16(%rsi,%r15,1),%rax
789
adcq $0,%rdx
790
addq %r11,%rdi
791
leaq 32(%rcx),%rcx
792
adcq $0,%rdx
793
movq %rdi,(%r14)
794
movq %rdx,%r13
795
796
addq $32,%r15
797
jnz .L1st4x
798
799
mulq %rbx
800
addq %rax,%r10
801
movq -16(%rcx),%rax
802
leaq 32(%r14),%r14
803
adcq $0,%rdx
804
movq %rdx,%r11
805
806
mulq %rbp
807
addq %rax,%r13
808
movq -8(%rsi),%rax
809
adcq $0,%rdx
810
addq %r10,%r13
811
adcq $0,%rdx
812
movq %r13,-24(%r14)
813
movq %rdx,%rdi
814
815
mulq %rbx
816
addq %rax,%r11
817
movq -8(%rcx),%rax
818
adcq $0,%rdx
819
movq %rdx,%r10
820
821
mulq %rbp
822
addq %rax,%rdi
823
movq (%rsi,%r9,1),%rax
824
adcq $0,%rdx
825
addq %r11,%rdi
826
adcq $0,%rdx
827
movq %rdi,-16(%r14)
828
movq %rdx,%r13
829
830
leaq (%rcx,%r9,1),%rcx
831
832
xorq %rdi,%rdi
833
addq %r10,%r13
834
adcq $0,%rdi
835
movq %r13,-8(%r14)
836
837
jmp .Louter4x
838
839
.align 32
840
.Louter4x:
841
leaq 16+128(%r14),%rdx
842
pxor %xmm4,%xmm4
843
pxor %xmm5,%xmm5
844
movdqa -128(%r12),%xmm0
845
movdqa -112(%r12),%xmm1
846
movdqa -96(%r12),%xmm2
847
movdqa -80(%r12),%xmm3
848
pand -128(%rdx),%xmm0
849
pand -112(%rdx),%xmm1
850
por %xmm0,%xmm4
851
pand -96(%rdx),%xmm2
852
por %xmm1,%xmm5
853
pand -80(%rdx),%xmm3
854
por %xmm2,%xmm4
855
por %xmm3,%xmm5
856
movdqa -64(%r12),%xmm0
857
movdqa -48(%r12),%xmm1
858
movdqa -32(%r12),%xmm2
859
movdqa -16(%r12),%xmm3
860
pand -64(%rdx),%xmm0
861
pand -48(%rdx),%xmm1
862
por %xmm0,%xmm4
863
pand -32(%rdx),%xmm2
864
por %xmm1,%xmm5
865
pand -16(%rdx),%xmm3
866
por %xmm2,%xmm4
867
por %xmm3,%xmm5
868
movdqa 0(%r12),%xmm0
869
movdqa 16(%r12),%xmm1
870
movdqa 32(%r12),%xmm2
871
movdqa 48(%r12),%xmm3
872
pand 0(%rdx),%xmm0
873
pand 16(%rdx),%xmm1
874
por %xmm0,%xmm4
875
pand 32(%rdx),%xmm2
876
por %xmm1,%xmm5
877
pand 48(%rdx),%xmm3
878
por %xmm2,%xmm4
879
por %xmm3,%xmm5
880
movdqa 64(%r12),%xmm0
881
movdqa 80(%r12),%xmm1
882
movdqa 96(%r12),%xmm2
883
movdqa 112(%r12),%xmm3
884
pand 64(%rdx),%xmm0
885
pand 80(%rdx),%xmm1
886
por %xmm0,%xmm4
887
pand 96(%rdx),%xmm2
888
por %xmm1,%xmm5
889
pand 112(%rdx),%xmm3
890
por %xmm2,%xmm4
891
por %xmm3,%xmm5
892
por %xmm5,%xmm4
893
pshufd $0x4e,%xmm4,%xmm0
894
por %xmm4,%xmm0
895
leaq 256(%r12),%r12
896
.byte 102,72,15,126,195
897
898
movq (%r14,%r9,1),%r10
899
movq %r8,%rbp
900
mulq %rbx
901
addq %rax,%r10
902
movq (%rcx),%rax
903
adcq $0,%rdx
904
905
imulq %r10,%rbp
906
movq %rdx,%r11
907
movq %rdi,(%r14)
908
909
leaq (%r14,%r9,1),%r14
910
911
mulq %rbp
912
addq %rax,%r10
913
movq 8(%rsi,%r9,1),%rax
914
adcq $0,%rdx
915
movq %rdx,%rdi
916
917
mulq %rbx
918
addq %rax,%r11
919
movq 8(%rcx),%rax
920
adcq $0,%rdx
921
addq 8(%r14),%r11
922
adcq $0,%rdx
923
movq %rdx,%r10
924
925
mulq %rbp
926
addq %rax,%rdi
927
movq 16(%rsi,%r9,1),%rax
928
adcq $0,%rdx
929
addq %r11,%rdi
930
leaq 32(%r9),%r15
931
leaq 32(%rcx),%rcx
932
adcq $0,%rdx
933
movq %rdx,%r13
934
jmp .Linner4x
935
936
.align 32
937
.Linner4x:
938
mulq %rbx
939
addq %rax,%r10
940
movq -16(%rcx),%rax
941
adcq $0,%rdx
942
addq 16(%r14),%r10
943
leaq 32(%r14),%r14
944
adcq $0,%rdx
945
movq %rdx,%r11
946
947
mulq %rbp
948
addq %rax,%r13
949
movq -8(%rsi,%r15,1),%rax
950
adcq $0,%rdx
951
addq %r10,%r13
952
adcq $0,%rdx
953
movq %rdi,-32(%r14)
954
movq %rdx,%rdi
955
956
mulq %rbx
957
addq %rax,%r11
958
movq -8(%rcx),%rax
959
adcq $0,%rdx
960
addq -8(%r14),%r11
961
adcq $0,%rdx
962
movq %rdx,%r10
963
964
mulq %rbp
965
addq %rax,%rdi
966
movq (%rsi,%r15,1),%rax
967
adcq $0,%rdx
968
addq %r11,%rdi
969
adcq $0,%rdx
970
movq %r13,-24(%r14)
971
movq %rdx,%r13
972
973
mulq %rbx
974
addq %rax,%r10
975
movq 0(%rcx),%rax
976
adcq $0,%rdx
977
addq (%r14),%r10
978
adcq $0,%rdx
979
movq %rdx,%r11
980
981
mulq %rbp
982
addq %rax,%r13
983
movq 8(%rsi,%r15,1),%rax
984
adcq $0,%rdx
985
addq %r10,%r13
986
adcq $0,%rdx
987
movq %rdi,-16(%r14)
988
movq %rdx,%rdi
989
990
mulq %rbx
991
addq %rax,%r11
992
movq 8(%rcx),%rax
993
adcq $0,%rdx
994
addq 8(%r14),%r11
995
adcq $0,%rdx
996
movq %rdx,%r10
997
998
mulq %rbp
999
addq %rax,%rdi
1000
movq 16(%rsi,%r15,1),%rax
1001
adcq $0,%rdx
1002
addq %r11,%rdi
1003
leaq 32(%rcx),%rcx
1004
adcq $0,%rdx
1005
movq %r13,-8(%r14)
1006
movq %rdx,%r13
1007
1008
addq $32,%r15
1009
jnz .Linner4x
1010
1011
mulq %rbx
1012
addq %rax,%r10
1013
movq -16(%rcx),%rax
1014
adcq $0,%rdx
1015
addq 16(%r14),%r10
1016
leaq 32(%r14),%r14
1017
adcq $0,%rdx
1018
movq %rdx,%r11
1019
1020
mulq %rbp
1021
addq %rax,%r13
1022
movq -8(%rsi),%rax
1023
adcq $0,%rdx
1024
addq %r10,%r13
1025
adcq $0,%rdx
1026
movq %rdi,-32(%r14)
1027
movq %rdx,%rdi
1028
1029
mulq %rbx
1030
addq %rax,%r11
1031
movq %rbp,%rax
1032
movq -8(%rcx),%rbp
1033
adcq $0,%rdx
1034
addq -8(%r14),%r11
1035
adcq $0,%rdx
1036
movq %rdx,%r10
1037
1038
mulq %rbp
1039
addq %rax,%rdi
1040
movq (%rsi,%r9,1),%rax
1041
adcq $0,%rdx
1042
addq %r11,%rdi
1043
adcq $0,%rdx
1044
movq %r13,-24(%r14)
1045
movq %rdx,%r13
1046
1047
movq %rdi,-16(%r14)
1048
leaq (%rcx,%r9,1),%rcx
1049
1050
xorq %rdi,%rdi
1051
addq %r10,%r13
1052
adcq $0,%rdi
1053
addq (%r14),%r13
1054
adcq $0,%rdi
1055
movq %r13,-8(%r14)
1056
1057
cmpq 16+8(%rsp),%r12
1058
jb .Louter4x
1059
xorq %rax,%rax
1060
subq %r13,%rbp
1061
adcq %r15,%r15
1062
orq %r15,%rdi
1063
subq %rdi,%rax
1064
leaq (%r14,%r9,1),%rbx
1065
movq (%rcx),%r12
1066
leaq (%rcx),%rbp
1067
movq %r9,%rcx
1068
sarq $3+2,%rcx
1069
movq 56+8(%rsp),%rdi
1070
decq %r12
1071
xorq %r10,%r10
1072
movq 8(%rbp),%r13
1073
movq 16(%rbp),%r14
1074
movq 24(%rbp),%r15
1075
jmp .Lsqr4x_sub_entry
1076
.cfi_endproc
1077
.size mul4x_internal,.-mul4x_internal
1078
.globl bn_power5
1079
.type bn_power5,@function
1080
.align 32
1081
bn_power5:
1082
.cfi_startproc
1083
movq %rsp,%rax
1084
.cfi_def_cfa_register %rax
1085
movl OPENSSL_ia32cap_P+8(%rip),%r11d
1086
andl $0x80108,%r11d
1087
cmpl $0x80108,%r11d
1088
je .Lpowerx5_enter
1089
pushq %rbx
1090
.cfi_offset %rbx,-16
1091
pushq %rbp
1092
.cfi_offset %rbp,-24
1093
pushq %r12
1094
.cfi_offset %r12,-32
1095
pushq %r13
1096
.cfi_offset %r13,-40
1097
pushq %r14
1098
.cfi_offset %r14,-48
1099
pushq %r15
1100
.cfi_offset %r15,-56
1101
.Lpower5_prologue:
1102
1103
shll $3,%r9d
1104
leal (%r9,%r9,2),%r10d
1105
negq %r9
1106
movq (%r8),%r8
1107
1108
1109
1110
1111
1112
1113
1114
1115
leaq -320(%rsp,%r9,2),%r11
1116
movq %rsp,%rbp
1117
subq %rdi,%r11
1118
andq $4095,%r11
1119
cmpq %r11,%r10
1120
jb .Lpwr_sp_alt
1121
subq %r11,%rbp
1122
leaq -320(%rbp,%r9,2),%rbp
1123
jmp .Lpwr_sp_done
1124
1125
.align 32
1126
.Lpwr_sp_alt:
1127
leaq 4096-320(,%r9,2),%r10
1128
leaq -320(%rbp,%r9,2),%rbp
1129
subq %r10,%r11
1130
movq $0,%r10
1131
cmovcq %r10,%r11
1132
subq %r11,%rbp
1133
.Lpwr_sp_done:
1134
andq $-64,%rbp
1135
movq %rsp,%r11
1136
subq %rbp,%r11
1137
andq $-4096,%r11
1138
leaq (%r11,%rbp,1),%rsp
1139
movq (%rsp),%r10
1140
cmpq %rbp,%rsp
1141
ja .Lpwr_page_walk
1142
jmp .Lpwr_page_walk_done
1143
1144
.Lpwr_page_walk:
1145
leaq -4096(%rsp),%rsp
1146
movq (%rsp),%r10
1147
cmpq %rbp,%rsp
1148
ja .Lpwr_page_walk
1149
.Lpwr_page_walk_done:
1150
1151
movq %r9,%r10
1152
negq %r9
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
movq %r8,32(%rsp)
1164
movq %rax,40(%rsp)
1165
.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1166
.Lpower5_body:
1167
.byte 102,72,15,110,207
1168
.byte 102,72,15,110,209
1169
.byte 102,73,15,110,218
1170
.byte 102,72,15,110,226
1171
1172
call __bn_sqr8x_internal
1173
call __bn_post4x_internal
1174
call __bn_sqr8x_internal
1175
call __bn_post4x_internal
1176
call __bn_sqr8x_internal
1177
call __bn_post4x_internal
1178
call __bn_sqr8x_internal
1179
call __bn_post4x_internal
1180
call __bn_sqr8x_internal
1181
call __bn_post4x_internal
1182
1183
.byte 102,72,15,126,209
1184
.byte 102,72,15,126,226
1185
movq %rsi,%rdi
1186
movq 40(%rsp),%rax
1187
leaq 32(%rsp),%r8
1188
1189
call mul4x_internal
1190
1191
movq 40(%rsp),%rsi
1192
.cfi_def_cfa %rsi,8
1193
movq $1,%rax
1194
movq -48(%rsi),%r15
1195
.cfi_restore %r15
1196
movq -40(%rsi),%r14
1197
.cfi_restore %r14
1198
movq -32(%rsi),%r13
1199
.cfi_restore %r13
1200
movq -24(%rsi),%r12
1201
.cfi_restore %r12
1202
movq -16(%rsi),%rbp
1203
.cfi_restore %rbp
1204
movq -8(%rsi),%rbx
1205
.cfi_restore %rbx
1206
leaq (%rsi),%rsp
1207
.cfi_def_cfa_register %rsp
1208
.Lpower5_epilogue:
1209
.byte 0xf3,0xc3
1210
.cfi_endproc
1211
.size bn_power5,.-bn_power5
1212
1213
.globl bn_sqr8x_internal
1214
.hidden bn_sqr8x_internal
1215
.type bn_sqr8x_internal,@function
1216
.align 32
1217
bn_sqr8x_internal:
1218
__bn_sqr8x_internal:
1219
.cfi_startproc
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
leaq 32(%r10),%rbp
1294
leaq (%rsi,%r9,1),%rsi
1295
1296
movq %r9,%rcx
1297
1298
1299
movq -32(%rsi,%rbp,1),%r14
1300
leaq 48+8(%rsp,%r9,2),%rdi
1301
movq -24(%rsi,%rbp,1),%rax
1302
leaq -32(%rdi,%rbp,1),%rdi
1303
movq -16(%rsi,%rbp,1),%rbx
1304
movq %rax,%r15
1305
1306
mulq %r14
1307
movq %rax,%r10
1308
movq %rbx,%rax
1309
movq %rdx,%r11
1310
movq %r10,-24(%rdi,%rbp,1)
1311
1312
mulq %r14
1313
addq %rax,%r11
1314
movq %rbx,%rax
1315
adcq $0,%rdx
1316
movq %r11,-16(%rdi,%rbp,1)
1317
movq %rdx,%r10
1318
1319
1320
movq -8(%rsi,%rbp,1),%rbx
1321
mulq %r15
1322
movq %rax,%r12
1323
movq %rbx,%rax
1324
movq %rdx,%r13
1325
1326
leaq (%rbp),%rcx
1327
mulq %r14
1328
addq %rax,%r10
1329
movq %rbx,%rax
1330
movq %rdx,%r11
1331
adcq $0,%r11
1332
addq %r12,%r10
1333
adcq $0,%r11
1334
movq %r10,-8(%rdi,%rcx,1)
1335
jmp .Lsqr4x_1st
1336
1337
.align 32
1338
.Lsqr4x_1st:
1339
movq (%rsi,%rcx,1),%rbx
1340
mulq %r15
1341
addq %rax,%r13
1342
movq %rbx,%rax
1343
movq %rdx,%r12
1344
adcq $0,%r12
1345
1346
mulq %r14
1347
addq %rax,%r11
1348
movq %rbx,%rax
1349
movq 8(%rsi,%rcx,1),%rbx
1350
movq %rdx,%r10
1351
adcq $0,%r10
1352
addq %r13,%r11
1353
adcq $0,%r10
1354
1355
1356
mulq %r15
1357
addq %rax,%r12
1358
movq %rbx,%rax
1359
movq %r11,(%rdi,%rcx,1)
1360
movq %rdx,%r13
1361
adcq $0,%r13
1362
1363
mulq %r14
1364
addq %rax,%r10
1365
movq %rbx,%rax
1366
movq 16(%rsi,%rcx,1),%rbx
1367
movq %rdx,%r11
1368
adcq $0,%r11
1369
addq %r12,%r10
1370
adcq $0,%r11
1371
1372
mulq %r15
1373
addq %rax,%r13
1374
movq %rbx,%rax
1375
movq %r10,8(%rdi,%rcx,1)
1376
movq %rdx,%r12
1377
adcq $0,%r12
1378
1379
mulq %r14
1380
addq %rax,%r11
1381
movq %rbx,%rax
1382
movq 24(%rsi,%rcx,1),%rbx
1383
movq %rdx,%r10
1384
adcq $0,%r10
1385
addq %r13,%r11
1386
adcq $0,%r10
1387
1388
1389
mulq %r15
1390
addq %rax,%r12
1391
movq %rbx,%rax
1392
movq %r11,16(%rdi,%rcx,1)
1393
movq %rdx,%r13
1394
adcq $0,%r13
1395
leaq 32(%rcx),%rcx
1396
1397
mulq %r14
1398
addq %rax,%r10
1399
movq %rbx,%rax
1400
movq %rdx,%r11
1401
adcq $0,%r11
1402
addq %r12,%r10
1403
adcq $0,%r11
1404
movq %r10,-8(%rdi,%rcx,1)
1405
1406
cmpq $0,%rcx
1407
jne .Lsqr4x_1st
1408
1409
mulq %r15
1410
addq %rax,%r13
1411
leaq 16(%rbp),%rbp
1412
adcq $0,%rdx
1413
addq %r11,%r13
1414
adcq $0,%rdx
1415
1416
movq %r13,(%rdi)
1417
movq %rdx,%r12
1418
movq %rdx,8(%rdi)
1419
jmp .Lsqr4x_outer
1420
1421
.align 32
1422
.Lsqr4x_outer:
1423
movq -32(%rsi,%rbp,1),%r14
1424
leaq 48+8(%rsp,%r9,2),%rdi
1425
movq -24(%rsi,%rbp,1),%rax
1426
leaq -32(%rdi,%rbp,1),%rdi
1427
movq -16(%rsi,%rbp,1),%rbx
1428
movq %rax,%r15
1429
1430
mulq %r14
1431
movq -24(%rdi,%rbp,1),%r10
1432
addq %rax,%r10
1433
movq %rbx,%rax
1434
adcq $0,%rdx
1435
movq %r10,-24(%rdi,%rbp,1)
1436
movq %rdx,%r11
1437
1438
mulq %r14
1439
addq %rax,%r11
1440
movq %rbx,%rax
1441
adcq $0,%rdx
1442
addq -16(%rdi,%rbp,1),%r11
1443
movq %rdx,%r10
1444
adcq $0,%r10
1445
movq %r11,-16(%rdi,%rbp,1)
1446
1447
xorq %r12,%r12
1448
1449
movq -8(%rsi,%rbp,1),%rbx
1450
mulq %r15
1451
addq %rax,%r12
1452
movq %rbx,%rax
1453
adcq $0,%rdx
1454
addq -8(%rdi,%rbp,1),%r12
1455
movq %rdx,%r13
1456
adcq $0,%r13
1457
1458
mulq %r14
1459
addq %rax,%r10
1460
movq %rbx,%rax
1461
adcq $0,%rdx
1462
addq %r12,%r10
1463
movq %rdx,%r11
1464
adcq $0,%r11
1465
movq %r10,-8(%rdi,%rbp,1)
1466
1467
leaq (%rbp),%rcx
1468
jmp .Lsqr4x_inner
1469
1470
.align 32
1471
.Lsqr4x_inner:
1472
movq (%rsi,%rcx,1),%rbx
1473
mulq %r15
1474
addq %rax,%r13
1475
movq %rbx,%rax
1476
movq %rdx,%r12
1477
adcq $0,%r12
1478
addq (%rdi,%rcx,1),%r13
1479
adcq $0,%r12
1480
1481
.byte 0x67
1482
mulq %r14
1483
addq %rax,%r11
1484
movq %rbx,%rax
1485
movq 8(%rsi,%rcx,1),%rbx
1486
movq %rdx,%r10
1487
adcq $0,%r10
1488
addq %r13,%r11
1489
adcq $0,%r10
1490
1491
mulq %r15
1492
addq %rax,%r12
1493
movq %r11,(%rdi,%rcx,1)
1494
movq %rbx,%rax
1495
movq %rdx,%r13
1496
adcq $0,%r13
1497
addq 8(%rdi,%rcx,1),%r12
1498
leaq 16(%rcx),%rcx
1499
adcq $0,%r13
1500
1501
mulq %r14
1502
addq %rax,%r10
1503
movq %rbx,%rax
1504
adcq $0,%rdx
1505
addq %r12,%r10
1506
movq %rdx,%r11
1507
adcq $0,%r11
1508
movq %r10,-8(%rdi,%rcx,1)
1509
1510
cmpq $0,%rcx
1511
jne .Lsqr4x_inner
1512
1513
.byte 0x67
1514
mulq %r15
1515
addq %rax,%r13
1516
adcq $0,%rdx
1517
addq %r11,%r13
1518
adcq $0,%rdx
1519
1520
movq %r13,(%rdi)
1521
movq %rdx,%r12
1522
movq %rdx,8(%rdi)
1523
1524
addq $16,%rbp
1525
jnz .Lsqr4x_outer
1526
1527
1528
movq -32(%rsi),%r14
1529
leaq 48+8(%rsp,%r9,2),%rdi
1530
movq -24(%rsi),%rax
1531
leaq -32(%rdi,%rbp,1),%rdi
1532
movq -16(%rsi),%rbx
1533
movq %rax,%r15
1534
1535
mulq %r14
1536
addq %rax,%r10
1537
movq %rbx,%rax
1538
movq %rdx,%r11
1539
adcq $0,%r11
1540
1541
mulq %r14
1542
addq %rax,%r11
1543
movq %rbx,%rax
1544
movq %r10,-24(%rdi)
1545
movq %rdx,%r10
1546
adcq $0,%r10
1547
addq %r13,%r11
1548
movq -8(%rsi),%rbx
1549
adcq $0,%r10
1550
1551
mulq %r15
1552
addq %rax,%r12
1553
movq %rbx,%rax
1554
movq %r11,-16(%rdi)
1555
movq %rdx,%r13
1556
adcq $0,%r13
1557
1558
mulq %r14
1559
addq %rax,%r10
1560
movq %rbx,%rax
1561
movq %rdx,%r11
1562
adcq $0,%r11
1563
addq %r12,%r10
1564
adcq $0,%r11
1565
movq %r10,-8(%rdi)
1566
1567
mulq %r15
1568
addq %rax,%r13
1569
movq -16(%rsi),%rax
1570
adcq $0,%rdx
1571
addq %r11,%r13
1572
adcq $0,%rdx
1573
1574
movq %r13,(%rdi)
1575
movq %rdx,%r12
1576
movq %rdx,8(%rdi)
1577
1578
mulq %rbx
1579
addq $16,%rbp
1580
xorq %r14,%r14
1581
subq %r9,%rbp
1582
xorq %r15,%r15
1583
1584
addq %r12,%rax
1585
adcq $0,%rdx
1586
movq %rax,8(%rdi)
1587
movq %rdx,16(%rdi)
1588
movq %r15,24(%rdi)
1589
1590
movq -16(%rsi,%rbp,1),%rax
1591
leaq 48+8(%rsp),%rdi
1592
xorq %r10,%r10
1593
movq 8(%rdi),%r11
1594
1595
leaq (%r14,%r10,2),%r12
1596
shrq $63,%r10
1597
leaq (%rcx,%r11,2),%r13
1598
shrq $63,%r11
1599
orq %r10,%r13
1600
movq 16(%rdi),%r10
1601
movq %r11,%r14
1602
mulq %rax
1603
negq %r15
1604
movq 24(%rdi),%r11
1605
adcq %rax,%r12
1606
movq -8(%rsi,%rbp,1),%rax
1607
movq %r12,(%rdi)
1608
adcq %rdx,%r13
1609
1610
leaq (%r14,%r10,2),%rbx
1611
movq %r13,8(%rdi)
1612
sbbq %r15,%r15
1613
shrq $63,%r10
1614
leaq (%rcx,%r11,2),%r8
1615
shrq $63,%r11
1616
orq %r10,%r8
1617
movq 32(%rdi),%r10
1618
movq %r11,%r14
1619
mulq %rax
1620
negq %r15
1621
movq 40(%rdi),%r11
1622
adcq %rax,%rbx
1623
movq 0(%rsi,%rbp,1),%rax
1624
movq %rbx,16(%rdi)
1625
adcq %rdx,%r8
1626
leaq 16(%rbp),%rbp
1627
movq %r8,24(%rdi)
1628
sbbq %r15,%r15
1629
leaq 64(%rdi),%rdi
1630
jmp .Lsqr4x_shift_n_add
1631
1632
.align 32
1633
.Lsqr4x_shift_n_add:
1634
leaq (%r14,%r10,2),%r12
1635
shrq $63,%r10
1636
leaq (%rcx,%r11,2),%r13
1637
shrq $63,%r11
1638
orq %r10,%r13
1639
movq -16(%rdi),%r10
1640
movq %r11,%r14
1641
mulq %rax
1642
negq %r15
1643
movq -8(%rdi),%r11
1644
adcq %rax,%r12
1645
movq -8(%rsi,%rbp,1),%rax
1646
movq %r12,-32(%rdi)
1647
adcq %rdx,%r13
1648
1649
leaq (%r14,%r10,2),%rbx
1650
movq %r13,-24(%rdi)
1651
sbbq %r15,%r15
1652
shrq $63,%r10
1653
leaq (%rcx,%r11,2),%r8
1654
shrq $63,%r11
1655
orq %r10,%r8
1656
movq 0(%rdi),%r10
1657
movq %r11,%r14
1658
mulq %rax
1659
negq %r15
1660
movq 8(%rdi),%r11
1661
adcq %rax,%rbx
1662
movq 0(%rsi,%rbp,1),%rax
1663
movq %rbx,-16(%rdi)
1664
adcq %rdx,%r8
1665
1666
leaq (%r14,%r10,2),%r12
1667
movq %r8,-8(%rdi)
1668
sbbq %r15,%r15
1669
shrq $63,%r10
1670
leaq (%rcx,%r11,2),%r13
1671
shrq $63,%r11
1672
orq %r10,%r13
1673
movq 16(%rdi),%r10
1674
movq %r11,%r14
1675
mulq %rax
1676
negq %r15
1677
movq 24(%rdi),%r11
1678
adcq %rax,%r12
1679
movq 8(%rsi,%rbp,1),%rax
1680
movq %r12,0(%rdi)
1681
adcq %rdx,%r13
1682
1683
leaq (%r14,%r10,2),%rbx
1684
movq %r13,8(%rdi)
1685
sbbq %r15,%r15
1686
shrq $63,%r10
1687
leaq (%rcx,%r11,2),%r8
1688
shrq $63,%r11
1689
orq %r10,%r8
1690
movq 32(%rdi),%r10
1691
movq %r11,%r14
1692
mulq %rax
1693
negq %r15
1694
movq 40(%rdi),%r11
1695
adcq %rax,%rbx
1696
movq 16(%rsi,%rbp,1),%rax
1697
movq %rbx,16(%rdi)
1698
adcq %rdx,%r8
1699
movq %r8,24(%rdi)
1700
sbbq %r15,%r15
1701
leaq 64(%rdi),%rdi
1702
addq $32,%rbp
1703
jnz .Lsqr4x_shift_n_add
1704
1705
leaq (%r14,%r10,2),%r12
1706
.byte 0x67
1707
shrq $63,%r10
1708
leaq (%rcx,%r11,2),%r13
1709
shrq $63,%r11
1710
orq %r10,%r13
1711
movq -16(%rdi),%r10
1712
movq %r11,%r14
1713
mulq %rax
1714
negq %r15
1715
movq -8(%rdi),%r11
1716
adcq %rax,%r12
1717
movq -8(%rsi),%rax
1718
movq %r12,-32(%rdi)
1719
adcq %rdx,%r13
1720
1721
leaq (%r14,%r10,2),%rbx
1722
movq %r13,-24(%rdi)
1723
sbbq %r15,%r15
1724
shrq $63,%r10
1725
leaq (%rcx,%r11,2),%r8
1726
shrq $63,%r11
1727
orq %r10,%r8
1728
mulq %rax
1729
negq %r15
1730
adcq %rax,%rbx
1731
adcq %rdx,%r8
1732
movq %rbx,-16(%rdi)
1733
movq %r8,-8(%rdi)
1734
.byte 102,72,15,126,213
1735
__bn_sqr8x_reduction:
1736
xorq %rax,%rax
1737
leaq (%r9,%rbp,1),%rcx
1738
leaq 48+8(%rsp,%r9,2),%rdx
1739
movq %rcx,0+8(%rsp)
1740
leaq 48+8(%rsp,%r9,1),%rdi
1741
movq %rdx,8+8(%rsp)
1742
negq %r9
1743
jmp .L8x_reduction_loop
1744
1745
.align 32
1746
.L8x_reduction_loop:
1747
leaq (%rdi,%r9,1),%rdi
1748
.byte 0x66
1749
movq 0(%rdi),%rbx
1750
movq 8(%rdi),%r9
1751
movq 16(%rdi),%r10
1752
movq 24(%rdi),%r11
1753
movq 32(%rdi),%r12
1754
movq 40(%rdi),%r13
1755
movq 48(%rdi),%r14
1756
movq 56(%rdi),%r15
1757
movq %rax,(%rdx)
1758
leaq 64(%rdi),%rdi
1759
1760
.byte 0x67
1761
movq %rbx,%r8
1762
imulq 32+8(%rsp),%rbx
1763
movq 0(%rbp),%rax
1764
movl $8,%ecx
1765
jmp .L8x_reduce
1766
1767
.align 32
1768
.L8x_reduce:
1769
mulq %rbx
1770
movq 8(%rbp),%rax
1771
negq %r8
1772
movq %rdx,%r8
1773
adcq $0,%r8
1774
1775
mulq %rbx
1776
addq %rax,%r9
1777
movq 16(%rbp),%rax
1778
adcq $0,%rdx
1779
addq %r9,%r8
1780
movq %rbx,48-8+8(%rsp,%rcx,8)
1781
movq %rdx,%r9
1782
adcq $0,%r9
1783
1784
mulq %rbx
1785
addq %rax,%r10
1786
movq 24(%rbp),%rax
1787
adcq $0,%rdx
1788
addq %r10,%r9
1789
movq 32+8(%rsp),%rsi
1790
movq %rdx,%r10
1791
adcq $0,%r10
1792
1793
mulq %rbx
1794
addq %rax,%r11
1795
movq 32(%rbp),%rax
1796
adcq $0,%rdx
1797
imulq %r8,%rsi
1798
addq %r11,%r10
1799
movq %rdx,%r11
1800
adcq $0,%r11
1801
1802
mulq %rbx
1803
addq %rax,%r12
1804
movq 40(%rbp),%rax
1805
adcq $0,%rdx
1806
addq %r12,%r11
1807
movq %rdx,%r12
1808
adcq $0,%r12
1809
1810
mulq %rbx
1811
addq %rax,%r13
1812
movq 48(%rbp),%rax
1813
adcq $0,%rdx
1814
addq %r13,%r12
1815
movq %rdx,%r13
1816
adcq $0,%r13
1817
1818
mulq %rbx
1819
addq %rax,%r14
1820
movq 56(%rbp),%rax
1821
adcq $0,%rdx
1822
addq %r14,%r13
1823
movq %rdx,%r14
1824
adcq $0,%r14
1825
1826
mulq %rbx
1827
movq %rsi,%rbx
1828
addq %rax,%r15
1829
movq 0(%rbp),%rax
1830
adcq $0,%rdx
1831
addq %r15,%r14
1832
movq %rdx,%r15
1833
adcq $0,%r15
1834
1835
decl %ecx
1836
jnz .L8x_reduce
1837
1838
leaq 64(%rbp),%rbp
1839
xorq %rax,%rax
1840
movq 8+8(%rsp),%rdx
1841
cmpq 0+8(%rsp),%rbp
1842
jae .L8x_no_tail
1843
1844
.byte 0x66
1845
addq 0(%rdi),%r8
1846
adcq 8(%rdi),%r9
1847
adcq 16(%rdi),%r10
1848
adcq 24(%rdi),%r11
1849
adcq 32(%rdi),%r12
1850
adcq 40(%rdi),%r13
1851
adcq 48(%rdi),%r14
1852
adcq 56(%rdi),%r15
1853
sbbq %rsi,%rsi
1854
1855
movq 48+56+8(%rsp),%rbx
1856
movl $8,%ecx
1857
movq 0(%rbp),%rax
1858
jmp .L8x_tail
1859
1860
.align 32
1861
.L8x_tail:
1862
mulq %rbx
1863
addq %rax,%r8
1864
movq 8(%rbp),%rax
1865
movq %r8,(%rdi)
1866
movq %rdx,%r8
1867
adcq $0,%r8
1868
1869
mulq %rbx
1870
addq %rax,%r9
1871
movq 16(%rbp),%rax
1872
adcq $0,%rdx
1873
addq %r9,%r8
1874
leaq 8(%rdi),%rdi
1875
movq %rdx,%r9
1876
adcq $0,%r9
1877
1878
mulq %rbx
1879
addq %rax,%r10
1880
movq 24(%rbp),%rax
1881
adcq $0,%rdx
1882
addq %r10,%r9
1883
movq %rdx,%r10
1884
adcq $0,%r10
1885
1886
mulq %rbx
1887
addq %rax,%r11
1888
movq 32(%rbp),%rax
1889
adcq $0,%rdx
1890
addq %r11,%r10
1891
movq %rdx,%r11
1892
adcq $0,%r11
1893
1894
mulq %rbx
1895
addq %rax,%r12
1896
movq 40(%rbp),%rax
1897
adcq $0,%rdx
1898
addq %r12,%r11
1899
movq %rdx,%r12
1900
adcq $0,%r12
1901
1902
mulq %rbx
1903
addq %rax,%r13
1904
movq 48(%rbp),%rax
1905
adcq $0,%rdx
1906
addq %r13,%r12
1907
movq %rdx,%r13
1908
adcq $0,%r13
1909
1910
mulq %rbx
1911
addq %rax,%r14
1912
movq 56(%rbp),%rax
1913
adcq $0,%rdx
1914
addq %r14,%r13
1915
movq %rdx,%r14
1916
adcq $0,%r14
1917
1918
mulq %rbx
1919
movq 48-16+8(%rsp,%rcx,8),%rbx
1920
addq %rax,%r15
1921
adcq $0,%rdx
1922
addq %r15,%r14
1923
movq 0(%rbp),%rax
1924
movq %rdx,%r15
1925
adcq $0,%r15
1926
1927
decl %ecx
1928
jnz .L8x_tail
1929
1930
leaq 64(%rbp),%rbp
1931
movq 8+8(%rsp),%rdx
1932
cmpq 0+8(%rsp),%rbp
1933
jae .L8x_tail_done
1934
1935
movq 48+56+8(%rsp),%rbx
1936
negq %rsi
1937
movq 0(%rbp),%rax
1938
adcq 0(%rdi),%r8
1939
adcq 8(%rdi),%r9
1940
adcq 16(%rdi),%r10
1941
adcq 24(%rdi),%r11
1942
adcq 32(%rdi),%r12
1943
adcq 40(%rdi),%r13
1944
adcq 48(%rdi),%r14
1945
adcq 56(%rdi),%r15
1946
sbbq %rsi,%rsi
1947
1948
movl $8,%ecx
1949
jmp .L8x_tail
1950
1951
.align 32
1952
.L8x_tail_done:
1953
xorq %rax,%rax
1954
addq (%rdx),%r8
1955
adcq $0,%r9
1956
adcq $0,%r10
1957
adcq $0,%r11
1958
adcq $0,%r12
1959
adcq $0,%r13
1960
adcq $0,%r14
1961
adcq $0,%r15
1962
adcq $0,%rax
1963
1964
negq %rsi
1965
.L8x_no_tail:
1966
adcq 0(%rdi),%r8
1967
adcq 8(%rdi),%r9
1968
adcq 16(%rdi),%r10
1969
adcq 24(%rdi),%r11
1970
adcq 32(%rdi),%r12
1971
adcq 40(%rdi),%r13
1972
adcq 48(%rdi),%r14
1973
adcq 56(%rdi),%r15
1974
adcq $0,%rax
1975
movq -8(%rbp),%rcx
1976
xorq %rsi,%rsi
1977
1978
.byte 102,72,15,126,213
1979
1980
movq %r8,0(%rdi)
1981
movq %r9,8(%rdi)
1982
.byte 102,73,15,126,217
1983
movq %r10,16(%rdi)
1984
movq %r11,24(%rdi)
1985
movq %r12,32(%rdi)
1986
movq %r13,40(%rdi)
1987
movq %r14,48(%rdi)
1988
movq %r15,56(%rdi)
1989
leaq 64(%rdi),%rdi
1990
1991
cmpq %rdx,%rdi
1992
jb .L8x_reduction_loop
1993
.byte 0xf3,0xc3
1994
.cfi_endproc
1995
.size bn_sqr8x_internal,.-bn_sqr8x_internal
1996
.type __bn_post4x_internal,@function
1997
.align 32
1998
__bn_post4x_internal:
1999
.cfi_startproc
2000
movq 0(%rbp),%r12
2001
leaq (%rdi,%r9,1),%rbx
2002
movq %r9,%rcx
2003
.byte 102,72,15,126,207
2004
negq %rax
2005
.byte 102,72,15,126,206
2006
sarq $3+2,%rcx
2007
decq %r12
2008
xorq %r10,%r10
2009
movq 8(%rbp),%r13
2010
movq 16(%rbp),%r14
2011
movq 24(%rbp),%r15
2012
jmp .Lsqr4x_sub_entry
2013
2014
.align 16
2015
.Lsqr4x_sub:
2016
movq 0(%rbp),%r12
2017
movq 8(%rbp),%r13
2018
movq 16(%rbp),%r14
2019
movq 24(%rbp),%r15
2020
.Lsqr4x_sub_entry:
2021
leaq 32(%rbp),%rbp
2022
notq %r12
2023
notq %r13
2024
notq %r14
2025
notq %r15
2026
andq %rax,%r12
2027
andq %rax,%r13
2028
andq %rax,%r14
2029
andq %rax,%r15
2030
2031
negq %r10
2032
adcq 0(%rbx),%r12
2033
adcq 8(%rbx),%r13
2034
adcq 16(%rbx),%r14
2035
adcq 24(%rbx),%r15
2036
movq %r12,0(%rdi)
2037
leaq 32(%rbx),%rbx
2038
movq %r13,8(%rdi)
2039
sbbq %r10,%r10
2040
movq %r14,16(%rdi)
2041
movq %r15,24(%rdi)
2042
leaq 32(%rdi),%rdi
2043
2044
incq %rcx
2045
jnz .Lsqr4x_sub
2046
2047
movq %r9,%r10
2048
negq %r9
2049
.byte 0xf3,0xc3
2050
.cfi_endproc
2051
.size __bn_post4x_internal,.-__bn_post4x_internal
2052
.type bn_mulx4x_mont_gather5,@function
2053
.align 32
2054
bn_mulx4x_mont_gather5:
2055
.cfi_startproc
2056
movq %rsp,%rax
2057
.cfi_def_cfa_register %rax
2058
.Lmulx4x_enter:
2059
pushq %rbx
2060
.cfi_offset %rbx,-16
2061
pushq %rbp
2062
.cfi_offset %rbp,-24
2063
pushq %r12
2064
.cfi_offset %r12,-32
2065
pushq %r13
2066
.cfi_offset %r13,-40
2067
pushq %r14
2068
.cfi_offset %r14,-48
2069
pushq %r15
2070
.cfi_offset %r15,-56
2071
.Lmulx4x_prologue:
2072
2073
shll $3,%r9d
2074
leaq (%r9,%r9,2),%r10
2075
negq %r9
2076
movq (%r8),%r8
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
leaq -320(%rsp,%r9,2),%r11
2088
movq %rsp,%rbp
2089
subq %rdi,%r11
2090
andq $4095,%r11
2091
cmpq %r11,%r10
2092
jb .Lmulx4xsp_alt
2093
subq %r11,%rbp
2094
leaq -320(%rbp,%r9,2),%rbp
2095
jmp .Lmulx4xsp_done
2096
2097
.Lmulx4xsp_alt:
2098
leaq 4096-320(,%r9,2),%r10
2099
leaq -320(%rbp,%r9,2),%rbp
2100
subq %r10,%r11
2101
movq $0,%r10
2102
cmovcq %r10,%r11
2103
subq %r11,%rbp
2104
.Lmulx4xsp_done:
2105
andq $-64,%rbp
2106
movq %rsp,%r11
2107
subq %rbp,%r11
2108
andq $-4096,%r11
2109
leaq (%r11,%rbp,1),%rsp
2110
movq (%rsp),%r10
2111
cmpq %rbp,%rsp
2112
ja .Lmulx4x_page_walk
2113
jmp .Lmulx4x_page_walk_done
2114
2115
.Lmulx4x_page_walk:
2116
leaq -4096(%rsp),%rsp
2117
movq (%rsp),%r10
2118
cmpq %rbp,%rsp
2119
ja .Lmulx4x_page_walk
2120
.Lmulx4x_page_walk_done:
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
movq %r8,32(%rsp)
2135
movq %rax,40(%rsp)
2136
.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2137
.Lmulx4x_body:
2138
call mulx4x_internal
2139
2140
movq 40(%rsp),%rsi
2141
.cfi_def_cfa %rsi,8
2142
movq $1,%rax
2143
2144
movq -48(%rsi),%r15
2145
.cfi_restore %r15
2146
movq -40(%rsi),%r14
2147
.cfi_restore %r14
2148
movq -32(%rsi),%r13
2149
.cfi_restore %r13
2150
movq -24(%rsi),%r12
2151
.cfi_restore %r12
2152
movq -16(%rsi),%rbp
2153
.cfi_restore %rbp
2154
movq -8(%rsi),%rbx
2155
.cfi_restore %rbx
2156
leaq (%rsi),%rsp
2157
.cfi_def_cfa_register %rsp
2158
.Lmulx4x_epilogue:
2159
.byte 0xf3,0xc3
2160
.cfi_endproc
2161
.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2162
2163
.type mulx4x_internal,@function
2164
.align 32
2165
mulx4x_internal:
2166
.cfi_startproc
2167
movq %r9,8(%rsp)
2168
movq %r9,%r10
2169
negq %r9
2170
shlq $5,%r9
2171
negq %r10
2172
leaq 128(%rdx,%r9,1),%r13
2173
shrq $5+5,%r9
2174
movd 8(%rax),%xmm5
2175
subq $1,%r9
2176
leaq .Linc(%rip),%rax
2177
movq %r13,16+8(%rsp)
2178
movq %r9,24+8(%rsp)
2179
movq %rdi,56+8(%rsp)
2180
movdqa 0(%rax),%xmm0
2181
movdqa 16(%rax),%xmm1
2182
leaq 88-112(%rsp,%r10,1),%r10
2183
leaq 128(%rdx),%rdi
2184
2185
pshufd $0,%xmm5,%xmm5
2186
movdqa %xmm1,%xmm4
2187
.byte 0x67
2188
movdqa %xmm1,%xmm2
2189
.byte 0x67
2190
paddd %xmm0,%xmm1
2191
pcmpeqd %xmm5,%xmm0
2192
movdqa %xmm4,%xmm3
2193
paddd %xmm1,%xmm2
2194
pcmpeqd %xmm5,%xmm1
2195
movdqa %xmm0,112(%r10)
2196
movdqa %xmm4,%xmm0
2197
2198
paddd %xmm2,%xmm3
2199
pcmpeqd %xmm5,%xmm2
2200
movdqa %xmm1,128(%r10)
2201
movdqa %xmm4,%xmm1
2202
2203
paddd %xmm3,%xmm0
2204
pcmpeqd %xmm5,%xmm3
2205
movdqa %xmm2,144(%r10)
2206
movdqa %xmm4,%xmm2
2207
2208
paddd %xmm0,%xmm1
2209
pcmpeqd %xmm5,%xmm0
2210
movdqa %xmm3,160(%r10)
2211
movdqa %xmm4,%xmm3
2212
paddd %xmm1,%xmm2
2213
pcmpeqd %xmm5,%xmm1
2214
movdqa %xmm0,176(%r10)
2215
movdqa %xmm4,%xmm0
2216
2217
paddd %xmm2,%xmm3
2218
pcmpeqd %xmm5,%xmm2
2219
movdqa %xmm1,192(%r10)
2220
movdqa %xmm4,%xmm1
2221
2222
paddd %xmm3,%xmm0
2223
pcmpeqd %xmm5,%xmm3
2224
movdqa %xmm2,208(%r10)
2225
movdqa %xmm4,%xmm2
2226
2227
paddd %xmm0,%xmm1
2228
pcmpeqd %xmm5,%xmm0
2229
movdqa %xmm3,224(%r10)
2230
movdqa %xmm4,%xmm3
2231
paddd %xmm1,%xmm2
2232
pcmpeqd %xmm5,%xmm1
2233
movdqa %xmm0,240(%r10)
2234
movdqa %xmm4,%xmm0
2235
2236
paddd %xmm2,%xmm3
2237
pcmpeqd %xmm5,%xmm2
2238
movdqa %xmm1,256(%r10)
2239
movdqa %xmm4,%xmm1
2240
2241
paddd %xmm3,%xmm0
2242
pcmpeqd %xmm5,%xmm3
2243
movdqa %xmm2,272(%r10)
2244
movdqa %xmm4,%xmm2
2245
2246
paddd %xmm0,%xmm1
2247
pcmpeqd %xmm5,%xmm0
2248
movdqa %xmm3,288(%r10)
2249
movdqa %xmm4,%xmm3
2250
.byte 0x67
2251
paddd %xmm1,%xmm2
2252
pcmpeqd %xmm5,%xmm1
2253
movdqa %xmm0,304(%r10)
2254
2255
paddd %xmm2,%xmm3
2256
pcmpeqd %xmm5,%xmm2
2257
movdqa %xmm1,320(%r10)
2258
2259
pcmpeqd %xmm5,%xmm3
2260
movdqa %xmm2,336(%r10)
2261
2262
pand 64(%rdi),%xmm0
2263
pand 80(%rdi),%xmm1
2264
pand 96(%rdi),%xmm2
2265
movdqa %xmm3,352(%r10)
2266
pand 112(%rdi),%xmm3
2267
por %xmm2,%xmm0
2268
por %xmm3,%xmm1
2269
movdqa -128(%rdi),%xmm4
2270
movdqa -112(%rdi),%xmm5
2271
movdqa -96(%rdi),%xmm2
2272
pand 112(%r10),%xmm4
2273
movdqa -80(%rdi),%xmm3
2274
pand 128(%r10),%xmm5
2275
por %xmm4,%xmm0
2276
pand 144(%r10),%xmm2
2277
por %xmm5,%xmm1
2278
pand 160(%r10),%xmm3
2279
por %xmm2,%xmm0
2280
por %xmm3,%xmm1
2281
movdqa -64(%rdi),%xmm4
2282
movdqa -48(%rdi),%xmm5
2283
movdqa -32(%rdi),%xmm2
2284
pand 176(%r10),%xmm4
2285
movdqa -16(%rdi),%xmm3
2286
pand 192(%r10),%xmm5
2287
por %xmm4,%xmm0
2288
pand 208(%r10),%xmm2
2289
por %xmm5,%xmm1
2290
pand 224(%r10),%xmm3
2291
por %xmm2,%xmm0
2292
por %xmm3,%xmm1
2293
movdqa 0(%rdi),%xmm4
2294
movdqa 16(%rdi),%xmm5
2295
movdqa 32(%rdi),%xmm2
2296
pand 240(%r10),%xmm4
2297
movdqa 48(%rdi),%xmm3
2298
pand 256(%r10),%xmm5
2299
por %xmm4,%xmm0
2300
pand 272(%r10),%xmm2
2301
por %xmm5,%xmm1
2302
pand 288(%r10),%xmm3
2303
por %xmm2,%xmm0
2304
por %xmm3,%xmm1
2305
pxor %xmm1,%xmm0
2306
pshufd $0x4e,%xmm0,%xmm1
2307
por %xmm1,%xmm0
2308
leaq 256(%rdi),%rdi
2309
.byte 102,72,15,126,194
2310
leaq 64+32+8(%rsp),%rbx
2311
2312
movq %rdx,%r9
2313
mulxq 0(%rsi),%r8,%rax
2314
mulxq 8(%rsi),%r11,%r12
2315
addq %rax,%r11
2316
mulxq 16(%rsi),%rax,%r13
2317
adcq %rax,%r12
2318
adcq $0,%r13
2319
mulxq 24(%rsi),%rax,%r14
2320
2321
movq %r8,%r15
2322
imulq 32+8(%rsp),%r8
2323
xorq %rbp,%rbp
2324
movq %r8,%rdx
2325
2326
movq %rdi,8+8(%rsp)
2327
2328
leaq 32(%rsi),%rsi
2329
adcxq %rax,%r13
2330
adcxq %rbp,%r14
2331
2332
mulxq 0(%rcx),%rax,%r10
2333
adcxq %rax,%r15
2334
adoxq %r11,%r10
2335
mulxq 8(%rcx),%rax,%r11
2336
adcxq %rax,%r10
2337
adoxq %r12,%r11
2338
mulxq 16(%rcx),%rax,%r12
2339
movq 24+8(%rsp),%rdi
2340
movq %r10,-32(%rbx)
2341
adcxq %rax,%r11
2342
adoxq %r13,%r12
2343
mulxq 24(%rcx),%rax,%r15
2344
movq %r9,%rdx
2345
movq %r11,-24(%rbx)
2346
adcxq %rax,%r12
2347
adoxq %rbp,%r15
2348
leaq 32(%rcx),%rcx
2349
movq %r12,-16(%rbx)
2350
jmp .Lmulx4x_1st
2351
2352
.align 32
2353
.Lmulx4x_1st:
2354
adcxq %rbp,%r15
2355
mulxq 0(%rsi),%r10,%rax
2356
adcxq %r14,%r10
2357
mulxq 8(%rsi),%r11,%r14
2358
adcxq %rax,%r11
2359
mulxq 16(%rsi),%r12,%rax
2360
adcxq %r14,%r12
2361
mulxq 24(%rsi),%r13,%r14
2362
.byte 0x67,0x67
2363
movq %r8,%rdx
2364
adcxq %rax,%r13
2365
adcxq %rbp,%r14
2366
leaq 32(%rsi),%rsi
2367
leaq 32(%rbx),%rbx
2368
2369
adoxq %r15,%r10
2370
mulxq 0(%rcx),%rax,%r15
2371
adcxq %rax,%r10
2372
adoxq %r15,%r11
2373
mulxq 8(%rcx),%rax,%r15
2374
adcxq %rax,%r11
2375
adoxq %r15,%r12
2376
mulxq 16(%rcx),%rax,%r15
2377
movq %r10,-40(%rbx)
2378
adcxq %rax,%r12
2379
movq %r11,-32(%rbx)
2380
adoxq %r15,%r13
2381
mulxq 24(%rcx),%rax,%r15
2382
movq %r9,%rdx
2383
movq %r12,-24(%rbx)
2384
adcxq %rax,%r13
2385
adoxq %rbp,%r15
2386
leaq 32(%rcx),%rcx
2387
movq %r13,-16(%rbx)
2388
2389
decq %rdi
2390
jnz .Lmulx4x_1st
2391
2392
movq 8(%rsp),%rax
2393
adcq %rbp,%r15
2394
leaq (%rsi,%rax,1),%rsi
2395
addq %r15,%r14
2396
movq 8+8(%rsp),%rdi
2397
adcq %rbp,%rbp
2398
movq %r14,-8(%rbx)
2399
jmp .Lmulx4x_outer
2400
2401
.align 32
2402
.Lmulx4x_outer:
2403
leaq 16-256(%rbx),%r10
2404
pxor %xmm4,%xmm4
2405
.byte 0x67,0x67
2406
pxor %xmm5,%xmm5
2407
movdqa -128(%rdi),%xmm0
2408
movdqa -112(%rdi),%xmm1
2409
movdqa -96(%rdi),%xmm2
2410
pand 256(%r10),%xmm0
2411
movdqa -80(%rdi),%xmm3
2412
pand 272(%r10),%xmm1
2413
por %xmm0,%xmm4
2414
pand 288(%r10),%xmm2
2415
por %xmm1,%xmm5
2416
pand 304(%r10),%xmm3
2417
por %xmm2,%xmm4
2418
por %xmm3,%xmm5
2419
movdqa -64(%rdi),%xmm0
2420
movdqa -48(%rdi),%xmm1
2421
movdqa -32(%rdi),%xmm2
2422
pand 320(%r10),%xmm0
2423
movdqa -16(%rdi),%xmm3
2424
pand 336(%r10),%xmm1
2425
por %xmm0,%xmm4
2426
pand 352(%r10),%xmm2
2427
por %xmm1,%xmm5
2428
pand 368(%r10),%xmm3
2429
por %xmm2,%xmm4
2430
por %xmm3,%xmm5
2431
movdqa 0(%rdi),%xmm0
2432
movdqa 16(%rdi),%xmm1
2433
movdqa 32(%rdi),%xmm2
2434
pand 384(%r10),%xmm0
2435
movdqa 48(%rdi),%xmm3
2436
pand 400(%r10),%xmm1
2437
por %xmm0,%xmm4
2438
pand 416(%r10),%xmm2
2439
por %xmm1,%xmm5
2440
pand 432(%r10),%xmm3
2441
por %xmm2,%xmm4
2442
por %xmm3,%xmm5
2443
movdqa 64(%rdi),%xmm0
2444
movdqa 80(%rdi),%xmm1
2445
movdqa 96(%rdi),%xmm2
2446
pand 448(%r10),%xmm0
2447
movdqa 112(%rdi),%xmm3
2448
pand 464(%r10),%xmm1
2449
por %xmm0,%xmm4
2450
pand 480(%r10),%xmm2
2451
por %xmm1,%xmm5
2452
pand 496(%r10),%xmm3
2453
por %xmm2,%xmm4
2454
por %xmm3,%xmm5
2455
por %xmm5,%xmm4
2456
pshufd $0x4e,%xmm4,%xmm0
2457
por %xmm4,%xmm0
2458
leaq 256(%rdi),%rdi
2459
.byte 102,72,15,126,194
2460
2461
movq %rbp,(%rbx)
2462
leaq 32(%rbx,%rax,1),%rbx
2463
mulxq 0(%rsi),%r8,%r11
2464
xorq %rbp,%rbp
2465
movq %rdx,%r9
2466
mulxq 8(%rsi),%r14,%r12
2467
adoxq -32(%rbx),%r8
2468
adcxq %r14,%r11
2469
mulxq 16(%rsi),%r15,%r13
2470
adoxq -24(%rbx),%r11
2471
adcxq %r15,%r12
2472
mulxq 24(%rsi),%rdx,%r14
2473
adoxq -16(%rbx),%r12
2474
adcxq %rdx,%r13
2475
leaq (%rcx,%rax,1),%rcx
2476
leaq 32(%rsi),%rsi
2477
adoxq -8(%rbx),%r13
2478
adcxq %rbp,%r14
2479
adoxq %rbp,%r14
2480
2481
movq %r8,%r15
2482
imulq 32+8(%rsp),%r8
2483
2484
movq %r8,%rdx
2485
xorq %rbp,%rbp
2486
movq %rdi,8+8(%rsp)
2487
2488
mulxq 0(%rcx),%rax,%r10
2489
adcxq %rax,%r15
2490
adoxq %r11,%r10
2491
mulxq 8(%rcx),%rax,%r11
2492
adcxq %rax,%r10
2493
adoxq %r12,%r11
2494
mulxq 16(%rcx),%rax,%r12
2495
adcxq %rax,%r11
2496
adoxq %r13,%r12
2497
mulxq 24(%rcx),%rax,%r15
2498
movq %r9,%rdx
2499
movq 24+8(%rsp),%rdi
2500
movq %r10,-32(%rbx)
2501
adcxq %rax,%r12
2502
movq %r11,-24(%rbx)
2503
adoxq %rbp,%r15
2504
movq %r12,-16(%rbx)
2505
leaq 32(%rcx),%rcx
2506
jmp .Lmulx4x_inner
2507
2508
.align 32
2509
.Lmulx4x_inner:
2510
mulxq 0(%rsi),%r10,%rax
2511
adcxq %rbp,%r15
2512
adoxq %r14,%r10
2513
mulxq 8(%rsi),%r11,%r14
2514
adcxq 0(%rbx),%r10
2515
adoxq %rax,%r11
2516
mulxq 16(%rsi),%r12,%rax
2517
adcxq 8(%rbx),%r11
2518
adoxq %r14,%r12
2519
mulxq 24(%rsi),%r13,%r14
2520
movq %r8,%rdx
2521
adcxq 16(%rbx),%r12
2522
adoxq %rax,%r13
2523
adcxq 24(%rbx),%r13
2524
adoxq %rbp,%r14
2525
leaq 32(%rsi),%rsi
2526
leaq 32(%rbx),%rbx
2527
adcxq %rbp,%r14
2528
2529
adoxq %r15,%r10
2530
mulxq 0(%rcx),%rax,%r15
2531
adcxq %rax,%r10
2532
adoxq %r15,%r11
2533
mulxq 8(%rcx),%rax,%r15
2534
adcxq %rax,%r11
2535
adoxq %r15,%r12
2536
mulxq 16(%rcx),%rax,%r15
2537
movq %r10,-40(%rbx)
2538
adcxq %rax,%r12
2539
adoxq %r15,%r13
2540
movq %r11,-32(%rbx)
2541
mulxq 24(%rcx),%rax,%r15
2542
movq %r9,%rdx
2543
leaq 32(%rcx),%rcx
2544
movq %r12,-24(%rbx)
2545
adcxq %rax,%r13
2546
adoxq %rbp,%r15
2547
movq %r13,-16(%rbx)
2548
2549
decq %rdi
2550
jnz .Lmulx4x_inner
2551
2552
movq 0+8(%rsp),%rax
2553
adcq %rbp,%r15
2554
subq 0(%rbx),%rdi
2555
movq 8+8(%rsp),%rdi
2556
movq 16+8(%rsp),%r10
2557
adcq %r15,%r14
2558
leaq (%rsi,%rax,1),%rsi
2559
adcq %rbp,%rbp
2560
movq %r14,-8(%rbx)
2561
2562
cmpq %r10,%rdi
2563
jb .Lmulx4x_outer
2564
2565
movq -8(%rcx),%r10
2566
movq %rbp,%r8
2567
movq (%rcx,%rax,1),%r12
2568
leaq (%rcx,%rax,1),%rbp
2569
movq %rax,%rcx
2570
leaq (%rbx,%rax,1),%rdi
2571
xorl %eax,%eax
2572
xorq %r15,%r15
2573
subq %r14,%r10
2574
adcq %r15,%r15
2575
orq %r15,%r8
2576
sarq $3+2,%rcx
2577
subq %r8,%rax
2578
movq 56+8(%rsp),%rdx
2579
decq %r12
2580
movq 8(%rbp),%r13
2581
xorq %r8,%r8
2582
movq 16(%rbp),%r14
2583
movq 24(%rbp),%r15
2584
jmp .Lsqrx4x_sub_entry
2585
.cfi_endproc
2586
.size mulx4x_internal,.-mulx4x_internal
2587
.type bn_powerx5,@function
2588
.align 32
2589
bn_powerx5:
2590
.cfi_startproc
2591
movq %rsp,%rax
2592
.cfi_def_cfa_register %rax
2593
.Lpowerx5_enter:
2594
pushq %rbx
2595
.cfi_offset %rbx,-16
2596
pushq %rbp
2597
.cfi_offset %rbp,-24
2598
pushq %r12
2599
.cfi_offset %r12,-32
2600
pushq %r13
2601
.cfi_offset %r13,-40
2602
pushq %r14
2603
.cfi_offset %r14,-48
2604
pushq %r15
2605
.cfi_offset %r15,-56
2606
.Lpowerx5_prologue:
2607
2608
shll $3,%r9d
2609
leaq (%r9,%r9,2),%r10
2610
negq %r9
2611
movq (%r8),%r8
2612
2613
2614
2615
2616
2617
2618
2619
2620
leaq -320(%rsp,%r9,2),%r11
2621
movq %rsp,%rbp
2622
subq %rdi,%r11
2623
andq $4095,%r11
2624
cmpq %r11,%r10
2625
jb .Lpwrx_sp_alt
2626
subq %r11,%rbp
2627
leaq -320(%rbp,%r9,2),%rbp
2628
jmp .Lpwrx_sp_done
2629
2630
.align 32
2631
.Lpwrx_sp_alt:
2632
leaq 4096-320(,%r9,2),%r10
2633
leaq -320(%rbp,%r9,2),%rbp
2634
subq %r10,%r11
2635
movq $0,%r10
2636
cmovcq %r10,%r11
2637
subq %r11,%rbp
2638
.Lpwrx_sp_done:
2639
andq $-64,%rbp
2640
movq %rsp,%r11
2641
subq %rbp,%r11
2642
andq $-4096,%r11
2643
leaq (%r11,%rbp,1),%rsp
2644
movq (%rsp),%r10
2645
cmpq %rbp,%rsp
2646
ja .Lpwrx_page_walk
2647
jmp .Lpwrx_page_walk_done
2648
2649
.Lpwrx_page_walk:
2650
leaq -4096(%rsp),%rsp
2651
movq (%rsp),%r10
2652
cmpq %rbp,%rsp
2653
ja .Lpwrx_page_walk
2654
.Lpwrx_page_walk_done:
2655
2656
movq %r9,%r10
2657
negq %r9
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
pxor %xmm0,%xmm0
2671
.byte 102,72,15,110,207
2672
.byte 102,72,15,110,209
2673
.byte 102,73,15,110,218
2674
.byte 102,72,15,110,226
2675
movq %r8,32(%rsp)
2676
movq %rax,40(%rsp)
2677
.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2678
.Lpowerx5_body:
2679
2680
call __bn_sqrx8x_internal
2681
call __bn_postx4x_internal
2682
call __bn_sqrx8x_internal
2683
call __bn_postx4x_internal
2684
call __bn_sqrx8x_internal
2685
call __bn_postx4x_internal
2686
call __bn_sqrx8x_internal
2687
call __bn_postx4x_internal
2688
call __bn_sqrx8x_internal
2689
call __bn_postx4x_internal
2690
2691
movq %r10,%r9
2692
movq %rsi,%rdi
2693
.byte 102,72,15,126,209
2694
.byte 102,72,15,126,226
2695
movq 40(%rsp),%rax
2696
2697
call mulx4x_internal
2698
2699
movq 40(%rsp),%rsi
2700
.cfi_def_cfa %rsi,8
2701
movq $1,%rax
2702
2703
movq -48(%rsi),%r15
2704
.cfi_restore %r15
2705
movq -40(%rsi),%r14
2706
.cfi_restore %r14
2707
movq -32(%rsi),%r13
2708
.cfi_restore %r13
2709
movq -24(%rsi),%r12
2710
.cfi_restore %r12
2711
movq -16(%rsi),%rbp
2712
.cfi_restore %rbp
2713
movq -8(%rsi),%rbx
2714
.cfi_restore %rbx
2715
leaq (%rsi),%rsp
2716
.cfi_def_cfa_register %rsp
2717
.Lpowerx5_epilogue:
2718
.byte 0xf3,0xc3
2719
.cfi_endproc
2720
.size bn_powerx5,.-bn_powerx5
2721
2722
.globl bn_sqrx8x_internal
2723
.hidden bn_sqrx8x_internal
2724
.type bn_sqrx8x_internal,@function
2725
.align 32
2726
bn_sqrx8x_internal:
2727
__bn_sqrx8x_internal:
2728
.cfi_startproc
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
leaq 48+8(%rsp),%rdi
2770
leaq (%rsi,%r9,1),%rbp
2771
movq %r9,0+8(%rsp)
2772
movq %rbp,8+8(%rsp)
2773
jmp .Lsqr8x_zero_start
2774
2775
.align 32
2776
.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2777
.Lsqrx8x_zero:
2778
.byte 0x3e
2779
movdqa %xmm0,0(%rdi)
2780
movdqa %xmm0,16(%rdi)
2781
movdqa %xmm0,32(%rdi)
2782
movdqa %xmm0,48(%rdi)
2783
.Lsqr8x_zero_start:
2784
movdqa %xmm0,64(%rdi)
2785
movdqa %xmm0,80(%rdi)
2786
movdqa %xmm0,96(%rdi)
2787
movdqa %xmm0,112(%rdi)
2788
leaq 128(%rdi),%rdi
2789
subq $64,%r9
2790
jnz .Lsqrx8x_zero
2791
2792
movq 0(%rsi),%rdx
2793
2794
xorq %r10,%r10
2795
xorq %r11,%r11
2796
xorq %r12,%r12
2797
xorq %r13,%r13
2798
xorq %r14,%r14
2799
xorq %r15,%r15
2800
leaq 48+8(%rsp),%rdi
2801
xorq %rbp,%rbp
2802
jmp .Lsqrx8x_outer_loop
2803
2804
.align 32
2805
.Lsqrx8x_outer_loop:
2806
mulxq 8(%rsi),%r8,%rax
2807
adcxq %r9,%r8
2808
adoxq %rax,%r10
2809
mulxq 16(%rsi),%r9,%rax
2810
adcxq %r10,%r9
2811
adoxq %rax,%r11
2812
.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2813
adcxq %r11,%r10
2814
adoxq %rax,%r12
2815
.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2816
adcxq %r12,%r11
2817
adoxq %rax,%r13
2818
mulxq 40(%rsi),%r12,%rax
2819
adcxq %r13,%r12
2820
adoxq %rax,%r14
2821
mulxq 48(%rsi),%r13,%rax
2822
adcxq %r14,%r13
2823
adoxq %r15,%rax
2824
mulxq 56(%rsi),%r14,%r15
2825
movq 8(%rsi),%rdx
2826
adcxq %rax,%r14
2827
adoxq %rbp,%r15
2828
adcq 64(%rdi),%r15
2829
movq %r8,8(%rdi)
2830
movq %r9,16(%rdi)
2831
sbbq %rcx,%rcx
2832
xorq %rbp,%rbp
2833
2834
2835
mulxq 16(%rsi),%r8,%rbx
2836
mulxq 24(%rsi),%r9,%rax
2837
adcxq %r10,%r8
2838
adoxq %rbx,%r9
2839
mulxq 32(%rsi),%r10,%rbx
2840
adcxq %r11,%r9
2841
adoxq %rax,%r10
2842
.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2843
adcxq %r12,%r10
2844
adoxq %rbx,%r11
2845
.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2846
adcxq %r13,%r11
2847
adoxq %r14,%r12
2848
.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2849
movq 16(%rsi),%rdx
2850
adcxq %rax,%r12
2851
adoxq %rbx,%r13
2852
adcxq %r15,%r13
2853
adoxq %rbp,%r14
2854
adcxq %rbp,%r14
2855
2856
movq %r8,24(%rdi)
2857
movq %r9,32(%rdi)
2858
2859
mulxq 24(%rsi),%r8,%rbx
2860
mulxq 32(%rsi),%r9,%rax
2861
adcxq %r10,%r8
2862
adoxq %rbx,%r9
2863
mulxq 40(%rsi),%r10,%rbx
2864
adcxq %r11,%r9
2865
adoxq %rax,%r10
2866
.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2867
adcxq %r12,%r10
2868
adoxq %r13,%r11
2869
.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2870
.byte 0x3e
2871
movq 24(%rsi),%rdx
2872
adcxq %rbx,%r11
2873
adoxq %rax,%r12
2874
adcxq %r14,%r12
2875
movq %r8,40(%rdi)
2876
movq %r9,48(%rdi)
2877
mulxq 32(%rsi),%r8,%rax
2878
adoxq %rbp,%r13
2879
adcxq %rbp,%r13
2880
2881
mulxq 40(%rsi),%r9,%rbx
2882
adcxq %r10,%r8
2883
adoxq %rax,%r9
2884
mulxq 48(%rsi),%r10,%rax
2885
adcxq %r11,%r9
2886
adoxq %r12,%r10
2887
mulxq 56(%rsi),%r11,%r12
2888
movq 32(%rsi),%rdx
2889
movq 40(%rsi),%r14
2890
adcxq %rbx,%r10
2891
adoxq %rax,%r11
2892
movq 48(%rsi),%r15
2893
adcxq %r13,%r11
2894
adoxq %rbp,%r12
2895
adcxq %rbp,%r12
2896
2897
movq %r8,56(%rdi)
2898
movq %r9,64(%rdi)
2899
2900
mulxq %r14,%r9,%rax
2901
movq 56(%rsi),%r8
2902
adcxq %r10,%r9
2903
mulxq %r15,%r10,%rbx
2904
adoxq %rax,%r10
2905
adcxq %r11,%r10
2906
mulxq %r8,%r11,%rax
2907
movq %r14,%rdx
2908
adoxq %rbx,%r11
2909
adcxq %r12,%r11
2910
2911
adcxq %rbp,%rax
2912
2913
mulxq %r15,%r14,%rbx
2914
mulxq %r8,%r12,%r13
2915
movq %r15,%rdx
2916
leaq 64(%rsi),%rsi
2917
adcxq %r14,%r11
2918
adoxq %rbx,%r12
2919
adcxq %rax,%r12
2920
adoxq %rbp,%r13
2921
2922
.byte 0x67,0x67
2923
mulxq %r8,%r8,%r14
2924
adcxq %r8,%r13
2925
adcxq %rbp,%r14
2926
2927
cmpq 8+8(%rsp),%rsi
2928
je .Lsqrx8x_outer_break
2929
2930
negq %rcx
2931
movq $-8,%rcx
2932
movq %rbp,%r15
2933
movq 64(%rdi),%r8
2934
adcxq 72(%rdi),%r9
2935
adcxq 80(%rdi),%r10
2936
adcxq 88(%rdi),%r11
2937
adcq 96(%rdi),%r12
2938
adcq 104(%rdi),%r13
2939
adcq 112(%rdi),%r14
2940
adcq 120(%rdi),%r15
2941
leaq (%rsi),%rbp
2942
leaq 128(%rdi),%rdi
2943
sbbq %rax,%rax
2944
2945
movq -64(%rsi),%rdx
2946
movq %rax,16+8(%rsp)
2947
movq %rdi,24+8(%rsp)
2948
2949
2950
xorl %eax,%eax
2951
jmp .Lsqrx8x_loop
2952
2953
.align 32
2954
.Lsqrx8x_loop:
2955
movq %r8,%rbx
2956
mulxq 0(%rbp),%rax,%r8
2957
adcxq %rax,%rbx
2958
adoxq %r9,%r8
2959
2960
mulxq 8(%rbp),%rax,%r9
2961
adcxq %rax,%r8
2962
adoxq %r10,%r9
2963
2964
mulxq 16(%rbp),%rax,%r10
2965
adcxq %rax,%r9
2966
adoxq %r11,%r10
2967
2968
mulxq 24(%rbp),%rax,%r11
2969
adcxq %rax,%r10
2970
adoxq %r12,%r11
2971
2972
.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
2973
adcxq %rax,%r11
2974
adoxq %r13,%r12
2975
2976
mulxq 40(%rbp),%rax,%r13
2977
adcxq %rax,%r12
2978
adoxq %r14,%r13
2979
2980
mulxq 48(%rbp),%rax,%r14
2981
movq %rbx,(%rdi,%rcx,8)
2982
movl $0,%ebx
2983
adcxq %rax,%r13
2984
adoxq %r15,%r14
2985
2986
.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
2987
movq 8(%rsi,%rcx,8),%rdx
2988
adcxq %rax,%r14
2989
adoxq %rbx,%r15
2990
adcxq %rbx,%r15
2991
2992
.byte 0x67
2993
incq %rcx
2994
jnz .Lsqrx8x_loop
2995
2996
leaq 64(%rbp),%rbp
2997
movq $-8,%rcx
2998
cmpq 8+8(%rsp),%rbp
2999
je .Lsqrx8x_break
3000
3001
subq 16+8(%rsp),%rbx
3002
.byte 0x66
3003
movq -64(%rsi),%rdx
3004
adcxq 0(%rdi),%r8
3005
adcxq 8(%rdi),%r9
3006
adcq 16(%rdi),%r10
3007
adcq 24(%rdi),%r11
3008
adcq 32(%rdi),%r12
3009
adcq 40(%rdi),%r13
3010
adcq 48(%rdi),%r14
3011
adcq 56(%rdi),%r15
3012
leaq 64(%rdi),%rdi
3013
.byte 0x67
3014
sbbq %rax,%rax
3015
xorl %ebx,%ebx
3016
movq %rax,16+8(%rsp)
3017
jmp .Lsqrx8x_loop
3018
3019
.align 32
3020
.Lsqrx8x_break:
3021
xorq %rbp,%rbp
3022
subq 16+8(%rsp),%rbx
3023
adcxq %rbp,%r8
3024
movq 24+8(%rsp),%rcx
3025
adcxq %rbp,%r9
3026
movq 0(%rsi),%rdx
3027
adcq $0,%r10
3028
movq %r8,0(%rdi)
3029
adcq $0,%r11
3030
adcq $0,%r12
3031
adcq $0,%r13
3032
adcq $0,%r14
3033
adcq $0,%r15
3034
cmpq %rcx,%rdi
3035
je .Lsqrx8x_outer_loop
3036
3037
movq %r9,8(%rdi)
3038
movq 8(%rcx),%r9
3039
movq %r10,16(%rdi)
3040
movq 16(%rcx),%r10
3041
movq %r11,24(%rdi)
3042
movq 24(%rcx),%r11
3043
movq %r12,32(%rdi)
3044
movq 32(%rcx),%r12
3045
movq %r13,40(%rdi)
3046
movq 40(%rcx),%r13
3047
movq %r14,48(%rdi)
3048
movq 48(%rcx),%r14
3049
movq %r15,56(%rdi)
3050
movq 56(%rcx),%r15
3051
movq %rcx,%rdi
3052
jmp .Lsqrx8x_outer_loop
3053
3054
.align 32
3055
.Lsqrx8x_outer_break:
3056
movq %r9,72(%rdi)
3057
.byte 102,72,15,126,217
3058
movq %r10,80(%rdi)
3059
movq %r11,88(%rdi)
3060
movq %r12,96(%rdi)
3061
movq %r13,104(%rdi)
3062
movq %r14,112(%rdi)
3063
leaq 48+8(%rsp),%rdi
3064
movq (%rsi,%rcx,1),%rdx
3065
3066
movq 8(%rdi),%r11
3067
xorq %r10,%r10
3068
movq 0+8(%rsp),%r9
3069
adoxq %r11,%r11
3070
movq 16(%rdi),%r12
3071
movq 24(%rdi),%r13
3072
3073
3074
.align 32
3075
.Lsqrx4x_shift_n_add:
3076
mulxq %rdx,%rax,%rbx
3077
adoxq %r12,%r12
3078
adcxq %r10,%rax
3079
.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3080
.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3081
adoxq %r13,%r13
3082
adcxq %r11,%rbx
3083
movq 40(%rdi),%r11
3084
movq %rax,0(%rdi)
3085
movq %rbx,8(%rdi)
3086
3087
mulxq %rdx,%rax,%rbx
3088
adoxq %r10,%r10
3089
adcxq %r12,%rax
3090
movq 16(%rsi,%rcx,1),%rdx
3091
movq 48(%rdi),%r12
3092
adoxq %r11,%r11
3093
adcxq %r13,%rbx
3094
movq 56(%rdi),%r13
3095
movq %rax,16(%rdi)
3096
movq %rbx,24(%rdi)
3097
3098
mulxq %rdx,%rax,%rbx
3099
adoxq %r12,%r12
3100
adcxq %r10,%rax
3101
movq 24(%rsi,%rcx,1),%rdx
3102
leaq 32(%rcx),%rcx
3103
movq 64(%rdi),%r10
3104
adoxq %r13,%r13
3105
adcxq %r11,%rbx
3106
movq 72(%rdi),%r11
3107
movq %rax,32(%rdi)
3108
movq %rbx,40(%rdi)
3109
3110
mulxq %rdx,%rax,%rbx
3111
adoxq %r10,%r10
3112
adcxq %r12,%rax
3113
jrcxz .Lsqrx4x_shift_n_add_break
3114
.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3115
adoxq %r11,%r11
3116
adcxq %r13,%rbx
3117
movq 80(%rdi),%r12
3118
movq 88(%rdi),%r13
3119
movq %rax,48(%rdi)
3120
movq %rbx,56(%rdi)
3121
leaq 64(%rdi),%rdi
3122
nop
3123
jmp .Lsqrx4x_shift_n_add
3124
3125
.align 32
3126
.Lsqrx4x_shift_n_add_break:
3127
adcxq %r13,%rbx
3128
movq %rax,48(%rdi)
3129
movq %rbx,56(%rdi)
3130
leaq 64(%rdi),%rdi
3131
.byte 102,72,15,126,213
3132
__bn_sqrx8x_reduction:
3133
xorl %eax,%eax
3134
movq 32+8(%rsp),%rbx
3135
movq 48+8(%rsp),%rdx
3136
leaq -64(%rbp,%r9,1),%rcx
3137
3138
movq %rcx,0+8(%rsp)
3139
movq %rdi,8+8(%rsp)
3140
3141
leaq 48+8(%rsp),%rdi
3142
jmp .Lsqrx8x_reduction_loop
3143
3144
.align 32
3145
.Lsqrx8x_reduction_loop:
3146
movq 8(%rdi),%r9
3147
movq 16(%rdi),%r10
3148
movq 24(%rdi),%r11
3149
movq 32(%rdi),%r12
3150
movq %rdx,%r8
3151
imulq %rbx,%rdx
3152
movq 40(%rdi),%r13
3153
movq 48(%rdi),%r14
3154
movq 56(%rdi),%r15
3155
movq %rax,24+8(%rsp)
3156
3157
leaq 64(%rdi),%rdi
3158
xorq %rsi,%rsi
3159
movq $-8,%rcx
3160
jmp .Lsqrx8x_reduce
3161
3162
.align 32
3163
.Lsqrx8x_reduce:
3164
movq %r8,%rbx
3165
mulxq 0(%rbp),%rax,%r8
3166
adcxq %rbx,%rax
3167
adoxq %r9,%r8
3168
3169
mulxq 8(%rbp),%rbx,%r9
3170
adcxq %rbx,%r8
3171
adoxq %r10,%r9
3172
3173
mulxq 16(%rbp),%rbx,%r10
3174
adcxq %rbx,%r9
3175
adoxq %r11,%r10
3176
3177
mulxq 24(%rbp),%rbx,%r11
3178
adcxq %rbx,%r10
3179
adoxq %r12,%r11
3180
3181
.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3182
movq %rdx,%rax
3183
movq %r8,%rdx
3184
adcxq %rbx,%r11
3185
adoxq %r13,%r12
3186
3187
mulxq 32+8(%rsp),%rbx,%rdx
3188
movq %rax,%rdx
3189
movq %rax,64+48+8(%rsp,%rcx,8)
3190
3191
mulxq 40(%rbp),%rax,%r13
3192
adcxq %rax,%r12
3193
adoxq %r14,%r13
3194
3195
mulxq 48(%rbp),%rax,%r14
3196
adcxq %rax,%r13
3197
adoxq %r15,%r14
3198
3199
mulxq 56(%rbp),%rax,%r15
3200
movq %rbx,%rdx
3201
adcxq %rax,%r14
3202
adoxq %rsi,%r15
3203
adcxq %rsi,%r15
3204
3205
.byte 0x67,0x67,0x67
3206
incq %rcx
3207
jnz .Lsqrx8x_reduce
3208
3209
movq %rsi,%rax
3210
cmpq 0+8(%rsp),%rbp
3211
jae .Lsqrx8x_no_tail
3212
3213
movq 48+8(%rsp),%rdx
3214
addq 0(%rdi),%r8
3215
leaq 64(%rbp),%rbp
3216
movq $-8,%rcx
3217
adcxq 8(%rdi),%r9
3218
adcxq 16(%rdi),%r10
3219
adcq 24(%rdi),%r11
3220
adcq 32(%rdi),%r12
3221
adcq 40(%rdi),%r13
3222
adcq 48(%rdi),%r14
3223
adcq 56(%rdi),%r15
3224
leaq 64(%rdi),%rdi
3225
sbbq %rax,%rax
3226
3227
xorq %rsi,%rsi
3228
movq %rax,16+8(%rsp)
3229
jmp .Lsqrx8x_tail
3230
3231
.align 32
3232
.Lsqrx8x_tail:
3233
movq %r8,%rbx
3234
mulxq 0(%rbp),%rax,%r8
3235
adcxq %rax,%rbx
3236
adoxq %r9,%r8
3237
3238
mulxq 8(%rbp),%rax,%r9
3239
adcxq %rax,%r8
3240
adoxq %r10,%r9
3241
3242
mulxq 16(%rbp),%rax,%r10
3243
adcxq %rax,%r9
3244
adoxq %r11,%r10
3245
3246
mulxq 24(%rbp),%rax,%r11
3247
adcxq %rax,%r10
3248
adoxq %r12,%r11
3249
3250
.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3251
adcxq %rax,%r11
3252
adoxq %r13,%r12
3253
3254
mulxq 40(%rbp),%rax,%r13
3255
adcxq %rax,%r12
3256
adoxq %r14,%r13
3257
3258
mulxq 48(%rbp),%rax,%r14
3259
adcxq %rax,%r13
3260
adoxq %r15,%r14
3261
3262
mulxq 56(%rbp),%rax,%r15
3263
movq 72+48+8(%rsp,%rcx,8),%rdx
3264
adcxq %rax,%r14
3265
adoxq %rsi,%r15
3266
movq %rbx,(%rdi,%rcx,8)
3267
movq %r8,%rbx
3268
adcxq %rsi,%r15
3269
3270
incq %rcx
3271
jnz .Lsqrx8x_tail
3272
3273
cmpq 0+8(%rsp),%rbp
3274
jae .Lsqrx8x_tail_done
3275
3276
subq 16+8(%rsp),%rsi
3277
movq 48+8(%rsp),%rdx
3278
leaq 64(%rbp),%rbp
3279
adcq 0(%rdi),%r8
3280
adcq 8(%rdi),%r9
3281
adcq 16(%rdi),%r10
3282
adcq 24(%rdi),%r11
3283
adcq 32(%rdi),%r12
3284
adcq 40(%rdi),%r13
3285
adcq 48(%rdi),%r14
3286
adcq 56(%rdi),%r15
3287
leaq 64(%rdi),%rdi
3288
sbbq %rax,%rax
3289
subq $8,%rcx
3290
3291
xorq %rsi,%rsi
3292
movq %rax,16+8(%rsp)
3293
jmp .Lsqrx8x_tail
3294
3295
.align 32
3296
.Lsqrx8x_tail_done:
3297
xorq %rax,%rax
3298
addq 24+8(%rsp),%r8
3299
adcq $0,%r9
3300
adcq $0,%r10
3301
adcq $0,%r11
3302
adcq $0,%r12
3303
adcq $0,%r13
3304
adcq $0,%r14
3305
adcq $0,%r15
3306
adcq $0,%rax
3307
3308
subq 16+8(%rsp),%rsi
3309
.Lsqrx8x_no_tail:
3310
adcq 0(%rdi),%r8
3311
.byte 102,72,15,126,217
3312
adcq 8(%rdi),%r9
3313
movq 56(%rbp),%rsi
3314
.byte 102,72,15,126,213
3315
adcq 16(%rdi),%r10
3316
adcq 24(%rdi),%r11
3317
adcq 32(%rdi),%r12
3318
adcq 40(%rdi),%r13
3319
adcq 48(%rdi),%r14
3320
adcq 56(%rdi),%r15
3321
adcq $0,%rax
3322
3323
movq 32+8(%rsp),%rbx
3324
movq 64(%rdi,%rcx,1),%rdx
3325
3326
movq %r8,0(%rdi)
3327
leaq 64(%rdi),%r8
3328
movq %r9,8(%rdi)
3329
movq %r10,16(%rdi)
3330
movq %r11,24(%rdi)
3331
movq %r12,32(%rdi)
3332
movq %r13,40(%rdi)
3333
movq %r14,48(%rdi)
3334
movq %r15,56(%rdi)
3335
3336
leaq 64(%rdi,%rcx,1),%rdi
3337
cmpq 8+8(%rsp),%r8
3338
jb .Lsqrx8x_reduction_loop
3339
.byte 0xf3,0xc3
3340
.cfi_endproc
3341
.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
3342
.align 32
3343
__bn_postx4x_internal:
3344
.cfi_startproc
3345
movq 0(%rbp),%r12
3346
movq %rcx,%r10
3347
movq %rcx,%r9
3348
negq %rax
3349
sarq $3+2,%rcx
3350
3351
.byte 102,72,15,126,202
3352
.byte 102,72,15,126,206
3353
decq %r12
3354
movq 8(%rbp),%r13
3355
xorq %r8,%r8
3356
movq 16(%rbp),%r14
3357
movq 24(%rbp),%r15
3358
jmp .Lsqrx4x_sub_entry
3359
3360
.align 16
3361
.Lsqrx4x_sub:
3362
movq 0(%rbp),%r12
3363
movq 8(%rbp),%r13
3364
movq 16(%rbp),%r14
3365
movq 24(%rbp),%r15
3366
.Lsqrx4x_sub_entry:
3367
andnq %rax,%r12,%r12
3368
leaq 32(%rbp),%rbp
3369
andnq %rax,%r13,%r13
3370
andnq %rax,%r14,%r14
3371
andnq %rax,%r15,%r15
3372
3373
negq %r8
3374
adcq 0(%rdi),%r12
3375
adcq 8(%rdi),%r13
3376
adcq 16(%rdi),%r14
3377
adcq 24(%rdi),%r15
3378
movq %r12,0(%rdx)
3379
leaq 32(%rdi),%rdi
3380
movq %r13,8(%rdx)
3381
sbbq %r8,%r8
3382
movq %r14,16(%rdx)
3383
movq %r15,24(%rdx)
3384
leaq 32(%rdx),%rdx
3385
3386
incq %rcx
3387
jnz .Lsqrx4x_sub
3388
3389
negq %r9
3390
3391
.byte 0xf3,0xc3
3392
.cfi_endproc
3393
.size __bn_postx4x_internal,.-__bn_postx4x_internal
3394
.globl bn_get_bits5
3395
.type bn_get_bits5,@function
3396
.align 16
3397
bn_get_bits5:
3398
.cfi_startproc
3399
leaq 0(%rdi),%r10
3400
leaq 1(%rdi),%r11
3401
movl %esi,%ecx
3402
shrl $4,%esi
3403
andl $15,%ecx
3404
leal -8(%rcx),%eax
3405
cmpl $11,%ecx
3406
cmovaq %r11,%r10
3407
cmoval %eax,%ecx
3408
movzwl (%r10,%rsi,2),%eax
3409
shrl %cl,%eax
3410
andl $31,%eax
3411
.byte 0xf3,0xc3
3412
.cfi_endproc
3413
.size bn_get_bits5,.-bn_get_bits5
3414
3415
.globl bn_scatter5
3416
.type bn_scatter5,@function
3417
.align 16
3418
bn_scatter5:
3419
.cfi_startproc
3420
cmpl $0,%esi
3421
jz .Lscatter_epilogue
3422
leaq (%rdx,%rcx,8),%rdx
3423
.Lscatter:
3424
movq (%rdi),%rax
3425
leaq 8(%rdi),%rdi
3426
movq %rax,(%rdx)
3427
leaq 256(%rdx),%rdx
3428
subl $1,%esi
3429
jnz .Lscatter
3430
.Lscatter_epilogue:
3431
.byte 0xf3,0xc3
3432
.cfi_endproc
3433
.size bn_scatter5,.-bn_scatter5
3434
3435
.globl bn_gather5
3436
.type bn_gather5,@function
3437
.align 32
3438
bn_gather5:
3439
.LSEH_begin_bn_gather5:
3440
.cfi_startproc
3441
3442
.byte 0x4c,0x8d,0x14,0x24
3443
.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
3444
leaq .Linc(%rip),%rax
3445
andq $-16,%rsp
3446
3447
movd %ecx,%xmm5
3448
movdqa 0(%rax),%xmm0
3449
movdqa 16(%rax),%xmm1
3450
leaq 128(%rdx),%r11
3451
leaq 128(%rsp),%rax
3452
3453
pshufd $0,%xmm5,%xmm5
3454
movdqa %xmm1,%xmm4
3455
movdqa %xmm1,%xmm2
3456
paddd %xmm0,%xmm1
3457
pcmpeqd %xmm5,%xmm0
3458
movdqa %xmm4,%xmm3
3459
3460
paddd %xmm1,%xmm2
3461
pcmpeqd %xmm5,%xmm1
3462
movdqa %xmm0,-128(%rax)
3463
movdqa %xmm4,%xmm0
3464
3465
paddd %xmm2,%xmm3
3466
pcmpeqd %xmm5,%xmm2
3467
movdqa %xmm1,-112(%rax)
3468
movdqa %xmm4,%xmm1
3469
3470
paddd %xmm3,%xmm0
3471
pcmpeqd %xmm5,%xmm3
3472
movdqa %xmm2,-96(%rax)
3473
movdqa %xmm4,%xmm2
3474
paddd %xmm0,%xmm1
3475
pcmpeqd %xmm5,%xmm0
3476
movdqa %xmm3,-80(%rax)
3477
movdqa %xmm4,%xmm3
3478
3479
paddd %xmm1,%xmm2
3480
pcmpeqd %xmm5,%xmm1
3481
movdqa %xmm0,-64(%rax)
3482
movdqa %xmm4,%xmm0
3483
3484
paddd %xmm2,%xmm3
3485
pcmpeqd %xmm5,%xmm2
3486
movdqa %xmm1,-48(%rax)
3487
movdqa %xmm4,%xmm1
3488
3489
paddd %xmm3,%xmm0
3490
pcmpeqd %xmm5,%xmm3
3491
movdqa %xmm2,-32(%rax)
3492
movdqa %xmm4,%xmm2
3493
paddd %xmm0,%xmm1
3494
pcmpeqd %xmm5,%xmm0
3495
movdqa %xmm3,-16(%rax)
3496
movdqa %xmm4,%xmm3
3497
3498
paddd %xmm1,%xmm2
3499
pcmpeqd %xmm5,%xmm1
3500
movdqa %xmm0,0(%rax)
3501
movdqa %xmm4,%xmm0
3502
3503
paddd %xmm2,%xmm3
3504
pcmpeqd %xmm5,%xmm2
3505
movdqa %xmm1,16(%rax)
3506
movdqa %xmm4,%xmm1
3507
3508
paddd %xmm3,%xmm0
3509
pcmpeqd %xmm5,%xmm3
3510
movdqa %xmm2,32(%rax)
3511
movdqa %xmm4,%xmm2
3512
paddd %xmm0,%xmm1
3513
pcmpeqd %xmm5,%xmm0
3514
movdqa %xmm3,48(%rax)
3515
movdqa %xmm4,%xmm3
3516
3517
paddd %xmm1,%xmm2
3518
pcmpeqd %xmm5,%xmm1
3519
movdqa %xmm0,64(%rax)
3520
movdqa %xmm4,%xmm0
3521
3522
paddd %xmm2,%xmm3
3523
pcmpeqd %xmm5,%xmm2
3524
movdqa %xmm1,80(%rax)
3525
movdqa %xmm4,%xmm1
3526
3527
paddd %xmm3,%xmm0
3528
pcmpeqd %xmm5,%xmm3
3529
movdqa %xmm2,96(%rax)
3530
movdqa %xmm4,%xmm2
3531
movdqa %xmm3,112(%rax)
3532
jmp .Lgather
3533
3534
.align 32
3535
.Lgather:
3536
pxor %xmm4,%xmm4
3537
pxor %xmm5,%xmm5
3538
movdqa -128(%r11),%xmm0
3539
movdqa -112(%r11),%xmm1
3540
movdqa -96(%r11),%xmm2
3541
pand -128(%rax),%xmm0
3542
movdqa -80(%r11),%xmm3
3543
pand -112(%rax),%xmm1
3544
por %xmm0,%xmm4
3545
pand -96(%rax),%xmm2
3546
por %xmm1,%xmm5
3547
pand -80(%rax),%xmm3
3548
por %xmm2,%xmm4
3549
por %xmm3,%xmm5
3550
movdqa -64(%r11),%xmm0
3551
movdqa -48(%r11),%xmm1
3552
movdqa -32(%r11),%xmm2
3553
pand -64(%rax),%xmm0
3554
movdqa -16(%r11),%xmm3
3555
pand -48(%rax),%xmm1
3556
por %xmm0,%xmm4
3557
pand -32(%rax),%xmm2
3558
por %xmm1,%xmm5
3559
pand -16(%rax),%xmm3
3560
por %xmm2,%xmm4
3561
por %xmm3,%xmm5
3562
movdqa 0(%r11),%xmm0
3563
movdqa 16(%r11),%xmm1
3564
movdqa 32(%r11),%xmm2
3565
pand 0(%rax),%xmm0
3566
movdqa 48(%r11),%xmm3
3567
pand 16(%rax),%xmm1
3568
por %xmm0,%xmm4
3569
pand 32(%rax),%xmm2
3570
por %xmm1,%xmm5
3571
pand 48(%rax),%xmm3
3572
por %xmm2,%xmm4
3573
por %xmm3,%xmm5
3574
movdqa 64(%r11),%xmm0
3575
movdqa 80(%r11),%xmm1
3576
movdqa 96(%r11),%xmm2
3577
pand 64(%rax),%xmm0
3578
movdqa 112(%r11),%xmm3
3579
pand 80(%rax),%xmm1
3580
por %xmm0,%xmm4
3581
pand 96(%rax),%xmm2
3582
por %xmm1,%xmm5
3583
pand 112(%rax),%xmm3
3584
por %xmm2,%xmm4
3585
por %xmm3,%xmm5
3586
por %xmm5,%xmm4
3587
leaq 256(%r11),%r11
3588
pshufd $0x4e,%xmm4,%xmm0
3589
por %xmm4,%xmm0
3590
movq %xmm0,(%rdi)
3591
leaq 8(%rdi),%rdi
3592
subl $1,%esi
3593
jnz .Lgather
3594
3595
leaq (%r10),%rsp
3596
.byte 0xf3,0xc3
3597
.LSEH_end_bn_gather5:
3598
.cfi_endproc
3599
.size bn_gather5,.-bn_gather5
3600
.section .rodata
3601
.align 64
3602
.Linc:
3603
.long 0,0, 1,1
3604
.long 2,2, 2,2
3605
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3606
.previous
3607
.section ".note.gnu.property", "a"
3608
.p2align 3
3609
.long 1f - 0f
3610
.long 4f - 1f
3611
.long 5
3612
0:
3613
# "GNU" encoded with .byte, since .asciz isn't supported
3614
# on Solaris.
3615
.byte 0x47
3616
.byte 0x4e
3617
.byte 0x55
3618
.byte 0
3619
1:
3620
.p2align 3
3621
.long 0xc0000002
3622
.long 3f - 2f
3623
2:
3624
.long 3
3625
3:
3626
.p2align 3
3627
4:
3628
3629