Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/amd64/rsaz-2k-avx512.S
39482 views
1
/* Do not modify. This file is auto-generated from rsaz-2k-avx512.pl. */
2
3
.globl ossl_rsaz_avx512ifma_eligible
4
.type ossl_rsaz_avx512ifma_eligible,@function
5
.align 32
6
ossl_rsaz_avx512ifma_eligible:
7
movl OPENSSL_ia32cap_P+8(%rip),%ecx
8
xorl %eax,%eax
9
andl $2149777408,%ecx
10
cmpl $2149777408,%ecx
11
cmovel %ecx,%eax
12
.byte 0xf3,0xc3
13
.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible
14
.text
15
16
.globl ossl_rsaz_amm52x20_x1_ifma256
17
.type ossl_rsaz_amm52x20_x1_ifma256,@function
18
.align 32
19
ossl_rsaz_amm52x20_x1_ifma256:
20
.cfi_startproc
21
.byte 243,15,30,250
22
pushq %rbx
23
.cfi_adjust_cfa_offset 8
24
.cfi_offset %rbx,-16
25
pushq %rbp
26
.cfi_adjust_cfa_offset 8
27
.cfi_offset %rbp,-24
28
pushq %r12
29
.cfi_adjust_cfa_offset 8
30
.cfi_offset %r12,-32
31
pushq %r13
32
.cfi_adjust_cfa_offset 8
33
.cfi_offset %r13,-40
34
pushq %r14
35
.cfi_adjust_cfa_offset 8
36
.cfi_offset %r14,-48
37
pushq %r15
38
.cfi_adjust_cfa_offset 8
39
.cfi_offset %r15,-56
40
.Lossl_rsaz_amm52x20_x1_ifma256_body:
41
42
43
vpxord %ymm0,%ymm0,%ymm0
44
vmovdqa64 %ymm0,%ymm3
45
vmovdqa64 %ymm0,%ymm16
46
vmovdqa64 %ymm0,%ymm17
47
vmovdqa64 %ymm0,%ymm18
48
vmovdqa64 %ymm0,%ymm19
49
50
xorl %r9d,%r9d
51
52
movq %rdx,%r11
53
movq $0xfffffffffffff,%rax
54
55
56
movl $5,%ebx
57
58
.align 32
59
.Lloop5:
60
movq 0(%r11),%r13
61
62
vpbroadcastq %r13,%ymm1
63
movq 0(%rsi),%rdx
64
mulxq %r13,%r13,%r12
65
addq %r13,%r9
66
movq %r12,%r10
67
adcq $0,%r10
68
69
movq %r8,%r13
70
imulq %r9,%r13
71
andq %rax,%r13
72
73
vpbroadcastq %r13,%ymm2
74
movq 0(%rcx),%rdx
75
mulxq %r13,%r13,%r12
76
addq %r13,%r9
77
adcq %r12,%r10
78
79
shrq $52,%r9
80
salq $12,%r10
81
orq %r10,%r9
82
83
vpmadd52luq 0(%rsi),%ymm1,%ymm3
84
vpmadd52luq 32(%rsi),%ymm1,%ymm16
85
vpmadd52luq 64(%rsi),%ymm1,%ymm17
86
vpmadd52luq 96(%rsi),%ymm1,%ymm18
87
vpmadd52luq 128(%rsi),%ymm1,%ymm19
88
89
vpmadd52luq 0(%rcx),%ymm2,%ymm3
90
vpmadd52luq 32(%rcx),%ymm2,%ymm16
91
vpmadd52luq 64(%rcx),%ymm2,%ymm17
92
vpmadd52luq 96(%rcx),%ymm2,%ymm18
93
vpmadd52luq 128(%rcx),%ymm2,%ymm19
94
95
96
valignq $1,%ymm3,%ymm16,%ymm3
97
valignq $1,%ymm16,%ymm17,%ymm16
98
valignq $1,%ymm17,%ymm18,%ymm17
99
valignq $1,%ymm18,%ymm19,%ymm18
100
valignq $1,%ymm19,%ymm0,%ymm19
101
102
vmovq %xmm3,%r13
103
addq %r13,%r9
104
105
vpmadd52huq 0(%rsi),%ymm1,%ymm3
106
vpmadd52huq 32(%rsi),%ymm1,%ymm16
107
vpmadd52huq 64(%rsi),%ymm1,%ymm17
108
vpmadd52huq 96(%rsi),%ymm1,%ymm18
109
vpmadd52huq 128(%rsi),%ymm1,%ymm19
110
111
vpmadd52huq 0(%rcx),%ymm2,%ymm3
112
vpmadd52huq 32(%rcx),%ymm2,%ymm16
113
vpmadd52huq 64(%rcx),%ymm2,%ymm17
114
vpmadd52huq 96(%rcx),%ymm2,%ymm18
115
vpmadd52huq 128(%rcx),%ymm2,%ymm19
116
movq 8(%r11),%r13
117
118
vpbroadcastq %r13,%ymm1
119
movq 0(%rsi),%rdx
120
mulxq %r13,%r13,%r12
121
addq %r13,%r9
122
movq %r12,%r10
123
adcq $0,%r10
124
125
movq %r8,%r13
126
imulq %r9,%r13
127
andq %rax,%r13
128
129
vpbroadcastq %r13,%ymm2
130
movq 0(%rcx),%rdx
131
mulxq %r13,%r13,%r12
132
addq %r13,%r9
133
adcq %r12,%r10
134
135
shrq $52,%r9
136
salq $12,%r10
137
orq %r10,%r9
138
139
vpmadd52luq 0(%rsi),%ymm1,%ymm3
140
vpmadd52luq 32(%rsi),%ymm1,%ymm16
141
vpmadd52luq 64(%rsi),%ymm1,%ymm17
142
vpmadd52luq 96(%rsi),%ymm1,%ymm18
143
vpmadd52luq 128(%rsi),%ymm1,%ymm19
144
145
vpmadd52luq 0(%rcx),%ymm2,%ymm3
146
vpmadd52luq 32(%rcx),%ymm2,%ymm16
147
vpmadd52luq 64(%rcx),%ymm2,%ymm17
148
vpmadd52luq 96(%rcx),%ymm2,%ymm18
149
vpmadd52luq 128(%rcx),%ymm2,%ymm19
150
151
152
valignq $1,%ymm3,%ymm16,%ymm3
153
valignq $1,%ymm16,%ymm17,%ymm16
154
valignq $1,%ymm17,%ymm18,%ymm17
155
valignq $1,%ymm18,%ymm19,%ymm18
156
valignq $1,%ymm19,%ymm0,%ymm19
157
158
vmovq %xmm3,%r13
159
addq %r13,%r9
160
161
vpmadd52huq 0(%rsi),%ymm1,%ymm3
162
vpmadd52huq 32(%rsi),%ymm1,%ymm16
163
vpmadd52huq 64(%rsi),%ymm1,%ymm17
164
vpmadd52huq 96(%rsi),%ymm1,%ymm18
165
vpmadd52huq 128(%rsi),%ymm1,%ymm19
166
167
vpmadd52huq 0(%rcx),%ymm2,%ymm3
168
vpmadd52huq 32(%rcx),%ymm2,%ymm16
169
vpmadd52huq 64(%rcx),%ymm2,%ymm17
170
vpmadd52huq 96(%rcx),%ymm2,%ymm18
171
vpmadd52huq 128(%rcx),%ymm2,%ymm19
172
movq 16(%r11),%r13
173
174
vpbroadcastq %r13,%ymm1
175
movq 0(%rsi),%rdx
176
mulxq %r13,%r13,%r12
177
addq %r13,%r9
178
movq %r12,%r10
179
adcq $0,%r10
180
181
movq %r8,%r13
182
imulq %r9,%r13
183
andq %rax,%r13
184
185
vpbroadcastq %r13,%ymm2
186
movq 0(%rcx),%rdx
187
mulxq %r13,%r13,%r12
188
addq %r13,%r9
189
adcq %r12,%r10
190
191
shrq $52,%r9
192
salq $12,%r10
193
orq %r10,%r9
194
195
vpmadd52luq 0(%rsi),%ymm1,%ymm3
196
vpmadd52luq 32(%rsi),%ymm1,%ymm16
197
vpmadd52luq 64(%rsi),%ymm1,%ymm17
198
vpmadd52luq 96(%rsi),%ymm1,%ymm18
199
vpmadd52luq 128(%rsi),%ymm1,%ymm19
200
201
vpmadd52luq 0(%rcx),%ymm2,%ymm3
202
vpmadd52luq 32(%rcx),%ymm2,%ymm16
203
vpmadd52luq 64(%rcx),%ymm2,%ymm17
204
vpmadd52luq 96(%rcx),%ymm2,%ymm18
205
vpmadd52luq 128(%rcx),%ymm2,%ymm19
206
207
208
valignq $1,%ymm3,%ymm16,%ymm3
209
valignq $1,%ymm16,%ymm17,%ymm16
210
valignq $1,%ymm17,%ymm18,%ymm17
211
valignq $1,%ymm18,%ymm19,%ymm18
212
valignq $1,%ymm19,%ymm0,%ymm19
213
214
vmovq %xmm3,%r13
215
addq %r13,%r9
216
217
vpmadd52huq 0(%rsi),%ymm1,%ymm3
218
vpmadd52huq 32(%rsi),%ymm1,%ymm16
219
vpmadd52huq 64(%rsi),%ymm1,%ymm17
220
vpmadd52huq 96(%rsi),%ymm1,%ymm18
221
vpmadd52huq 128(%rsi),%ymm1,%ymm19
222
223
vpmadd52huq 0(%rcx),%ymm2,%ymm3
224
vpmadd52huq 32(%rcx),%ymm2,%ymm16
225
vpmadd52huq 64(%rcx),%ymm2,%ymm17
226
vpmadd52huq 96(%rcx),%ymm2,%ymm18
227
vpmadd52huq 128(%rcx),%ymm2,%ymm19
228
movq 24(%r11),%r13
229
230
vpbroadcastq %r13,%ymm1
231
movq 0(%rsi),%rdx
232
mulxq %r13,%r13,%r12
233
addq %r13,%r9
234
movq %r12,%r10
235
adcq $0,%r10
236
237
movq %r8,%r13
238
imulq %r9,%r13
239
andq %rax,%r13
240
241
vpbroadcastq %r13,%ymm2
242
movq 0(%rcx),%rdx
243
mulxq %r13,%r13,%r12
244
addq %r13,%r9
245
adcq %r12,%r10
246
247
shrq $52,%r9
248
salq $12,%r10
249
orq %r10,%r9
250
251
vpmadd52luq 0(%rsi),%ymm1,%ymm3
252
vpmadd52luq 32(%rsi),%ymm1,%ymm16
253
vpmadd52luq 64(%rsi),%ymm1,%ymm17
254
vpmadd52luq 96(%rsi),%ymm1,%ymm18
255
vpmadd52luq 128(%rsi),%ymm1,%ymm19
256
257
vpmadd52luq 0(%rcx),%ymm2,%ymm3
258
vpmadd52luq 32(%rcx),%ymm2,%ymm16
259
vpmadd52luq 64(%rcx),%ymm2,%ymm17
260
vpmadd52luq 96(%rcx),%ymm2,%ymm18
261
vpmadd52luq 128(%rcx),%ymm2,%ymm19
262
263
264
valignq $1,%ymm3,%ymm16,%ymm3
265
valignq $1,%ymm16,%ymm17,%ymm16
266
valignq $1,%ymm17,%ymm18,%ymm17
267
valignq $1,%ymm18,%ymm19,%ymm18
268
valignq $1,%ymm19,%ymm0,%ymm19
269
270
vmovq %xmm3,%r13
271
addq %r13,%r9
272
273
vpmadd52huq 0(%rsi),%ymm1,%ymm3
274
vpmadd52huq 32(%rsi),%ymm1,%ymm16
275
vpmadd52huq 64(%rsi),%ymm1,%ymm17
276
vpmadd52huq 96(%rsi),%ymm1,%ymm18
277
vpmadd52huq 128(%rsi),%ymm1,%ymm19
278
279
vpmadd52huq 0(%rcx),%ymm2,%ymm3
280
vpmadd52huq 32(%rcx),%ymm2,%ymm16
281
vpmadd52huq 64(%rcx),%ymm2,%ymm17
282
vpmadd52huq 96(%rcx),%ymm2,%ymm18
283
vpmadd52huq 128(%rcx),%ymm2,%ymm19
284
leaq 32(%r11),%r11
285
decl %ebx
286
jne .Lloop5
287
288
vpbroadcastq %r9,%ymm0
289
vpblendd $3,%ymm0,%ymm3,%ymm3
290
291
292
293
vpsrlq $52,%ymm3,%ymm0
294
vpsrlq $52,%ymm16,%ymm1
295
vpsrlq $52,%ymm17,%ymm2
296
vpsrlq $52,%ymm18,%ymm25
297
vpsrlq $52,%ymm19,%ymm26
298
299
300
valignq $3,%ymm25,%ymm26,%ymm26
301
valignq $3,%ymm2,%ymm25,%ymm25
302
valignq $3,%ymm1,%ymm2,%ymm2
303
valignq $3,%ymm0,%ymm1,%ymm1
304
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
305
306
307
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
308
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
309
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
310
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
311
vpandq .Lmask52x4(%rip),%ymm19,%ymm19
312
313
314
vpaddq %ymm0,%ymm3,%ymm3
315
vpaddq %ymm1,%ymm16,%ymm16
316
vpaddq %ymm2,%ymm17,%ymm17
317
vpaddq %ymm25,%ymm18,%ymm18
318
vpaddq %ymm26,%ymm19,%ymm19
319
320
321
322
vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
323
vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2
324
vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3
325
vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4
326
vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5
327
kmovb %k1,%r14d
328
kmovb %k2,%r13d
329
kmovb %k3,%r12d
330
kmovb %k4,%r11d
331
kmovb %k5,%r10d
332
333
334
vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
335
vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2
336
vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3
337
vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4
338
vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5
339
kmovb %k1,%r9d
340
kmovb %k2,%r8d
341
kmovb %k3,%ebx
342
kmovb %k4,%ecx
343
kmovb %k5,%edx
344
345
346
347
shlb $4,%r13b
348
orb %r13b,%r14b
349
shlb $4,%r11b
350
orb %r11b,%r12b
351
352
addb %r14b,%r14b
353
adcb %r12b,%r12b
354
adcb %r10b,%r10b
355
356
shlb $4,%r8b
357
orb %r8b,%r9b
358
shlb $4,%cl
359
orb %cl,%bl
360
361
addb %r9b,%r14b
362
adcb %bl,%r12b
363
adcb %dl,%r10b
364
365
xorb %r9b,%r14b
366
xorb %bl,%r12b
367
xorb %dl,%r10b
368
369
kmovb %r14d,%k1
370
shrb $4,%r14b
371
kmovb %r14d,%k2
372
kmovb %r12d,%k3
373
shrb $4,%r12b
374
kmovb %r12d,%k4
375
kmovb %r10d,%k5
376
377
378
vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
379
vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2}
380
vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3}
381
vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4}
382
vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5}
383
384
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
385
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
386
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
387
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
388
vpandq .Lmask52x4(%rip),%ymm19,%ymm19
389
390
vmovdqu64 %ymm3,0(%rdi)
391
vmovdqu64 %ymm16,32(%rdi)
392
vmovdqu64 %ymm17,64(%rdi)
393
vmovdqu64 %ymm18,96(%rdi)
394
vmovdqu64 %ymm19,128(%rdi)
395
396
vzeroupper
397
movq 0(%rsp),%r15
398
.cfi_restore %r15
399
movq 8(%rsp),%r14
400
.cfi_restore %r14
401
movq 16(%rsp),%r13
402
.cfi_restore %r13
403
movq 24(%rsp),%r12
404
.cfi_restore %r12
405
movq 32(%rsp),%rbp
406
.cfi_restore %rbp
407
movq 40(%rsp),%rbx
408
.cfi_restore %rbx
409
leaq 48(%rsp),%rsp
410
.cfi_adjust_cfa_offset -48
411
.Lossl_rsaz_amm52x20_x1_ifma256_epilogue:
412
.byte 0xf3,0xc3
413
.cfi_endproc
414
.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256
415
.section .rodata
416
.align 32
417
.Lmask52x4:
418
.quad 0xfffffffffffff
419
.quad 0xfffffffffffff
420
.quad 0xfffffffffffff
421
.quad 0xfffffffffffff
422
.text
423
424
.globl ossl_rsaz_amm52x20_x2_ifma256
425
.type ossl_rsaz_amm52x20_x2_ifma256,@function
426
.align 32
427
ossl_rsaz_amm52x20_x2_ifma256:
428
.cfi_startproc
429
.byte 243,15,30,250
430
pushq %rbx
431
.cfi_adjust_cfa_offset 8
432
.cfi_offset %rbx,-16
433
pushq %rbp
434
.cfi_adjust_cfa_offset 8
435
.cfi_offset %rbp,-24
436
pushq %r12
437
.cfi_adjust_cfa_offset 8
438
.cfi_offset %r12,-32
439
pushq %r13
440
.cfi_adjust_cfa_offset 8
441
.cfi_offset %r13,-40
442
pushq %r14
443
.cfi_adjust_cfa_offset 8
444
.cfi_offset %r14,-48
445
pushq %r15
446
.cfi_adjust_cfa_offset 8
447
.cfi_offset %r15,-56
448
.Lossl_rsaz_amm52x20_x2_ifma256_body:
449
450
451
vpxord %ymm0,%ymm0,%ymm0
452
vmovdqa64 %ymm0,%ymm3
453
vmovdqa64 %ymm0,%ymm16
454
vmovdqa64 %ymm0,%ymm17
455
vmovdqa64 %ymm0,%ymm18
456
vmovdqa64 %ymm0,%ymm19
457
vmovdqa64 %ymm0,%ymm4
458
vmovdqa64 %ymm0,%ymm20
459
vmovdqa64 %ymm0,%ymm21
460
vmovdqa64 %ymm0,%ymm22
461
vmovdqa64 %ymm0,%ymm23
462
463
xorl %r9d,%r9d
464
xorl %r15d,%r15d
465
466
movq %rdx,%r11
467
movq $0xfffffffffffff,%rax
468
469
movl $20,%ebx
470
471
.align 32
472
.Lloop20:
473
movq 0(%r11),%r13
474
475
vpbroadcastq %r13,%ymm1
476
movq 0(%rsi),%rdx
477
mulxq %r13,%r13,%r12
478
addq %r13,%r9
479
movq %r12,%r10
480
adcq $0,%r10
481
482
movq (%r8),%r13
483
imulq %r9,%r13
484
andq %rax,%r13
485
486
vpbroadcastq %r13,%ymm2
487
movq 0(%rcx),%rdx
488
mulxq %r13,%r13,%r12
489
addq %r13,%r9
490
adcq %r12,%r10
491
492
shrq $52,%r9
493
salq $12,%r10
494
orq %r10,%r9
495
496
vpmadd52luq 0(%rsi),%ymm1,%ymm3
497
vpmadd52luq 32(%rsi),%ymm1,%ymm16
498
vpmadd52luq 64(%rsi),%ymm1,%ymm17
499
vpmadd52luq 96(%rsi),%ymm1,%ymm18
500
vpmadd52luq 128(%rsi),%ymm1,%ymm19
501
502
vpmadd52luq 0(%rcx),%ymm2,%ymm3
503
vpmadd52luq 32(%rcx),%ymm2,%ymm16
504
vpmadd52luq 64(%rcx),%ymm2,%ymm17
505
vpmadd52luq 96(%rcx),%ymm2,%ymm18
506
vpmadd52luq 128(%rcx),%ymm2,%ymm19
507
508
509
valignq $1,%ymm3,%ymm16,%ymm3
510
valignq $1,%ymm16,%ymm17,%ymm16
511
valignq $1,%ymm17,%ymm18,%ymm17
512
valignq $1,%ymm18,%ymm19,%ymm18
513
valignq $1,%ymm19,%ymm0,%ymm19
514
515
vmovq %xmm3,%r13
516
addq %r13,%r9
517
518
vpmadd52huq 0(%rsi),%ymm1,%ymm3
519
vpmadd52huq 32(%rsi),%ymm1,%ymm16
520
vpmadd52huq 64(%rsi),%ymm1,%ymm17
521
vpmadd52huq 96(%rsi),%ymm1,%ymm18
522
vpmadd52huq 128(%rsi),%ymm1,%ymm19
523
524
vpmadd52huq 0(%rcx),%ymm2,%ymm3
525
vpmadd52huq 32(%rcx),%ymm2,%ymm16
526
vpmadd52huq 64(%rcx),%ymm2,%ymm17
527
vpmadd52huq 96(%rcx),%ymm2,%ymm18
528
vpmadd52huq 128(%rcx),%ymm2,%ymm19
529
movq 160(%r11),%r13
530
531
vpbroadcastq %r13,%ymm1
532
movq 160(%rsi),%rdx
533
mulxq %r13,%r13,%r12
534
addq %r13,%r15
535
movq %r12,%r10
536
adcq $0,%r10
537
538
movq 8(%r8),%r13
539
imulq %r15,%r13
540
andq %rax,%r13
541
542
vpbroadcastq %r13,%ymm2
543
movq 160(%rcx),%rdx
544
mulxq %r13,%r13,%r12
545
addq %r13,%r15
546
adcq %r12,%r10
547
548
shrq $52,%r15
549
salq $12,%r10
550
orq %r10,%r15
551
552
vpmadd52luq 160(%rsi),%ymm1,%ymm4
553
vpmadd52luq 192(%rsi),%ymm1,%ymm20
554
vpmadd52luq 224(%rsi),%ymm1,%ymm21
555
vpmadd52luq 256(%rsi),%ymm1,%ymm22
556
vpmadd52luq 288(%rsi),%ymm1,%ymm23
557
558
vpmadd52luq 160(%rcx),%ymm2,%ymm4
559
vpmadd52luq 192(%rcx),%ymm2,%ymm20
560
vpmadd52luq 224(%rcx),%ymm2,%ymm21
561
vpmadd52luq 256(%rcx),%ymm2,%ymm22
562
vpmadd52luq 288(%rcx),%ymm2,%ymm23
563
564
565
valignq $1,%ymm4,%ymm20,%ymm4
566
valignq $1,%ymm20,%ymm21,%ymm20
567
valignq $1,%ymm21,%ymm22,%ymm21
568
valignq $1,%ymm22,%ymm23,%ymm22
569
valignq $1,%ymm23,%ymm0,%ymm23
570
571
vmovq %xmm4,%r13
572
addq %r13,%r15
573
574
vpmadd52huq 160(%rsi),%ymm1,%ymm4
575
vpmadd52huq 192(%rsi),%ymm1,%ymm20
576
vpmadd52huq 224(%rsi),%ymm1,%ymm21
577
vpmadd52huq 256(%rsi),%ymm1,%ymm22
578
vpmadd52huq 288(%rsi),%ymm1,%ymm23
579
580
vpmadd52huq 160(%rcx),%ymm2,%ymm4
581
vpmadd52huq 192(%rcx),%ymm2,%ymm20
582
vpmadd52huq 224(%rcx),%ymm2,%ymm21
583
vpmadd52huq 256(%rcx),%ymm2,%ymm22
584
vpmadd52huq 288(%rcx),%ymm2,%ymm23
585
leaq 8(%r11),%r11
586
decl %ebx
587
jne .Lloop20
588
589
vpbroadcastq %r9,%ymm0
590
vpblendd $3,%ymm0,%ymm3,%ymm3
591
592
593
594
vpsrlq $52,%ymm3,%ymm0
595
vpsrlq $52,%ymm16,%ymm1
596
vpsrlq $52,%ymm17,%ymm2
597
vpsrlq $52,%ymm18,%ymm25
598
vpsrlq $52,%ymm19,%ymm26
599
600
601
valignq $3,%ymm25,%ymm26,%ymm26
602
valignq $3,%ymm2,%ymm25,%ymm25
603
valignq $3,%ymm1,%ymm2,%ymm2
604
valignq $3,%ymm0,%ymm1,%ymm1
605
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
606
607
608
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
609
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
610
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
611
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
612
vpandq .Lmask52x4(%rip),%ymm19,%ymm19
613
614
615
vpaddq %ymm0,%ymm3,%ymm3
616
vpaddq %ymm1,%ymm16,%ymm16
617
vpaddq %ymm2,%ymm17,%ymm17
618
vpaddq %ymm25,%ymm18,%ymm18
619
vpaddq %ymm26,%ymm19,%ymm19
620
621
622
623
vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
624
vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2
625
vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3
626
vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4
627
vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5
628
kmovb %k1,%r14d
629
kmovb %k2,%r13d
630
kmovb %k3,%r12d
631
kmovb %k4,%r11d
632
kmovb %k5,%r10d
633
634
635
vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
636
vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2
637
vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3
638
vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4
639
vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5
640
kmovb %k1,%r9d
641
kmovb %k2,%r8d
642
kmovb %k3,%ebx
643
kmovb %k4,%ecx
644
kmovb %k5,%edx
645
646
647
648
shlb $4,%r13b
649
orb %r13b,%r14b
650
shlb $4,%r11b
651
orb %r11b,%r12b
652
653
addb %r14b,%r14b
654
adcb %r12b,%r12b
655
adcb %r10b,%r10b
656
657
shlb $4,%r8b
658
orb %r8b,%r9b
659
shlb $4,%cl
660
orb %cl,%bl
661
662
addb %r9b,%r14b
663
adcb %bl,%r12b
664
adcb %dl,%r10b
665
666
xorb %r9b,%r14b
667
xorb %bl,%r12b
668
xorb %dl,%r10b
669
670
kmovb %r14d,%k1
671
shrb $4,%r14b
672
kmovb %r14d,%k2
673
kmovb %r12d,%k3
674
shrb $4,%r12b
675
kmovb %r12d,%k4
676
kmovb %r10d,%k5
677
678
679
vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
680
vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2}
681
vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3}
682
vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4}
683
vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5}
684
685
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
686
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
687
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
688
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
689
vpandq .Lmask52x4(%rip),%ymm19,%ymm19
690
691
vpbroadcastq %r15,%ymm0
692
vpblendd $3,%ymm0,%ymm4,%ymm4
693
694
695
696
vpsrlq $52,%ymm4,%ymm0
697
vpsrlq $52,%ymm20,%ymm1
698
vpsrlq $52,%ymm21,%ymm2
699
vpsrlq $52,%ymm22,%ymm25
700
vpsrlq $52,%ymm23,%ymm26
701
702
703
valignq $3,%ymm25,%ymm26,%ymm26
704
valignq $3,%ymm2,%ymm25,%ymm25
705
valignq $3,%ymm1,%ymm2,%ymm2
706
valignq $3,%ymm0,%ymm1,%ymm1
707
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
708
709
710
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
711
vpandq .Lmask52x4(%rip),%ymm20,%ymm20
712
vpandq .Lmask52x4(%rip),%ymm21,%ymm21
713
vpandq .Lmask52x4(%rip),%ymm22,%ymm22
714
vpandq .Lmask52x4(%rip),%ymm23,%ymm23
715
716
717
vpaddq %ymm0,%ymm4,%ymm4
718
vpaddq %ymm1,%ymm20,%ymm20
719
vpaddq %ymm2,%ymm21,%ymm21
720
vpaddq %ymm25,%ymm22,%ymm22
721
vpaddq %ymm26,%ymm23,%ymm23
722
723
724
725
vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k1
726
vpcmpuq $6,.Lmask52x4(%rip),%ymm20,%k2
727
vpcmpuq $6,.Lmask52x4(%rip),%ymm21,%k3
728
vpcmpuq $6,.Lmask52x4(%rip),%ymm22,%k4
729
vpcmpuq $6,.Lmask52x4(%rip),%ymm23,%k5
730
kmovb %k1,%r14d
731
kmovb %k2,%r13d
732
kmovb %k3,%r12d
733
kmovb %k4,%r11d
734
kmovb %k5,%r10d
735
736
737
vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k1
738
vpcmpuq $0,.Lmask52x4(%rip),%ymm20,%k2
739
vpcmpuq $0,.Lmask52x4(%rip),%ymm21,%k3
740
vpcmpuq $0,.Lmask52x4(%rip),%ymm22,%k4
741
vpcmpuq $0,.Lmask52x4(%rip),%ymm23,%k5
742
kmovb %k1,%r9d
743
kmovb %k2,%r8d
744
kmovb %k3,%ebx
745
kmovb %k4,%ecx
746
kmovb %k5,%edx
747
748
749
750
shlb $4,%r13b
751
orb %r13b,%r14b
752
shlb $4,%r11b
753
orb %r11b,%r12b
754
755
addb %r14b,%r14b
756
adcb %r12b,%r12b
757
adcb %r10b,%r10b
758
759
shlb $4,%r8b
760
orb %r8b,%r9b
761
shlb $4,%cl
762
orb %cl,%bl
763
764
addb %r9b,%r14b
765
adcb %bl,%r12b
766
adcb %dl,%r10b
767
768
xorb %r9b,%r14b
769
xorb %bl,%r12b
770
xorb %dl,%r10b
771
772
kmovb %r14d,%k1
773
shrb $4,%r14b
774
kmovb %r14d,%k2
775
kmovb %r12d,%k3
776
shrb $4,%r12b
777
kmovb %r12d,%k4
778
kmovb %r10d,%k5
779
780
781
vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k1}
782
vpsubq .Lmask52x4(%rip),%ymm20,%ymm20{%k2}
783
vpsubq .Lmask52x4(%rip),%ymm21,%ymm21{%k3}
784
vpsubq .Lmask52x4(%rip),%ymm22,%ymm22{%k4}
785
vpsubq .Lmask52x4(%rip),%ymm23,%ymm23{%k5}
786
787
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
788
vpandq .Lmask52x4(%rip),%ymm20,%ymm20
789
vpandq .Lmask52x4(%rip),%ymm21,%ymm21
790
vpandq .Lmask52x4(%rip),%ymm22,%ymm22
791
vpandq .Lmask52x4(%rip),%ymm23,%ymm23
792
793
vmovdqu64 %ymm3,0(%rdi)
794
vmovdqu64 %ymm16,32(%rdi)
795
vmovdqu64 %ymm17,64(%rdi)
796
vmovdqu64 %ymm18,96(%rdi)
797
vmovdqu64 %ymm19,128(%rdi)
798
799
vmovdqu64 %ymm4,160(%rdi)
800
vmovdqu64 %ymm20,192(%rdi)
801
vmovdqu64 %ymm21,224(%rdi)
802
vmovdqu64 %ymm22,256(%rdi)
803
vmovdqu64 %ymm23,288(%rdi)
804
805
vzeroupper
806
movq 0(%rsp),%r15
807
.cfi_restore %r15
808
movq 8(%rsp),%r14
809
.cfi_restore %r14
810
movq 16(%rsp),%r13
811
.cfi_restore %r13
812
movq 24(%rsp),%r12
813
.cfi_restore %r12
814
movq 32(%rsp),%rbp
815
.cfi_restore %rbp
816
movq 40(%rsp),%rbx
817
.cfi_restore %rbx
818
leaq 48(%rsp),%rsp
819
.cfi_adjust_cfa_offset -48
820
.Lossl_rsaz_amm52x20_x2_ifma256_epilogue:
821
.byte 0xf3,0xc3
822
.cfi_endproc
823
.size ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256
824
.text
825
826
.align 32
827
.globl ossl_extract_multiplier_2x20_win5
828
.type ossl_extract_multiplier_2x20_win5,@function
829
ossl_extract_multiplier_2x20_win5:
830
.cfi_startproc
831
.byte 243,15,30,250
832
vmovdqa64 .Lones(%rip),%ymm24
833
vpbroadcastq %rdx,%ymm22
834
vpbroadcastq %rcx,%ymm23
835
leaq 10240(%rsi),%rax
836
837
838
vpxor %xmm0,%xmm0,%xmm0
839
vmovdqa64 %ymm0,%ymm21
840
vmovdqa64 %ymm0,%ymm1
841
vmovdqa64 %ymm0,%ymm2
842
vmovdqa64 %ymm0,%ymm3
843
vmovdqa64 %ymm0,%ymm4
844
vmovdqa64 %ymm0,%ymm5
845
vmovdqa64 %ymm0,%ymm16
846
vmovdqa64 %ymm0,%ymm17
847
vmovdqa64 %ymm0,%ymm18
848
vmovdqa64 %ymm0,%ymm19
849
850
.align 32
851
.Lloop:
852
vpcmpq $0,%ymm21,%ymm22,%k1
853
vpcmpq $0,%ymm21,%ymm23,%k2
854
vmovdqu64 0(%rsi),%ymm20
855
vpblendmq %ymm20,%ymm0,%ymm0{%k1}
856
vmovdqu64 32(%rsi),%ymm20
857
vpblendmq %ymm20,%ymm1,%ymm1{%k1}
858
vmovdqu64 64(%rsi),%ymm20
859
vpblendmq %ymm20,%ymm2,%ymm2{%k1}
860
vmovdqu64 96(%rsi),%ymm20
861
vpblendmq %ymm20,%ymm3,%ymm3{%k1}
862
vmovdqu64 128(%rsi),%ymm20
863
vpblendmq %ymm20,%ymm4,%ymm4{%k1}
864
vmovdqu64 160(%rsi),%ymm20
865
vpblendmq %ymm20,%ymm5,%ymm5{%k2}
866
vmovdqu64 192(%rsi),%ymm20
867
vpblendmq %ymm20,%ymm16,%ymm16{%k2}
868
vmovdqu64 224(%rsi),%ymm20
869
vpblendmq %ymm20,%ymm17,%ymm17{%k2}
870
vmovdqu64 256(%rsi),%ymm20
871
vpblendmq %ymm20,%ymm18,%ymm18{%k2}
872
vmovdqu64 288(%rsi),%ymm20
873
vpblendmq %ymm20,%ymm19,%ymm19{%k2}
874
vpaddq %ymm24,%ymm21,%ymm21
875
addq $320,%rsi
876
cmpq %rsi,%rax
877
jne .Lloop
878
vmovdqu64 %ymm0,0(%rdi)
879
vmovdqu64 %ymm1,32(%rdi)
880
vmovdqu64 %ymm2,64(%rdi)
881
vmovdqu64 %ymm3,96(%rdi)
882
vmovdqu64 %ymm4,128(%rdi)
883
vmovdqu64 %ymm5,160(%rdi)
884
vmovdqu64 %ymm16,192(%rdi)
885
vmovdqu64 %ymm17,224(%rdi)
886
vmovdqu64 %ymm18,256(%rdi)
887
vmovdqu64 %ymm19,288(%rdi)
888
.byte 0xf3,0xc3
889
.cfi_endproc
890
.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5
891
.section .rodata
892
.align 32
893
.Lones:
894
.quad 1,1,1,1
895
.Lzeros:
896
.quad 0,0,0,0
897
.section ".note.gnu.property", "a"
898
.p2align 3
899
.long 1f - 0f
900
.long 4f - 1f
901
.long 5
902
0:
903
# "GNU" encoded with .byte, since .asciz isn't supported
904
# on Solaris.
905
.byte 0x47
906
.byte 0x4e
907
.byte 0x55
908
.byte 0
909
1:
910
.p2align 3
911
.long 0xc0000002
912
.long 3f - 2f
913
2:
914
.long 3
915
3:
916
.p2align 3
917
4:
918
919