Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/amd64/rsaz-3k-avxifma.S
39483 views
1
/* Do not modify. This file is auto-generated from rsaz-3k-avxifma.pl. */
2
.text
3
4
.globl ossl_rsaz_amm52x30_x1_avxifma256
5
.type ossl_rsaz_amm52x30_x1_avxifma256,@function
6
.align 32
7
ossl_rsaz_amm52x30_x1_avxifma256:
8
.cfi_startproc
9
.byte 243,15,30,250
10
pushq %rbx
11
.cfi_adjust_cfa_offset 8
12
.cfi_offset %rbx,-16
13
pushq %rbp
14
.cfi_adjust_cfa_offset 8
15
.cfi_offset %rbp,-24
16
pushq %r12
17
.cfi_adjust_cfa_offset 8
18
.cfi_offset %r12,-32
19
pushq %r13
20
.cfi_adjust_cfa_offset 8
21
.cfi_offset %r13,-40
22
pushq %r14
23
.cfi_adjust_cfa_offset 8
24
.cfi_offset %r14,-48
25
pushq %r15
26
.cfi_adjust_cfa_offset 8
27
.cfi_offset %r15,-56
28
29
vpxor %ymm0,%ymm0,%ymm0
30
vmovapd %ymm0,%ymm3
31
vmovapd %ymm0,%ymm4
32
vmovapd %ymm0,%ymm5
33
vmovapd %ymm0,%ymm6
34
vmovapd %ymm0,%ymm7
35
vmovapd %ymm0,%ymm8
36
vmovapd %ymm0,%ymm9
37
vmovapd %ymm0,%ymm10
38
39
xorl %r9d,%r9d
40
41
movq %rdx,%r11
42
movq $0xfffffffffffff,%rax
43
44
45
movl $7,%ebx
46
47
.align 32
48
.Lloop7:
49
movq 0(%r11),%r13
50
51
vpbroadcastq 0(%r11),%ymm1
52
movq 0(%rsi),%rdx
53
mulxq %r13,%r13,%r12
54
addq %r13,%r9
55
movq %r12,%r10
56
adcq $0,%r10
57
58
movq %r8,%r13
59
imulq %r9,%r13
60
andq %rax,%r13
61
62
vmovq %r13,%xmm2
63
vpbroadcastq %xmm2,%ymm2
64
movq 0(%rcx),%rdx
65
mulxq %r13,%r13,%r12
66
addq %r13,%r9
67
adcq %r12,%r10
68
69
shrq $52,%r9
70
salq $12,%r10
71
orq %r10,%r9
72
73
leaq -264(%rsp),%rsp
74
75
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
76
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
77
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
78
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
79
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
80
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
81
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
82
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
83
84
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
85
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
86
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
87
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
88
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
89
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
90
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
91
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
92
93
94
vmovdqu %ymm3,0(%rsp)
95
vmovdqu %ymm4,32(%rsp)
96
vmovdqu %ymm5,64(%rsp)
97
vmovdqu %ymm6,96(%rsp)
98
vmovdqu %ymm7,128(%rsp)
99
vmovdqu %ymm8,160(%rsp)
100
vmovdqu %ymm9,192(%rsp)
101
vmovdqu %ymm10,224(%rsp)
102
movq $0,256(%rsp)
103
104
vmovdqu 8(%rsp),%ymm3
105
vmovdqu 40(%rsp),%ymm4
106
vmovdqu 72(%rsp),%ymm5
107
vmovdqu 104(%rsp),%ymm6
108
vmovdqu 136(%rsp),%ymm7
109
vmovdqu 168(%rsp),%ymm8
110
vmovdqu 200(%rsp),%ymm9
111
vmovdqu 232(%rsp),%ymm10
112
113
addq 8(%rsp),%r9
114
115
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
116
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
117
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
118
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
119
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
120
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
121
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
122
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
123
124
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
125
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
126
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
127
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
128
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
129
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
130
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
131
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
132
133
leaq 264(%rsp),%rsp
134
movq 8(%r11),%r13
135
136
vpbroadcastq 8(%r11),%ymm1
137
movq 0(%rsi),%rdx
138
mulxq %r13,%r13,%r12
139
addq %r13,%r9
140
movq %r12,%r10
141
adcq $0,%r10
142
143
movq %r8,%r13
144
imulq %r9,%r13
145
andq %rax,%r13
146
147
vmovq %r13,%xmm2
148
vpbroadcastq %xmm2,%ymm2
149
movq 0(%rcx),%rdx
150
mulxq %r13,%r13,%r12
151
addq %r13,%r9
152
adcq %r12,%r10
153
154
shrq $52,%r9
155
salq $12,%r10
156
orq %r10,%r9
157
158
leaq -264(%rsp),%rsp
159
160
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
161
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
162
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
163
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
164
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
165
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
166
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
167
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
168
169
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
170
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
171
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
172
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
173
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
174
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
175
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
176
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
177
178
179
vmovdqu %ymm3,0(%rsp)
180
vmovdqu %ymm4,32(%rsp)
181
vmovdqu %ymm5,64(%rsp)
182
vmovdqu %ymm6,96(%rsp)
183
vmovdqu %ymm7,128(%rsp)
184
vmovdqu %ymm8,160(%rsp)
185
vmovdqu %ymm9,192(%rsp)
186
vmovdqu %ymm10,224(%rsp)
187
movq $0,256(%rsp)
188
189
vmovdqu 8(%rsp),%ymm3
190
vmovdqu 40(%rsp),%ymm4
191
vmovdqu 72(%rsp),%ymm5
192
vmovdqu 104(%rsp),%ymm6
193
vmovdqu 136(%rsp),%ymm7
194
vmovdqu 168(%rsp),%ymm8
195
vmovdqu 200(%rsp),%ymm9
196
vmovdqu 232(%rsp),%ymm10
197
198
addq 8(%rsp),%r9
199
200
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
201
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
202
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
203
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
204
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
205
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
206
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
207
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
208
209
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
210
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
211
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
212
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
213
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
214
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
215
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
216
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
217
218
leaq 264(%rsp),%rsp
219
movq 16(%r11),%r13
220
221
vpbroadcastq 16(%r11),%ymm1
222
movq 0(%rsi),%rdx
223
mulxq %r13,%r13,%r12
224
addq %r13,%r9
225
movq %r12,%r10
226
adcq $0,%r10
227
228
movq %r8,%r13
229
imulq %r9,%r13
230
andq %rax,%r13
231
232
vmovq %r13,%xmm2
233
vpbroadcastq %xmm2,%ymm2
234
movq 0(%rcx),%rdx
235
mulxq %r13,%r13,%r12
236
addq %r13,%r9
237
adcq %r12,%r10
238
239
shrq $52,%r9
240
salq $12,%r10
241
orq %r10,%r9
242
243
leaq -264(%rsp),%rsp
244
245
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
246
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
247
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
248
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
249
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
250
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
251
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
252
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
253
254
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
255
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
256
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
257
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
258
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
259
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
260
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
261
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
262
263
264
vmovdqu %ymm3,0(%rsp)
265
vmovdqu %ymm4,32(%rsp)
266
vmovdqu %ymm5,64(%rsp)
267
vmovdqu %ymm6,96(%rsp)
268
vmovdqu %ymm7,128(%rsp)
269
vmovdqu %ymm8,160(%rsp)
270
vmovdqu %ymm9,192(%rsp)
271
vmovdqu %ymm10,224(%rsp)
272
movq $0,256(%rsp)
273
274
vmovdqu 8(%rsp),%ymm3
275
vmovdqu 40(%rsp),%ymm4
276
vmovdqu 72(%rsp),%ymm5
277
vmovdqu 104(%rsp),%ymm6
278
vmovdqu 136(%rsp),%ymm7
279
vmovdqu 168(%rsp),%ymm8
280
vmovdqu 200(%rsp),%ymm9
281
vmovdqu 232(%rsp),%ymm10
282
283
addq 8(%rsp),%r9
284
285
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
286
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
287
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
288
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
289
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
290
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
291
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
292
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
293
294
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
295
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
296
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
297
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
298
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
299
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
300
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
301
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
302
303
leaq 264(%rsp),%rsp
304
movq 24(%r11),%r13
305
306
vpbroadcastq 24(%r11),%ymm1
307
movq 0(%rsi),%rdx
308
mulxq %r13,%r13,%r12
309
addq %r13,%r9
310
movq %r12,%r10
311
adcq $0,%r10
312
313
movq %r8,%r13
314
imulq %r9,%r13
315
andq %rax,%r13
316
317
vmovq %r13,%xmm2
318
vpbroadcastq %xmm2,%ymm2
319
movq 0(%rcx),%rdx
320
mulxq %r13,%r13,%r12
321
addq %r13,%r9
322
adcq %r12,%r10
323
324
shrq $52,%r9
325
salq $12,%r10
326
orq %r10,%r9
327
328
leaq -264(%rsp),%rsp
329
330
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
331
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
332
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
333
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
334
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
335
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
336
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
337
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
338
339
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
340
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
341
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
342
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
343
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
344
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
345
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
346
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
347
348
349
vmovdqu %ymm3,0(%rsp)
350
vmovdqu %ymm4,32(%rsp)
351
vmovdqu %ymm5,64(%rsp)
352
vmovdqu %ymm6,96(%rsp)
353
vmovdqu %ymm7,128(%rsp)
354
vmovdqu %ymm8,160(%rsp)
355
vmovdqu %ymm9,192(%rsp)
356
vmovdqu %ymm10,224(%rsp)
357
movq $0,256(%rsp)
358
359
vmovdqu 8(%rsp),%ymm3
360
vmovdqu 40(%rsp),%ymm4
361
vmovdqu 72(%rsp),%ymm5
362
vmovdqu 104(%rsp),%ymm6
363
vmovdqu 136(%rsp),%ymm7
364
vmovdqu 168(%rsp),%ymm8
365
vmovdqu 200(%rsp),%ymm9
366
vmovdqu 232(%rsp),%ymm10
367
368
addq 8(%rsp),%r9
369
370
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
371
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
372
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
373
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
374
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
375
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
376
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
377
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
378
379
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
380
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
381
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
382
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
383
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
384
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
385
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
386
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
387
388
leaq 264(%rsp),%rsp
389
leaq 32(%r11),%r11
390
decl %ebx
391
jne .Lloop7
392
movq 0(%r11),%r13
393
394
vpbroadcastq 0(%r11),%ymm1
395
movq 0(%rsi),%rdx
396
mulxq %r13,%r13,%r12
397
addq %r13,%r9
398
movq %r12,%r10
399
adcq $0,%r10
400
401
movq %r8,%r13
402
imulq %r9,%r13
403
andq %rax,%r13
404
405
vmovq %r13,%xmm2
406
vpbroadcastq %xmm2,%ymm2
407
movq 0(%rcx),%rdx
408
mulxq %r13,%r13,%r12
409
addq %r13,%r9
410
adcq %r12,%r10
411
412
shrq $52,%r9
413
salq $12,%r10
414
orq %r10,%r9
415
416
leaq -264(%rsp),%rsp
417
418
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
419
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
420
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
421
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
422
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
423
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
424
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
425
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
426
427
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
428
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
429
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
430
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
431
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
432
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
433
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
434
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
435
436
437
vmovdqu %ymm3,0(%rsp)
438
vmovdqu %ymm4,32(%rsp)
439
vmovdqu %ymm5,64(%rsp)
440
vmovdqu %ymm6,96(%rsp)
441
vmovdqu %ymm7,128(%rsp)
442
vmovdqu %ymm8,160(%rsp)
443
vmovdqu %ymm9,192(%rsp)
444
vmovdqu %ymm10,224(%rsp)
445
movq $0,256(%rsp)
446
447
vmovdqu 8(%rsp),%ymm3
448
vmovdqu 40(%rsp),%ymm4
449
vmovdqu 72(%rsp),%ymm5
450
vmovdqu 104(%rsp),%ymm6
451
vmovdqu 136(%rsp),%ymm7
452
vmovdqu 168(%rsp),%ymm8
453
vmovdqu 200(%rsp),%ymm9
454
vmovdqu 232(%rsp),%ymm10
455
456
addq 8(%rsp),%r9
457
458
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
459
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
460
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
461
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
462
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
463
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
464
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
465
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
466
467
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
468
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
469
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
470
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
471
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
472
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
473
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
474
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
475
476
leaq 264(%rsp),%rsp
477
movq 8(%r11),%r13
478
479
vpbroadcastq 8(%r11),%ymm1
480
movq 0(%rsi),%rdx
481
mulxq %r13,%r13,%r12
482
addq %r13,%r9
483
movq %r12,%r10
484
adcq $0,%r10
485
486
movq %r8,%r13
487
imulq %r9,%r13
488
andq %rax,%r13
489
490
vmovq %r13,%xmm2
491
vpbroadcastq %xmm2,%ymm2
492
movq 0(%rcx),%rdx
493
mulxq %r13,%r13,%r12
494
addq %r13,%r9
495
adcq %r12,%r10
496
497
shrq $52,%r9
498
salq $12,%r10
499
orq %r10,%r9
500
501
leaq -264(%rsp),%rsp
502
503
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
504
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
505
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
506
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
507
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
508
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
509
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
510
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
511
512
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
513
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
514
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
515
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
516
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
517
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
518
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
519
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
520
521
522
vmovdqu %ymm3,0(%rsp)
523
vmovdqu %ymm4,32(%rsp)
524
vmovdqu %ymm5,64(%rsp)
525
vmovdqu %ymm6,96(%rsp)
526
vmovdqu %ymm7,128(%rsp)
527
vmovdqu %ymm8,160(%rsp)
528
vmovdqu %ymm9,192(%rsp)
529
vmovdqu %ymm10,224(%rsp)
530
movq $0,256(%rsp)
531
532
vmovdqu 8(%rsp),%ymm3
533
vmovdqu 40(%rsp),%ymm4
534
vmovdqu 72(%rsp),%ymm5
535
vmovdqu 104(%rsp),%ymm6
536
vmovdqu 136(%rsp),%ymm7
537
vmovdqu 168(%rsp),%ymm8
538
vmovdqu 200(%rsp),%ymm9
539
vmovdqu 232(%rsp),%ymm10
540
541
addq 8(%rsp),%r9
542
543
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
544
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
545
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
546
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
547
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
548
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
549
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
550
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
551
552
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
553
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
554
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
555
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
556
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
557
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
558
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
559
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
560
561
leaq 264(%rsp),%rsp
562
563
vmovq %r9,%xmm0
564
vpbroadcastq %xmm0,%ymm0
565
vpblendd $3,%ymm0,%ymm3,%ymm3
566
567
568
569
vpsrlq $52,%ymm3,%ymm0
570
vpsrlq $52,%ymm4,%ymm1
571
vpsrlq $52,%ymm5,%ymm2
572
vpsrlq $52,%ymm6,%ymm11
573
vpsrlq $52,%ymm7,%ymm12
574
vpsrlq $52,%ymm8,%ymm13
575
vpsrlq $52,%ymm9,%ymm14
576
vpsrlq $52,%ymm10,%ymm15
577
578
leaq -32(%rsp),%rsp
579
vmovupd %ymm3,(%rsp)
580
581
582
vpermq $144,%ymm15,%ymm15
583
vpermq $3,%ymm14,%ymm3
584
vblendpd $1,%ymm3,%ymm15,%ymm15
585
586
vpermq $144,%ymm14,%ymm14
587
vpermq $3,%ymm13,%ymm3
588
vblendpd $1,%ymm3,%ymm14,%ymm14
589
590
vpermq $144,%ymm13,%ymm13
591
vpermq $3,%ymm12,%ymm3
592
vblendpd $1,%ymm3,%ymm13,%ymm13
593
594
vpermq $144,%ymm12,%ymm12
595
vpermq $3,%ymm11,%ymm3
596
vblendpd $1,%ymm3,%ymm12,%ymm12
597
598
vpermq $144,%ymm11,%ymm11
599
vpermq $3,%ymm2,%ymm3
600
vblendpd $1,%ymm3,%ymm11,%ymm11
601
602
vpermq $144,%ymm2,%ymm2
603
vpermq $3,%ymm1,%ymm3
604
vblendpd $1,%ymm3,%ymm2,%ymm2
605
606
vpermq $144,%ymm1,%ymm1
607
vpermq $3,%ymm0,%ymm3
608
vblendpd $1,%ymm3,%ymm1,%ymm1
609
610
vpermq $144,%ymm0,%ymm0
611
vpand .Lhigh64x3(%rip),%ymm0,%ymm0
612
613
vmovupd (%rsp),%ymm3
614
leaq 32(%rsp),%rsp
615
616
617
vpand .Lmask52x4(%rip),%ymm3,%ymm3
618
vpand .Lmask52x4(%rip),%ymm4,%ymm4
619
vpand .Lmask52x4(%rip),%ymm5,%ymm5
620
vpand .Lmask52x4(%rip),%ymm6,%ymm6
621
vpand .Lmask52x4(%rip),%ymm7,%ymm7
622
vpand .Lmask52x4(%rip),%ymm8,%ymm8
623
vpand .Lmask52x4(%rip),%ymm9,%ymm9
624
vpand .Lmask52x4(%rip),%ymm10,%ymm10
625
626
627
vpaddq %ymm0,%ymm3,%ymm3
628
vpaddq %ymm1,%ymm4,%ymm4
629
vpaddq %ymm2,%ymm5,%ymm5
630
vpaddq %ymm11,%ymm6,%ymm6
631
vpaddq %ymm12,%ymm7,%ymm7
632
vpaddq %ymm13,%ymm8,%ymm8
633
vpaddq %ymm14,%ymm9,%ymm9
634
vpaddq %ymm15,%ymm10,%ymm10
635
636
637
638
vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0
639
vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1
640
vmovmskpd %ymm0,%r14d
641
vmovmskpd %ymm1,%r13d
642
shlb $4,%r13b
643
orb %r13b,%r14b
644
645
vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2
646
vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11
647
vmovmskpd %ymm2,%r13d
648
vmovmskpd %ymm11,%r12d
649
shlb $4,%r12b
650
orb %r12b,%r13b
651
652
vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12
653
vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13
654
vmovmskpd %ymm12,%r12d
655
vmovmskpd %ymm13,%r11d
656
shlb $4,%r11b
657
orb %r11b,%r12b
658
659
vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14
660
vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15
661
vmovmskpd %ymm14,%r11d
662
vmovmskpd %ymm15,%r10d
663
shlb $4,%r10b
664
orb %r10b,%r11b
665
666
addb %r14b,%r14b
667
adcb %r13b,%r13b
668
adcb %r12b,%r12b
669
adcb %r11b,%r11b
670
671
672
vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0
673
vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1
674
vmovmskpd %ymm0,%r9d
675
vmovmskpd %ymm1,%r8d
676
shlb $4,%r8b
677
orb %r8b,%r9b
678
679
vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2
680
vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11
681
vmovmskpd %ymm2,%r8d
682
vmovmskpd %ymm11,%edx
683
shlb $4,%dl
684
orb %dl,%r8b
685
686
vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12
687
vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13
688
vmovmskpd %ymm12,%edx
689
vmovmskpd %ymm13,%ecx
690
shlb $4,%cl
691
orb %cl,%dl
692
693
vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14
694
vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15
695
vmovmskpd %ymm14,%ecx
696
vmovmskpd %ymm15,%ebx
697
shlb $4,%bl
698
orb %bl,%cl
699
700
addb %r9b,%r14b
701
adcb %r8b,%r13b
702
adcb %dl,%r12b
703
adcb %cl,%r11b
704
705
xorb %r9b,%r14b
706
xorb %r8b,%r13b
707
xorb %dl,%r12b
708
xorb %cl,%r11b
709
710
leaq .Lkmasklut(%rip),%rdx
711
712
movb %r14b,%r10b
713
andq $0xf,%r14
714
vpsubq .Lmask52x4(%rip),%ymm3,%ymm0
715
shlq $5,%r14
716
vmovapd (%rdx,%r14,1),%ymm2
717
vblendvpd %ymm2,%ymm0,%ymm3,%ymm3
718
719
shrb $4,%r10b
720
andq $0xf,%r10
721
vpsubq .Lmask52x4(%rip),%ymm4,%ymm0
722
shlq $5,%r10
723
vmovapd (%rdx,%r10,1),%ymm2
724
vblendvpd %ymm2,%ymm0,%ymm4,%ymm4
725
726
movb %r13b,%r10b
727
andq $0xf,%r13
728
vpsubq .Lmask52x4(%rip),%ymm5,%ymm0
729
shlq $5,%r13
730
vmovapd (%rdx,%r13,1),%ymm2
731
vblendvpd %ymm2,%ymm0,%ymm5,%ymm5
732
733
shrb $4,%r10b
734
andq $0xf,%r10
735
vpsubq .Lmask52x4(%rip),%ymm6,%ymm0
736
shlq $5,%r10
737
vmovapd (%rdx,%r10,1),%ymm2
738
vblendvpd %ymm2,%ymm0,%ymm6,%ymm6
739
740
movb %r12b,%r10b
741
andq $0xf,%r12
742
vpsubq .Lmask52x4(%rip),%ymm7,%ymm0
743
shlq $5,%r12
744
vmovapd (%rdx,%r12,1),%ymm2
745
vblendvpd %ymm2,%ymm0,%ymm7,%ymm7
746
747
shrb $4,%r10b
748
andq $0xf,%r10
749
vpsubq .Lmask52x4(%rip),%ymm8,%ymm0
750
shlq $5,%r10
751
vmovapd (%rdx,%r10,1),%ymm2
752
vblendvpd %ymm2,%ymm0,%ymm8,%ymm8
753
754
movb %r11b,%r10b
755
andq $0xf,%r11
756
vpsubq .Lmask52x4(%rip),%ymm9,%ymm0
757
shlq $5,%r11
758
vmovapd (%rdx,%r11,1),%ymm2
759
vblendvpd %ymm2,%ymm0,%ymm9,%ymm9
760
761
shrb $4,%r10b
762
andq $0xf,%r10
763
vpsubq .Lmask52x4(%rip),%ymm10,%ymm0
764
shlq $5,%r10
765
vmovapd (%rdx,%r10,1),%ymm2
766
vblendvpd %ymm2,%ymm0,%ymm10,%ymm10
767
768
vpand .Lmask52x4(%rip),%ymm3,%ymm3
769
vpand .Lmask52x4(%rip),%ymm4,%ymm4
770
vpand .Lmask52x4(%rip),%ymm5,%ymm5
771
vpand .Lmask52x4(%rip),%ymm6,%ymm6
772
vpand .Lmask52x4(%rip),%ymm7,%ymm7
773
vpand .Lmask52x4(%rip),%ymm8,%ymm8
774
vpand .Lmask52x4(%rip),%ymm9,%ymm9
775
776
vpand .Lmask52x4(%rip),%ymm10,%ymm10
777
778
vmovdqu %ymm3,0(%rdi)
779
vmovdqu %ymm4,32(%rdi)
780
vmovdqu %ymm5,64(%rdi)
781
vmovdqu %ymm6,96(%rdi)
782
vmovdqu %ymm7,128(%rdi)
783
vmovdqu %ymm8,160(%rdi)
784
vmovdqu %ymm9,192(%rdi)
785
vmovdqu %ymm10,224(%rdi)
786
787
vzeroupper
788
leaq (%rsp),%rax
789
.cfi_def_cfa_register %rax
790
movq 0(%rax),%r15
791
.cfi_restore %r15
792
movq 8(%rax),%r14
793
.cfi_restore %r14
794
movq 16(%rax),%r13
795
.cfi_restore %r13
796
movq 24(%rax),%r12
797
.cfi_restore %r12
798
movq 32(%rax),%rbp
799
.cfi_restore %rbp
800
movq 40(%rax),%rbx
801
.cfi_restore %rbx
802
leaq 48(%rax),%rsp
803
.cfi_def_cfa %rsp,8
804
.Lossl_rsaz_amm52x30_x1_avxifma256_epilogue:
805
.byte 0xf3,0xc3
806
.cfi_endproc
807
.size ossl_rsaz_amm52x30_x1_avxifma256, .-ossl_rsaz_amm52x30_x1_avxifma256
808
.section .rodata
809
.align 32
810
.Lmask52x4:
811
.quad 0xfffffffffffff
812
.quad 0xfffffffffffff
813
.quad 0xfffffffffffff
814
.quad 0xfffffffffffff
815
.Lhigh64x3:
816
.quad 0x0
817
.quad 0xffffffffffffffff
818
.quad 0xffffffffffffffff
819
.quad 0xffffffffffffffff
820
.Lkmasklut:
821
822
.quad 0x0
823
.quad 0x0
824
.quad 0x0
825
.quad 0x0
826
827
.quad 0xffffffffffffffff
828
.quad 0x0
829
.quad 0x0
830
.quad 0x0
831
832
.quad 0x0
833
.quad 0xffffffffffffffff
834
.quad 0x0
835
.quad 0x0
836
837
.quad 0xffffffffffffffff
838
.quad 0xffffffffffffffff
839
.quad 0x0
840
.quad 0x0
841
842
.quad 0x0
843
.quad 0x0
844
.quad 0xffffffffffffffff
845
.quad 0x0
846
847
.quad 0xffffffffffffffff
848
.quad 0x0
849
.quad 0xffffffffffffffff
850
.quad 0x0
851
852
.quad 0x0
853
.quad 0xffffffffffffffff
854
.quad 0xffffffffffffffff
855
.quad 0x0
856
857
.quad 0xffffffffffffffff
858
.quad 0xffffffffffffffff
859
.quad 0xffffffffffffffff
860
.quad 0x0
861
862
.quad 0x0
863
.quad 0x0
864
.quad 0x0
865
.quad 0xffffffffffffffff
866
867
.quad 0xffffffffffffffff
868
.quad 0x0
869
.quad 0x0
870
.quad 0xffffffffffffffff
871
872
.quad 0x0
873
.quad 0xffffffffffffffff
874
.quad 0x0
875
.quad 0xffffffffffffffff
876
877
.quad 0xffffffffffffffff
878
.quad 0xffffffffffffffff
879
.quad 0x0
880
.quad 0xffffffffffffffff
881
882
.quad 0x0
883
.quad 0x0
884
.quad 0xffffffffffffffff
885
.quad 0xffffffffffffffff
886
887
.quad 0xffffffffffffffff
888
.quad 0x0
889
.quad 0xffffffffffffffff
890
.quad 0xffffffffffffffff
891
892
.quad 0x0
893
.quad 0xffffffffffffffff
894
.quad 0xffffffffffffffff
895
.quad 0xffffffffffffffff
896
897
.quad 0xffffffffffffffff
898
.quad 0xffffffffffffffff
899
.quad 0xffffffffffffffff
900
.quad 0xffffffffffffffff
901
.text
902
903
.globl ossl_rsaz_amm52x30_x2_avxifma256
904
.type ossl_rsaz_amm52x30_x2_avxifma256,@function
905
.align 32
906
ossl_rsaz_amm52x30_x2_avxifma256:
907
.cfi_startproc
908
.byte 243,15,30,250
909
pushq %rbx
910
.cfi_adjust_cfa_offset 8
911
.cfi_offset %rbx,-16
912
pushq %rbp
913
.cfi_adjust_cfa_offset 8
914
.cfi_offset %rbp,-24
915
pushq %r12
916
.cfi_adjust_cfa_offset 8
917
.cfi_offset %r12,-32
918
pushq %r13
919
.cfi_adjust_cfa_offset 8
920
.cfi_offset %r13,-40
921
pushq %r14
922
.cfi_adjust_cfa_offset 8
923
.cfi_offset %r14,-48
924
pushq %r15
925
.cfi_adjust_cfa_offset 8
926
.cfi_offset %r15,-56
927
928
vpxor %ymm0,%ymm0,%ymm0
929
vmovapd %ymm0,%ymm3
930
vmovapd %ymm0,%ymm4
931
vmovapd %ymm0,%ymm5
932
vmovapd %ymm0,%ymm6
933
vmovapd %ymm0,%ymm7
934
vmovapd %ymm0,%ymm8
935
vmovapd %ymm0,%ymm9
936
vmovapd %ymm0,%ymm10
937
938
xorl %r9d,%r9d
939
940
movq %rdx,%r11
941
movq $0xfffffffffffff,%rax
942
943
movl $30,%ebx
944
945
.align 32
946
.Lloop30:
947
movq 0(%r11),%r13
948
949
vpbroadcastq 0(%r11),%ymm1
950
movq 0(%rsi),%rdx
951
mulxq %r13,%r13,%r12
952
addq %r13,%r9
953
movq %r12,%r10
954
adcq $0,%r10
955
956
movq (%r8),%r13
957
imulq %r9,%r13
958
andq %rax,%r13
959
960
vmovq %r13,%xmm2
961
vpbroadcastq %xmm2,%ymm2
962
movq 0(%rcx),%rdx
963
mulxq %r13,%r13,%r12
964
addq %r13,%r9
965
adcq %r12,%r10
966
967
shrq $52,%r9
968
salq $12,%r10
969
orq %r10,%r9
970
971
leaq -264(%rsp),%rsp
972
973
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
974
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
975
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
976
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
977
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
978
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
979
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
980
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
981
982
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
983
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
984
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
985
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
986
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
987
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
988
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
989
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
990
991
992
vmovdqu %ymm3,0(%rsp)
993
vmovdqu %ymm4,32(%rsp)
994
vmovdqu %ymm5,64(%rsp)
995
vmovdqu %ymm6,96(%rsp)
996
vmovdqu %ymm7,128(%rsp)
997
vmovdqu %ymm8,160(%rsp)
998
vmovdqu %ymm9,192(%rsp)
999
vmovdqu %ymm10,224(%rsp)
1000
movq $0,256(%rsp)
1001
1002
vmovdqu 8(%rsp),%ymm3
1003
vmovdqu 40(%rsp),%ymm4
1004
vmovdqu 72(%rsp),%ymm5
1005
vmovdqu 104(%rsp),%ymm6
1006
vmovdqu 136(%rsp),%ymm7
1007
vmovdqu 168(%rsp),%ymm8
1008
vmovdqu 200(%rsp),%ymm9
1009
vmovdqu 232(%rsp),%ymm10
1010
1011
addq 8(%rsp),%r9
1012
1013
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
1014
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
1015
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
1016
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
1017
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
1018
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
1019
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
1020
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
1021
1022
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
1023
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
1024
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
1025
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
1026
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
1027
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
1028
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
1029
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
1030
1031
leaq 264(%rsp),%rsp
1032
leaq 8(%r11),%r11
1033
decl %ebx
1034
jne .Lloop30
1035
1036
pushq %r11
1037
pushq %rsi
1038
pushq %rcx
1039
pushq %r8
1040
1041
vmovq %r9,%xmm0
1042
vpbroadcastq %xmm0,%ymm0
1043
vpblendd $3,%ymm0,%ymm3,%ymm3
1044
1045
1046
1047
vpsrlq $52,%ymm3,%ymm0
1048
vpsrlq $52,%ymm4,%ymm1
1049
vpsrlq $52,%ymm5,%ymm2
1050
vpsrlq $52,%ymm6,%ymm11
1051
vpsrlq $52,%ymm7,%ymm12
1052
vpsrlq $52,%ymm8,%ymm13
1053
vpsrlq $52,%ymm9,%ymm14
1054
vpsrlq $52,%ymm10,%ymm15
1055
1056
leaq -32(%rsp),%rsp
1057
vmovupd %ymm3,(%rsp)
1058
1059
1060
vpermq $144,%ymm15,%ymm15
1061
vpermq $3,%ymm14,%ymm3
1062
vblendpd $1,%ymm3,%ymm15,%ymm15
1063
1064
vpermq $144,%ymm14,%ymm14
1065
vpermq $3,%ymm13,%ymm3
1066
vblendpd $1,%ymm3,%ymm14,%ymm14
1067
1068
vpermq $144,%ymm13,%ymm13
1069
vpermq $3,%ymm12,%ymm3
1070
vblendpd $1,%ymm3,%ymm13,%ymm13
1071
1072
vpermq $144,%ymm12,%ymm12
1073
vpermq $3,%ymm11,%ymm3
1074
vblendpd $1,%ymm3,%ymm12,%ymm12
1075
1076
vpermq $144,%ymm11,%ymm11
1077
vpermq $3,%ymm2,%ymm3
1078
vblendpd $1,%ymm3,%ymm11,%ymm11
1079
1080
vpermq $144,%ymm2,%ymm2
1081
vpermq $3,%ymm1,%ymm3
1082
vblendpd $1,%ymm3,%ymm2,%ymm2
1083
1084
vpermq $144,%ymm1,%ymm1
1085
vpermq $3,%ymm0,%ymm3
1086
vblendpd $1,%ymm3,%ymm1,%ymm1
1087
1088
vpermq $144,%ymm0,%ymm0
1089
vpand .Lhigh64x3(%rip),%ymm0,%ymm0
1090
1091
vmovupd (%rsp),%ymm3
1092
leaq 32(%rsp),%rsp
1093
1094
1095
vpand .Lmask52x4(%rip),%ymm3,%ymm3
1096
vpand .Lmask52x4(%rip),%ymm4,%ymm4
1097
vpand .Lmask52x4(%rip),%ymm5,%ymm5
1098
vpand .Lmask52x4(%rip),%ymm6,%ymm6
1099
vpand .Lmask52x4(%rip),%ymm7,%ymm7
1100
vpand .Lmask52x4(%rip),%ymm8,%ymm8
1101
vpand .Lmask52x4(%rip),%ymm9,%ymm9
1102
vpand .Lmask52x4(%rip),%ymm10,%ymm10
1103
1104
1105
vpaddq %ymm0,%ymm3,%ymm3
1106
vpaddq %ymm1,%ymm4,%ymm4
1107
vpaddq %ymm2,%ymm5,%ymm5
1108
vpaddq %ymm11,%ymm6,%ymm6
1109
vpaddq %ymm12,%ymm7,%ymm7
1110
vpaddq %ymm13,%ymm8,%ymm8
1111
vpaddq %ymm14,%ymm9,%ymm9
1112
vpaddq %ymm15,%ymm10,%ymm10
1113
1114
1115
1116
vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0
1117
vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1
1118
vmovmskpd %ymm0,%r14d
1119
vmovmskpd %ymm1,%r13d
1120
shlb $4,%r13b
1121
orb %r13b,%r14b
1122
1123
vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2
1124
vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11
1125
vmovmskpd %ymm2,%r13d
1126
vmovmskpd %ymm11,%r12d
1127
shlb $4,%r12b
1128
orb %r12b,%r13b
1129
1130
vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12
1131
vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13
1132
vmovmskpd %ymm12,%r12d
1133
vmovmskpd %ymm13,%r11d
1134
shlb $4,%r11b
1135
orb %r11b,%r12b
1136
1137
vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14
1138
vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15
1139
vmovmskpd %ymm14,%r11d
1140
vmovmskpd %ymm15,%r10d
1141
shlb $4,%r10b
1142
orb %r10b,%r11b
1143
1144
addb %r14b,%r14b
1145
adcb %r13b,%r13b
1146
adcb %r12b,%r12b
1147
adcb %r11b,%r11b
1148
1149
1150
vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0
1151
vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1
1152
vmovmskpd %ymm0,%r9d
1153
vmovmskpd %ymm1,%r8d
1154
shlb $4,%r8b
1155
orb %r8b,%r9b
1156
1157
vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2
1158
vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11
1159
vmovmskpd %ymm2,%r8d
1160
vmovmskpd %ymm11,%edx
1161
shlb $4,%dl
1162
orb %dl,%r8b
1163
1164
vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12
1165
vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13
1166
vmovmskpd %ymm12,%edx
1167
vmovmskpd %ymm13,%ecx
1168
shlb $4,%cl
1169
orb %cl,%dl
1170
1171
vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14
1172
vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15
1173
vmovmskpd %ymm14,%ecx
1174
vmovmskpd %ymm15,%ebx
1175
shlb $4,%bl
1176
orb %bl,%cl
1177
1178
addb %r9b,%r14b
1179
adcb %r8b,%r13b
1180
adcb %dl,%r12b
1181
adcb %cl,%r11b
1182
1183
xorb %r9b,%r14b
1184
xorb %r8b,%r13b
1185
xorb %dl,%r12b
1186
xorb %cl,%r11b
1187
1188
leaq .Lkmasklut(%rip),%rdx
1189
1190
movb %r14b,%r10b
1191
andq $0xf,%r14
1192
vpsubq .Lmask52x4(%rip),%ymm3,%ymm0
1193
shlq $5,%r14
1194
vmovapd (%rdx,%r14,1),%ymm2
1195
vblendvpd %ymm2,%ymm0,%ymm3,%ymm3
1196
1197
shrb $4,%r10b
1198
andq $0xf,%r10
1199
vpsubq .Lmask52x4(%rip),%ymm4,%ymm0
1200
shlq $5,%r10
1201
vmovapd (%rdx,%r10,1),%ymm2
1202
vblendvpd %ymm2,%ymm0,%ymm4,%ymm4
1203
1204
movb %r13b,%r10b
1205
andq $0xf,%r13
1206
vpsubq .Lmask52x4(%rip),%ymm5,%ymm0
1207
shlq $5,%r13
1208
vmovapd (%rdx,%r13,1),%ymm2
1209
vblendvpd %ymm2,%ymm0,%ymm5,%ymm5
1210
1211
shrb $4,%r10b
1212
andq $0xf,%r10
1213
vpsubq .Lmask52x4(%rip),%ymm6,%ymm0
1214
shlq $5,%r10
1215
vmovapd (%rdx,%r10,1),%ymm2
1216
vblendvpd %ymm2,%ymm0,%ymm6,%ymm6
1217
1218
movb %r12b,%r10b
1219
andq $0xf,%r12
1220
vpsubq .Lmask52x4(%rip),%ymm7,%ymm0
1221
shlq $5,%r12
1222
vmovapd (%rdx,%r12,1),%ymm2
1223
vblendvpd %ymm2,%ymm0,%ymm7,%ymm7
1224
1225
shrb $4,%r10b
1226
andq $0xf,%r10
1227
vpsubq .Lmask52x4(%rip),%ymm8,%ymm0
1228
shlq $5,%r10
1229
vmovapd (%rdx,%r10,1),%ymm2
1230
vblendvpd %ymm2,%ymm0,%ymm8,%ymm8
1231
1232
movb %r11b,%r10b
1233
andq $0xf,%r11
1234
vpsubq .Lmask52x4(%rip),%ymm9,%ymm0
1235
shlq $5,%r11
1236
vmovapd (%rdx,%r11,1),%ymm2
1237
vblendvpd %ymm2,%ymm0,%ymm9,%ymm9
1238
1239
shrb $4,%r10b
1240
andq $0xf,%r10
1241
vpsubq .Lmask52x4(%rip),%ymm10,%ymm0
1242
shlq $5,%r10
1243
vmovapd (%rdx,%r10,1),%ymm2
1244
vblendvpd %ymm2,%ymm0,%ymm10,%ymm10
1245
1246
vpand .Lmask52x4(%rip),%ymm3,%ymm3
1247
vpand .Lmask52x4(%rip),%ymm4,%ymm4
1248
vpand .Lmask52x4(%rip),%ymm5,%ymm5
1249
vpand .Lmask52x4(%rip),%ymm6,%ymm6
1250
vpand .Lmask52x4(%rip),%ymm7,%ymm7
1251
vpand .Lmask52x4(%rip),%ymm8,%ymm8
1252
vpand .Lmask52x4(%rip),%ymm9,%ymm9
1253
1254
vpand .Lmask52x4(%rip),%ymm10,%ymm10
1255
popq %r8
1256
popq %rcx
1257
popq %rsi
1258
popq %r11
1259
1260
vmovdqu %ymm3,0(%rdi)
1261
vmovdqu %ymm4,32(%rdi)
1262
vmovdqu %ymm5,64(%rdi)
1263
vmovdqu %ymm6,96(%rdi)
1264
vmovdqu %ymm7,128(%rdi)
1265
vmovdqu %ymm8,160(%rdi)
1266
vmovdqu %ymm9,192(%rdi)
1267
vmovdqu %ymm10,224(%rdi)
1268
1269
xorl %r15d,%r15d
1270
1271
leaq 16(%r11),%r11
1272
movq $0xfffffffffffff,%rax
1273
1274
movl $30,%ebx
1275
1276
vpxor %ymm0,%ymm0,%ymm0
1277
vmovapd %ymm0,%ymm3
1278
vmovapd %ymm0,%ymm4
1279
vmovapd %ymm0,%ymm5
1280
vmovapd %ymm0,%ymm6
1281
vmovapd %ymm0,%ymm7
1282
vmovapd %ymm0,%ymm8
1283
vmovapd %ymm0,%ymm9
1284
vmovapd %ymm0,%ymm10
1285
.align 32
1286
.Lloop40:
1287
movq 0(%r11),%r13
1288
1289
vpbroadcastq 0(%r11),%ymm1
1290
movq 256(%rsi),%rdx
1291
mulxq %r13,%r13,%r12
1292
addq %r13,%r9
1293
movq %r12,%r10
1294
adcq $0,%r10
1295
1296
movq 8(%r8),%r13
1297
imulq %r9,%r13
1298
andq %rax,%r13
1299
1300
vmovq %r13,%xmm2
1301
vpbroadcastq %xmm2,%ymm2
1302
movq 256(%rcx),%rdx
1303
mulxq %r13,%r13,%r12
1304
addq %r13,%r9
1305
adcq %r12,%r10
1306
1307
shrq $52,%r9
1308
salq $12,%r10
1309
orq %r10,%r9
1310
1311
leaq -264(%rsp),%rsp
1312
1313
{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm3
1314
{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm4
1315
{vex} vpmadd52luq 320(%rsi),%ymm1,%ymm5
1316
{vex} vpmadd52luq 352(%rsi),%ymm1,%ymm6
1317
{vex} vpmadd52luq 384(%rsi),%ymm1,%ymm7
1318
{vex} vpmadd52luq 416(%rsi),%ymm1,%ymm8
1319
{vex} vpmadd52luq 448(%rsi),%ymm1,%ymm9
1320
{vex} vpmadd52luq 480(%rsi),%ymm1,%ymm10
1321
1322
{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm3
1323
{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm4
1324
{vex} vpmadd52luq 320(%rcx),%ymm2,%ymm5
1325
{vex} vpmadd52luq 352(%rcx),%ymm2,%ymm6
1326
{vex} vpmadd52luq 384(%rcx),%ymm2,%ymm7
1327
{vex} vpmadd52luq 416(%rcx),%ymm2,%ymm8
1328
{vex} vpmadd52luq 448(%rcx),%ymm2,%ymm9
1329
{vex} vpmadd52luq 480(%rcx),%ymm2,%ymm10
1330
1331
1332
vmovdqu %ymm3,0(%rsp)
1333
vmovdqu %ymm4,32(%rsp)
1334
vmovdqu %ymm5,64(%rsp)
1335
vmovdqu %ymm6,96(%rsp)
1336
vmovdqu %ymm7,128(%rsp)
1337
vmovdqu %ymm8,160(%rsp)
1338
vmovdqu %ymm9,192(%rsp)
1339
vmovdqu %ymm10,224(%rsp)
1340
movq $0,256(%rsp)
1341
1342
vmovdqu 8(%rsp),%ymm3
1343
vmovdqu 40(%rsp),%ymm4
1344
vmovdqu 72(%rsp),%ymm5
1345
vmovdqu 104(%rsp),%ymm6
1346
vmovdqu 136(%rsp),%ymm7
1347
vmovdqu 168(%rsp),%ymm8
1348
vmovdqu 200(%rsp),%ymm9
1349
vmovdqu 232(%rsp),%ymm10
1350
1351
addq 8(%rsp),%r9
1352
1353
{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm3
1354
{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm4
1355
{vex} vpmadd52huq 320(%rsi),%ymm1,%ymm5
1356
{vex} vpmadd52huq 352(%rsi),%ymm1,%ymm6
1357
{vex} vpmadd52huq 384(%rsi),%ymm1,%ymm7
1358
{vex} vpmadd52huq 416(%rsi),%ymm1,%ymm8
1359
{vex} vpmadd52huq 448(%rsi),%ymm1,%ymm9
1360
{vex} vpmadd52huq 480(%rsi),%ymm1,%ymm10
1361
1362
{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm3
1363
{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm4
1364
{vex} vpmadd52huq 320(%rcx),%ymm2,%ymm5
1365
{vex} vpmadd52huq 352(%rcx),%ymm2,%ymm6
1366
{vex} vpmadd52huq 384(%rcx),%ymm2,%ymm7
1367
{vex} vpmadd52huq 416(%rcx),%ymm2,%ymm8
1368
{vex} vpmadd52huq 448(%rcx),%ymm2,%ymm9
1369
{vex} vpmadd52huq 480(%rcx),%ymm2,%ymm10
1370
1371
leaq 264(%rsp),%rsp
1372
leaq 8(%r11),%r11
1373
decl %ebx
1374
jne .Lloop40
1375
1376
vmovq %r9,%xmm0
1377
vpbroadcastq %xmm0,%ymm0
1378
vpblendd $3,%ymm0,%ymm3,%ymm3
1379
1380
1381
1382
vpsrlq $52,%ymm3,%ymm0
1383
vpsrlq $52,%ymm4,%ymm1
1384
vpsrlq $52,%ymm5,%ymm2
1385
vpsrlq $52,%ymm6,%ymm11
1386
vpsrlq $52,%ymm7,%ymm12
1387
vpsrlq $52,%ymm8,%ymm13
1388
vpsrlq $52,%ymm9,%ymm14
1389
vpsrlq $52,%ymm10,%ymm15
1390
1391
leaq -32(%rsp),%rsp
1392
vmovupd %ymm3,(%rsp)
1393
1394
1395
vpermq $144,%ymm15,%ymm15
1396
vpermq $3,%ymm14,%ymm3
1397
vblendpd $1,%ymm3,%ymm15,%ymm15
1398
1399
vpermq $144,%ymm14,%ymm14
1400
vpermq $3,%ymm13,%ymm3
1401
vblendpd $1,%ymm3,%ymm14,%ymm14
1402
1403
vpermq $144,%ymm13,%ymm13
1404
vpermq $3,%ymm12,%ymm3
1405
vblendpd $1,%ymm3,%ymm13,%ymm13
1406
1407
vpermq $144,%ymm12,%ymm12
1408
vpermq $3,%ymm11,%ymm3
1409
vblendpd $1,%ymm3,%ymm12,%ymm12
1410
1411
vpermq $144,%ymm11,%ymm11
1412
vpermq $3,%ymm2,%ymm3
1413
vblendpd $1,%ymm3,%ymm11,%ymm11
1414
1415
vpermq $144,%ymm2,%ymm2
1416
vpermq $3,%ymm1,%ymm3
1417
vblendpd $1,%ymm3,%ymm2,%ymm2
1418
1419
vpermq $144,%ymm1,%ymm1
1420
vpermq $3,%ymm0,%ymm3
1421
vblendpd $1,%ymm3,%ymm1,%ymm1
1422
1423
vpermq $144,%ymm0,%ymm0
1424
vpand .Lhigh64x3(%rip),%ymm0,%ymm0
1425
1426
vmovupd (%rsp),%ymm3
1427
leaq 32(%rsp),%rsp
1428
1429
1430
vpand .Lmask52x4(%rip),%ymm3,%ymm3
1431
vpand .Lmask52x4(%rip),%ymm4,%ymm4
1432
vpand .Lmask52x4(%rip),%ymm5,%ymm5
1433
vpand .Lmask52x4(%rip),%ymm6,%ymm6
1434
vpand .Lmask52x4(%rip),%ymm7,%ymm7
1435
vpand .Lmask52x4(%rip),%ymm8,%ymm8
1436
vpand .Lmask52x4(%rip),%ymm9,%ymm9
1437
vpand .Lmask52x4(%rip),%ymm10,%ymm10
1438
1439
1440
vpaddq %ymm0,%ymm3,%ymm3
1441
vpaddq %ymm1,%ymm4,%ymm4
1442
vpaddq %ymm2,%ymm5,%ymm5
1443
vpaddq %ymm11,%ymm6,%ymm6
1444
vpaddq %ymm12,%ymm7,%ymm7
1445
vpaddq %ymm13,%ymm8,%ymm8
1446
vpaddq %ymm14,%ymm9,%ymm9
1447
vpaddq %ymm15,%ymm10,%ymm10
1448
1449
1450
1451
vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0
1452
vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1
1453
vmovmskpd %ymm0,%r14d
1454
vmovmskpd %ymm1,%r13d
1455
shlb $4,%r13b
1456
orb %r13b,%r14b
1457
1458
vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2
1459
vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11
1460
vmovmskpd %ymm2,%r13d
1461
vmovmskpd %ymm11,%r12d
1462
shlb $4,%r12b
1463
orb %r12b,%r13b
1464
1465
vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12
1466
vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13
1467
vmovmskpd %ymm12,%r12d
1468
vmovmskpd %ymm13,%r11d
1469
shlb $4,%r11b
1470
orb %r11b,%r12b
1471
1472
vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14
1473
vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15
1474
vmovmskpd %ymm14,%r11d
1475
vmovmskpd %ymm15,%r10d
1476
shlb $4,%r10b
1477
orb %r10b,%r11b
1478
1479
addb %r14b,%r14b
1480
adcb %r13b,%r13b
1481
adcb %r12b,%r12b
1482
adcb %r11b,%r11b
1483
1484
1485
vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0
1486
vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1
1487
vmovmskpd %ymm0,%r9d
1488
vmovmskpd %ymm1,%r8d
1489
shlb $4,%r8b
1490
orb %r8b,%r9b
1491
1492
vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2
1493
vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11
1494
vmovmskpd %ymm2,%r8d
1495
vmovmskpd %ymm11,%edx
1496
shlb $4,%dl
1497
orb %dl,%r8b
1498
1499
vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12
1500
vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13
1501
vmovmskpd %ymm12,%edx
1502
vmovmskpd %ymm13,%ecx
1503
shlb $4,%cl
1504
orb %cl,%dl
1505
1506
vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14
1507
vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15
1508
vmovmskpd %ymm14,%ecx
1509
vmovmskpd %ymm15,%ebx
1510
shlb $4,%bl
1511
orb %bl,%cl
1512
1513
addb %r9b,%r14b
1514
adcb %r8b,%r13b
1515
adcb %dl,%r12b
1516
adcb %cl,%r11b
1517
1518
xorb %r9b,%r14b
1519
xorb %r8b,%r13b
1520
xorb %dl,%r12b
1521
xorb %cl,%r11b
1522
1523
leaq .Lkmasklut(%rip),%rdx
1524
1525
movb %r14b,%r10b
1526
andq $0xf,%r14
1527
vpsubq .Lmask52x4(%rip),%ymm3,%ymm0
1528
shlq $5,%r14
1529
vmovapd (%rdx,%r14,1),%ymm2
1530
vblendvpd %ymm2,%ymm0,%ymm3,%ymm3
1531
1532
shrb $4,%r10b
1533
andq $0xf,%r10
1534
vpsubq .Lmask52x4(%rip),%ymm4,%ymm0
1535
shlq $5,%r10
1536
vmovapd (%rdx,%r10,1),%ymm2
1537
vblendvpd %ymm2,%ymm0,%ymm4,%ymm4
1538
1539
movb %r13b,%r10b
1540
andq $0xf,%r13
1541
vpsubq .Lmask52x4(%rip),%ymm5,%ymm0
1542
shlq $5,%r13
1543
vmovapd (%rdx,%r13,1),%ymm2
1544
vblendvpd %ymm2,%ymm0,%ymm5,%ymm5
1545
1546
shrb $4,%r10b
1547
andq $0xf,%r10
1548
vpsubq .Lmask52x4(%rip),%ymm6,%ymm0
1549
shlq $5,%r10
1550
vmovapd (%rdx,%r10,1),%ymm2
1551
vblendvpd %ymm2,%ymm0,%ymm6,%ymm6
1552
1553
movb %r12b,%r10b
1554
andq $0xf,%r12
1555
vpsubq .Lmask52x4(%rip),%ymm7,%ymm0
1556
shlq $5,%r12
1557
vmovapd (%rdx,%r12,1),%ymm2
1558
vblendvpd %ymm2,%ymm0,%ymm7,%ymm7
1559
1560
shrb $4,%r10b
1561
andq $0xf,%r10
1562
vpsubq .Lmask52x4(%rip),%ymm8,%ymm0
1563
shlq $5,%r10
1564
vmovapd (%rdx,%r10,1),%ymm2
1565
vblendvpd %ymm2,%ymm0,%ymm8,%ymm8
1566
1567
movb %r11b,%r10b
1568
andq $0xf,%r11
1569
vpsubq .Lmask52x4(%rip),%ymm9,%ymm0
1570
shlq $5,%r11
1571
vmovapd (%rdx,%r11,1),%ymm2
1572
vblendvpd %ymm2,%ymm0,%ymm9,%ymm9
1573
1574
shrb $4,%r10b
1575
andq $0xf,%r10
1576
vpsubq .Lmask52x4(%rip),%ymm10,%ymm0
1577
shlq $5,%r10
1578
vmovapd (%rdx,%r10,1),%ymm2
1579
vblendvpd %ymm2,%ymm0,%ymm10,%ymm10
1580
1581
vpand .Lmask52x4(%rip),%ymm3,%ymm3
1582
vpand .Lmask52x4(%rip),%ymm4,%ymm4
1583
vpand .Lmask52x4(%rip),%ymm5,%ymm5
1584
vpand .Lmask52x4(%rip),%ymm6,%ymm6
1585
vpand .Lmask52x4(%rip),%ymm7,%ymm7
1586
vpand .Lmask52x4(%rip),%ymm8,%ymm8
1587
vpand .Lmask52x4(%rip),%ymm9,%ymm9
1588
1589
vpand .Lmask52x4(%rip),%ymm10,%ymm10
1590
1591
vmovdqu %ymm3,256(%rdi)
1592
vmovdqu %ymm4,288(%rdi)
1593
vmovdqu %ymm5,320(%rdi)
1594
vmovdqu %ymm6,352(%rdi)
1595
vmovdqu %ymm7,384(%rdi)
1596
vmovdqu %ymm8,416(%rdi)
1597
vmovdqu %ymm9,448(%rdi)
1598
vmovdqu %ymm10,480(%rdi)
1599
1600
vzeroupper
1601
leaq (%rsp),%rax
1602
.cfi_def_cfa_register %rax
1603
movq 0(%rax),%r15
1604
.cfi_restore %r15
1605
movq 8(%rax),%r14
1606
.cfi_restore %r14
1607
movq 16(%rax),%r13
1608
.cfi_restore %r13
1609
movq 24(%rax),%r12
1610
.cfi_restore %r12
1611
movq 32(%rax),%rbp
1612
.cfi_restore %rbp
1613
movq 40(%rax),%rbx
1614
.cfi_restore %rbx
1615
leaq 48(%rax),%rsp
1616
.cfi_def_cfa %rsp,8
1617
.Lossl_rsaz_amm52x30_x2_avxifma256_epilogue:
1618
.byte 0xf3,0xc3
1619
.cfi_endproc
1620
.size ossl_rsaz_amm52x30_x2_avxifma256, .-ossl_rsaz_amm52x30_x2_avxifma256
1621
.text
1622
1623
.align 32
1624
.globl ossl_extract_multiplier_2x30_win5_avx
1625
.type ossl_extract_multiplier_2x30_win5_avx,@function
1626
ossl_extract_multiplier_2x30_win5_avx:
1627
.cfi_startproc
1628
.byte 243,15,30,250
1629
vmovapd .Lones(%rip),%ymm12
1630
vmovq %rdx,%xmm8
1631
vpbroadcastq %xmm8,%ymm10
1632
vmovq %rcx,%xmm8
1633
vpbroadcastq %xmm8,%ymm11
1634
leaq 16384(%rsi),%rax
1635
1636
1637
vpxor %xmm0,%xmm0,%xmm0
1638
vmovapd %ymm0,%ymm9
1639
vmovapd %ymm0,%ymm1
1640
vmovapd %ymm0,%ymm2
1641
vmovapd %ymm0,%ymm3
1642
vmovapd %ymm0,%ymm4
1643
vmovapd %ymm0,%ymm5
1644
vmovapd %ymm0,%ymm6
1645
vmovapd %ymm0,%ymm7
1646
1647
.align 32
1648
.Lloop:
1649
vpcmpeqq %ymm9,%ymm10,%ymm13
1650
vmovdqu 0(%rsi),%ymm8
1651
1652
vblendvpd %ymm13,%ymm8,%ymm0,%ymm0
1653
vmovdqu 32(%rsi),%ymm8
1654
1655
vblendvpd %ymm13,%ymm8,%ymm1,%ymm1
1656
vmovdqu 64(%rsi),%ymm8
1657
1658
vblendvpd %ymm13,%ymm8,%ymm2,%ymm2
1659
vmovdqu 96(%rsi),%ymm8
1660
1661
vblendvpd %ymm13,%ymm8,%ymm3,%ymm3
1662
vmovdqu 128(%rsi),%ymm8
1663
1664
vblendvpd %ymm13,%ymm8,%ymm4,%ymm4
1665
vmovdqu 160(%rsi),%ymm8
1666
1667
vblendvpd %ymm13,%ymm8,%ymm5,%ymm5
1668
vmovdqu 192(%rsi),%ymm8
1669
1670
vblendvpd %ymm13,%ymm8,%ymm6,%ymm6
1671
vmovdqu 224(%rsi),%ymm8
1672
1673
vblendvpd %ymm13,%ymm8,%ymm7,%ymm7
1674
vpaddq %ymm12,%ymm9,%ymm9
1675
addq $512,%rsi
1676
cmpq %rsi,%rax
1677
jne .Lloop
1678
vmovdqu %ymm0,0(%rdi)
1679
vmovdqu %ymm1,32(%rdi)
1680
vmovdqu %ymm2,64(%rdi)
1681
vmovdqu %ymm3,96(%rdi)
1682
vmovdqu %ymm4,128(%rdi)
1683
vmovdqu %ymm5,160(%rdi)
1684
vmovdqu %ymm6,192(%rdi)
1685
vmovdqu %ymm7,224(%rdi)
1686
leaq -16384(%rax),%rsi
1687
1688
1689
vpxor %xmm0,%xmm0,%xmm0
1690
vmovapd %ymm0,%ymm9
1691
vmovapd %ymm0,%ymm0
1692
vmovapd %ymm0,%ymm1
1693
vmovapd %ymm0,%ymm2
1694
vmovapd %ymm0,%ymm3
1695
vmovapd %ymm0,%ymm4
1696
vmovapd %ymm0,%ymm5
1697
vmovapd %ymm0,%ymm6
1698
vmovapd %ymm0,%ymm7
1699
1700
.align 32
1701
.Lloop_8_15:
1702
vpcmpeqq %ymm9,%ymm11,%ymm13
1703
vmovdqu 256(%rsi),%ymm8
1704
1705
vblendvpd %ymm13,%ymm8,%ymm0,%ymm0
1706
vmovdqu 288(%rsi),%ymm8
1707
1708
vblendvpd %ymm13,%ymm8,%ymm1,%ymm1
1709
vmovdqu 320(%rsi),%ymm8
1710
1711
vblendvpd %ymm13,%ymm8,%ymm2,%ymm2
1712
vmovdqu 352(%rsi),%ymm8
1713
1714
vblendvpd %ymm13,%ymm8,%ymm3,%ymm3
1715
vmovdqu 384(%rsi),%ymm8
1716
1717
vblendvpd %ymm13,%ymm8,%ymm4,%ymm4
1718
vmovdqu 416(%rsi),%ymm8
1719
1720
vblendvpd %ymm13,%ymm8,%ymm5,%ymm5
1721
vmovdqu 448(%rsi),%ymm8
1722
1723
vblendvpd %ymm13,%ymm8,%ymm6,%ymm6
1724
vmovdqu 480(%rsi),%ymm8
1725
1726
vblendvpd %ymm13,%ymm8,%ymm7,%ymm7
1727
vpaddq %ymm12,%ymm9,%ymm9
1728
addq $512,%rsi
1729
cmpq %rsi,%rax
1730
jne .Lloop_8_15
1731
vmovdqu %ymm0,256(%rdi)
1732
vmovdqu %ymm1,288(%rdi)
1733
vmovdqu %ymm2,320(%rdi)
1734
vmovdqu %ymm3,352(%rdi)
1735
vmovdqu %ymm4,384(%rdi)
1736
vmovdqu %ymm5,416(%rdi)
1737
vmovdqu %ymm6,448(%rdi)
1738
vmovdqu %ymm7,480(%rdi)
1739
1740
.byte 0xf3,0xc3
1741
.cfi_endproc
1742
.size ossl_extract_multiplier_2x30_win5_avx, .-ossl_extract_multiplier_2x30_win5_avx
1743
.section .rodata
1744
.align 32
1745
.Lones:
1746
.quad 1,1,1,1
1747
.Lzeros:
1748
.quad 0,0,0,0
1749
.section ".note.gnu.property", "a"
1750
.p2align 3
1751
.long 1f - 0f
1752
.long 4f - 1f
1753
.long 5
1754
0:
1755
# "GNU" encoded with .byte, since .asciz isn't supported
1756
# on Solaris.
1757
.byte 0x47
1758
.byte 0x4e
1759
.byte 0x55
1760
.byte 0
1761
1:
1762
.p2align 3
1763
.long 0xc0000002
1764
.long 3f - 2f
1765
2:
1766
.long 3
1767
3:
1768
.p2align 3
1769
4:
1770
1771