Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/amd64/rsaz-4k-avxifma.S
39483 views
1
/* Do not modify. This file is auto-generated from rsaz-4k-avxifma.pl. */
2
.text
3
4
.globl ossl_rsaz_amm52x40_x1_avxifma256
5
.type ossl_rsaz_amm52x40_x1_avxifma256,@function
6
.align 32
7
ossl_rsaz_amm52x40_x1_avxifma256:
8
.cfi_startproc
9
.byte 243,15,30,250
10
pushq %rbx
11
.cfi_adjust_cfa_offset 8
12
.cfi_offset %rbx,-16
13
pushq %rbp
14
.cfi_adjust_cfa_offset 8
15
.cfi_offset %rbp,-24
16
pushq %r12
17
.cfi_adjust_cfa_offset 8
18
.cfi_offset %r12,-32
19
pushq %r13
20
.cfi_adjust_cfa_offset 8
21
.cfi_offset %r13,-40
22
pushq %r14
23
.cfi_adjust_cfa_offset 8
24
.cfi_offset %r14,-48
25
pushq %r15
26
.cfi_adjust_cfa_offset 8
27
.cfi_offset %r15,-56
28
29
vpxor %ymm0,%ymm0,%ymm0
30
vmovapd %ymm0,%ymm3
31
vmovapd %ymm0,%ymm4
32
vmovapd %ymm0,%ymm5
33
vmovapd %ymm0,%ymm6
34
vmovapd %ymm0,%ymm7
35
vmovapd %ymm0,%ymm8
36
vmovapd %ymm0,%ymm9
37
vmovapd %ymm0,%ymm10
38
vmovapd %ymm0,%ymm11
39
vmovapd %ymm0,%ymm12
40
41
xorl %r9d,%r9d
42
43
movq %rdx,%r11
44
movq $0xfffffffffffff,%rax
45
46
47
movl $10,%ebx
48
49
.align 32
50
.Lloop10:
51
movq 0(%r11),%r13
52
53
vpbroadcastq 0(%r11),%ymm1
54
movq 0(%rsi),%rdx
55
mulxq %r13,%r13,%r12
56
addq %r13,%r9
57
movq %r12,%r10
58
adcq $0,%r10
59
60
movq %r8,%r13
61
imulq %r9,%r13
62
andq %rax,%r13
63
64
vmovq %r13,%xmm2
65
vpbroadcastq %xmm2,%ymm2
66
movq 0(%rcx),%rdx
67
mulxq %r13,%r13,%r12
68
addq %r13,%r9
69
adcq %r12,%r10
70
71
shrq $52,%r9
72
salq $12,%r10
73
orq %r10,%r9
74
75
leaq -328(%rsp),%rsp
76
77
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
78
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
79
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
80
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
81
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
82
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
83
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
84
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
85
{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11
86
{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12
87
88
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
89
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
90
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
91
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
92
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
93
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
94
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
95
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
96
{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11
97
{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12
98
vmovdqu %ymm3,0(%rsp)
99
vmovdqu %ymm4,32(%rsp)
100
vmovdqu %ymm5,64(%rsp)
101
vmovdqu %ymm6,96(%rsp)
102
vmovdqu %ymm7,128(%rsp)
103
vmovdqu %ymm8,160(%rsp)
104
vmovdqu %ymm9,192(%rsp)
105
vmovdqu %ymm10,224(%rsp)
106
vmovdqu %ymm11,256(%rsp)
107
vmovdqu %ymm12,288(%rsp)
108
movq $0,320(%rsp)
109
110
vmovdqu 8(%rsp),%ymm3
111
vmovdqu 40(%rsp),%ymm4
112
vmovdqu 72(%rsp),%ymm5
113
vmovdqu 104(%rsp),%ymm6
114
vmovdqu 136(%rsp),%ymm7
115
vmovdqu 168(%rsp),%ymm8
116
vmovdqu 200(%rsp),%ymm9
117
vmovdqu 232(%rsp),%ymm10
118
vmovdqu 264(%rsp),%ymm11
119
vmovdqu 296(%rsp),%ymm12
120
121
addq 8(%rsp),%r9
122
123
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
124
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
125
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
126
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
127
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
128
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
129
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
130
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
131
{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11
132
{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12
133
134
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
135
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
136
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
137
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
138
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
139
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
140
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
141
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
142
{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11
143
{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12
144
leaq 328(%rsp),%rsp
145
movq 8(%r11),%r13
146
147
vpbroadcastq 8(%r11),%ymm1
148
movq 0(%rsi),%rdx
149
mulxq %r13,%r13,%r12
150
addq %r13,%r9
151
movq %r12,%r10
152
adcq $0,%r10
153
154
movq %r8,%r13
155
imulq %r9,%r13
156
andq %rax,%r13
157
158
vmovq %r13,%xmm2
159
vpbroadcastq %xmm2,%ymm2
160
movq 0(%rcx),%rdx
161
mulxq %r13,%r13,%r12
162
addq %r13,%r9
163
adcq %r12,%r10
164
165
shrq $52,%r9
166
salq $12,%r10
167
orq %r10,%r9
168
169
leaq -328(%rsp),%rsp
170
171
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
172
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
173
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
174
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
175
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
176
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
177
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
178
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
179
{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11
180
{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12
181
182
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
183
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
184
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
185
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
186
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
187
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
188
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
189
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
190
{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11
191
{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12
192
vmovdqu %ymm3,0(%rsp)
193
vmovdqu %ymm4,32(%rsp)
194
vmovdqu %ymm5,64(%rsp)
195
vmovdqu %ymm6,96(%rsp)
196
vmovdqu %ymm7,128(%rsp)
197
vmovdqu %ymm8,160(%rsp)
198
vmovdqu %ymm9,192(%rsp)
199
vmovdqu %ymm10,224(%rsp)
200
vmovdqu %ymm11,256(%rsp)
201
vmovdqu %ymm12,288(%rsp)
202
movq $0,320(%rsp)
203
204
vmovdqu 8(%rsp),%ymm3
205
vmovdqu 40(%rsp),%ymm4
206
vmovdqu 72(%rsp),%ymm5
207
vmovdqu 104(%rsp),%ymm6
208
vmovdqu 136(%rsp),%ymm7
209
vmovdqu 168(%rsp),%ymm8
210
vmovdqu 200(%rsp),%ymm9
211
vmovdqu 232(%rsp),%ymm10
212
vmovdqu 264(%rsp),%ymm11
213
vmovdqu 296(%rsp),%ymm12
214
215
addq 8(%rsp),%r9
216
217
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
218
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
219
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
220
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
221
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
222
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
223
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
224
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
225
{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11
226
{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12
227
228
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
229
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
230
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
231
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
232
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
233
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
234
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
235
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
236
{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11
237
{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12
238
leaq 328(%rsp),%rsp
239
movq 16(%r11),%r13
240
241
vpbroadcastq 16(%r11),%ymm1
242
movq 0(%rsi),%rdx
243
mulxq %r13,%r13,%r12
244
addq %r13,%r9
245
movq %r12,%r10
246
adcq $0,%r10
247
248
movq %r8,%r13
249
imulq %r9,%r13
250
andq %rax,%r13
251
252
vmovq %r13,%xmm2
253
vpbroadcastq %xmm2,%ymm2
254
movq 0(%rcx),%rdx
255
mulxq %r13,%r13,%r12
256
addq %r13,%r9
257
adcq %r12,%r10
258
259
shrq $52,%r9
260
salq $12,%r10
261
orq %r10,%r9
262
263
leaq -328(%rsp),%rsp
264
265
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
266
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
267
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
268
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
269
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
270
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
271
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
272
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
273
{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11
274
{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12
275
276
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
277
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
278
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
279
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
280
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
281
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
282
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
283
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
284
{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11
285
{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12
286
vmovdqu %ymm3,0(%rsp)
287
vmovdqu %ymm4,32(%rsp)
288
vmovdqu %ymm5,64(%rsp)
289
vmovdqu %ymm6,96(%rsp)
290
vmovdqu %ymm7,128(%rsp)
291
vmovdqu %ymm8,160(%rsp)
292
vmovdqu %ymm9,192(%rsp)
293
vmovdqu %ymm10,224(%rsp)
294
vmovdqu %ymm11,256(%rsp)
295
vmovdqu %ymm12,288(%rsp)
296
movq $0,320(%rsp)
297
298
vmovdqu 8(%rsp),%ymm3
299
vmovdqu 40(%rsp),%ymm4
300
vmovdqu 72(%rsp),%ymm5
301
vmovdqu 104(%rsp),%ymm6
302
vmovdqu 136(%rsp),%ymm7
303
vmovdqu 168(%rsp),%ymm8
304
vmovdqu 200(%rsp),%ymm9
305
vmovdqu 232(%rsp),%ymm10
306
vmovdqu 264(%rsp),%ymm11
307
vmovdqu 296(%rsp),%ymm12
308
309
addq 8(%rsp),%r9
310
311
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
312
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
313
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
314
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
315
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
316
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
317
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
318
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
319
{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11
320
{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12
321
322
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
323
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
324
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
325
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
326
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
327
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
328
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
329
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
330
{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11
331
{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12
332
leaq 328(%rsp),%rsp
333
movq 24(%r11),%r13
334
335
vpbroadcastq 24(%r11),%ymm1
336
movq 0(%rsi),%rdx
337
mulxq %r13,%r13,%r12
338
addq %r13,%r9
339
movq %r12,%r10
340
adcq $0,%r10
341
342
movq %r8,%r13
343
imulq %r9,%r13
344
andq %rax,%r13
345
346
vmovq %r13,%xmm2
347
vpbroadcastq %xmm2,%ymm2
348
movq 0(%rcx),%rdx
349
mulxq %r13,%r13,%r12
350
addq %r13,%r9
351
adcq %r12,%r10
352
353
shrq $52,%r9
354
salq $12,%r10
355
orq %r10,%r9
356
357
leaq -328(%rsp),%rsp
358
359
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
360
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
361
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
362
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
363
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
364
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
365
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
366
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
367
{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11
368
{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12
369
370
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
371
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
372
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
373
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
374
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
375
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
376
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
377
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
378
{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11
379
{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12
380
vmovdqu %ymm3,0(%rsp)
381
vmovdqu %ymm4,32(%rsp)
382
vmovdqu %ymm5,64(%rsp)
383
vmovdqu %ymm6,96(%rsp)
384
vmovdqu %ymm7,128(%rsp)
385
vmovdqu %ymm8,160(%rsp)
386
vmovdqu %ymm9,192(%rsp)
387
vmovdqu %ymm10,224(%rsp)
388
vmovdqu %ymm11,256(%rsp)
389
vmovdqu %ymm12,288(%rsp)
390
movq $0,320(%rsp)
391
392
vmovdqu 8(%rsp),%ymm3
393
vmovdqu 40(%rsp),%ymm4
394
vmovdqu 72(%rsp),%ymm5
395
vmovdqu 104(%rsp),%ymm6
396
vmovdqu 136(%rsp),%ymm7
397
vmovdqu 168(%rsp),%ymm8
398
vmovdqu 200(%rsp),%ymm9
399
vmovdqu 232(%rsp),%ymm10
400
vmovdqu 264(%rsp),%ymm11
401
vmovdqu 296(%rsp),%ymm12
402
403
addq 8(%rsp),%r9
404
405
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
406
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
407
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
408
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
409
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
410
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
411
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
412
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
413
{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11
414
{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12
415
416
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
417
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
418
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
419
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
420
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
421
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
422
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
423
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
424
{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11
425
{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12
426
leaq 328(%rsp),%rsp
427
leaq 32(%r11),%r11
428
decl %ebx
429
jne .Lloop10
430
431
vmovq %r9,%xmm0
432
vpbroadcastq %xmm0,%ymm0
433
vpblendd $3,%ymm0,%ymm3,%ymm3
434
435
leaq -640(%rsp),%rsp
436
vmovupd %ymm3,0(%rsp)
437
vmovupd %ymm4,32(%rsp)
438
vmovupd %ymm5,64(%rsp)
439
vmovupd %ymm6,96(%rsp)
440
vmovupd %ymm7,128(%rsp)
441
vmovupd %ymm8,160(%rsp)
442
vmovupd %ymm9,192(%rsp)
443
vmovupd %ymm10,224(%rsp)
444
vmovupd %ymm11,256(%rsp)
445
vmovupd %ymm12,288(%rsp)
446
447
448
449
vpsrlq $52,%ymm3,%ymm3
450
vpsrlq $52,%ymm4,%ymm4
451
vpsrlq $52,%ymm5,%ymm5
452
vpsrlq $52,%ymm6,%ymm6
453
vpsrlq $52,%ymm7,%ymm7
454
vpsrlq $52,%ymm8,%ymm8
455
vpsrlq $52,%ymm9,%ymm9
456
vpsrlq $52,%ymm10,%ymm10
457
vpsrlq $52,%ymm11,%ymm11
458
vpsrlq $52,%ymm12,%ymm12
459
460
461
vpermq $144,%ymm12,%ymm12
462
vpermq $3,%ymm11,%ymm13
463
vblendpd $1,%ymm13,%ymm12,%ymm12
464
465
vpermq $144,%ymm11,%ymm11
466
vpermq $3,%ymm10,%ymm13
467
vblendpd $1,%ymm13,%ymm11,%ymm11
468
469
vpermq $144,%ymm10,%ymm10
470
vpermq $3,%ymm9,%ymm13
471
vblendpd $1,%ymm13,%ymm10,%ymm10
472
473
vpermq $144,%ymm9,%ymm9
474
vpermq $3,%ymm8,%ymm13
475
vblendpd $1,%ymm13,%ymm9,%ymm9
476
477
vpermq $144,%ymm8,%ymm8
478
vpermq $3,%ymm7,%ymm13
479
vblendpd $1,%ymm13,%ymm8,%ymm8
480
481
vpermq $144,%ymm7,%ymm7
482
vpermq $3,%ymm6,%ymm13
483
vblendpd $1,%ymm13,%ymm7,%ymm7
484
485
vpermq $144,%ymm6,%ymm6
486
vpermq $3,%ymm5,%ymm13
487
vblendpd $1,%ymm13,%ymm6,%ymm6
488
489
vpermq $144,%ymm5,%ymm5
490
vpermq $3,%ymm4,%ymm13
491
vblendpd $1,%ymm13,%ymm5,%ymm5
492
493
vpermq $144,%ymm4,%ymm4
494
vpermq $3,%ymm3,%ymm13
495
vblendpd $1,%ymm13,%ymm4,%ymm4
496
497
vpermq $144,%ymm3,%ymm3
498
vpand .Lhigh64x3(%rip),%ymm3,%ymm3
499
500
vmovupd %ymm3,320(%rsp)
501
vmovupd %ymm4,352(%rsp)
502
vmovupd %ymm5,384(%rsp)
503
vmovupd %ymm6,416(%rsp)
504
vmovupd %ymm7,448(%rsp)
505
vmovupd %ymm8,480(%rsp)
506
vmovupd %ymm9,512(%rsp)
507
vmovupd %ymm10,544(%rsp)
508
vmovupd %ymm11,576(%rsp)
509
vmovupd %ymm12,608(%rsp)
510
511
vmovupd 0(%rsp),%ymm3
512
vmovupd 32(%rsp),%ymm4
513
vmovupd 64(%rsp),%ymm5
514
vmovupd 96(%rsp),%ymm6
515
vmovupd 128(%rsp),%ymm7
516
vmovupd 160(%rsp),%ymm8
517
vmovupd 192(%rsp),%ymm9
518
vmovupd 224(%rsp),%ymm10
519
vmovupd 256(%rsp),%ymm11
520
vmovupd 288(%rsp),%ymm12
521
522
523
vpand .Lmask52x4(%rip),%ymm3,%ymm3
524
vpand .Lmask52x4(%rip),%ymm4,%ymm4
525
vpand .Lmask52x4(%rip),%ymm5,%ymm5
526
vpand .Lmask52x4(%rip),%ymm6,%ymm6
527
vpand .Lmask52x4(%rip),%ymm7,%ymm7
528
vpand .Lmask52x4(%rip),%ymm8,%ymm8
529
vpand .Lmask52x4(%rip),%ymm9,%ymm9
530
vpand .Lmask52x4(%rip),%ymm10,%ymm10
531
vpand .Lmask52x4(%rip),%ymm11,%ymm11
532
vpand .Lmask52x4(%rip),%ymm12,%ymm12
533
534
535
vpaddq 320(%rsp),%ymm3,%ymm3
536
vpaddq 352(%rsp),%ymm4,%ymm4
537
vpaddq 384(%rsp),%ymm5,%ymm5
538
vpaddq 416(%rsp),%ymm6,%ymm6
539
vpaddq 448(%rsp),%ymm7,%ymm7
540
vpaddq 480(%rsp),%ymm8,%ymm8
541
vpaddq 512(%rsp),%ymm9,%ymm9
542
vpaddq 544(%rsp),%ymm10,%ymm10
543
vpaddq 576(%rsp),%ymm11,%ymm11
544
vpaddq 608(%rsp),%ymm12,%ymm12
545
546
leaq 640(%rsp),%rsp
547
548
549
550
vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm13
551
vmovmskpd %ymm13,%r14d
552
vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm13
553
vmovmskpd %ymm13,%r13d
554
shlb $4,%r13b
555
orb %r13b,%r14b
556
557
vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm13
558
vmovmskpd %ymm13,%r13d
559
vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm13
560
vmovmskpd %ymm13,%r12d
561
shlb $4,%r12b
562
orb %r12b,%r13b
563
564
vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13
565
vmovmskpd %ymm13,%r12d
566
vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13
567
vmovmskpd %ymm13,%r11d
568
shlb $4,%r11b
569
orb %r11b,%r12b
570
571
vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm13
572
vmovmskpd %ymm13,%r11d
573
vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm13
574
vmovmskpd %ymm13,%r10d
575
shlb $4,%r10b
576
orb %r10b,%r11b
577
578
vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13
579
vmovmskpd %ymm13,%r10d
580
vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm13
581
vmovmskpd %ymm13,%r9d
582
shlb $4,%r9b
583
orb %r9b,%r10b
584
585
addb %r14b,%r14b
586
adcb %r13b,%r13b
587
adcb %r12b,%r12b
588
adcb %r11b,%r11b
589
adcb %r10b,%r10b
590
591
592
vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm13
593
vmovmskpd %ymm13,%r9d
594
vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm13
595
vmovmskpd %ymm13,%r8d
596
shlb $4,%r8b
597
orb %r8b,%r9b
598
599
vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm13
600
vmovmskpd %ymm13,%r8d
601
vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm13
602
vmovmskpd %ymm13,%edx
603
shlb $4,%dl
604
orb %dl,%r8b
605
606
vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13
607
vmovmskpd %ymm13,%edx
608
vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13
609
vmovmskpd %ymm13,%ecx
610
shlb $4,%cl
611
orb %cl,%dl
612
613
vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm13
614
vmovmskpd %ymm13,%ecx
615
vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm13
616
vmovmskpd %ymm13,%ebx
617
shlb $4,%bl
618
orb %bl,%cl
619
620
vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13
621
vmovmskpd %ymm13,%ebx
622
vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm13
623
vmovmskpd %ymm13,%eax
624
shlb $4,%al
625
orb %al,%bl
626
627
addb %r9b,%r14b
628
adcb %r8b,%r13b
629
adcb %dl,%r12b
630
adcb %cl,%r11b
631
adcb %bl,%r10b
632
633
xorb %r9b,%r14b
634
xorb %r8b,%r13b
635
xorb %dl,%r12b
636
xorb %cl,%r11b
637
xorb %bl,%r10b
638
639
pushq %r9
640
pushq %r8
641
642
leaq .Lkmasklut(%rip),%r8
643
644
movb %r14b,%r9b
645
andq $0xf,%r14
646
vpsubq .Lmask52x4(%rip),%ymm3,%ymm13
647
shlq $5,%r14
648
vmovapd (%r8,%r14,1),%ymm14
649
vblendvpd %ymm14,%ymm13,%ymm3,%ymm3
650
651
shrb $4,%r9b
652
andq $0xf,%r9
653
vpsubq .Lmask52x4(%rip),%ymm4,%ymm13
654
shlq $5,%r9
655
vmovapd (%r8,%r9,1),%ymm14
656
vblendvpd %ymm14,%ymm13,%ymm4,%ymm4
657
658
movb %r13b,%r9b
659
andq $0xf,%r13
660
vpsubq .Lmask52x4(%rip),%ymm5,%ymm13
661
shlq $5,%r13
662
vmovapd (%r8,%r13,1),%ymm14
663
vblendvpd %ymm14,%ymm13,%ymm5,%ymm5
664
665
shrb $4,%r9b
666
andq $0xf,%r9
667
vpsubq .Lmask52x4(%rip),%ymm6,%ymm13
668
shlq $5,%r9
669
vmovapd (%r8,%r9,1),%ymm14
670
vblendvpd %ymm14,%ymm13,%ymm6,%ymm6
671
672
movb %r12b,%r9b
673
andq $0xf,%r12
674
vpsubq .Lmask52x4(%rip),%ymm7,%ymm13
675
shlq $5,%r12
676
vmovapd (%r8,%r12,1),%ymm14
677
vblendvpd %ymm14,%ymm13,%ymm7,%ymm7
678
679
shrb $4,%r9b
680
andq $0xf,%r9
681
vpsubq .Lmask52x4(%rip),%ymm8,%ymm13
682
shlq $5,%r9
683
vmovapd (%r8,%r9,1),%ymm14
684
vblendvpd %ymm14,%ymm13,%ymm8,%ymm8
685
686
movb %r11b,%r9b
687
andq $0xf,%r11
688
vpsubq .Lmask52x4(%rip),%ymm9,%ymm13
689
shlq $5,%r11
690
vmovapd (%r8,%r11,1),%ymm14
691
vblendvpd %ymm14,%ymm13,%ymm9,%ymm9
692
693
shrb $4,%r9b
694
andq $0xf,%r9
695
vpsubq .Lmask52x4(%rip),%ymm10,%ymm13
696
shlq $5,%r9
697
vmovapd (%r8,%r9,1),%ymm14
698
vblendvpd %ymm14,%ymm13,%ymm10,%ymm10
699
700
movb %r10b,%r9b
701
andq $0xf,%r10
702
vpsubq .Lmask52x4(%rip),%ymm11,%ymm13
703
shlq $5,%r10
704
vmovapd (%r8,%r10,1),%ymm14
705
vblendvpd %ymm14,%ymm13,%ymm11,%ymm11
706
707
shrb $4,%r9b
708
andq $0xf,%r9
709
vpsubq .Lmask52x4(%rip),%ymm12,%ymm13
710
shlq $5,%r9
711
vmovapd (%r8,%r9,1),%ymm14
712
vblendvpd %ymm14,%ymm13,%ymm12,%ymm12
713
714
popq %r8
715
popq %r9
716
717
vpand .Lmask52x4(%rip),%ymm3,%ymm3
718
vpand .Lmask52x4(%rip),%ymm4,%ymm4
719
vpand .Lmask52x4(%rip),%ymm5,%ymm5
720
vpand .Lmask52x4(%rip),%ymm6,%ymm6
721
vpand .Lmask52x4(%rip),%ymm7,%ymm7
722
vpand .Lmask52x4(%rip),%ymm8,%ymm8
723
vpand .Lmask52x4(%rip),%ymm9,%ymm9
724
725
vpand .Lmask52x4(%rip),%ymm10,%ymm10
726
vpand .Lmask52x4(%rip),%ymm11,%ymm11
727
vpand .Lmask52x4(%rip),%ymm12,%ymm12
728
729
vmovdqu %ymm3,0(%rdi)
730
vmovdqu %ymm4,32(%rdi)
731
vmovdqu %ymm5,64(%rdi)
732
vmovdqu %ymm6,96(%rdi)
733
vmovdqu %ymm7,128(%rdi)
734
vmovdqu %ymm8,160(%rdi)
735
vmovdqu %ymm9,192(%rdi)
736
vmovdqu %ymm10,224(%rdi)
737
vmovdqu %ymm11,256(%rdi)
738
vmovdqu %ymm12,288(%rdi)
739
740
vzeroupper
741
leaq (%rsp),%rax
742
.cfi_def_cfa_register %rax
743
movq 0(%rax),%r15
744
.cfi_restore %r15
745
movq 8(%rax),%r14
746
.cfi_restore %r14
747
movq 16(%rax),%r13
748
.cfi_restore %r13
749
movq 24(%rax),%r12
750
.cfi_restore %r12
751
movq 32(%rax),%rbp
752
.cfi_restore %rbp
753
movq 40(%rax),%rbx
754
.cfi_restore %rbx
755
leaq 48(%rax),%rsp
756
.cfi_def_cfa %rsp,8
757
.Lossl_rsaz_amm52x40_x1_avxifma256_epilogue:
758
759
.byte 0xf3,0xc3
760
.cfi_endproc
761
.size ossl_rsaz_amm52x40_x1_avxifma256, .-ossl_rsaz_amm52x40_x1_avxifma256
762
.section .rodata
763
.align 32
764
.Lmask52x4:
765
.quad 0xfffffffffffff
766
.quad 0xfffffffffffff
767
.quad 0xfffffffffffff
768
.quad 0xfffffffffffff
769
.Lhigh64x3:
770
.quad 0x0
771
.quad 0xffffffffffffffff
772
.quad 0xffffffffffffffff
773
.quad 0xffffffffffffffff
774
.Lkmasklut:
775
776
.quad 0x0
777
.quad 0x0
778
.quad 0x0
779
.quad 0x0
780
781
.quad 0xffffffffffffffff
782
.quad 0x0
783
.quad 0x0
784
.quad 0x0
785
786
.quad 0x0
787
.quad 0xffffffffffffffff
788
.quad 0x0
789
.quad 0x0
790
791
.quad 0xffffffffffffffff
792
.quad 0xffffffffffffffff
793
.quad 0x0
794
.quad 0x0
795
796
.quad 0x0
797
.quad 0x0
798
.quad 0xffffffffffffffff
799
.quad 0x0
800
801
.quad 0xffffffffffffffff
802
.quad 0x0
803
.quad 0xffffffffffffffff
804
.quad 0x0
805
806
.quad 0x0
807
.quad 0xffffffffffffffff
808
.quad 0xffffffffffffffff
809
.quad 0x0
810
811
.quad 0xffffffffffffffff
812
.quad 0xffffffffffffffff
813
.quad 0xffffffffffffffff
814
.quad 0x0
815
816
.quad 0x0
817
.quad 0x0
818
.quad 0x0
819
.quad 0xffffffffffffffff
820
821
.quad 0xffffffffffffffff
822
.quad 0x0
823
.quad 0x0
824
.quad 0xffffffffffffffff
825
826
.quad 0x0
827
.quad 0xffffffffffffffff
828
.quad 0x0
829
.quad 0xffffffffffffffff
830
831
.quad 0xffffffffffffffff
832
.quad 0xffffffffffffffff
833
.quad 0x0
834
.quad 0xffffffffffffffff
835
836
.quad 0x0
837
.quad 0x0
838
.quad 0xffffffffffffffff
839
.quad 0xffffffffffffffff
840
841
.quad 0xffffffffffffffff
842
.quad 0x0
843
.quad 0xffffffffffffffff
844
.quad 0xffffffffffffffff
845
846
.quad 0x0
847
.quad 0xffffffffffffffff
848
.quad 0xffffffffffffffff
849
.quad 0xffffffffffffffff
850
851
.quad 0xffffffffffffffff
852
.quad 0xffffffffffffffff
853
.quad 0xffffffffffffffff
854
.quad 0xffffffffffffffff
855
.text
856
857
.globl ossl_rsaz_amm52x40_x2_avxifma256
858
.type ossl_rsaz_amm52x40_x2_avxifma256,@function
859
.align 32
860
ossl_rsaz_amm52x40_x2_avxifma256:
861
.cfi_startproc
862
.byte 243,15,30,250
863
pushq %rbx
864
.cfi_adjust_cfa_offset 8
865
.cfi_offset %rbx,-16
866
pushq %rbp
867
.cfi_adjust_cfa_offset 8
868
.cfi_offset %rbp,-24
869
pushq %r12
870
.cfi_adjust_cfa_offset 8
871
.cfi_offset %r12,-32
872
pushq %r13
873
.cfi_adjust_cfa_offset 8
874
.cfi_offset %r13,-40
875
pushq %r14
876
.cfi_adjust_cfa_offset 8
877
.cfi_offset %r14,-48
878
pushq %r15
879
.cfi_adjust_cfa_offset 8
880
.cfi_offset %r15,-56
881
882
vpxor %ymm0,%ymm0,%ymm0
883
vmovapd %ymm0,%ymm3
884
vmovapd %ymm0,%ymm4
885
vmovapd %ymm0,%ymm5
886
vmovapd %ymm0,%ymm6
887
vmovapd %ymm0,%ymm7
888
vmovapd %ymm0,%ymm8
889
vmovapd %ymm0,%ymm9
890
vmovapd %ymm0,%ymm10
891
vmovapd %ymm0,%ymm11
892
vmovapd %ymm0,%ymm12
893
894
xorl %r9d,%r9d
895
896
movq %rdx,%r11
897
movq $0xfffffffffffff,%rax
898
899
movl $40,%ebx
900
901
.align 32
902
.Lloop40:
903
movq 0(%r11),%r13
904
905
vpbroadcastq 0(%r11),%ymm1
906
movq 0(%rsi),%rdx
907
mulxq %r13,%r13,%r12
908
addq %r13,%r9
909
movq %r12,%r10
910
adcq $0,%r10
911
912
movq (%r8),%r13
913
imulq %r9,%r13
914
andq %rax,%r13
915
916
vmovq %r13,%xmm2
917
vpbroadcastq %xmm2,%ymm2
918
movq 0(%rcx),%rdx
919
mulxq %r13,%r13,%r12
920
addq %r13,%r9
921
adcq %r12,%r10
922
923
shrq $52,%r9
924
salq $12,%r10
925
orq %r10,%r9
926
927
leaq -328(%rsp),%rsp
928
929
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
930
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
931
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
932
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
933
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
934
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
935
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
936
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
937
{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11
938
{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12
939
940
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
941
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
942
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
943
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
944
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
945
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
946
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
947
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
948
{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11
949
{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12
950
vmovdqu %ymm3,0(%rsp)
951
vmovdqu %ymm4,32(%rsp)
952
vmovdqu %ymm5,64(%rsp)
953
vmovdqu %ymm6,96(%rsp)
954
vmovdqu %ymm7,128(%rsp)
955
vmovdqu %ymm8,160(%rsp)
956
vmovdqu %ymm9,192(%rsp)
957
vmovdqu %ymm10,224(%rsp)
958
vmovdqu %ymm11,256(%rsp)
959
vmovdqu %ymm12,288(%rsp)
960
movq $0,320(%rsp)
961
962
vmovdqu 8(%rsp),%ymm3
963
vmovdqu 40(%rsp),%ymm4
964
vmovdqu 72(%rsp),%ymm5
965
vmovdqu 104(%rsp),%ymm6
966
vmovdqu 136(%rsp),%ymm7
967
vmovdqu 168(%rsp),%ymm8
968
vmovdqu 200(%rsp),%ymm9
969
vmovdqu 232(%rsp),%ymm10
970
vmovdqu 264(%rsp),%ymm11
971
vmovdqu 296(%rsp),%ymm12
972
973
addq 8(%rsp),%r9
974
975
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
976
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
977
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
978
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
979
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
980
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
981
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
982
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
983
{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11
984
{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12
985
986
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
987
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
988
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
989
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
990
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
991
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
992
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
993
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
994
{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11
995
{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12
996
leaq 328(%rsp),%rsp
997
leaq 8(%r11),%r11
998
decl %ebx
999
jne .Lloop40
1000
1001
pushq %r11
1002
pushq %rsi
1003
pushq %rcx
1004
pushq %r8
1005
1006
vmovq %r9,%xmm0
1007
vpbroadcastq %xmm0,%ymm0
1008
vpblendd $3,%ymm0,%ymm3,%ymm3
1009
1010
leaq -640(%rsp),%rsp
1011
vmovupd %ymm3,0(%rsp)
1012
vmovupd %ymm4,32(%rsp)
1013
vmovupd %ymm5,64(%rsp)
1014
vmovupd %ymm6,96(%rsp)
1015
vmovupd %ymm7,128(%rsp)
1016
vmovupd %ymm8,160(%rsp)
1017
vmovupd %ymm9,192(%rsp)
1018
vmovupd %ymm10,224(%rsp)
1019
vmovupd %ymm11,256(%rsp)
1020
vmovupd %ymm12,288(%rsp)
1021
1022
1023
1024
vpsrlq $52,%ymm3,%ymm3
1025
vpsrlq $52,%ymm4,%ymm4
1026
vpsrlq $52,%ymm5,%ymm5
1027
vpsrlq $52,%ymm6,%ymm6
1028
vpsrlq $52,%ymm7,%ymm7
1029
vpsrlq $52,%ymm8,%ymm8
1030
vpsrlq $52,%ymm9,%ymm9
1031
vpsrlq $52,%ymm10,%ymm10
1032
vpsrlq $52,%ymm11,%ymm11
1033
vpsrlq $52,%ymm12,%ymm12
1034
1035
1036
vpermq $144,%ymm12,%ymm12
1037
vpermq $3,%ymm11,%ymm13
1038
vblendpd $1,%ymm13,%ymm12,%ymm12
1039
1040
vpermq $144,%ymm11,%ymm11
1041
vpermq $3,%ymm10,%ymm13
1042
vblendpd $1,%ymm13,%ymm11,%ymm11
1043
1044
vpermq $144,%ymm10,%ymm10
1045
vpermq $3,%ymm9,%ymm13
1046
vblendpd $1,%ymm13,%ymm10,%ymm10
1047
1048
vpermq $144,%ymm9,%ymm9
1049
vpermq $3,%ymm8,%ymm13
1050
vblendpd $1,%ymm13,%ymm9,%ymm9
1051
1052
vpermq $144,%ymm8,%ymm8
1053
vpermq $3,%ymm7,%ymm13
1054
vblendpd $1,%ymm13,%ymm8,%ymm8
1055
1056
vpermq $144,%ymm7,%ymm7
1057
vpermq $3,%ymm6,%ymm13
1058
vblendpd $1,%ymm13,%ymm7,%ymm7
1059
1060
vpermq $144,%ymm6,%ymm6
1061
vpermq $3,%ymm5,%ymm13
1062
vblendpd $1,%ymm13,%ymm6,%ymm6
1063
1064
vpermq $144,%ymm5,%ymm5
1065
vpermq $3,%ymm4,%ymm13
1066
vblendpd $1,%ymm13,%ymm5,%ymm5
1067
1068
vpermq $144,%ymm4,%ymm4
1069
vpermq $3,%ymm3,%ymm13
1070
vblendpd $1,%ymm13,%ymm4,%ymm4
1071
1072
vpermq $144,%ymm3,%ymm3
1073
vpand .Lhigh64x3(%rip),%ymm3,%ymm3
1074
1075
vmovupd %ymm3,320(%rsp)
1076
vmovupd %ymm4,352(%rsp)
1077
vmovupd %ymm5,384(%rsp)
1078
vmovupd %ymm6,416(%rsp)
1079
vmovupd %ymm7,448(%rsp)
1080
vmovupd %ymm8,480(%rsp)
1081
vmovupd %ymm9,512(%rsp)
1082
vmovupd %ymm10,544(%rsp)
1083
vmovupd %ymm11,576(%rsp)
1084
vmovupd %ymm12,608(%rsp)
1085
1086
vmovupd 0(%rsp),%ymm3
1087
vmovupd 32(%rsp),%ymm4
1088
vmovupd 64(%rsp),%ymm5
1089
vmovupd 96(%rsp),%ymm6
1090
vmovupd 128(%rsp),%ymm7
1091
vmovupd 160(%rsp),%ymm8
1092
vmovupd 192(%rsp),%ymm9
1093
vmovupd 224(%rsp),%ymm10
1094
vmovupd 256(%rsp),%ymm11
1095
vmovupd 288(%rsp),%ymm12
1096
1097
1098
vpand .Lmask52x4(%rip),%ymm3,%ymm3
1099
vpand .Lmask52x4(%rip),%ymm4,%ymm4
1100
vpand .Lmask52x4(%rip),%ymm5,%ymm5
1101
vpand .Lmask52x4(%rip),%ymm6,%ymm6
1102
vpand .Lmask52x4(%rip),%ymm7,%ymm7
1103
vpand .Lmask52x4(%rip),%ymm8,%ymm8
1104
vpand .Lmask52x4(%rip),%ymm9,%ymm9
1105
vpand .Lmask52x4(%rip),%ymm10,%ymm10
1106
vpand .Lmask52x4(%rip),%ymm11,%ymm11
1107
vpand .Lmask52x4(%rip),%ymm12,%ymm12
1108
1109
1110
vpaddq 320(%rsp),%ymm3,%ymm3
1111
vpaddq 352(%rsp),%ymm4,%ymm4
1112
vpaddq 384(%rsp),%ymm5,%ymm5
1113
vpaddq 416(%rsp),%ymm6,%ymm6
1114
vpaddq 448(%rsp),%ymm7,%ymm7
1115
vpaddq 480(%rsp),%ymm8,%ymm8
1116
vpaddq 512(%rsp),%ymm9,%ymm9
1117
vpaddq 544(%rsp),%ymm10,%ymm10
1118
vpaddq 576(%rsp),%ymm11,%ymm11
1119
vpaddq 608(%rsp),%ymm12,%ymm12
1120
1121
leaq 640(%rsp),%rsp
1122
1123
1124
1125
vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm13
1126
vmovmskpd %ymm13,%r14d
1127
vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm13
1128
vmovmskpd %ymm13,%r13d
1129
shlb $4,%r13b
1130
orb %r13b,%r14b
1131
1132
vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm13
1133
vmovmskpd %ymm13,%r13d
1134
vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm13
1135
vmovmskpd %ymm13,%r12d
1136
shlb $4,%r12b
1137
orb %r12b,%r13b
1138
1139
vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13
1140
vmovmskpd %ymm13,%r12d
1141
vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13
1142
vmovmskpd %ymm13,%r11d
1143
shlb $4,%r11b
1144
orb %r11b,%r12b
1145
1146
vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm13
1147
vmovmskpd %ymm13,%r11d
1148
vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm13
1149
vmovmskpd %ymm13,%r10d
1150
shlb $4,%r10b
1151
orb %r10b,%r11b
1152
1153
vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13
1154
vmovmskpd %ymm13,%r10d
1155
vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm13
1156
vmovmskpd %ymm13,%r9d
1157
shlb $4,%r9b
1158
orb %r9b,%r10b
1159
1160
addb %r14b,%r14b
1161
adcb %r13b,%r13b
1162
adcb %r12b,%r12b
1163
adcb %r11b,%r11b
1164
adcb %r10b,%r10b
1165
1166
1167
vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm13
1168
vmovmskpd %ymm13,%r9d
1169
vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm13
1170
vmovmskpd %ymm13,%r8d
1171
shlb $4,%r8b
1172
orb %r8b,%r9b
1173
1174
vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm13
1175
vmovmskpd %ymm13,%r8d
1176
vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm13
1177
vmovmskpd %ymm13,%edx
1178
shlb $4,%dl
1179
orb %dl,%r8b
1180
1181
vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13
1182
vmovmskpd %ymm13,%edx
1183
vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13
1184
vmovmskpd %ymm13,%ecx
1185
shlb $4,%cl
1186
orb %cl,%dl
1187
1188
vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm13
1189
vmovmskpd %ymm13,%ecx
1190
vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm13
1191
vmovmskpd %ymm13,%ebx
1192
shlb $4,%bl
1193
orb %bl,%cl
1194
1195
vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13
1196
vmovmskpd %ymm13,%ebx
1197
vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm13
1198
vmovmskpd %ymm13,%eax
1199
shlb $4,%al
1200
orb %al,%bl
1201
1202
addb %r9b,%r14b
1203
adcb %r8b,%r13b
1204
adcb %dl,%r12b
1205
adcb %cl,%r11b
1206
adcb %bl,%r10b
1207
1208
xorb %r9b,%r14b
1209
xorb %r8b,%r13b
1210
xorb %dl,%r12b
1211
xorb %cl,%r11b
1212
xorb %bl,%r10b
1213
1214
pushq %r9
1215
pushq %r8
1216
1217
leaq .Lkmasklut(%rip),%r8
1218
1219
movb %r14b,%r9b
1220
andq $0xf,%r14
1221
vpsubq .Lmask52x4(%rip),%ymm3,%ymm13
1222
shlq $5,%r14
1223
vmovapd (%r8,%r14,1),%ymm14
1224
vblendvpd %ymm14,%ymm13,%ymm3,%ymm3
1225
1226
shrb $4,%r9b
1227
andq $0xf,%r9
1228
vpsubq .Lmask52x4(%rip),%ymm4,%ymm13
1229
shlq $5,%r9
1230
vmovapd (%r8,%r9,1),%ymm14
1231
vblendvpd %ymm14,%ymm13,%ymm4,%ymm4
1232
1233
movb %r13b,%r9b
1234
andq $0xf,%r13
1235
vpsubq .Lmask52x4(%rip),%ymm5,%ymm13
1236
shlq $5,%r13
1237
vmovapd (%r8,%r13,1),%ymm14
1238
vblendvpd %ymm14,%ymm13,%ymm5,%ymm5
1239
1240
shrb $4,%r9b
1241
andq $0xf,%r9
1242
vpsubq .Lmask52x4(%rip),%ymm6,%ymm13
1243
shlq $5,%r9
1244
vmovapd (%r8,%r9,1),%ymm14
1245
vblendvpd %ymm14,%ymm13,%ymm6,%ymm6
1246
1247
movb %r12b,%r9b
1248
andq $0xf,%r12
1249
vpsubq .Lmask52x4(%rip),%ymm7,%ymm13
1250
shlq $5,%r12
1251
vmovapd (%r8,%r12,1),%ymm14
1252
vblendvpd %ymm14,%ymm13,%ymm7,%ymm7
1253
1254
shrb $4,%r9b
1255
andq $0xf,%r9
1256
vpsubq .Lmask52x4(%rip),%ymm8,%ymm13
1257
shlq $5,%r9
1258
vmovapd (%r8,%r9,1),%ymm14
1259
vblendvpd %ymm14,%ymm13,%ymm8,%ymm8
1260
1261
movb %r11b,%r9b
1262
andq $0xf,%r11
1263
vpsubq .Lmask52x4(%rip),%ymm9,%ymm13
1264
shlq $5,%r11
1265
vmovapd (%r8,%r11,1),%ymm14
1266
vblendvpd %ymm14,%ymm13,%ymm9,%ymm9
1267
1268
shrb $4,%r9b
1269
andq $0xf,%r9
1270
vpsubq .Lmask52x4(%rip),%ymm10,%ymm13
1271
shlq $5,%r9
1272
vmovapd (%r8,%r9,1),%ymm14
1273
vblendvpd %ymm14,%ymm13,%ymm10,%ymm10
1274
1275
movb %r10b,%r9b
1276
andq $0xf,%r10
1277
vpsubq .Lmask52x4(%rip),%ymm11,%ymm13
1278
shlq $5,%r10
1279
vmovapd (%r8,%r10,1),%ymm14
1280
vblendvpd %ymm14,%ymm13,%ymm11,%ymm11
1281
1282
shrb $4,%r9b
1283
andq $0xf,%r9
1284
vpsubq .Lmask52x4(%rip),%ymm12,%ymm13
1285
shlq $5,%r9
1286
vmovapd (%r8,%r9,1),%ymm14
1287
vblendvpd %ymm14,%ymm13,%ymm12,%ymm12
1288
1289
popq %r8
1290
popq %r9
1291
1292
vpand .Lmask52x4(%rip),%ymm3,%ymm3
1293
vpand .Lmask52x4(%rip),%ymm4,%ymm4
1294
vpand .Lmask52x4(%rip),%ymm5,%ymm5
1295
vpand .Lmask52x4(%rip),%ymm6,%ymm6
1296
vpand .Lmask52x4(%rip),%ymm7,%ymm7
1297
vpand .Lmask52x4(%rip),%ymm8,%ymm8
1298
vpand .Lmask52x4(%rip),%ymm9,%ymm9
1299
1300
vpand .Lmask52x4(%rip),%ymm10,%ymm10
1301
vpand .Lmask52x4(%rip),%ymm11,%ymm11
1302
vpand .Lmask52x4(%rip),%ymm12,%ymm12
1303
1304
popq %r8
1305
popq %rcx
1306
popq %rsi
1307
popq %r11
1308
1309
vmovdqu %ymm3,0(%rdi)
1310
vmovdqu %ymm4,32(%rdi)
1311
vmovdqu %ymm5,64(%rdi)
1312
vmovdqu %ymm6,96(%rdi)
1313
vmovdqu %ymm7,128(%rdi)
1314
vmovdqu %ymm8,160(%rdi)
1315
vmovdqu %ymm9,192(%rdi)
1316
vmovdqu %ymm10,224(%rdi)
1317
vmovdqu %ymm11,256(%rdi)
1318
vmovdqu %ymm12,288(%rdi)
1319
1320
xorl %r15d,%r15d
1321
1322
movq $0xfffffffffffff,%rax
1323
1324
movl $40,%ebx
1325
1326
vpxor %ymm0,%ymm0,%ymm0
1327
vmovapd %ymm0,%ymm3
1328
vmovapd %ymm0,%ymm4
1329
vmovapd %ymm0,%ymm5
1330
vmovapd %ymm0,%ymm6
1331
vmovapd %ymm0,%ymm7
1332
vmovapd %ymm0,%ymm8
1333
vmovapd %ymm0,%ymm9
1334
vmovapd %ymm0,%ymm10
1335
vmovapd %ymm0,%ymm11
1336
vmovapd %ymm0,%ymm12
1337
.align 32
1338
.Lloop40_1:
1339
movq 0(%r11),%r13
1340
1341
vpbroadcastq 0(%r11),%ymm1
1342
movq 320(%rsi),%rdx
1343
mulxq %r13,%r13,%r12
1344
addq %r13,%r9
1345
movq %r12,%r10
1346
adcq $0,%r10
1347
1348
movq 8(%r8),%r13
1349
imulq %r9,%r13
1350
andq %rax,%r13
1351
1352
vmovq %r13,%xmm2
1353
vpbroadcastq %xmm2,%ymm2
1354
movq 320(%rcx),%rdx
1355
mulxq %r13,%r13,%r12
1356
addq %r13,%r9
1357
adcq %r12,%r10
1358
1359
shrq $52,%r9
1360
salq $12,%r10
1361
orq %r10,%r9
1362
1363
leaq -328(%rsp),%rsp
1364
1365
{vex} vpmadd52luq 320(%rsi),%ymm1,%ymm3
1366
{vex} vpmadd52luq 352(%rsi),%ymm1,%ymm4
1367
{vex} vpmadd52luq 384(%rsi),%ymm1,%ymm5
1368
{vex} vpmadd52luq 416(%rsi),%ymm1,%ymm6
1369
{vex} vpmadd52luq 448(%rsi),%ymm1,%ymm7
1370
{vex} vpmadd52luq 480(%rsi),%ymm1,%ymm8
1371
{vex} vpmadd52luq 512(%rsi),%ymm1,%ymm9
1372
{vex} vpmadd52luq 544(%rsi),%ymm1,%ymm10
1373
{vex} vpmadd52luq 576(%rsi),%ymm1,%ymm11
1374
{vex} vpmadd52luq 608(%rsi),%ymm1,%ymm12
1375
1376
{vex} vpmadd52luq 320(%rcx),%ymm2,%ymm3
1377
{vex} vpmadd52luq 352(%rcx),%ymm2,%ymm4
1378
{vex} vpmadd52luq 384(%rcx),%ymm2,%ymm5
1379
{vex} vpmadd52luq 416(%rcx),%ymm2,%ymm6
1380
{vex} vpmadd52luq 448(%rcx),%ymm2,%ymm7
1381
{vex} vpmadd52luq 480(%rcx),%ymm2,%ymm8
1382
{vex} vpmadd52luq 512(%rcx),%ymm2,%ymm9
1383
{vex} vpmadd52luq 544(%rcx),%ymm2,%ymm10
1384
{vex} vpmadd52luq 576(%rcx),%ymm2,%ymm11
1385
{vex} vpmadd52luq 608(%rcx),%ymm2,%ymm12
1386
vmovdqu %ymm3,0(%rsp)
1387
vmovdqu %ymm4,32(%rsp)
1388
vmovdqu %ymm5,64(%rsp)
1389
vmovdqu %ymm6,96(%rsp)
1390
vmovdqu %ymm7,128(%rsp)
1391
vmovdqu %ymm8,160(%rsp)
1392
vmovdqu %ymm9,192(%rsp)
1393
vmovdqu %ymm10,224(%rsp)
1394
vmovdqu %ymm11,256(%rsp)
1395
vmovdqu %ymm12,288(%rsp)
1396
movq $0,320(%rsp)
1397
1398
vmovdqu 8(%rsp),%ymm3
1399
vmovdqu 40(%rsp),%ymm4
1400
vmovdqu 72(%rsp),%ymm5
1401
vmovdqu 104(%rsp),%ymm6
1402
vmovdqu 136(%rsp),%ymm7
1403
vmovdqu 168(%rsp),%ymm8
1404
vmovdqu 200(%rsp),%ymm9
1405
vmovdqu 232(%rsp),%ymm10
1406
vmovdqu 264(%rsp),%ymm11
1407
vmovdqu 296(%rsp),%ymm12
1408
1409
addq 8(%rsp),%r9
1410
1411
{vex} vpmadd52huq 320(%rsi),%ymm1,%ymm3
1412
{vex} vpmadd52huq 352(%rsi),%ymm1,%ymm4
1413
{vex} vpmadd52huq 384(%rsi),%ymm1,%ymm5
1414
{vex} vpmadd52huq 416(%rsi),%ymm1,%ymm6
1415
{vex} vpmadd52huq 448(%rsi),%ymm1,%ymm7
1416
{vex} vpmadd52huq 480(%rsi),%ymm1,%ymm8
1417
{vex} vpmadd52huq 512(%rsi),%ymm1,%ymm9
1418
{vex} vpmadd52huq 544(%rsi),%ymm1,%ymm10
1419
{vex} vpmadd52huq 576(%rsi),%ymm1,%ymm11
1420
{vex} vpmadd52huq 608(%rsi),%ymm1,%ymm12
1421
1422
{vex} vpmadd52huq 320(%rcx),%ymm2,%ymm3
1423
{vex} vpmadd52huq 352(%rcx),%ymm2,%ymm4
1424
{vex} vpmadd52huq 384(%rcx),%ymm2,%ymm5
1425
{vex} vpmadd52huq 416(%rcx),%ymm2,%ymm6
1426
{vex} vpmadd52huq 448(%rcx),%ymm2,%ymm7
1427
{vex} vpmadd52huq 480(%rcx),%ymm2,%ymm8
1428
{vex} vpmadd52huq 512(%rcx),%ymm2,%ymm9
1429
{vex} vpmadd52huq 544(%rcx),%ymm2,%ymm10
1430
{vex} vpmadd52huq 576(%rcx),%ymm2,%ymm11
1431
{vex} vpmadd52huq 608(%rcx),%ymm2,%ymm12
1432
leaq 328(%rsp),%rsp
1433
leaq 8(%r11),%r11
1434
decl %ebx
1435
jne .Lloop40_1
1436
1437
vmovq %r9,%xmm0
1438
vpbroadcastq %xmm0,%ymm0
1439
vpblendd $3,%ymm0,%ymm3,%ymm3
1440
1441
leaq -640(%rsp),%rsp
1442
vmovupd %ymm3,0(%rsp)
1443
vmovupd %ymm4,32(%rsp)
1444
vmovupd %ymm5,64(%rsp)
1445
vmovupd %ymm6,96(%rsp)
1446
vmovupd %ymm7,128(%rsp)
1447
vmovupd %ymm8,160(%rsp)
1448
vmovupd %ymm9,192(%rsp)
1449
vmovupd %ymm10,224(%rsp)
1450
vmovupd %ymm11,256(%rsp)
1451
vmovupd %ymm12,288(%rsp)
1452
1453
1454
1455
vpsrlq $52,%ymm3,%ymm3
1456
vpsrlq $52,%ymm4,%ymm4
1457
vpsrlq $52,%ymm5,%ymm5
1458
vpsrlq $52,%ymm6,%ymm6
1459
vpsrlq $52,%ymm7,%ymm7
1460
vpsrlq $52,%ymm8,%ymm8
1461
vpsrlq $52,%ymm9,%ymm9
1462
vpsrlq $52,%ymm10,%ymm10
1463
vpsrlq $52,%ymm11,%ymm11
1464
vpsrlq $52,%ymm12,%ymm12
1465
1466
1467
vpermq $144,%ymm12,%ymm12
1468
vpermq $3,%ymm11,%ymm13
1469
vblendpd $1,%ymm13,%ymm12,%ymm12
1470
1471
vpermq $144,%ymm11,%ymm11
1472
vpermq $3,%ymm10,%ymm13
1473
vblendpd $1,%ymm13,%ymm11,%ymm11
1474
1475
vpermq $144,%ymm10,%ymm10
1476
vpermq $3,%ymm9,%ymm13
1477
vblendpd $1,%ymm13,%ymm10,%ymm10
1478
1479
vpermq $144,%ymm9,%ymm9
1480
vpermq $3,%ymm8,%ymm13
1481
vblendpd $1,%ymm13,%ymm9,%ymm9
1482
1483
vpermq $144,%ymm8,%ymm8
1484
vpermq $3,%ymm7,%ymm13
1485
vblendpd $1,%ymm13,%ymm8,%ymm8
1486
1487
vpermq $144,%ymm7,%ymm7
1488
vpermq $3,%ymm6,%ymm13
1489
vblendpd $1,%ymm13,%ymm7,%ymm7
1490
1491
vpermq $144,%ymm6,%ymm6
1492
vpermq $3,%ymm5,%ymm13
1493
vblendpd $1,%ymm13,%ymm6,%ymm6
1494
1495
vpermq $144,%ymm5,%ymm5
1496
vpermq $3,%ymm4,%ymm13
1497
vblendpd $1,%ymm13,%ymm5,%ymm5
1498
1499
vpermq $144,%ymm4,%ymm4
1500
vpermq $3,%ymm3,%ymm13
1501
vblendpd $1,%ymm13,%ymm4,%ymm4
1502
1503
vpermq $144,%ymm3,%ymm3
1504
vpand .Lhigh64x3(%rip),%ymm3,%ymm3
1505
1506
vmovupd %ymm3,320(%rsp)
1507
vmovupd %ymm4,352(%rsp)
1508
vmovupd %ymm5,384(%rsp)
1509
vmovupd %ymm6,416(%rsp)
1510
vmovupd %ymm7,448(%rsp)
1511
vmovupd %ymm8,480(%rsp)
1512
vmovupd %ymm9,512(%rsp)
1513
vmovupd %ymm10,544(%rsp)
1514
vmovupd %ymm11,576(%rsp)
1515
vmovupd %ymm12,608(%rsp)
1516
1517
vmovupd 0(%rsp),%ymm3
1518
vmovupd 32(%rsp),%ymm4
1519
vmovupd 64(%rsp),%ymm5
1520
vmovupd 96(%rsp),%ymm6
1521
vmovupd 128(%rsp),%ymm7
1522
vmovupd 160(%rsp),%ymm8
1523
vmovupd 192(%rsp),%ymm9
1524
vmovupd 224(%rsp),%ymm10
1525
vmovupd 256(%rsp),%ymm11
1526
vmovupd 288(%rsp),%ymm12
1527
1528
1529
vpand .Lmask52x4(%rip),%ymm3,%ymm3
1530
vpand .Lmask52x4(%rip),%ymm4,%ymm4
1531
vpand .Lmask52x4(%rip),%ymm5,%ymm5
1532
vpand .Lmask52x4(%rip),%ymm6,%ymm6
1533
vpand .Lmask52x4(%rip),%ymm7,%ymm7
1534
vpand .Lmask52x4(%rip),%ymm8,%ymm8
1535
vpand .Lmask52x4(%rip),%ymm9,%ymm9
1536
vpand .Lmask52x4(%rip),%ymm10,%ymm10
1537
vpand .Lmask52x4(%rip),%ymm11,%ymm11
1538
vpand .Lmask52x4(%rip),%ymm12,%ymm12
1539
1540
1541
vpaddq 320(%rsp),%ymm3,%ymm3
1542
vpaddq 352(%rsp),%ymm4,%ymm4
1543
vpaddq 384(%rsp),%ymm5,%ymm5
1544
vpaddq 416(%rsp),%ymm6,%ymm6
1545
vpaddq 448(%rsp),%ymm7,%ymm7
1546
vpaddq 480(%rsp),%ymm8,%ymm8
1547
vpaddq 512(%rsp),%ymm9,%ymm9
1548
vpaddq 544(%rsp),%ymm10,%ymm10
1549
vpaddq 576(%rsp),%ymm11,%ymm11
1550
vpaddq 608(%rsp),%ymm12,%ymm12
1551
1552
leaq 640(%rsp),%rsp
1553
1554
1555
1556
vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm13
1557
vmovmskpd %ymm13,%r14d
1558
vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm13
1559
vmovmskpd %ymm13,%r13d
1560
shlb $4,%r13b
1561
orb %r13b,%r14b
1562
1563
vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm13
1564
vmovmskpd %ymm13,%r13d
1565
vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm13
1566
vmovmskpd %ymm13,%r12d
1567
shlb $4,%r12b
1568
orb %r12b,%r13b
1569
1570
vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13
1571
vmovmskpd %ymm13,%r12d
1572
vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13
1573
vmovmskpd %ymm13,%r11d
1574
shlb $4,%r11b
1575
orb %r11b,%r12b
1576
1577
vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm13
1578
vmovmskpd %ymm13,%r11d
1579
vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm13
1580
vmovmskpd %ymm13,%r10d
1581
shlb $4,%r10b
1582
orb %r10b,%r11b
1583
1584
vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13
1585
vmovmskpd %ymm13,%r10d
1586
vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm13
1587
vmovmskpd %ymm13,%r9d
1588
shlb $4,%r9b
1589
orb %r9b,%r10b
1590
1591
addb %r14b,%r14b
1592
adcb %r13b,%r13b
1593
adcb %r12b,%r12b
1594
adcb %r11b,%r11b
1595
adcb %r10b,%r10b
1596
1597
1598
vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm13
1599
vmovmskpd %ymm13,%r9d
1600
vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm13
1601
vmovmskpd %ymm13,%r8d
1602
shlb $4,%r8b
1603
orb %r8b,%r9b
1604
1605
vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm13
1606
vmovmskpd %ymm13,%r8d
1607
vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm13
1608
vmovmskpd %ymm13,%edx
1609
shlb $4,%dl
1610
orb %dl,%r8b
1611
1612
vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13
1613
vmovmskpd %ymm13,%edx
1614
vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13
1615
vmovmskpd %ymm13,%ecx
1616
shlb $4,%cl
1617
orb %cl,%dl
1618
1619
vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm13
1620
vmovmskpd %ymm13,%ecx
1621
vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm13
1622
vmovmskpd %ymm13,%ebx
1623
shlb $4,%bl
1624
orb %bl,%cl
1625
1626
vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13
1627
vmovmskpd %ymm13,%ebx
1628
vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm13
1629
vmovmskpd %ymm13,%eax
1630
shlb $4,%al
1631
orb %al,%bl
1632
1633
addb %r9b,%r14b
1634
adcb %r8b,%r13b
1635
adcb %dl,%r12b
1636
adcb %cl,%r11b
1637
adcb %bl,%r10b
1638
1639
xorb %r9b,%r14b
1640
xorb %r8b,%r13b
1641
xorb %dl,%r12b
1642
xorb %cl,%r11b
1643
xorb %bl,%r10b
1644
1645
pushq %r9
1646
pushq %r8
1647
1648
leaq .Lkmasklut(%rip),%r8
1649
1650
movb %r14b,%r9b
1651
andq $0xf,%r14
1652
vpsubq .Lmask52x4(%rip),%ymm3,%ymm13
1653
shlq $5,%r14
1654
vmovapd (%r8,%r14,1),%ymm14
1655
vblendvpd %ymm14,%ymm13,%ymm3,%ymm3
1656
1657
shrb $4,%r9b
1658
andq $0xf,%r9
1659
vpsubq .Lmask52x4(%rip),%ymm4,%ymm13
1660
shlq $5,%r9
1661
vmovapd (%r8,%r9,1),%ymm14
1662
vblendvpd %ymm14,%ymm13,%ymm4,%ymm4
1663
1664
movb %r13b,%r9b
1665
andq $0xf,%r13
1666
vpsubq .Lmask52x4(%rip),%ymm5,%ymm13
1667
shlq $5,%r13
1668
vmovapd (%r8,%r13,1),%ymm14
1669
vblendvpd %ymm14,%ymm13,%ymm5,%ymm5
1670
1671
shrb $4,%r9b
1672
andq $0xf,%r9
1673
vpsubq .Lmask52x4(%rip),%ymm6,%ymm13
1674
shlq $5,%r9
1675
vmovapd (%r8,%r9,1),%ymm14
1676
vblendvpd %ymm14,%ymm13,%ymm6,%ymm6
1677
1678
movb %r12b,%r9b
1679
andq $0xf,%r12
1680
vpsubq .Lmask52x4(%rip),%ymm7,%ymm13
1681
shlq $5,%r12
1682
vmovapd (%r8,%r12,1),%ymm14
1683
vblendvpd %ymm14,%ymm13,%ymm7,%ymm7
1684
1685
shrb $4,%r9b
1686
andq $0xf,%r9
1687
vpsubq .Lmask52x4(%rip),%ymm8,%ymm13
1688
shlq $5,%r9
1689
vmovapd (%r8,%r9,1),%ymm14
1690
vblendvpd %ymm14,%ymm13,%ymm8,%ymm8
1691
1692
movb %r11b,%r9b
1693
andq $0xf,%r11
1694
vpsubq .Lmask52x4(%rip),%ymm9,%ymm13
1695
shlq $5,%r11
1696
vmovapd (%r8,%r11,1),%ymm14
1697
vblendvpd %ymm14,%ymm13,%ymm9,%ymm9
1698
1699
shrb $4,%r9b
1700
andq $0xf,%r9
1701
vpsubq .Lmask52x4(%rip),%ymm10,%ymm13
1702
shlq $5,%r9
1703
vmovapd (%r8,%r9,1),%ymm14
1704
vblendvpd %ymm14,%ymm13,%ymm10,%ymm10
1705
1706
movb %r10b,%r9b
1707
andq $0xf,%r10
1708
vpsubq .Lmask52x4(%rip),%ymm11,%ymm13
1709
shlq $5,%r10
1710
vmovapd (%r8,%r10,1),%ymm14
1711
vblendvpd %ymm14,%ymm13,%ymm11,%ymm11
1712
1713
shrb $4,%r9b
1714
andq $0xf,%r9
1715
vpsubq .Lmask52x4(%rip),%ymm12,%ymm13
1716
shlq $5,%r9
1717
vmovapd (%r8,%r9,1),%ymm14
1718
vblendvpd %ymm14,%ymm13,%ymm12,%ymm12
1719
1720
popq %r8
1721
popq %r9
1722
1723
vpand .Lmask52x4(%rip),%ymm3,%ymm3
1724
vpand .Lmask52x4(%rip),%ymm4,%ymm4
1725
vpand .Lmask52x4(%rip),%ymm5,%ymm5
1726
vpand .Lmask52x4(%rip),%ymm6,%ymm6
1727
vpand .Lmask52x4(%rip),%ymm7,%ymm7
1728
vpand .Lmask52x4(%rip),%ymm8,%ymm8
1729
vpand .Lmask52x4(%rip),%ymm9,%ymm9
1730
1731
vpand .Lmask52x4(%rip),%ymm10,%ymm10
1732
vpand .Lmask52x4(%rip),%ymm11,%ymm11
1733
vpand .Lmask52x4(%rip),%ymm12,%ymm12
1734
1735
vmovdqu %ymm3,320(%rdi)
1736
vmovdqu %ymm4,352(%rdi)
1737
vmovdqu %ymm5,384(%rdi)
1738
vmovdqu %ymm6,416(%rdi)
1739
vmovdqu %ymm7,448(%rdi)
1740
vmovdqu %ymm8,480(%rdi)
1741
vmovdqu %ymm9,512(%rdi)
1742
vmovdqu %ymm10,544(%rdi)
1743
vmovdqu %ymm11,576(%rdi)
1744
vmovdqu %ymm12,608(%rdi)
1745
1746
vzeroupper
1747
leaq (%rsp),%rax
1748
.cfi_def_cfa_register %rax
1749
movq 0(%rax),%r15
1750
.cfi_restore %r15
1751
movq 8(%rax),%r14
1752
.cfi_restore %r14
1753
movq 16(%rax),%r13
1754
.cfi_restore %r13
1755
movq 24(%rax),%r12
1756
.cfi_restore %r12
1757
movq 32(%rax),%rbp
1758
.cfi_restore %rbp
1759
movq 40(%rax),%rbx
1760
.cfi_restore %rbx
1761
leaq 48(%rax),%rsp
1762
.cfi_def_cfa %rsp,8
1763
.Lossl_rsaz_amm52x40_x2_avxifma256_epilogue:
1764
.byte 0xf3,0xc3
1765
.cfi_endproc
1766
.size ossl_rsaz_amm52x40_x2_avxifma256, .-ossl_rsaz_amm52x40_x2_avxifma256
1767
.text
1768
1769
.align 32
1770
.globl ossl_extract_multiplier_2x40_win5_avx
1771
.type ossl_extract_multiplier_2x40_win5_avx,@function
1772
ossl_extract_multiplier_2x40_win5_avx:
1773
.cfi_startproc
1774
.byte 243,15,30,250
1775
vmovapd .Lones(%rip),%ymm14
1776
vmovq %rdx,%xmm10
1777
vpbroadcastq %xmm10,%ymm12
1778
vmovq %rcx,%xmm10
1779
vpbroadcastq %xmm10,%ymm13
1780
leaq 20480(%rsi),%rax
1781
1782
1783
movq %rsi,%r10
1784
1785
1786
vpxor %xmm0,%xmm0,%xmm0
1787
vmovapd %ymm0,%ymm1
1788
vmovapd %ymm0,%ymm2
1789
vmovapd %ymm0,%ymm3
1790
vmovapd %ymm0,%ymm4
1791
vmovapd %ymm0,%ymm5
1792
vmovapd %ymm0,%ymm6
1793
vmovapd %ymm0,%ymm7
1794
vmovapd %ymm0,%ymm8
1795
vmovapd %ymm0,%ymm9
1796
vpxor %ymm11,%ymm11,%ymm11
1797
.align 32
1798
.Lloop_0:
1799
vpcmpeqq %ymm11,%ymm12,%ymm15
1800
vmovdqu 0(%rsi),%ymm10
1801
1802
vblendvpd %ymm15,%ymm10,%ymm0,%ymm0
1803
vmovdqu 32(%rsi),%ymm10
1804
1805
vblendvpd %ymm15,%ymm10,%ymm1,%ymm1
1806
vmovdqu 64(%rsi),%ymm10
1807
1808
vblendvpd %ymm15,%ymm10,%ymm2,%ymm2
1809
vmovdqu 96(%rsi),%ymm10
1810
1811
vblendvpd %ymm15,%ymm10,%ymm3,%ymm3
1812
vmovdqu 128(%rsi),%ymm10
1813
1814
vblendvpd %ymm15,%ymm10,%ymm4,%ymm4
1815
vmovdqu 160(%rsi),%ymm10
1816
1817
vblendvpd %ymm15,%ymm10,%ymm5,%ymm5
1818
vmovdqu 192(%rsi),%ymm10
1819
1820
vblendvpd %ymm15,%ymm10,%ymm6,%ymm6
1821
vmovdqu 224(%rsi),%ymm10
1822
1823
vblendvpd %ymm15,%ymm10,%ymm7,%ymm7
1824
vmovdqu 256(%rsi),%ymm10
1825
1826
vblendvpd %ymm15,%ymm10,%ymm8,%ymm8
1827
vmovdqu 288(%rsi),%ymm10
1828
1829
vblendvpd %ymm15,%ymm10,%ymm9,%ymm9
1830
vpaddq %ymm14,%ymm11,%ymm11
1831
addq $640,%rsi
1832
cmpq %rsi,%rax
1833
jne .Lloop_0
1834
vmovdqu %ymm0,0(%rdi)
1835
vmovdqu %ymm1,32(%rdi)
1836
vmovdqu %ymm2,64(%rdi)
1837
vmovdqu %ymm3,96(%rdi)
1838
vmovdqu %ymm4,128(%rdi)
1839
vmovdqu %ymm5,160(%rdi)
1840
vmovdqu %ymm6,192(%rdi)
1841
vmovdqu %ymm7,224(%rdi)
1842
vmovdqu %ymm8,256(%rdi)
1843
vmovdqu %ymm9,288(%rdi)
1844
movq %r10,%rsi
1845
vpxor %ymm11,%ymm11,%ymm11
1846
.align 32
1847
.Lloop_320:
1848
vpcmpeqq %ymm11,%ymm13,%ymm15
1849
vmovdqu 320(%rsi),%ymm10
1850
1851
vblendvpd %ymm15,%ymm10,%ymm0,%ymm0
1852
vmovdqu 352(%rsi),%ymm10
1853
1854
vblendvpd %ymm15,%ymm10,%ymm1,%ymm1
1855
vmovdqu 384(%rsi),%ymm10
1856
1857
vblendvpd %ymm15,%ymm10,%ymm2,%ymm2
1858
vmovdqu 416(%rsi),%ymm10
1859
1860
vblendvpd %ymm15,%ymm10,%ymm3,%ymm3
1861
vmovdqu 448(%rsi),%ymm10
1862
1863
vblendvpd %ymm15,%ymm10,%ymm4,%ymm4
1864
vmovdqu 480(%rsi),%ymm10
1865
1866
vblendvpd %ymm15,%ymm10,%ymm5,%ymm5
1867
vmovdqu 512(%rsi),%ymm10
1868
1869
vblendvpd %ymm15,%ymm10,%ymm6,%ymm6
1870
vmovdqu 544(%rsi),%ymm10
1871
1872
vblendvpd %ymm15,%ymm10,%ymm7,%ymm7
1873
vmovdqu 576(%rsi),%ymm10
1874
1875
vblendvpd %ymm15,%ymm10,%ymm8,%ymm8
1876
vmovdqu 608(%rsi),%ymm10
1877
1878
vblendvpd %ymm15,%ymm10,%ymm9,%ymm9
1879
vpaddq %ymm14,%ymm11,%ymm11
1880
addq $640,%rsi
1881
cmpq %rsi,%rax
1882
jne .Lloop_320
1883
vmovdqu %ymm0,320(%rdi)
1884
vmovdqu %ymm1,352(%rdi)
1885
vmovdqu %ymm2,384(%rdi)
1886
vmovdqu %ymm3,416(%rdi)
1887
vmovdqu %ymm4,448(%rdi)
1888
vmovdqu %ymm5,480(%rdi)
1889
vmovdqu %ymm6,512(%rdi)
1890
vmovdqu %ymm7,544(%rdi)
1891
vmovdqu %ymm8,576(%rdi)
1892
vmovdqu %ymm9,608(%rdi)
1893
1894
.byte 0xf3,0xc3
1895
.cfi_endproc
1896
.size ossl_extract_multiplier_2x40_win5_avx, .-ossl_extract_multiplier_2x40_win5_avx
1897
.section .rodata
1898
.align 32
1899
.Lones:
1900
.quad 1,1,1,1
1901
.Lzeros:
1902
.quad 0,0,0,0
1903
.section ".note.gnu.property", "a"
1904
.p2align 3
1905
.long 1f - 0f
1906
.long 4f - 1f
1907
.long 5
1908
0:
1909
# "GNU" encoded with .byte, since .asciz isn't supported
1910
# on Solaris.
1911
.byte 0x47
1912
.byte 0x4e
1913
.byte 0x55
1914
.byte 0
1915
1:
1916
.p2align 3
1917
.long 0xc0000002
1918
.long 3f - 2f
1919
2:
1920
.long 3
1921
3:
1922
.p2align 3
1923
4:
1924
1925