Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/amd64/rsaz-3k-avx512.S
39482 views
1
/* Do not modify. This file is auto-generated from rsaz-3k-avx512.pl. */
2
.text
3
4
.globl ossl_rsaz_amm52x30_x1_ifma256
5
.type ossl_rsaz_amm52x30_x1_ifma256,@function
6
.align 32
7
ossl_rsaz_amm52x30_x1_ifma256:
8
.cfi_startproc
9
.byte 243,15,30,250
10
pushq %rbx
11
.cfi_adjust_cfa_offset 8
12
.cfi_offset %rbx,-16
13
pushq %rbp
14
.cfi_adjust_cfa_offset 8
15
.cfi_offset %rbp,-24
16
pushq %r12
17
.cfi_adjust_cfa_offset 8
18
.cfi_offset %r12,-32
19
pushq %r13
20
.cfi_adjust_cfa_offset 8
21
.cfi_offset %r13,-40
22
pushq %r14
23
.cfi_adjust_cfa_offset 8
24
.cfi_offset %r14,-48
25
pushq %r15
26
.cfi_adjust_cfa_offset 8
27
.cfi_offset %r15,-56
28
29
vpxord %ymm0,%ymm0,%ymm0
30
vmovdqa64 %ymm0,%ymm3
31
vmovdqa64 %ymm0,%ymm4
32
vmovdqa64 %ymm0,%ymm5
33
vmovdqa64 %ymm0,%ymm6
34
vmovdqa64 %ymm0,%ymm7
35
vmovdqa64 %ymm0,%ymm8
36
vmovdqa64 %ymm0,%ymm9
37
vmovdqa64 %ymm0,%ymm10
38
39
xorl %r9d,%r9d
40
41
movq %rdx,%r11
42
movq $0xfffffffffffff,%rax
43
44
45
movl $7,%ebx
46
47
.align 32
48
.Lloop7:
49
movq 0(%r11),%r13
50
51
vpbroadcastq %r13,%ymm1
52
movq 0(%rsi),%rdx
53
mulxq %r13,%r13,%r12
54
addq %r13,%r9
55
movq %r12,%r10
56
adcq $0,%r10
57
58
movq %r8,%r13
59
imulq %r9,%r13
60
andq %rax,%r13
61
62
vpbroadcastq %r13,%ymm2
63
movq 0(%rcx),%rdx
64
mulxq %r13,%r13,%r12
65
addq %r13,%r9
66
adcq %r12,%r10
67
68
shrq $52,%r9
69
salq $12,%r10
70
orq %r10,%r9
71
72
vpmadd52luq 0(%rsi),%ymm1,%ymm3
73
vpmadd52luq 32(%rsi),%ymm1,%ymm4
74
vpmadd52luq 64(%rsi),%ymm1,%ymm5
75
vpmadd52luq 96(%rsi),%ymm1,%ymm6
76
vpmadd52luq 128(%rsi),%ymm1,%ymm7
77
vpmadd52luq 160(%rsi),%ymm1,%ymm8
78
vpmadd52luq 192(%rsi),%ymm1,%ymm9
79
vpmadd52luq 224(%rsi),%ymm1,%ymm10
80
81
vpmadd52luq 0(%rcx),%ymm2,%ymm3
82
vpmadd52luq 32(%rcx),%ymm2,%ymm4
83
vpmadd52luq 64(%rcx),%ymm2,%ymm5
84
vpmadd52luq 96(%rcx),%ymm2,%ymm6
85
vpmadd52luq 128(%rcx),%ymm2,%ymm7
86
vpmadd52luq 160(%rcx),%ymm2,%ymm8
87
vpmadd52luq 192(%rcx),%ymm2,%ymm9
88
vpmadd52luq 224(%rcx),%ymm2,%ymm10
89
90
91
valignq $1,%ymm3,%ymm4,%ymm3
92
valignq $1,%ymm4,%ymm5,%ymm4
93
valignq $1,%ymm5,%ymm6,%ymm5
94
valignq $1,%ymm6,%ymm7,%ymm6
95
valignq $1,%ymm7,%ymm8,%ymm7
96
valignq $1,%ymm8,%ymm9,%ymm8
97
valignq $1,%ymm9,%ymm10,%ymm9
98
valignq $1,%ymm10,%ymm0,%ymm10
99
100
vmovq %xmm3,%r13
101
addq %r13,%r9
102
103
vpmadd52huq 0(%rsi),%ymm1,%ymm3
104
vpmadd52huq 32(%rsi),%ymm1,%ymm4
105
vpmadd52huq 64(%rsi),%ymm1,%ymm5
106
vpmadd52huq 96(%rsi),%ymm1,%ymm6
107
vpmadd52huq 128(%rsi),%ymm1,%ymm7
108
vpmadd52huq 160(%rsi),%ymm1,%ymm8
109
vpmadd52huq 192(%rsi),%ymm1,%ymm9
110
vpmadd52huq 224(%rsi),%ymm1,%ymm10
111
112
vpmadd52huq 0(%rcx),%ymm2,%ymm3
113
vpmadd52huq 32(%rcx),%ymm2,%ymm4
114
vpmadd52huq 64(%rcx),%ymm2,%ymm5
115
vpmadd52huq 96(%rcx),%ymm2,%ymm6
116
vpmadd52huq 128(%rcx),%ymm2,%ymm7
117
vpmadd52huq 160(%rcx),%ymm2,%ymm8
118
vpmadd52huq 192(%rcx),%ymm2,%ymm9
119
vpmadd52huq 224(%rcx),%ymm2,%ymm10
120
movq 8(%r11),%r13
121
122
vpbroadcastq %r13,%ymm1
123
movq 0(%rsi),%rdx
124
mulxq %r13,%r13,%r12
125
addq %r13,%r9
126
movq %r12,%r10
127
adcq $0,%r10
128
129
movq %r8,%r13
130
imulq %r9,%r13
131
andq %rax,%r13
132
133
vpbroadcastq %r13,%ymm2
134
movq 0(%rcx),%rdx
135
mulxq %r13,%r13,%r12
136
addq %r13,%r9
137
adcq %r12,%r10
138
139
shrq $52,%r9
140
salq $12,%r10
141
orq %r10,%r9
142
143
vpmadd52luq 0(%rsi),%ymm1,%ymm3
144
vpmadd52luq 32(%rsi),%ymm1,%ymm4
145
vpmadd52luq 64(%rsi),%ymm1,%ymm5
146
vpmadd52luq 96(%rsi),%ymm1,%ymm6
147
vpmadd52luq 128(%rsi),%ymm1,%ymm7
148
vpmadd52luq 160(%rsi),%ymm1,%ymm8
149
vpmadd52luq 192(%rsi),%ymm1,%ymm9
150
vpmadd52luq 224(%rsi),%ymm1,%ymm10
151
152
vpmadd52luq 0(%rcx),%ymm2,%ymm3
153
vpmadd52luq 32(%rcx),%ymm2,%ymm4
154
vpmadd52luq 64(%rcx),%ymm2,%ymm5
155
vpmadd52luq 96(%rcx),%ymm2,%ymm6
156
vpmadd52luq 128(%rcx),%ymm2,%ymm7
157
vpmadd52luq 160(%rcx),%ymm2,%ymm8
158
vpmadd52luq 192(%rcx),%ymm2,%ymm9
159
vpmadd52luq 224(%rcx),%ymm2,%ymm10
160
161
162
valignq $1,%ymm3,%ymm4,%ymm3
163
valignq $1,%ymm4,%ymm5,%ymm4
164
valignq $1,%ymm5,%ymm6,%ymm5
165
valignq $1,%ymm6,%ymm7,%ymm6
166
valignq $1,%ymm7,%ymm8,%ymm7
167
valignq $1,%ymm8,%ymm9,%ymm8
168
valignq $1,%ymm9,%ymm10,%ymm9
169
valignq $1,%ymm10,%ymm0,%ymm10
170
171
vmovq %xmm3,%r13
172
addq %r13,%r9
173
174
vpmadd52huq 0(%rsi),%ymm1,%ymm3
175
vpmadd52huq 32(%rsi),%ymm1,%ymm4
176
vpmadd52huq 64(%rsi),%ymm1,%ymm5
177
vpmadd52huq 96(%rsi),%ymm1,%ymm6
178
vpmadd52huq 128(%rsi),%ymm1,%ymm7
179
vpmadd52huq 160(%rsi),%ymm1,%ymm8
180
vpmadd52huq 192(%rsi),%ymm1,%ymm9
181
vpmadd52huq 224(%rsi),%ymm1,%ymm10
182
183
vpmadd52huq 0(%rcx),%ymm2,%ymm3
184
vpmadd52huq 32(%rcx),%ymm2,%ymm4
185
vpmadd52huq 64(%rcx),%ymm2,%ymm5
186
vpmadd52huq 96(%rcx),%ymm2,%ymm6
187
vpmadd52huq 128(%rcx),%ymm2,%ymm7
188
vpmadd52huq 160(%rcx),%ymm2,%ymm8
189
vpmadd52huq 192(%rcx),%ymm2,%ymm9
190
vpmadd52huq 224(%rcx),%ymm2,%ymm10
191
movq 16(%r11),%r13
192
193
vpbroadcastq %r13,%ymm1
194
movq 0(%rsi),%rdx
195
mulxq %r13,%r13,%r12
196
addq %r13,%r9
197
movq %r12,%r10
198
adcq $0,%r10
199
200
movq %r8,%r13
201
imulq %r9,%r13
202
andq %rax,%r13
203
204
vpbroadcastq %r13,%ymm2
205
movq 0(%rcx),%rdx
206
mulxq %r13,%r13,%r12
207
addq %r13,%r9
208
adcq %r12,%r10
209
210
shrq $52,%r9
211
salq $12,%r10
212
orq %r10,%r9
213
214
vpmadd52luq 0(%rsi),%ymm1,%ymm3
215
vpmadd52luq 32(%rsi),%ymm1,%ymm4
216
vpmadd52luq 64(%rsi),%ymm1,%ymm5
217
vpmadd52luq 96(%rsi),%ymm1,%ymm6
218
vpmadd52luq 128(%rsi),%ymm1,%ymm7
219
vpmadd52luq 160(%rsi),%ymm1,%ymm8
220
vpmadd52luq 192(%rsi),%ymm1,%ymm9
221
vpmadd52luq 224(%rsi),%ymm1,%ymm10
222
223
vpmadd52luq 0(%rcx),%ymm2,%ymm3
224
vpmadd52luq 32(%rcx),%ymm2,%ymm4
225
vpmadd52luq 64(%rcx),%ymm2,%ymm5
226
vpmadd52luq 96(%rcx),%ymm2,%ymm6
227
vpmadd52luq 128(%rcx),%ymm2,%ymm7
228
vpmadd52luq 160(%rcx),%ymm2,%ymm8
229
vpmadd52luq 192(%rcx),%ymm2,%ymm9
230
vpmadd52luq 224(%rcx),%ymm2,%ymm10
231
232
233
valignq $1,%ymm3,%ymm4,%ymm3
234
valignq $1,%ymm4,%ymm5,%ymm4
235
valignq $1,%ymm5,%ymm6,%ymm5
236
valignq $1,%ymm6,%ymm7,%ymm6
237
valignq $1,%ymm7,%ymm8,%ymm7
238
valignq $1,%ymm8,%ymm9,%ymm8
239
valignq $1,%ymm9,%ymm10,%ymm9
240
valignq $1,%ymm10,%ymm0,%ymm10
241
242
vmovq %xmm3,%r13
243
addq %r13,%r9
244
245
vpmadd52huq 0(%rsi),%ymm1,%ymm3
246
vpmadd52huq 32(%rsi),%ymm1,%ymm4
247
vpmadd52huq 64(%rsi),%ymm1,%ymm5
248
vpmadd52huq 96(%rsi),%ymm1,%ymm6
249
vpmadd52huq 128(%rsi),%ymm1,%ymm7
250
vpmadd52huq 160(%rsi),%ymm1,%ymm8
251
vpmadd52huq 192(%rsi),%ymm1,%ymm9
252
vpmadd52huq 224(%rsi),%ymm1,%ymm10
253
254
vpmadd52huq 0(%rcx),%ymm2,%ymm3
255
vpmadd52huq 32(%rcx),%ymm2,%ymm4
256
vpmadd52huq 64(%rcx),%ymm2,%ymm5
257
vpmadd52huq 96(%rcx),%ymm2,%ymm6
258
vpmadd52huq 128(%rcx),%ymm2,%ymm7
259
vpmadd52huq 160(%rcx),%ymm2,%ymm8
260
vpmadd52huq 192(%rcx),%ymm2,%ymm9
261
vpmadd52huq 224(%rcx),%ymm2,%ymm10
262
movq 24(%r11),%r13
263
264
vpbroadcastq %r13,%ymm1
265
movq 0(%rsi),%rdx
266
mulxq %r13,%r13,%r12
267
addq %r13,%r9
268
movq %r12,%r10
269
adcq $0,%r10
270
271
movq %r8,%r13
272
imulq %r9,%r13
273
andq %rax,%r13
274
275
vpbroadcastq %r13,%ymm2
276
movq 0(%rcx),%rdx
277
mulxq %r13,%r13,%r12
278
addq %r13,%r9
279
adcq %r12,%r10
280
281
shrq $52,%r9
282
salq $12,%r10
283
orq %r10,%r9
284
285
vpmadd52luq 0(%rsi),%ymm1,%ymm3
286
vpmadd52luq 32(%rsi),%ymm1,%ymm4
287
vpmadd52luq 64(%rsi),%ymm1,%ymm5
288
vpmadd52luq 96(%rsi),%ymm1,%ymm6
289
vpmadd52luq 128(%rsi),%ymm1,%ymm7
290
vpmadd52luq 160(%rsi),%ymm1,%ymm8
291
vpmadd52luq 192(%rsi),%ymm1,%ymm9
292
vpmadd52luq 224(%rsi),%ymm1,%ymm10
293
294
vpmadd52luq 0(%rcx),%ymm2,%ymm3
295
vpmadd52luq 32(%rcx),%ymm2,%ymm4
296
vpmadd52luq 64(%rcx),%ymm2,%ymm5
297
vpmadd52luq 96(%rcx),%ymm2,%ymm6
298
vpmadd52luq 128(%rcx),%ymm2,%ymm7
299
vpmadd52luq 160(%rcx),%ymm2,%ymm8
300
vpmadd52luq 192(%rcx),%ymm2,%ymm9
301
vpmadd52luq 224(%rcx),%ymm2,%ymm10
302
303
304
valignq $1,%ymm3,%ymm4,%ymm3
305
valignq $1,%ymm4,%ymm5,%ymm4
306
valignq $1,%ymm5,%ymm6,%ymm5
307
valignq $1,%ymm6,%ymm7,%ymm6
308
valignq $1,%ymm7,%ymm8,%ymm7
309
valignq $1,%ymm8,%ymm9,%ymm8
310
valignq $1,%ymm9,%ymm10,%ymm9
311
valignq $1,%ymm10,%ymm0,%ymm10
312
313
vmovq %xmm3,%r13
314
addq %r13,%r9
315
316
vpmadd52huq 0(%rsi),%ymm1,%ymm3
317
vpmadd52huq 32(%rsi),%ymm1,%ymm4
318
vpmadd52huq 64(%rsi),%ymm1,%ymm5
319
vpmadd52huq 96(%rsi),%ymm1,%ymm6
320
vpmadd52huq 128(%rsi),%ymm1,%ymm7
321
vpmadd52huq 160(%rsi),%ymm1,%ymm8
322
vpmadd52huq 192(%rsi),%ymm1,%ymm9
323
vpmadd52huq 224(%rsi),%ymm1,%ymm10
324
325
vpmadd52huq 0(%rcx),%ymm2,%ymm3
326
vpmadd52huq 32(%rcx),%ymm2,%ymm4
327
vpmadd52huq 64(%rcx),%ymm2,%ymm5
328
vpmadd52huq 96(%rcx),%ymm2,%ymm6
329
vpmadd52huq 128(%rcx),%ymm2,%ymm7
330
vpmadd52huq 160(%rcx),%ymm2,%ymm8
331
vpmadd52huq 192(%rcx),%ymm2,%ymm9
332
vpmadd52huq 224(%rcx),%ymm2,%ymm10
333
leaq 32(%r11),%r11
334
decl %ebx
335
jne .Lloop7
336
movq 0(%r11),%r13
337
338
vpbroadcastq %r13,%ymm1
339
movq 0(%rsi),%rdx
340
mulxq %r13,%r13,%r12
341
addq %r13,%r9
342
movq %r12,%r10
343
adcq $0,%r10
344
345
movq %r8,%r13
346
imulq %r9,%r13
347
andq %rax,%r13
348
349
vpbroadcastq %r13,%ymm2
350
movq 0(%rcx),%rdx
351
mulxq %r13,%r13,%r12
352
addq %r13,%r9
353
adcq %r12,%r10
354
355
shrq $52,%r9
356
salq $12,%r10
357
orq %r10,%r9
358
359
vpmadd52luq 0(%rsi),%ymm1,%ymm3
360
vpmadd52luq 32(%rsi),%ymm1,%ymm4
361
vpmadd52luq 64(%rsi),%ymm1,%ymm5
362
vpmadd52luq 96(%rsi),%ymm1,%ymm6
363
vpmadd52luq 128(%rsi),%ymm1,%ymm7
364
vpmadd52luq 160(%rsi),%ymm1,%ymm8
365
vpmadd52luq 192(%rsi),%ymm1,%ymm9
366
vpmadd52luq 224(%rsi),%ymm1,%ymm10
367
368
vpmadd52luq 0(%rcx),%ymm2,%ymm3
369
vpmadd52luq 32(%rcx),%ymm2,%ymm4
370
vpmadd52luq 64(%rcx),%ymm2,%ymm5
371
vpmadd52luq 96(%rcx),%ymm2,%ymm6
372
vpmadd52luq 128(%rcx),%ymm2,%ymm7
373
vpmadd52luq 160(%rcx),%ymm2,%ymm8
374
vpmadd52luq 192(%rcx),%ymm2,%ymm9
375
vpmadd52luq 224(%rcx),%ymm2,%ymm10
376
377
378
valignq $1,%ymm3,%ymm4,%ymm3
379
valignq $1,%ymm4,%ymm5,%ymm4
380
valignq $1,%ymm5,%ymm6,%ymm5
381
valignq $1,%ymm6,%ymm7,%ymm6
382
valignq $1,%ymm7,%ymm8,%ymm7
383
valignq $1,%ymm8,%ymm9,%ymm8
384
valignq $1,%ymm9,%ymm10,%ymm9
385
valignq $1,%ymm10,%ymm0,%ymm10
386
387
vmovq %xmm3,%r13
388
addq %r13,%r9
389
390
vpmadd52huq 0(%rsi),%ymm1,%ymm3
391
vpmadd52huq 32(%rsi),%ymm1,%ymm4
392
vpmadd52huq 64(%rsi),%ymm1,%ymm5
393
vpmadd52huq 96(%rsi),%ymm1,%ymm6
394
vpmadd52huq 128(%rsi),%ymm1,%ymm7
395
vpmadd52huq 160(%rsi),%ymm1,%ymm8
396
vpmadd52huq 192(%rsi),%ymm1,%ymm9
397
vpmadd52huq 224(%rsi),%ymm1,%ymm10
398
399
vpmadd52huq 0(%rcx),%ymm2,%ymm3
400
vpmadd52huq 32(%rcx),%ymm2,%ymm4
401
vpmadd52huq 64(%rcx),%ymm2,%ymm5
402
vpmadd52huq 96(%rcx),%ymm2,%ymm6
403
vpmadd52huq 128(%rcx),%ymm2,%ymm7
404
vpmadd52huq 160(%rcx),%ymm2,%ymm8
405
vpmadd52huq 192(%rcx),%ymm2,%ymm9
406
vpmadd52huq 224(%rcx),%ymm2,%ymm10
407
movq 8(%r11),%r13
408
409
vpbroadcastq %r13,%ymm1
410
movq 0(%rsi),%rdx
411
mulxq %r13,%r13,%r12
412
addq %r13,%r9
413
movq %r12,%r10
414
adcq $0,%r10
415
416
movq %r8,%r13
417
imulq %r9,%r13
418
andq %rax,%r13
419
420
vpbroadcastq %r13,%ymm2
421
movq 0(%rcx),%rdx
422
mulxq %r13,%r13,%r12
423
addq %r13,%r9
424
adcq %r12,%r10
425
426
shrq $52,%r9
427
salq $12,%r10
428
orq %r10,%r9
429
430
vpmadd52luq 0(%rsi),%ymm1,%ymm3
431
vpmadd52luq 32(%rsi),%ymm1,%ymm4
432
vpmadd52luq 64(%rsi),%ymm1,%ymm5
433
vpmadd52luq 96(%rsi),%ymm1,%ymm6
434
vpmadd52luq 128(%rsi),%ymm1,%ymm7
435
vpmadd52luq 160(%rsi),%ymm1,%ymm8
436
vpmadd52luq 192(%rsi),%ymm1,%ymm9
437
vpmadd52luq 224(%rsi),%ymm1,%ymm10
438
439
vpmadd52luq 0(%rcx),%ymm2,%ymm3
440
vpmadd52luq 32(%rcx),%ymm2,%ymm4
441
vpmadd52luq 64(%rcx),%ymm2,%ymm5
442
vpmadd52luq 96(%rcx),%ymm2,%ymm6
443
vpmadd52luq 128(%rcx),%ymm2,%ymm7
444
vpmadd52luq 160(%rcx),%ymm2,%ymm8
445
vpmadd52luq 192(%rcx),%ymm2,%ymm9
446
vpmadd52luq 224(%rcx),%ymm2,%ymm10
447
448
449
valignq $1,%ymm3,%ymm4,%ymm3
450
valignq $1,%ymm4,%ymm5,%ymm4
451
valignq $1,%ymm5,%ymm6,%ymm5
452
valignq $1,%ymm6,%ymm7,%ymm6
453
valignq $1,%ymm7,%ymm8,%ymm7
454
valignq $1,%ymm8,%ymm9,%ymm8
455
valignq $1,%ymm9,%ymm10,%ymm9
456
valignq $1,%ymm10,%ymm0,%ymm10
457
458
vmovq %xmm3,%r13
459
addq %r13,%r9
460
461
vpmadd52huq 0(%rsi),%ymm1,%ymm3
462
vpmadd52huq 32(%rsi),%ymm1,%ymm4
463
vpmadd52huq 64(%rsi),%ymm1,%ymm5
464
vpmadd52huq 96(%rsi),%ymm1,%ymm6
465
vpmadd52huq 128(%rsi),%ymm1,%ymm7
466
vpmadd52huq 160(%rsi),%ymm1,%ymm8
467
vpmadd52huq 192(%rsi),%ymm1,%ymm9
468
vpmadd52huq 224(%rsi),%ymm1,%ymm10
469
470
vpmadd52huq 0(%rcx),%ymm2,%ymm3
471
vpmadd52huq 32(%rcx),%ymm2,%ymm4
472
vpmadd52huq 64(%rcx),%ymm2,%ymm5
473
vpmadd52huq 96(%rcx),%ymm2,%ymm6
474
vpmadd52huq 128(%rcx),%ymm2,%ymm7
475
vpmadd52huq 160(%rcx),%ymm2,%ymm8
476
vpmadd52huq 192(%rcx),%ymm2,%ymm9
477
vpmadd52huq 224(%rcx),%ymm2,%ymm10
478
479
vpbroadcastq %r9,%ymm0
480
vpblendd $3,%ymm0,%ymm3,%ymm3
481
482
483
484
vpsrlq $52,%ymm3,%ymm0
485
vpsrlq $52,%ymm4,%ymm1
486
vpsrlq $52,%ymm5,%ymm2
487
vpsrlq $52,%ymm6,%ymm19
488
vpsrlq $52,%ymm7,%ymm20
489
vpsrlq $52,%ymm8,%ymm21
490
vpsrlq $52,%ymm9,%ymm22
491
vpsrlq $52,%ymm10,%ymm23
492
493
494
valignq $3,%ymm22,%ymm23,%ymm23
495
valignq $3,%ymm21,%ymm22,%ymm22
496
valignq $3,%ymm20,%ymm21,%ymm21
497
valignq $3,%ymm19,%ymm20,%ymm20
498
valignq $3,%ymm2,%ymm19,%ymm19
499
valignq $3,%ymm1,%ymm2,%ymm2
500
valignq $3,%ymm0,%ymm1,%ymm1
501
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
502
503
504
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
505
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
506
vpandq .Lmask52x4(%rip),%ymm5,%ymm5
507
vpandq .Lmask52x4(%rip),%ymm6,%ymm6
508
vpandq .Lmask52x4(%rip),%ymm7,%ymm7
509
vpandq .Lmask52x4(%rip),%ymm8,%ymm8
510
vpandq .Lmask52x4(%rip),%ymm9,%ymm9
511
vpandq .Lmask52x4(%rip),%ymm10,%ymm10
512
513
514
vpaddq %ymm0,%ymm3,%ymm3
515
vpaddq %ymm1,%ymm4,%ymm4
516
vpaddq %ymm2,%ymm5,%ymm5
517
vpaddq %ymm19,%ymm6,%ymm6
518
vpaddq %ymm20,%ymm7,%ymm7
519
vpaddq %ymm21,%ymm8,%ymm8
520
vpaddq %ymm22,%ymm9,%ymm9
521
vpaddq %ymm23,%ymm10,%ymm10
522
523
524
525
vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
526
vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2
527
kmovb %k1,%r14d
528
kmovb %k2,%r13d
529
shlb $4,%r13b
530
orb %r13b,%r14b
531
532
vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1
533
vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2
534
kmovb %k1,%r13d
535
kmovb %k2,%r12d
536
shlb $4,%r12b
537
orb %r12b,%r13b
538
539
vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1
540
vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2
541
kmovb %k1,%r12d
542
kmovb %k2,%r11d
543
shlb $4,%r11b
544
orb %r11b,%r12b
545
546
vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1
547
vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2
548
kmovb %k1,%r11d
549
kmovb %k2,%r10d
550
shlb $4,%r10b
551
orb %r10b,%r11b
552
553
addb %r14b,%r14b
554
adcb %r13b,%r13b
555
adcb %r12b,%r12b
556
adcb %r11b,%r11b
557
558
559
vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
560
vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2
561
kmovb %k1,%r9d
562
kmovb %k2,%r8d
563
shlb $4,%r8b
564
orb %r8b,%r9b
565
566
vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1
567
vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2
568
kmovb %k1,%r8d
569
kmovb %k2,%edx
570
shlb $4,%dl
571
orb %dl,%r8b
572
573
vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1
574
vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2
575
kmovb %k1,%edx
576
kmovb %k2,%ecx
577
shlb $4,%cl
578
orb %cl,%dl
579
580
vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1
581
vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2
582
kmovb %k1,%ecx
583
kmovb %k2,%ebx
584
shlb $4,%bl
585
orb %bl,%cl
586
587
addb %r9b,%r14b
588
adcb %r8b,%r13b
589
adcb %dl,%r12b
590
adcb %cl,%r11b
591
592
xorb %r9b,%r14b
593
xorb %r8b,%r13b
594
xorb %dl,%r12b
595
xorb %cl,%r11b
596
597
kmovb %r14d,%k1
598
shrb $4,%r14b
599
kmovb %r14d,%k2
600
kmovb %r13d,%k3
601
shrb $4,%r13b
602
kmovb %r13d,%k4
603
kmovb %r12d,%k5
604
shrb $4,%r12b
605
kmovb %r12d,%k6
606
kmovb %r11d,%k7
607
608
vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
609
vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2}
610
vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3}
611
vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4}
612
vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5}
613
vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6}
614
vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7}
615
616
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
617
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
618
vpandq .Lmask52x4(%rip),%ymm5,%ymm5
619
vpandq .Lmask52x4(%rip),%ymm6,%ymm6
620
vpandq .Lmask52x4(%rip),%ymm7,%ymm7
621
vpandq .Lmask52x4(%rip),%ymm8,%ymm8
622
vpandq .Lmask52x4(%rip),%ymm9,%ymm9
623
624
shrb $4,%r11b
625
kmovb %r11d,%k1
626
627
vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1}
628
629
vpandq .Lmask52x4(%rip),%ymm10,%ymm10
630
631
vmovdqu64 %ymm3,0(%rdi)
632
vmovdqu64 %ymm4,32(%rdi)
633
vmovdqu64 %ymm5,64(%rdi)
634
vmovdqu64 %ymm6,96(%rdi)
635
vmovdqu64 %ymm7,128(%rdi)
636
vmovdqu64 %ymm8,160(%rdi)
637
vmovdqu64 %ymm9,192(%rdi)
638
vmovdqu64 %ymm10,224(%rdi)
639
640
vzeroupper
641
leaq (%rsp),%rax
642
.cfi_def_cfa_register %rax
643
movq 0(%rax),%r15
644
.cfi_restore %r15
645
movq 8(%rax),%r14
646
.cfi_restore %r14
647
movq 16(%rax),%r13
648
.cfi_restore %r13
649
movq 24(%rax),%r12
650
.cfi_restore %r12
651
movq 32(%rax),%rbp
652
.cfi_restore %rbp
653
movq 40(%rax),%rbx
654
.cfi_restore %rbx
655
leaq 48(%rax),%rsp
656
.cfi_def_cfa %rsp,8
657
.Lossl_rsaz_amm52x30_x1_ifma256_epilogue:
658
.byte 0xf3,0xc3
659
.cfi_endproc
660
.size ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256
661
.section .rodata
662
.align 32
663
.Lmask52x4:
664
.quad 0xfffffffffffff
665
.quad 0xfffffffffffff
666
.quad 0xfffffffffffff
667
.quad 0xfffffffffffff
668
.text
669
670
.globl ossl_rsaz_amm52x30_x2_ifma256
671
.type ossl_rsaz_amm52x30_x2_ifma256,@function
672
.align 32
673
ossl_rsaz_amm52x30_x2_ifma256:
674
.cfi_startproc
675
.byte 243,15,30,250
676
pushq %rbx
677
.cfi_adjust_cfa_offset 8
678
.cfi_offset %rbx,-16
679
pushq %rbp
680
.cfi_adjust_cfa_offset 8
681
.cfi_offset %rbp,-24
682
pushq %r12
683
.cfi_adjust_cfa_offset 8
684
.cfi_offset %r12,-32
685
pushq %r13
686
.cfi_adjust_cfa_offset 8
687
.cfi_offset %r13,-40
688
pushq %r14
689
.cfi_adjust_cfa_offset 8
690
.cfi_offset %r14,-48
691
pushq %r15
692
.cfi_adjust_cfa_offset 8
693
.cfi_offset %r15,-56
694
695
vpxord %ymm0,%ymm0,%ymm0
696
vmovdqa64 %ymm0,%ymm3
697
vmovdqa64 %ymm0,%ymm4
698
vmovdqa64 %ymm0,%ymm5
699
vmovdqa64 %ymm0,%ymm6
700
vmovdqa64 %ymm0,%ymm7
701
vmovdqa64 %ymm0,%ymm8
702
vmovdqa64 %ymm0,%ymm9
703
vmovdqa64 %ymm0,%ymm10
704
705
vmovdqa64 %ymm0,%ymm11
706
vmovdqa64 %ymm0,%ymm12
707
vmovdqa64 %ymm0,%ymm13
708
vmovdqa64 %ymm0,%ymm14
709
vmovdqa64 %ymm0,%ymm15
710
vmovdqa64 %ymm0,%ymm16
711
vmovdqa64 %ymm0,%ymm17
712
vmovdqa64 %ymm0,%ymm18
713
714
715
xorl %r9d,%r9d
716
xorl %r15d,%r15d
717
718
movq %rdx,%r11
719
movq $0xfffffffffffff,%rax
720
721
movl $30,%ebx
722
723
.align 32
724
.Lloop30:
725
movq 0(%r11),%r13
726
727
vpbroadcastq %r13,%ymm1
728
movq 0(%rsi),%rdx
729
mulxq %r13,%r13,%r12
730
addq %r13,%r9
731
movq %r12,%r10
732
adcq $0,%r10
733
734
movq (%r8),%r13
735
imulq %r9,%r13
736
andq %rax,%r13
737
738
vpbroadcastq %r13,%ymm2
739
movq 0(%rcx),%rdx
740
mulxq %r13,%r13,%r12
741
addq %r13,%r9
742
adcq %r12,%r10
743
744
shrq $52,%r9
745
salq $12,%r10
746
orq %r10,%r9
747
748
vpmadd52luq 0(%rsi),%ymm1,%ymm3
749
vpmadd52luq 32(%rsi),%ymm1,%ymm4
750
vpmadd52luq 64(%rsi),%ymm1,%ymm5
751
vpmadd52luq 96(%rsi),%ymm1,%ymm6
752
vpmadd52luq 128(%rsi),%ymm1,%ymm7
753
vpmadd52luq 160(%rsi),%ymm1,%ymm8
754
vpmadd52luq 192(%rsi),%ymm1,%ymm9
755
vpmadd52luq 224(%rsi),%ymm1,%ymm10
756
757
vpmadd52luq 0(%rcx),%ymm2,%ymm3
758
vpmadd52luq 32(%rcx),%ymm2,%ymm4
759
vpmadd52luq 64(%rcx),%ymm2,%ymm5
760
vpmadd52luq 96(%rcx),%ymm2,%ymm6
761
vpmadd52luq 128(%rcx),%ymm2,%ymm7
762
vpmadd52luq 160(%rcx),%ymm2,%ymm8
763
vpmadd52luq 192(%rcx),%ymm2,%ymm9
764
vpmadd52luq 224(%rcx),%ymm2,%ymm10
765
766
767
valignq $1,%ymm3,%ymm4,%ymm3
768
valignq $1,%ymm4,%ymm5,%ymm4
769
valignq $1,%ymm5,%ymm6,%ymm5
770
valignq $1,%ymm6,%ymm7,%ymm6
771
valignq $1,%ymm7,%ymm8,%ymm7
772
valignq $1,%ymm8,%ymm9,%ymm8
773
valignq $1,%ymm9,%ymm10,%ymm9
774
valignq $1,%ymm10,%ymm0,%ymm10
775
776
vmovq %xmm3,%r13
777
addq %r13,%r9
778
779
vpmadd52huq 0(%rsi),%ymm1,%ymm3
780
vpmadd52huq 32(%rsi),%ymm1,%ymm4
781
vpmadd52huq 64(%rsi),%ymm1,%ymm5
782
vpmadd52huq 96(%rsi),%ymm1,%ymm6
783
vpmadd52huq 128(%rsi),%ymm1,%ymm7
784
vpmadd52huq 160(%rsi),%ymm1,%ymm8
785
vpmadd52huq 192(%rsi),%ymm1,%ymm9
786
vpmadd52huq 224(%rsi),%ymm1,%ymm10
787
788
vpmadd52huq 0(%rcx),%ymm2,%ymm3
789
vpmadd52huq 32(%rcx),%ymm2,%ymm4
790
vpmadd52huq 64(%rcx),%ymm2,%ymm5
791
vpmadd52huq 96(%rcx),%ymm2,%ymm6
792
vpmadd52huq 128(%rcx),%ymm2,%ymm7
793
vpmadd52huq 160(%rcx),%ymm2,%ymm8
794
vpmadd52huq 192(%rcx),%ymm2,%ymm9
795
vpmadd52huq 224(%rcx),%ymm2,%ymm10
796
movq 256(%r11),%r13
797
798
vpbroadcastq %r13,%ymm1
799
movq 256(%rsi),%rdx
800
mulxq %r13,%r13,%r12
801
addq %r13,%r15
802
movq %r12,%r10
803
adcq $0,%r10
804
805
movq 8(%r8),%r13
806
imulq %r15,%r13
807
andq %rax,%r13
808
809
vpbroadcastq %r13,%ymm2
810
movq 256(%rcx),%rdx
811
mulxq %r13,%r13,%r12
812
addq %r13,%r15
813
adcq %r12,%r10
814
815
shrq $52,%r15
816
salq $12,%r10
817
orq %r10,%r15
818
819
vpmadd52luq 256(%rsi),%ymm1,%ymm11
820
vpmadd52luq 288(%rsi),%ymm1,%ymm12
821
vpmadd52luq 320(%rsi),%ymm1,%ymm13
822
vpmadd52luq 352(%rsi),%ymm1,%ymm14
823
vpmadd52luq 384(%rsi),%ymm1,%ymm15
824
vpmadd52luq 416(%rsi),%ymm1,%ymm16
825
vpmadd52luq 448(%rsi),%ymm1,%ymm17
826
vpmadd52luq 480(%rsi),%ymm1,%ymm18
827
828
vpmadd52luq 256(%rcx),%ymm2,%ymm11
829
vpmadd52luq 288(%rcx),%ymm2,%ymm12
830
vpmadd52luq 320(%rcx),%ymm2,%ymm13
831
vpmadd52luq 352(%rcx),%ymm2,%ymm14
832
vpmadd52luq 384(%rcx),%ymm2,%ymm15
833
vpmadd52luq 416(%rcx),%ymm2,%ymm16
834
vpmadd52luq 448(%rcx),%ymm2,%ymm17
835
vpmadd52luq 480(%rcx),%ymm2,%ymm18
836
837
838
valignq $1,%ymm11,%ymm12,%ymm11
839
valignq $1,%ymm12,%ymm13,%ymm12
840
valignq $1,%ymm13,%ymm14,%ymm13
841
valignq $1,%ymm14,%ymm15,%ymm14
842
valignq $1,%ymm15,%ymm16,%ymm15
843
valignq $1,%ymm16,%ymm17,%ymm16
844
valignq $1,%ymm17,%ymm18,%ymm17
845
valignq $1,%ymm18,%ymm0,%ymm18
846
847
vmovq %xmm11,%r13
848
addq %r13,%r15
849
850
vpmadd52huq 256(%rsi),%ymm1,%ymm11
851
vpmadd52huq 288(%rsi),%ymm1,%ymm12
852
vpmadd52huq 320(%rsi),%ymm1,%ymm13
853
vpmadd52huq 352(%rsi),%ymm1,%ymm14
854
vpmadd52huq 384(%rsi),%ymm1,%ymm15
855
vpmadd52huq 416(%rsi),%ymm1,%ymm16
856
vpmadd52huq 448(%rsi),%ymm1,%ymm17
857
vpmadd52huq 480(%rsi),%ymm1,%ymm18
858
859
vpmadd52huq 256(%rcx),%ymm2,%ymm11
860
vpmadd52huq 288(%rcx),%ymm2,%ymm12
861
vpmadd52huq 320(%rcx),%ymm2,%ymm13
862
vpmadd52huq 352(%rcx),%ymm2,%ymm14
863
vpmadd52huq 384(%rcx),%ymm2,%ymm15
864
vpmadd52huq 416(%rcx),%ymm2,%ymm16
865
vpmadd52huq 448(%rcx),%ymm2,%ymm17
866
vpmadd52huq 480(%rcx),%ymm2,%ymm18
867
leaq 8(%r11),%r11
868
decl %ebx
869
jne .Lloop30
870
871
vpbroadcastq %r9,%ymm0
872
vpblendd $3,%ymm0,%ymm3,%ymm3
873
874
875
876
vpsrlq $52,%ymm3,%ymm0
877
vpsrlq $52,%ymm4,%ymm1
878
vpsrlq $52,%ymm5,%ymm2
879
vpsrlq $52,%ymm6,%ymm19
880
vpsrlq $52,%ymm7,%ymm20
881
vpsrlq $52,%ymm8,%ymm21
882
vpsrlq $52,%ymm9,%ymm22
883
vpsrlq $52,%ymm10,%ymm23
884
885
886
valignq $3,%ymm22,%ymm23,%ymm23
887
valignq $3,%ymm21,%ymm22,%ymm22
888
valignq $3,%ymm20,%ymm21,%ymm21
889
valignq $3,%ymm19,%ymm20,%ymm20
890
valignq $3,%ymm2,%ymm19,%ymm19
891
valignq $3,%ymm1,%ymm2,%ymm2
892
valignq $3,%ymm0,%ymm1,%ymm1
893
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
894
895
896
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
897
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
898
vpandq .Lmask52x4(%rip),%ymm5,%ymm5
899
vpandq .Lmask52x4(%rip),%ymm6,%ymm6
900
vpandq .Lmask52x4(%rip),%ymm7,%ymm7
901
vpandq .Lmask52x4(%rip),%ymm8,%ymm8
902
vpandq .Lmask52x4(%rip),%ymm9,%ymm9
903
vpandq .Lmask52x4(%rip),%ymm10,%ymm10
904
905
906
vpaddq %ymm0,%ymm3,%ymm3
907
vpaddq %ymm1,%ymm4,%ymm4
908
vpaddq %ymm2,%ymm5,%ymm5
909
vpaddq %ymm19,%ymm6,%ymm6
910
vpaddq %ymm20,%ymm7,%ymm7
911
vpaddq %ymm21,%ymm8,%ymm8
912
vpaddq %ymm22,%ymm9,%ymm9
913
vpaddq %ymm23,%ymm10,%ymm10
914
915
916
917
vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
918
vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2
919
kmovb %k1,%r14d
920
kmovb %k2,%r13d
921
shlb $4,%r13b
922
orb %r13b,%r14b
923
924
vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1
925
vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2
926
kmovb %k1,%r13d
927
kmovb %k2,%r12d
928
shlb $4,%r12b
929
orb %r12b,%r13b
930
931
vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1
932
vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2
933
kmovb %k1,%r12d
934
kmovb %k2,%r11d
935
shlb $4,%r11b
936
orb %r11b,%r12b
937
938
vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1
939
vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2
940
kmovb %k1,%r11d
941
kmovb %k2,%r10d
942
shlb $4,%r10b
943
orb %r10b,%r11b
944
945
addb %r14b,%r14b
946
adcb %r13b,%r13b
947
adcb %r12b,%r12b
948
adcb %r11b,%r11b
949
950
951
vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
952
vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2
953
kmovb %k1,%r9d
954
kmovb %k2,%r8d
955
shlb $4,%r8b
956
orb %r8b,%r9b
957
958
vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1
959
vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2
960
kmovb %k1,%r8d
961
kmovb %k2,%edx
962
shlb $4,%dl
963
orb %dl,%r8b
964
965
vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1
966
vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2
967
kmovb %k1,%edx
968
kmovb %k2,%ecx
969
shlb $4,%cl
970
orb %cl,%dl
971
972
vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1
973
vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2
974
kmovb %k1,%ecx
975
kmovb %k2,%ebx
976
shlb $4,%bl
977
orb %bl,%cl
978
979
addb %r9b,%r14b
980
adcb %r8b,%r13b
981
adcb %dl,%r12b
982
adcb %cl,%r11b
983
984
xorb %r9b,%r14b
985
xorb %r8b,%r13b
986
xorb %dl,%r12b
987
xorb %cl,%r11b
988
989
kmovb %r14d,%k1
990
shrb $4,%r14b
991
kmovb %r14d,%k2
992
kmovb %r13d,%k3
993
shrb $4,%r13b
994
kmovb %r13d,%k4
995
kmovb %r12d,%k5
996
shrb $4,%r12b
997
kmovb %r12d,%k6
998
kmovb %r11d,%k7
999
1000
vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
1001
vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2}
1002
vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3}
1003
vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4}
1004
vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5}
1005
vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6}
1006
vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7}
1007
1008
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
1009
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
1010
vpandq .Lmask52x4(%rip),%ymm5,%ymm5
1011
vpandq .Lmask52x4(%rip),%ymm6,%ymm6
1012
vpandq .Lmask52x4(%rip),%ymm7,%ymm7
1013
vpandq .Lmask52x4(%rip),%ymm8,%ymm8
1014
vpandq .Lmask52x4(%rip),%ymm9,%ymm9
1015
1016
shrb $4,%r11b
1017
kmovb %r11d,%k1
1018
1019
vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1}
1020
1021
vpandq .Lmask52x4(%rip),%ymm10,%ymm10
1022
1023
vpbroadcastq %r15,%ymm0
1024
vpblendd $3,%ymm0,%ymm11,%ymm11
1025
1026
1027
1028
vpsrlq $52,%ymm11,%ymm0
1029
vpsrlq $52,%ymm12,%ymm1
1030
vpsrlq $52,%ymm13,%ymm2
1031
vpsrlq $52,%ymm14,%ymm19
1032
vpsrlq $52,%ymm15,%ymm20
1033
vpsrlq $52,%ymm16,%ymm21
1034
vpsrlq $52,%ymm17,%ymm22
1035
vpsrlq $52,%ymm18,%ymm23
1036
1037
1038
valignq $3,%ymm22,%ymm23,%ymm23
1039
valignq $3,%ymm21,%ymm22,%ymm22
1040
valignq $3,%ymm20,%ymm21,%ymm21
1041
valignq $3,%ymm19,%ymm20,%ymm20
1042
valignq $3,%ymm2,%ymm19,%ymm19
1043
valignq $3,%ymm1,%ymm2,%ymm2
1044
valignq $3,%ymm0,%ymm1,%ymm1
1045
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
1046
1047
1048
vpandq .Lmask52x4(%rip),%ymm11,%ymm11
1049
vpandq .Lmask52x4(%rip),%ymm12,%ymm12
1050
vpandq .Lmask52x4(%rip),%ymm13,%ymm13
1051
vpandq .Lmask52x4(%rip),%ymm14,%ymm14
1052
vpandq .Lmask52x4(%rip),%ymm15,%ymm15
1053
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
1054
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
1055
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
1056
1057
1058
vpaddq %ymm0,%ymm11,%ymm11
1059
vpaddq %ymm1,%ymm12,%ymm12
1060
vpaddq %ymm2,%ymm13,%ymm13
1061
vpaddq %ymm19,%ymm14,%ymm14
1062
vpaddq %ymm20,%ymm15,%ymm15
1063
vpaddq %ymm21,%ymm16,%ymm16
1064
vpaddq %ymm22,%ymm17,%ymm17
1065
vpaddq %ymm23,%ymm18,%ymm18
1066
1067
1068
1069
vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1
1070
vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2
1071
kmovb %k1,%r14d
1072
kmovb %k2,%r13d
1073
shlb $4,%r13b
1074
orb %r13b,%r14b
1075
1076
vpcmpuq $6,.Lmask52x4(%rip),%ymm13,%k1
1077
vpcmpuq $6,.Lmask52x4(%rip),%ymm14,%k2
1078
kmovb %k1,%r13d
1079
kmovb %k2,%r12d
1080
shlb $4,%r12b
1081
orb %r12b,%r13b
1082
1083
vpcmpuq $6,.Lmask52x4(%rip),%ymm15,%k1
1084
vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2
1085
kmovb %k1,%r12d
1086
kmovb %k2,%r11d
1087
shlb $4,%r11b
1088
orb %r11b,%r12b
1089
1090
vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k1
1091
vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k2
1092
kmovb %k1,%r11d
1093
kmovb %k2,%r10d
1094
shlb $4,%r10b
1095
orb %r10b,%r11b
1096
1097
addb %r14b,%r14b
1098
adcb %r13b,%r13b
1099
adcb %r12b,%r12b
1100
adcb %r11b,%r11b
1101
1102
1103
vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1
1104
vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2
1105
kmovb %k1,%r9d
1106
kmovb %k2,%r8d
1107
shlb $4,%r8b
1108
orb %r8b,%r9b
1109
1110
vpcmpuq $0,.Lmask52x4(%rip),%ymm13,%k1
1111
vpcmpuq $0,.Lmask52x4(%rip),%ymm14,%k2
1112
kmovb %k1,%r8d
1113
kmovb %k2,%edx
1114
shlb $4,%dl
1115
orb %dl,%r8b
1116
1117
vpcmpuq $0,.Lmask52x4(%rip),%ymm15,%k1
1118
vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2
1119
kmovb %k1,%edx
1120
kmovb %k2,%ecx
1121
shlb $4,%cl
1122
orb %cl,%dl
1123
1124
vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k1
1125
vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k2
1126
kmovb %k1,%ecx
1127
kmovb %k2,%ebx
1128
shlb $4,%bl
1129
orb %bl,%cl
1130
1131
addb %r9b,%r14b
1132
adcb %r8b,%r13b
1133
adcb %dl,%r12b
1134
adcb %cl,%r11b
1135
1136
xorb %r9b,%r14b
1137
xorb %r8b,%r13b
1138
xorb %dl,%r12b
1139
xorb %cl,%r11b
1140
1141
kmovb %r14d,%k1
1142
shrb $4,%r14b
1143
kmovb %r14d,%k2
1144
kmovb %r13d,%k3
1145
shrb $4,%r13b
1146
kmovb %r13d,%k4
1147
kmovb %r12d,%k5
1148
shrb $4,%r12b
1149
kmovb %r12d,%k6
1150
kmovb %r11d,%k7
1151
1152
vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k1}
1153
vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k2}
1154
vpsubq .Lmask52x4(%rip),%ymm13,%ymm13{%k3}
1155
vpsubq .Lmask52x4(%rip),%ymm14,%ymm14{%k4}
1156
vpsubq .Lmask52x4(%rip),%ymm15,%ymm15{%k5}
1157
vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k6}
1158
vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k7}
1159
1160
vpandq .Lmask52x4(%rip),%ymm11,%ymm11
1161
vpandq .Lmask52x4(%rip),%ymm12,%ymm12
1162
vpandq .Lmask52x4(%rip),%ymm13,%ymm13
1163
vpandq .Lmask52x4(%rip),%ymm14,%ymm14
1164
vpandq .Lmask52x4(%rip),%ymm15,%ymm15
1165
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
1166
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
1167
1168
shrb $4,%r11b
1169
kmovb %r11d,%k1
1170
1171
vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k1}
1172
1173
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
1174
1175
vmovdqu64 %ymm3,0(%rdi)
1176
vmovdqu64 %ymm4,32(%rdi)
1177
vmovdqu64 %ymm5,64(%rdi)
1178
vmovdqu64 %ymm6,96(%rdi)
1179
vmovdqu64 %ymm7,128(%rdi)
1180
vmovdqu64 %ymm8,160(%rdi)
1181
vmovdqu64 %ymm9,192(%rdi)
1182
vmovdqu64 %ymm10,224(%rdi)
1183
1184
vmovdqu64 %ymm11,256(%rdi)
1185
vmovdqu64 %ymm12,288(%rdi)
1186
vmovdqu64 %ymm13,320(%rdi)
1187
vmovdqu64 %ymm14,352(%rdi)
1188
vmovdqu64 %ymm15,384(%rdi)
1189
vmovdqu64 %ymm16,416(%rdi)
1190
vmovdqu64 %ymm17,448(%rdi)
1191
vmovdqu64 %ymm18,480(%rdi)
1192
1193
vzeroupper
1194
leaq (%rsp),%rax
1195
.cfi_def_cfa_register %rax
1196
movq 0(%rax),%r15
1197
.cfi_restore %r15
1198
movq 8(%rax),%r14
1199
.cfi_restore %r14
1200
movq 16(%rax),%r13
1201
.cfi_restore %r13
1202
movq 24(%rax),%r12
1203
.cfi_restore %r12
1204
movq 32(%rax),%rbp
1205
.cfi_restore %rbp
1206
movq 40(%rax),%rbx
1207
.cfi_restore %rbx
1208
leaq 48(%rax),%rsp
1209
.cfi_def_cfa %rsp,8
1210
.Lossl_rsaz_amm52x30_x2_ifma256_epilogue:
1211
.byte 0xf3,0xc3
1212
.cfi_endproc
1213
.size ossl_rsaz_amm52x30_x2_ifma256, .-ossl_rsaz_amm52x30_x2_ifma256
1214
.text
1215
1216
.align 32
1217
.globl ossl_extract_multiplier_2x30_win5
1218
.type ossl_extract_multiplier_2x30_win5,@function
1219
ossl_extract_multiplier_2x30_win5:
1220
.cfi_startproc
1221
.byte 243,15,30,250
1222
vmovdqa64 .Lones(%rip),%ymm30
1223
vpbroadcastq %rdx,%ymm28
1224
vpbroadcastq %rcx,%ymm29
1225
leaq 16384(%rsi),%rax
1226
1227
1228
vpxor %xmm0,%xmm0,%xmm0
1229
vmovdqa64 %ymm0,%ymm27
1230
vmovdqa64 %ymm0,%ymm1
1231
vmovdqa64 %ymm0,%ymm2
1232
vmovdqa64 %ymm0,%ymm3
1233
vmovdqa64 %ymm0,%ymm4
1234
vmovdqa64 %ymm0,%ymm5
1235
vmovdqa64 %ymm0,%ymm16
1236
vmovdqa64 %ymm0,%ymm17
1237
vmovdqa64 %ymm0,%ymm18
1238
vmovdqa64 %ymm0,%ymm19
1239
vmovdqa64 %ymm0,%ymm20
1240
vmovdqa64 %ymm0,%ymm21
1241
vmovdqa64 %ymm0,%ymm22
1242
vmovdqa64 %ymm0,%ymm23
1243
vmovdqa64 %ymm0,%ymm24
1244
vmovdqa64 %ymm0,%ymm25
1245
1246
.align 32
1247
.Lloop:
1248
vpcmpq $0,%ymm27,%ymm28,%k1
1249
vpcmpq $0,%ymm27,%ymm29,%k2
1250
vmovdqu64 0(%rsi),%ymm26
1251
vpblendmq %ymm26,%ymm0,%ymm0{%k1}
1252
vmovdqu64 32(%rsi),%ymm26
1253
vpblendmq %ymm26,%ymm1,%ymm1{%k1}
1254
vmovdqu64 64(%rsi),%ymm26
1255
vpblendmq %ymm26,%ymm2,%ymm2{%k1}
1256
vmovdqu64 96(%rsi),%ymm26
1257
vpblendmq %ymm26,%ymm3,%ymm3{%k1}
1258
vmovdqu64 128(%rsi),%ymm26
1259
vpblendmq %ymm26,%ymm4,%ymm4{%k1}
1260
vmovdqu64 160(%rsi),%ymm26
1261
vpblendmq %ymm26,%ymm5,%ymm5{%k1}
1262
vmovdqu64 192(%rsi),%ymm26
1263
vpblendmq %ymm26,%ymm16,%ymm16{%k1}
1264
vmovdqu64 224(%rsi),%ymm26
1265
vpblendmq %ymm26,%ymm17,%ymm17{%k1}
1266
vmovdqu64 256(%rsi),%ymm26
1267
vpblendmq %ymm26,%ymm18,%ymm18{%k2}
1268
vmovdqu64 288(%rsi),%ymm26
1269
vpblendmq %ymm26,%ymm19,%ymm19{%k2}
1270
vmovdqu64 320(%rsi),%ymm26
1271
vpblendmq %ymm26,%ymm20,%ymm20{%k2}
1272
vmovdqu64 352(%rsi),%ymm26
1273
vpblendmq %ymm26,%ymm21,%ymm21{%k2}
1274
vmovdqu64 384(%rsi),%ymm26
1275
vpblendmq %ymm26,%ymm22,%ymm22{%k2}
1276
vmovdqu64 416(%rsi),%ymm26
1277
vpblendmq %ymm26,%ymm23,%ymm23{%k2}
1278
vmovdqu64 448(%rsi),%ymm26
1279
vpblendmq %ymm26,%ymm24,%ymm24{%k2}
1280
vmovdqu64 480(%rsi),%ymm26
1281
vpblendmq %ymm26,%ymm25,%ymm25{%k2}
1282
vpaddq %ymm30,%ymm27,%ymm27
1283
addq $512,%rsi
1284
cmpq %rsi,%rax
1285
jne .Lloop
1286
vmovdqu64 %ymm0,0(%rdi)
1287
vmovdqu64 %ymm1,32(%rdi)
1288
vmovdqu64 %ymm2,64(%rdi)
1289
vmovdqu64 %ymm3,96(%rdi)
1290
vmovdqu64 %ymm4,128(%rdi)
1291
vmovdqu64 %ymm5,160(%rdi)
1292
vmovdqu64 %ymm16,192(%rdi)
1293
vmovdqu64 %ymm17,224(%rdi)
1294
vmovdqu64 %ymm18,256(%rdi)
1295
vmovdqu64 %ymm19,288(%rdi)
1296
vmovdqu64 %ymm20,320(%rdi)
1297
vmovdqu64 %ymm21,352(%rdi)
1298
vmovdqu64 %ymm22,384(%rdi)
1299
vmovdqu64 %ymm23,416(%rdi)
1300
vmovdqu64 %ymm24,448(%rdi)
1301
vmovdqu64 %ymm25,480(%rdi)
1302
1303
.byte 0xf3,0xc3
1304
.cfi_endproc
1305
.size ossl_extract_multiplier_2x30_win5, .-ossl_extract_multiplier_2x30_win5
1306
.section .rodata
1307
.align 32
1308
.Lones:
1309
.quad 1,1,1,1
1310
.Lzeros:
1311
.quad 0,0,0,0
1312
.section ".note.gnu.property", "a"
1313
.p2align 3
1314
.long 1f - 0f
1315
.long 4f - 1f
1316
.long 5
1317
0:
1318
# "GNU" encoded with .byte, since .asciz isn't supported
1319
# on Solaris.
1320
.byte 0x47
1321
.byte 0x4e
1322
.byte 0x55
1323
.byte 0
1324
1:
1325
.p2align 3
1326
.long 0xc0000002
1327
.long 3f - 2f
1328
2:
1329
.long 3
1330
3:
1331
.p2align 3
1332
4:
1333
1334