Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/amd64/rsaz-4k-avx512.S
39483 views
1
/* Do not modify. This file is auto-generated from rsaz-4k-avx512.pl. */
2
.text
3
4
.globl ossl_rsaz_amm52x40_x1_ifma256
5
.type ossl_rsaz_amm52x40_x1_ifma256,@function
6
.align 32
7
ossl_rsaz_amm52x40_x1_ifma256:
8
.cfi_startproc
9
.byte 243,15,30,250
10
pushq %rbx
11
.cfi_adjust_cfa_offset 8
12
.cfi_offset %rbx,-16
13
pushq %rbp
14
.cfi_adjust_cfa_offset 8
15
.cfi_offset %rbp,-24
16
pushq %r12
17
.cfi_adjust_cfa_offset 8
18
.cfi_offset %r12,-32
19
pushq %r13
20
.cfi_adjust_cfa_offset 8
21
.cfi_offset %r13,-40
22
pushq %r14
23
.cfi_adjust_cfa_offset 8
24
.cfi_offset %r14,-48
25
pushq %r15
26
.cfi_adjust_cfa_offset 8
27
.cfi_offset %r15,-56
28
29
vpxord %ymm0,%ymm0,%ymm0
30
vmovdqa64 %ymm0,%ymm3
31
vmovdqa64 %ymm0,%ymm4
32
vmovdqa64 %ymm0,%ymm5
33
vmovdqa64 %ymm0,%ymm6
34
vmovdqa64 %ymm0,%ymm7
35
vmovdqa64 %ymm0,%ymm8
36
vmovdqa64 %ymm0,%ymm9
37
vmovdqa64 %ymm0,%ymm10
38
vmovdqa64 %ymm0,%ymm11
39
vmovdqa64 %ymm0,%ymm12
40
41
xorl %r9d,%r9d
42
43
movq %rdx,%r11
44
movq $0xfffffffffffff,%rax
45
46
47
movl $10,%ebx
48
49
.align 32
50
.Lloop10:
51
movq 0(%r11),%r13
52
53
vpbroadcastq %r13,%ymm1
54
movq 0(%rsi),%rdx
55
mulxq %r13,%r13,%r12
56
addq %r13,%r9
57
movq %r12,%r10
58
adcq $0,%r10
59
60
movq %r8,%r13
61
imulq %r9,%r13
62
andq %rax,%r13
63
64
vpbroadcastq %r13,%ymm2
65
movq 0(%rcx),%rdx
66
mulxq %r13,%r13,%r12
67
addq %r13,%r9
68
adcq %r12,%r10
69
70
shrq $52,%r9
71
salq $12,%r10
72
orq %r10,%r9
73
74
vpmadd52luq 0(%rsi),%ymm1,%ymm3
75
vpmadd52luq 32(%rsi),%ymm1,%ymm4
76
vpmadd52luq 64(%rsi),%ymm1,%ymm5
77
vpmadd52luq 96(%rsi),%ymm1,%ymm6
78
vpmadd52luq 128(%rsi),%ymm1,%ymm7
79
vpmadd52luq 160(%rsi),%ymm1,%ymm8
80
vpmadd52luq 192(%rsi),%ymm1,%ymm9
81
vpmadd52luq 224(%rsi),%ymm1,%ymm10
82
vpmadd52luq 256(%rsi),%ymm1,%ymm11
83
vpmadd52luq 288(%rsi),%ymm1,%ymm12
84
85
vpmadd52luq 0(%rcx),%ymm2,%ymm3
86
vpmadd52luq 32(%rcx),%ymm2,%ymm4
87
vpmadd52luq 64(%rcx),%ymm2,%ymm5
88
vpmadd52luq 96(%rcx),%ymm2,%ymm6
89
vpmadd52luq 128(%rcx),%ymm2,%ymm7
90
vpmadd52luq 160(%rcx),%ymm2,%ymm8
91
vpmadd52luq 192(%rcx),%ymm2,%ymm9
92
vpmadd52luq 224(%rcx),%ymm2,%ymm10
93
vpmadd52luq 256(%rcx),%ymm2,%ymm11
94
vpmadd52luq 288(%rcx),%ymm2,%ymm12
95
96
97
valignq $1,%ymm3,%ymm4,%ymm3
98
valignq $1,%ymm4,%ymm5,%ymm4
99
valignq $1,%ymm5,%ymm6,%ymm5
100
valignq $1,%ymm6,%ymm7,%ymm6
101
valignq $1,%ymm7,%ymm8,%ymm7
102
valignq $1,%ymm8,%ymm9,%ymm8
103
valignq $1,%ymm9,%ymm10,%ymm9
104
valignq $1,%ymm10,%ymm11,%ymm10
105
valignq $1,%ymm11,%ymm12,%ymm11
106
valignq $1,%ymm12,%ymm0,%ymm12
107
108
vmovq %xmm3,%r13
109
addq %r13,%r9
110
111
vpmadd52huq 0(%rsi),%ymm1,%ymm3
112
vpmadd52huq 32(%rsi),%ymm1,%ymm4
113
vpmadd52huq 64(%rsi),%ymm1,%ymm5
114
vpmadd52huq 96(%rsi),%ymm1,%ymm6
115
vpmadd52huq 128(%rsi),%ymm1,%ymm7
116
vpmadd52huq 160(%rsi),%ymm1,%ymm8
117
vpmadd52huq 192(%rsi),%ymm1,%ymm9
118
vpmadd52huq 224(%rsi),%ymm1,%ymm10
119
vpmadd52huq 256(%rsi),%ymm1,%ymm11
120
vpmadd52huq 288(%rsi),%ymm1,%ymm12
121
122
vpmadd52huq 0(%rcx),%ymm2,%ymm3
123
vpmadd52huq 32(%rcx),%ymm2,%ymm4
124
vpmadd52huq 64(%rcx),%ymm2,%ymm5
125
vpmadd52huq 96(%rcx),%ymm2,%ymm6
126
vpmadd52huq 128(%rcx),%ymm2,%ymm7
127
vpmadd52huq 160(%rcx),%ymm2,%ymm8
128
vpmadd52huq 192(%rcx),%ymm2,%ymm9
129
vpmadd52huq 224(%rcx),%ymm2,%ymm10
130
vpmadd52huq 256(%rcx),%ymm2,%ymm11
131
vpmadd52huq 288(%rcx),%ymm2,%ymm12
132
movq 8(%r11),%r13
133
134
vpbroadcastq %r13,%ymm1
135
movq 0(%rsi),%rdx
136
mulxq %r13,%r13,%r12
137
addq %r13,%r9
138
movq %r12,%r10
139
adcq $0,%r10
140
141
movq %r8,%r13
142
imulq %r9,%r13
143
andq %rax,%r13
144
145
vpbroadcastq %r13,%ymm2
146
movq 0(%rcx),%rdx
147
mulxq %r13,%r13,%r12
148
addq %r13,%r9
149
adcq %r12,%r10
150
151
shrq $52,%r9
152
salq $12,%r10
153
orq %r10,%r9
154
155
vpmadd52luq 0(%rsi),%ymm1,%ymm3
156
vpmadd52luq 32(%rsi),%ymm1,%ymm4
157
vpmadd52luq 64(%rsi),%ymm1,%ymm5
158
vpmadd52luq 96(%rsi),%ymm1,%ymm6
159
vpmadd52luq 128(%rsi),%ymm1,%ymm7
160
vpmadd52luq 160(%rsi),%ymm1,%ymm8
161
vpmadd52luq 192(%rsi),%ymm1,%ymm9
162
vpmadd52luq 224(%rsi),%ymm1,%ymm10
163
vpmadd52luq 256(%rsi),%ymm1,%ymm11
164
vpmadd52luq 288(%rsi),%ymm1,%ymm12
165
166
vpmadd52luq 0(%rcx),%ymm2,%ymm3
167
vpmadd52luq 32(%rcx),%ymm2,%ymm4
168
vpmadd52luq 64(%rcx),%ymm2,%ymm5
169
vpmadd52luq 96(%rcx),%ymm2,%ymm6
170
vpmadd52luq 128(%rcx),%ymm2,%ymm7
171
vpmadd52luq 160(%rcx),%ymm2,%ymm8
172
vpmadd52luq 192(%rcx),%ymm2,%ymm9
173
vpmadd52luq 224(%rcx),%ymm2,%ymm10
174
vpmadd52luq 256(%rcx),%ymm2,%ymm11
175
vpmadd52luq 288(%rcx),%ymm2,%ymm12
176
177
178
valignq $1,%ymm3,%ymm4,%ymm3
179
valignq $1,%ymm4,%ymm5,%ymm4
180
valignq $1,%ymm5,%ymm6,%ymm5
181
valignq $1,%ymm6,%ymm7,%ymm6
182
valignq $1,%ymm7,%ymm8,%ymm7
183
valignq $1,%ymm8,%ymm9,%ymm8
184
valignq $1,%ymm9,%ymm10,%ymm9
185
valignq $1,%ymm10,%ymm11,%ymm10
186
valignq $1,%ymm11,%ymm12,%ymm11
187
valignq $1,%ymm12,%ymm0,%ymm12
188
189
vmovq %xmm3,%r13
190
addq %r13,%r9
191
192
vpmadd52huq 0(%rsi),%ymm1,%ymm3
193
vpmadd52huq 32(%rsi),%ymm1,%ymm4
194
vpmadd52huq 64(%rsi),%ymm1,%ymm5
195
vpmadd52huq 96(%rsi),%ymm1,%ymm6
196
vpmadd52huq 128(%rsi),%ymm1,%ymm7
197
vpmadd52huq 160(%rsi),%ymm1,%ymm8
198
vpmadd52huq 192(%rsi),%ymm1,%ymm9
199
vpmadd52huq 224(%rsi),%ymm1,%ymm10
200
vpmadd52huq 256(%rsi),%ymm1,%ymm11
201
vpmadd52huq 288(%rsi),%ymm1,%ymm12
202
203
vpmadd52huq 0(%rcx),%ymm2,%ymm3
204
vpmadd52huq 32(%rcx),%ymm2,%ymm4
205
vpmadd52huq 64(%rcx),%ymm2,%ymm5
206
vpmadd52huq 96(%rcx),%ymm2,%ymm6
207
vpmadd52huq 128(%rcx),%ymm2,%ymm7
208
vpmadd52huq 160(%rcx),%ymm2,%ymm8
209
vpmadd52huq 192(%rcx),%ymm2,%ymm9
210
vpmadd52huq 224(%rcx),%ymm2,%ymm10
211
vpmadd52huq 256(%rcx),%ymm2,%ymm11
212
vpmadd52huq 288(%rcx),%ymm2,%ymm12
213
movq 16(%r11),%r13
214
215
vpbroadcastq %r13,%ymm1
216
movq 0(%rsi),%rdx
217
mulxq %r13,%r13,%r12
218
addq %r13,%r9
219
movq %r12,%r10
220
adcq $0,%r10
221
222
movq %r8,%r13
223
imulq %r9,%r13
224
andq %rax,%r13
225
226
vpbroadcastq %r13,%ymm2
227
movq 0(%rcx),%rdx
228
mulxq %r13,%r13,%r12
229
addq %r13,%r9
230
adcq %r12,%r10
231
232
shrq $52,%r9
233
salq $12,%r10
234
orq %r10,%r9
235
236
vpmadd52luq 0(%rsi),%ymm1,%ymm3
237
vpmadd52luq 32(%rsi),%ymm1,%ymm4
238
vpmadd52luq 64(%rsi),%ymm1,%ymm5
239
vpmadd52luq 96(%rsi),%ymm1,%ymm6
240
vpmadd52luq 128(%rsi),%ymm1,%ymm7
241
vpmadd52luq 160(%rsi),%ymm1,%ymm8
242
vpmadd52luq 192(%rsi),%ymm1,%ymm9
243
vpmadd52luq 224(%rsi),%ymm1,%ymm10
244
vpmadd52luq 256(%rsi),%ymm1,%ymm11
245
vpmadd52luq 288(%rsi),%ymm1,%ymm12
246
247
vpmadd52luq 0(%rcx),%ymm2,%ymm3
248
vpmadd52luq 32(%rcx),%ymm2,%ymm4
249
vpmadd52luq 64(%rcx),%ymm2,%ymm5
250
vpmadd52luq 96(%rcx),%ymm2,%ymm6
251
vpmadd52luq 128(%rcx),%ymm2,%ymm7
252
vpmadd52luq 160(%rcx),%ymm2,%ymm8
253
vpmadd52luq 192(%rcx),%ymm2,%ymm9
254
vpmadd52luq 224(%rcx),%ymm2,%ymm10
255
vpmadd52luq 256(%rcx),%ymm2,%ymm11
256
vpmadd52luq 288(%rcx),%ymm2,%ymm12
257
258
259
valignq $1,%ymm3,%ymm4,%ymm3
260
valignq $1,%ymm4,%ymm5,%ymm4
261
valignq $1,%ymm5,%ymm6,%ymm5
262
valignq $1,%ymm6,%ymm7,%ymm6
263
valignq $1,%ymm7,%ymm8,%ymm7
264
valignq $1,%ymm8,%ymm9,%ymm8
265
valignq $1,%ymm9,%ymm10,%ymm9
266
valignq $1,%ymm10,%ymm11,%ymm10
267
valignq $1,%ymm11,%ymm12,%ymm11
268
valignq $1,%ymm12,%ymm0,%ymm12
269
270
vmovq %xmm3,%r13
271
addq %r13,%r9
272
273
vpmadd52huq 0(%rsi),%ymm1,%ymm3
274
vpmadd52huq 32(%rsi),%ymm1,%ymm4
275
vpmadd52huq 64(%rsi),%ymm1,%ymm5
276
vpmadd52huq 96(%rsi),%ymm1,%ymm6
277
vpmadd52huq 128(%rsi),%ymm1,%ymm7
278
vpmadd52huq 160(%rsi),%ymm1,%ymm8
279
vpmadd52huq 192(%rsi),%ymm1,%ymm9
280
vpmadd52huq 224(%rsi),%ymm1,%ymm10
281
vpmadd52huq 256(%rsi),%ymm1,%ymm11
282
vpmadd52huq 288(%rsi),%ymm1,%ymm12
283
284
vpmadd52huq 0(%rcx),%ymm2,%ymm3
285
vpmadd52huq 32(%rcx),%ymm2,%ymm4
286
vpmadd52huq 64(%rcx),%ymm2,%ymm5
287
vpmadd52huq 96(%rcx),%ymm2,%ymm6
288
vpmadd52huq 128(%rcx),%ymm2,%ymm7
289
vpmadd52huq 160(%rcx),%ymm2,%ymm8
290
vpmadd52huq 192(%rcx),%ymm2,%ymm9
291
vpmadd52huq 224(%rcx),%ymm2,%ymm10
292
vpmadd52huq 256(%rcx),%ymm2,%ymm11
293
vpmadd52huq 288(%rcx),%ymm2,%ymm12
294
movq 24(%r11),%r13
295
296
vpbroadcastq %r13,%ymm1
297
movq 0(%rsi),%rdx
298
mulxq %r13,%r13,%r12
299
addq %r13,%r9
300
movq %r12,%r10
301
adcq $0,%r10
302
303
movq %r8,%r13
304
imulq %r9,%r13
305
andq %rax,%r13
306
307
vpbroadcastq %r13,%ymm2
308
movq 0(%rcx),%rdx
309
mulxq %r13,%r13,%r12
310
addq %r13,%r9
311
adcq %r12,%r10
312
313
shrq $52,%r9
314
salq $12,%r10
315
orq %r10,%r9
316
317
vpmadd52luq 0(%rsi),%ymm1,%ymm3
318
vpmadd52luq 32(%rsi),%ymm1,%ymm4
319
vpmadd52luq 64(%rsi),%ymm1,%ymm5
320
vpmadd52luq 96(%rsi),%ymm1,%ymm6
321
vpmadd52luq 128(%rsi),%ymm1,%ymm7
322
vpmadd52luq 160(%rsi),%ymm1,%ymm8
323
vpmadd52luq 192(%rsi),%ymm1,%ymm9
324
vpmadd52luq 224(%rsi),%ymm1,%ymm10
325
vpmadd52luq 256(%rsi),%ymm1,%ymm11
326
vpmadd52luq 288(%rsi),%ymm1,%ymm12
327
328
vpmadd52luq 0(%rcx),%ymm2,%ymm3
329
vpmadd52luq 32(%rcx),%ymm2,%ymm4
330
vpmadd52luq 64(%rcx),%ymm2,%ymm5
331
vpmadd52luq 96(%rcx),%ymm2,%ymm6
332
vpmadd52luq 128(%rcx),%ymm2,%ymm7
333
vpmadd52luq 160(%rcx),%ymm2,%ymm8
334
vpmadd52luq 192(%rcx),%ymm2,%ymm9
335
vpmadd52luq 224(%rcx),%ymm2,%ymm10
336
vpmadd52luq 256(%rcx),%ymm2,%ymm11
337
vpmadd52luq 288(%rcx),%ymm2,%ymm12
338
339
340
valignq $1,%ymm3,%ymm4,%ymm3
341
valignq $1,%ymm4,%ymm5,%ymm4
342
valignq $1,%ymm5,%ymm6,%ymm5
343
valignq $1,%ymm6,%ymm7,%ymm6
344
valignq $1,%ymm7,%ymm8,%ymm7
345
valignq $1,%ymm8,%ymm9,%ymm8
346
valignq $1,%ymm9,%ymm10,%ymm9
347
valignq $1,%ymm10,%ymm11,%ymm10
348
valignq $1,%ymm11,%ymm12,%ymm11
349
valignq $1,%ymm12,%ymm0,%ymm12
350
351
vmovq %xmm3,%r13
352
addq %r13,%r9
353
354
vpmadd52huq 0(%rsi),%ymm1,%ymm3
355
vpmadd52huq 32(%rsi),%ymm1,%ymm4
356
vpmadd52huq 64(%rsi),%ymm1,%ymm5
357
vpmadd52huq 96(%rsi),%ymm1,%ymm6
358
vpmadd52huq 128(%rsi),%ymm1,%ymm7
359
vpmadd52huq 160(%rsi),%ymm1,%ymm8
360
vpmadd52huq 192(%rsi),%ymm1,%ymm9
361
vpmadd52huq 224(%rsi),%ymm1,%ymm10
362
vpmadd52huq 256(%rsi),%ymm1,%ymm11
363
vpmadd52huq 288(%rsi),%ymm1,%ymm12
364
365
vpmadd52huq 0(%rcx),%ymm2,%ymm3
366
vpmadd52huq 32(%rcx),%ymm2,%ymm4
367
vpmadd52huq 64(%rcx),%ymm2,%ymm5
368
vpmadd52huq 96(%rcx),%ymm2,%ymm6
369
vpmadd52huq 128(%rcx),%ymm2,%ymm7
370
vpmadd52huq 160(%rcx),%ymm2,%ymm8
371
vpmadd52huq 192(%rcx),%ymm2,%ymm9
372
vpmadd52huq 224(%rcx),%ymm2,%ymm10
373
vpmadd52huq 256(%rcx),%ymm2,%ymm11
374
vpmadd52huq 288(%rcx),%ymm2,%ymm12
375
leaq 32(%r11),%r11
376
decl %ebx
377
jne .Lloop10
378
379
vpbroadcastq %r9,%ymm0
380
vpblendd $3,%ymm0,%ymm3,%ymm3
381
382
383
384
vpsrlq $52,%ymm3,%ymm0
385
vpsrlq $52,%ymm4,%ymm1
386
vpsrlq $52,%ymm5,%ymm2
387
vpsrlq $52,%ymm6,%ymm23
388
vpsrlq $52,%ymm7,%ymm24
389
vpsrlq $52,%ymm8,%ymm25
390
vpsrlq $52,%ymm9,%ymm26
391
vpsrlq $52,%ymm10,%ymm27
392
vpsrlq $52,%ymm11,%ymm28
393
vpsrlq $52,%ymm12,%ymm29
394
395
396
valignq $3,%ymm28,%ymm29,%ymm29
397
valignq $3,%ymm27,%ymm28,%ymm28
398
valignq $3,%ymm26,%ymm27,%ymm27
399
valignq $3,%ymm25,%ymm26,%ymm26
400
valignq $3,%ymm24,%ymm25,%ymm25
401
valignq $3,%ymm23,%ymm24,%ymm24
402
valignq $3,%ymm2,%ymm23,%ymm23
403
valignq $3,%ymm1,%ymm2,%ymm2
404
valignq $3,%ymm0,%ymm1,%ymm1
405
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
406
407
408
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
409
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
410
vpandq .Lmask52x4(%rip),%ymm5,%ymm5
411
vpandq .Lmask52x4(%rip),%ymm6,%ymm6
412
vpandq .Lmask52x4(%rip),%ymm7,%ymm7
413
vpandq .Lmask52x4(%rip),%ymm8,%ymm8
414
vpandq .Lmask52x4(%rip),%ymm9,%ymm9
415
vpandq .Lmask52x4(%rip),%ymm10,%ymm10
416
vpandq .Lmask52x4(%rip),%ymm11,%ymm11
417
vpandq .Lmask52x4(%rip),%ymm12,%ymm12
418
419
420
vpaddq %ymm0,%ymm3,%ymm3
421
vpaddq %ymm1,%ymm4,%ymm4
422
vpaddq %ymm2,%ymm5,%ymm5
423
vpaddq %ymm23,%ymm6,%ymm6
424
vpaddq %ymm24,%ymm7,%ymm7
425
vpaddq %ymm25,%ymm8,%ymm8
426
vpaddq %ymm26,%ymm9,%ymm9
427
vpaddq %ymm27,%ymm10,%ymm10
428
vpaddq %ymm28,%ymm11,%ymm11
429
vpaddq %ymm29,%ymm12,%ymm12
430
431
432
433
vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
434
vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2
435
kmovb %k1,%r14d
436
kmovb %k2,%r13d
437
shlb $4,%r13b
438
orb %r13b,%r14b
439
440
vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1
441
vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2
442
kmovb %k1,%r13d
443
kmovb %k2,%r12d
444
shlb $4,%r12b
445
orb %r12b,%r13b
446
447
vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1
448
vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2
449
kmovb %k1,%r12d
450
kmovb %k2,%r11d
451
shlb $4,%r11b
452
orb %r11b,%r12b
453
454
vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1
455
vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2
456
kmovb %k1,%r11d
457
kmovb %k2,%r10d
458
shlb $4,%r10b
459
orb %r10b,%r11b
460
461
vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1
462
vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2
463
kmovb %k1,%r10d
464
kmovb %k2,%r9d
465
shlb $4,%r9b
466
orb %r9b,%r10b
467
468
addb %r14b,%r14b
469
adcb %r13b,%r13b
470
adcb %r12b,%r12b
471
adcb %r11b,%r11b
472
adcb %r10b,%r10b
473
474
475
vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
476
vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2
477
kmovb %k1,%r9d
478
kmovb %k2,%r8d
479
shlb $4,%r8b
480
orb %r8b,%r9b
481
482
vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1
483
vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2
484
kmovb %k1,%r8d
485
kmovb %k2,%edx
486
shlb $4,%dl
487
orb %dl,%r8b
488
489
vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1
490
vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2
491
kmovb %k1,%edx
492
kmovb %k2,%ecx
493
shlb $4,%cl
494
orb %cl,%dl
495
496
vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1
497
vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2
498
kmovb %k1,%ecx
499
kmovb %k2,%ebx
500
shlb $4,%bl
501
orb %bl,%cl
502
503
vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1
504
vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2
505
kmovb %k1,%ebx
506
kmovb %k2,%eax
507
shlb $4,%al
508
orb %al,%bl
509
510
addb %r9b,%r14b
511
adcb %r8b,%r13b
512
adcb %dl,%r12b
513
adcb %cl,%r11b
514
adcb %bl,%r10b
515
516
xorb %r9b,%r14b
517
xorb %r8b,%r13b
518
xorb %dl,%r12b
519
xorb %cl,%r11b
520
xorb %bl,%r10b
521
522
kmovb %r14d,%k1
523
shrb $4,%r14b
524
kmovb %r14d,%k2
525
kmovb %r13d,%k3
526
shrb $4,%r13b
527
kmovb %r13d,%k4
528
kmovb %r12d,%k5
529
shrb $4,%r12b
530
kmovb %r12d,%k6
531
kmovb %r11d,%k7
532
533
vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
534
vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2}
535
vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3}
536
vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4}
537
vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5}
538
vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6}
539
vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7}
540
541
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
542
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
543
vpandq .Lmask52x4(%rip),%ymm5,%ymm5
544
vpandq .Lmask52x4(%rip),%ymm6,%ymm6
545
vpandq .Lmask52x4(%rip),%ymm7,%ymm7
546
vpandq .Lmask52x4(%rip),%ymm8,%ymm8
547
vpandq .Lmask52x4(%rip),%ymm9,%ymm9
548
549
shrb $4,%r11b
550
kmovb %r11d,%k1
551
kmovb %r10d,%k2
552
shrb $4,%r10b
553
kmovb %r10d,%k3
554
555
vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1}
556
vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k2}
557
vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k3}
558
559
vpandq .Lmask52x4(%rip),%ymm10,%ymm10
560
vpandq .Lmask52x4(%rip),%ymm11,%ymm11
561
vpandq .Lmask52x4(%rip),%ymm12,%ymm12
562
563
vmovdqu64 %ymm3,0(%rdi)
564
vmovdqu64 %ymm4,32(%rdi)
565
vmovdqu64 %ymm5,64(%rdi)
566
vmovdqu64 %ymm6,96(%rdi)
567
vmovdqu64 %ymm7,128(%rdi)
568
vmovdqu64 %ymm8,160(%rdi)
569
vmovdqu64 %ymm9,192(%rdi)
570
vmovdqu64 %ymm10,224(%rdi)
571
vmovdqu64 %ymm11,256(%rdi)
572
vmovdqu64 %ymm12,288(%rdi)
573
574
vzeroupper
575
leaq (%rsp),%rax
576
.cfi_def_cfa_register %rax
577
movq 0(%rax),%r15
578
.cfi_restore %r15
579
movq 8(%rax),%r14
580
.cfi_restore %r14
581
movq 16(%rax),%r13
582
.cfi_restore %r13
583
movq 24(%rax),%r12
584
.cfi_restore %r12
585
movq 32(%rax),%rbp
586
.cfi_restore %rbp
587
movq 40(%rax),%rbx
588
.cfi_restore %rbx
589
leaq 48(%rax),%rsp
590
.cfi_def_cfa %rsp,8
591
.Lossl_rsaz_amm52x40_x1_ifma256_epilogue:
592
593
.byte 0xf3,0xc3
594
.cfi_endproc
595
.size ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256
596
.section .rodata
597
.align 32
598
.Lmask52x4:
599
.quad 0xfffffffffffff
600
.quad 0xfffffffffffff
601
.quad 0xfffffffffffff
602
.quad 0xfffffffffffff
603
.text
604
605
.globl ossl_rsaz_amm52x40_x2_ifma256
606
.type ossl_rsaz_amm52x40_x2_ifma256,@function
607
.align 32
608
ossl_rsaz_amm52x40_x2_ifma256:
609
.cfi_startproc
610
.byte 243,15,30,250
611
pushq %rbx
612
.cfi_adjust_cfa_offset 8
613
.cfi_offset %rbx,-16
614
pushq %rbp
615
.cfi_adjust_cfa_offset 8
616
.cfi_offset %rbp,-24
617
pushq %r12
618
.cfi_adjust_cfa_offset 8
619
.cfi_offset %r12,-32
620
pushq %r13
621
.cfi_adjust_cfa_offset 8
622
.cfi_offset %r13,-40
623
pushq %r14
624
.cfi_adjust_cfa_offset 8
625
.cfi_offset %r14,-48
626
pushq %r15
627
.cfi_adjust_cfa_offset 8
628
.cfi_offset %r15,-56
629
630
vpxord %ymm0,%ymm0,%ymm0
631
vmovdqa64 %ymm0,%ymm3
632
vmovdqa64 %ymm0,%ymm4
633
vmovdqa64 %ymm0,%ymm5
634
vmovdqa64 %ymm0,%ymm6
635
vmovdqa64 %ymm0,%ymm7
636
vmovdqa64 %ymm0,%ymm8
637
vmovdqa64 %ymm0,%ymm9
638
vmovdqa64 %ymm0,%ymm10
639
vmovdqa64 %ymm0,%ymm11
640
vmovdqa64 %ymm0,%ymm12
641
642
vmovdqa64 %ymm0,%ymm13
643
vmovdqa64 %ymm0,%ymm14
644
vmovdqa64 %ymm0,%ymm15
645
vmovdqa64 %ymm0,%ymm16
646
vmovdqa64 %ymm0,%ymm17
647
vmovdqa64 %ymm0,%ymm18
648
vmovdqa64 %ymm0,%ymm19
649
vmovdqa64 %ymm0,%ymm20
650
vmovdqa64 %ymm0,%ymm21
651
vmovdqa64 %ymm0,%ymm22
652
653
654
xorl %r9d,%r9d
655
xorl %r15d,%r15d
656
657
movq %rdx,%r11
658
movq $0xfffffffffffff,%rax
659
660
movl $40,%ebx
661
662
.align 32
663
.Lloop40:
664
movq 0(%r11),%r13
665
666
vpbroadcastq %r13,%ymm1
667
movq 0(%rsi),%rdx
668
mulxq %r13,%r13,%r12
669
addq %r13,%r9
670
movq %r12,%r10
671
adcq $0,%r10
672
673
movq (%r8),%r13
674
imulq %r9,%r13
675
andq %rax,%r13
676
677
vpbroadcastq %r13,%ymm2
678
movq 0(%rcx),%rdx
679
mulxq %r13,%r13,%r12
680
addq %r13,%r9
681
adcq %r12,%r10
682
683
shrq $52,%r9
684
salq $12,%r10
685
orq %r10,%r9
686
687
vpmadd52luq 0(%rsi),%ymm1,%ymm3
688
vpmadd52luq 32(%rsi),%ymm1,%ymm4
689
vpmadd52luq 64(%rsi),%ymm1,%ymm5
690
vpmadd52luq 96(%rsi),%ymm1,%ymm6
691
vpmadd52luq 128(%rsi),%ymm1,%ymm7
692
vpmadd52luq 160(%rsi),%ymm1,%ymm8
693
vpmadd52luq 192(%rsi),%ymm1,%ymm9
694
vpmadd52luq 224(%rsi),%ymm1,%ymm10
695
vpmadd52luq 256(%rsi),%ymm1,%ymm11
696
vpmadd52luq 288(%rsi),%ymm1,%ymm12
697
698
vpmadd52luq 0(%rcx),%ymm2,%ymm3
699
vpmadd52luq 32(%rcx),%ymm2,%ymm4
700
vpmadd52luq 64(%rcx),%ymm2,%ymm5
701
vpmadd52luq 96(%rcx),%ymm2,%ymm6
702
vpmadd52luq 128(%rcx),%ymm2,%ymm7
703
vpmadd52luq 160(%rcx),%ymm2,%ymm8
704
vpmadd52luq 192(%rcx),%ymm2,%ymm9
705
vpmadd52luq 224(%rcx),%ymm2,%ymm10
706
vpmadd52luq 256(%rcx),%ymm2,%ymm11
707
vpmadd52luq 288(%rcx),%ymm2,%ymm12
708
709
710
valignq $1,%ymm3,%ymm4,%ymm3
711
valignq $1,%ymm4,%ymm5,%ymm4
712
valignq $1,%ymm5,%ymm6,%ymm5
713
valignq $1,%ymm6,%ymm7,%ymm6
714
valignq $1,%ymm7,%ymm8,%ymm7
715
valignq $1,%ymm8,%ymm9,%ymm8
716
valignq $1,%ymm9,%ymm10,%ymm9
717
valignq $1,%ymm10,%ymm11,%ymm10
718
valignq $1,%ymm11,%ymm12,%ymm11
719
valignq $1,%ymm12,%ymm0,%ymm12
720
721
vmovq %xmm3,%r13
722
addq %r13,%r9
723
724
vpmadd52huq 0(%rsi),%ymm1,%ymm3
725
vpmadd52huq 32(%rsi),%ymm1,%ymm4
726
vpmadd52huq 64(%rsi),%ymm1,%ymm5
727
vpmadd52huq 96(%rsi),%ymm1,%ymm6
728
vpmadd52huq 128(%rsi),%ymm1,%ymm7
729
vpmadd52huq 160(%rsi),%ymm1,%ymm8
730
vpmadd52huq 192(%rsi),%ymm1,%ymm9
731
vpmadd52huq 224(%rsi),%ymm1,%ymm10
732
vpmadd52huq 256(%rsi),%ymm1,%ymm11
733
vpmadd52huq 288(%rsi),%ymm1,%ymm12
734
735
vpmadd52huq 0(%rcx),%ymm2,%ymm3
736
vpmadd52huq 32(%rcx),%ymm2,%ymm4
737
vpmadd52huq 64(%rcx),%ymm2,%ymm5
738
vpmadd52huq 96(%rcx),%ymm2,%ymm6
739
vpmadd52huq 128(%rcx),%ymm2,%ymm7
740
vpmadd52huq 160(%rcx),%ymm2,%ymm8
741
vpmadd52huq 192(%rcx),%ymm2,%ymm9
742
vpmadd52huq 224(%rcx),%ymm2,%ymm10
743
vpmadd52huq 256(%rcx),%ymm2,%ymm11
744
vpmadd52huq 288(%rcx),%ymm2,%ymm12
745
movq 320(%r11),%r13
746
747
vpbroadcastq %r13,%ymm1
748
movq 320(%rsi),%rdx
749
mulxq %r13,%r13,%r12
750
addq %r13,%r15
751
movq %r12,%r10
752
adcq $0,%r10
753
754
movq 8(%r8),%r13
755
imulq %r15,%r13
756
andq %rax,%r13
757
758
vpbroadcastq %r13,%ymm2
759
movq 320(%rcx),%rdx
760
mulxq %r13,%r13,%r12
761
addq %r13,%r15
762
adcq %r12,%r10
763
764
shrq $52,%r15
765
salq $12,%r10
766
orq %r10,%r15
767
768
vpmadd52luq 320(%rsi),%ymm1,%ymm13
769
vpmadd52luq 352(%rsi),%ymm1,%ymm14
770
vpmadd52luq 384(%rsi),%ymm1,%ymm15
771
vpmadd52luq 416(%rsi),%ymm1,%ymm16
772
vpmadd52luq 448(%rsi),%ymm1,%ymm17
773
vpmadd52luq 480(%rsi),%ymm1,%ymm18
774
vpmadd52luq 512(%rsi),%ymm1,%ymm19
775
vpmadd52luq 544(%rsi),%ymm1,%ymm20
776
vpmadd52luq 576(%rsi),%ymm1,%ymm21
777
vpmadd52luq 608(%rsi),%ymm1,%ymm22
778
779
vpmadd52luq 320(%rcx),%ymm2,%ymm13
780
vpmadd52luq 352(%rcx),%ymm2,%ymm14
781
vpmadd52luq 384(%rcx),%ymm2,%ymm15
782
vpmadd52luq 416(%rcx),%ymm2,%ymm16
783
vpmadd52luq 448(%rcx),%ymm2,%ymm17
784
vpmadd52luq 480(%rcx),%ymm2,%ymm18
785
vpmadd52luq 512(%rcx),%ymm2,%ymm19
786
vpmadd52luq 544(%rcx),%ymm2,%ymm20
787
vpmadd52luq 576(%rcx),%ymm2,%ymm21
788
vpmadd52luq 608(%rcx),%ymm2,%ymm22
789
790
791
valignq $1,%ymm13,%ymm14,%ymm13
792
valignq $1,%ymm14,%ymm15,%ymm14
793
valignq $1,%ymm15,%ymm16,%ymm15
794
valignq $1,%ymm16,%ymm17,%ymm16
795
valignq $1,%ymm17,%ymm18,%ymm17
796
valignq $1,%ymm18,%ymm19,%ymm18
797
valignq $1,%ymm19,%ymm20,%ymm19
798
valignq $1,%ymm20,%ymm21,%ymm20
799
valignq $1,%ymm21,%ymm22,%ymm21
800
valignq $1,%ymm22,%ymm0,%ymm22
801
802
vmovq %xmm13,%r13
803
addq %r13,%r15
804
805
vpmadd52huq 320(%rsi),%ymm1,%ymm13
806
vpmadd52huq 352(%rsi),%ymm1,%ymm14
807
vpmadd52huq 384(%rsi),%ymm1,%ymm15
808
vpmadd52huq 416(%rsi),%ymm1,%ymm16
809
vpmadd52huq 448(%rsi),%ymm1,%ymm17
810
vpmadd52huq 480(%rsi),%ymm1,%ymm18
811
vpmadd52huq 512(%rsi),%ymm1,%ymm19
812
vpmadd52huq 544(%rsi),%ymm1,%ymm20
813
vpmadd52huq 576(%rsi),%ymm1,%ymm21
814
vpmadd52huq 608(%rsi),%ymm1,%ymm22
815
816
vpmadd52huq 320(%rcx),%ymm2,%ymm13
817
vpmadd52huq 352(%rcx),%ymm2,%ymm14
818
vpmadd52huq 384(%rcx),%ymm2,%ymm15
819
vpmadd52huq 416(%rcx),%ymm2,%ymm16
820
vpmadd52huq 448(%rcx),%ymm2,%ymm17
821
vpmadd52huq 480(%rcx),%ymm2,%ymm18
822
vpmadd52huq 512(%rcx),%ymm2,%ymm19
823
vpmadd52huq 544(%rcx),%ymm2,%ymm20
824
vpmadd52huq 576(%rcx),%ymm2,%ymm21
825
vpmadd52huq 608(%rcx),%ymm2,%ymm22
826
leaq 8(%r11),%r11
827
decl %ebx
828
jne .Lloop40
829
830
vpbroadcastq %r9,%ymm0
831
vpblendd $3,%ymm0,%ymm3,%ymm3
832
833
834
835
vpsrlq $52,%ymm3,%ymm0
836
vpsrlq $52,%ymm4,%ymm1
837
vpsrlq $52,%ymm5,%ymm2
838
vpsrlq $52,%ymm6,%ymm23
839
vpsrlq $52,%ymm7,%ymm24
840
vpsrlq $52,%ymm8,%ymm25
841
vpsrlq $52,%ymm9,%ymm26
842
vpsrlq $52,%ymm10,%ymm27
843
vpsrlq $52,%ymm11,%ymm28
844
vpsrlq $52,%ymm12,%ymm29
845
846
847
valignq $3,%ymm28,%ymm29,%ymm29
848
valignq $3,%ymm27,%ymm28,%ymm28
849
valignq $3,%ymm26,%ymm27,%ymm27
850
valignq $3,%ymm25,%ymm26,%ymm26
851
valignq $3,%ymm24,%ymm25,%ymm25
852
valignq $3,%ymm23,%ymm24,%ymm24
853
valignq $3,%ymm2,%ymm23,%ymm23
854
valignq $3,%ymm1,%ymm2,%ymm2
855
valignq $3,%ymm0,%ymm1,%ymm1
856
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
857
858
859
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
860
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
861
vpandq .Lmask52x4(%rip),%ymm5,%ymm5
862
vpandq .Lmask52x4(%rip),%ymm6,%ymm6
863
vpandq .Lmask52x4(%rip),%ymm7,%ymm7
864
vpandq .Lmask52x4(%rip),%ymm8,%ymm8
865
vpandq .Lmask52x4(%rip),%ymm9,%ymm9
866
vpandq .Lmask52x4(%rip),%ymm10,%ymm10
867
vpandq .Lmask52x4(%rip),%ymm11,%ymm11
868
vpandq .Lmask52x4(%rip),%ymm12,%ymm12
869
870
871
vpaddq %ymm0,%ymm3,%ymm3
872
vpaddq %ymm1,%ymm4,%ymm4
873
vpaddq %ymm2,%ymm5,%ymm5
874
vpaddq %ymm23,%ymm6,%ymm6
875
vpaddq %ymm24,%ymm7,%ymm7
876
vpaddq %ymm25,%ymm8,%ymm8
877
vpaddq %ymm26,%ymm9,%ymm9
878
vpaddq %ymm27,%ymm10,%ymm10
879
vpaddq %ymm28,%ymm11,%ymm11
880
vpaddq %ymm29,%ymm12,%ymm12
881
882
883
884
vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
885
vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2
886
kmovb %k1,%r14d
887
kmovb %k2,%r13d
888
shlb $4,%r13b
889
orb %r13b,%r14b
890
891
vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1
892
vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2
893
kmovb %k1,%r13d
894
kmovb %k2,%r12d
895
shlb $4,%r12b
896
orb %r12b,%r13b
897
898
vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1
899
vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2
900
kmovb %k1,%r12d
901
kmovb %k2,%r11d
902
shlb $4,%r11b
903
orb %r11b,%r12b
904
905
vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1
906
vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2
907
kmovb %k1,%r11d
908
kmovb %k2,%r10d
909
shlb $4,%r10b
910
orb %r10b,%r11b
911
912
vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1
913
vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2
914
kmovb %k1,%r10d
915
kmovb %k2,%r9d
916
shlb $4,%r9b
917
orb %r9b,%r10b
918
919
addb %r14b,%r14b
920
adcb %r13b,%r13b
921
adcb %r12b,%r12b
922
adcb %r11b,%r11b
923
adcb %r10b,%r10b
924
925
926
vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
927
vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2
928
kmovb %k1,%r9d
929
kmovb %k2,%r8d
930
shlb $4,%r8b
931
orb %r8b,%r9b
932
933
vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1
934
vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2
935
kmovb %k1,%r8d
936
kmovb %k2,%edx
937
shlb $4,%dl
938
orb %dl,%r8b
939
940
vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1
941
vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2
942
kmovb %k1,%edx
943
kmovb %k2,%ecx
944
shlb $4,%cl
945
orb %cl,%dl
946
947
vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1
948
vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2
949
kmovb %k1,%ecx
950
kmovb %k2,%ebx
951
shlb $4,%bl
952
orb %bl,%cl
953
954
vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1
955
vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2
956
kmovb %k1,%ebx
957
kmovb %k2,%eax
958
shlb $4,%al
959
orb %al,%bl
960
961
addb %r9b,%r14b
962
adcb %r8b,%r13b
963
adcb %dl,%r12b
964
adcb %cl,%r11b
965
adcb %bl,%r10b
966
967
xorb %r9b,%r14b
968
xorb %r8b,%r13b
969
xorb %dl,%r12b
970
xorb %cl,%r11b
971
xorb %bl,%r10b
972
973
kmovb %r14d,%k1
974
shrb $4,%r14b
975
kmovb %r14d,%k2
976
kmovb %r13d,%k3
977
shrb $4,%r13b
978
kmovb %r13d,%k4
979
kmovb %r12d,%k5
980
shrb $4,%r12b
981
kmovb %r12d,%k6
982
kmovb %r11d,%k7
983
984
vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
985
vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2}
986
vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3}
987
vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4}
988
vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5}
989
vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6}
990
vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7}
991
992
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
993
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
994
vpandq .Lmask52x4(%rip),%ymm5,%ymm5
995
vpandq .Lmask52x4(%rip),%ymm6,%ymm6
996
vpandq .Lmask52x4(%rip),%ymm7,%ymm7
997
vpandq .Lmask52x4(%rip),%ymm8,%ymm8
998
vpandq .Lmask52x4(%rip),%ymm9,%ymm9
999
1000
shrb $4,%r11b
1001
kmovb %r11d,%k1
1002
kmovb %r10d,%k2
1003
shrb $4,%r10b
1004
kmovb %r10d,%k3
1005
1006
vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1}
1007
vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k2}
1008
vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k3}
1009
1010
vpandq .Lmask52x4(%rip),%ymm10,%ymm10
1011
vpandq .Lmask52x4(%rip),%ymm11,%ymm11
1012
vpandq .Lmask52x4(%rip),%ymm12,%ymm12
1013
1014
vpbroadcastq %r15,%ymm0
1015
vpblendd $3,%ymm0,%ymm13,%ymm13
1016
1017
1018
1019
vpsrlq $52,%ymm13,%ymm0
1020
vpsrlq $52,%ymm14,%ymm1
1021
vpsrlq $52,%ymm15,%ymm2
1022
vpsrlq $52,%ymm16,%ymm23
1023
vpsrlq $52,%ymm17,%ymm24
1024
vpsrlq $52,%ymm18,%ymm25
1025
vpsrlq $52,%ymm19,%ymm26
1026
vpsrlq $52,%ymm20,%ymm27
1027
vpsrlq $52,%ymm21,%ymm28
1028
vpsrlq $52,%ymm22,%ymm29
1029
1030
1031
valignq $3,%ymm28,%ymm29,%ymm29
1032
valignq $3,%ymm27,%ymm28,%ymm28
1033
valignq $3,%ymm26,%ymm27,%ymm27
1034
valignq $3,%ymm25,%ymm26,%ymm26
1035
valignq $3,%ymm24,%ymm25,%ymm25
1036
valignq $3,%ymm23,%ymm24,%ymm24
1037
valignq $3,%ymm2,%ymm23,%ymm23
1038
valignq $3,%ymm1,%ymm2,%ymm2
1039
valignq $3,%ymm0,%ymm1,%ymm1
1040
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
1041
1042
1043
vpandq .Lmask52x4(%rip),%ymm13,%ymm13
1044
vpandq .Lmask52x4(%rip),%ymm14,%ymm14
1045
vpandq .Lmask52x4(%rip),%ymm15,%ymm15
1046
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
1047
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
1048
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
1049
vpandq .Lmask52x4(%rip),%ymm19,%ymm19
1050
vpandq .Lmask52x4(%rip),%ymm20,%ymm20
1051
vpandq .Lmask52x4(%rip),%ymm21,%ymm21
1052
vpandq .Lmask52x4(%rip),%ymm22,%ymm22
1053
1054
1055
vpaddq %ymm0,%ymm13,%ymm13
1056
vpaddq %ymm1,%ymm14,%ymm14
1057
vpaddq %ymm2,%ymm15,%ymm15
1058
vpaddq %ymm23,%ymm16,%ymm16
1059
vpaddq %ymm24,%ymm17,%ymm17
1060
vpaddq %ymm25,%ymm18,%ymm18
1061
vpaddq %ymm26,%ymm19,%ymm19
1062
vpaddq %ymm27,%ymm20,%ymm20
1063
vpaddq %ymm28,%ymm21,%ymm21
1064
vpaddq %ymm29,%ymm22,%ymm22
1065
1066
1067
1068
vpcmpuq $6,.Lmask52x4(%rip),%ymm13,%k1
1069
vpcmpuq $6,.Lmask52x4(%rip),%ymm14,%k2
1070
kmovb %k1,%r14d
1071
kmovb %k2,%r13d
1072
shlb $4,%r13b
1073
orb %r13b,%r14b
1074
1075
vpcmpuq $6,.Lmask52x4(%rip),%ymm15,%k1
1076
vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2
1077
kmovb %k1,%r13d
1078
kmovb %k2,%r12d
1079
shlb $4,%r12b
1080
orb %r12b,%r13b
1081
1082
vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k1
1083
vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k2
1084
kmovb %k1,%r12d
1085
kmovb %k2,%r11d
1086
shlb $4,%r11b
1087
orb %r11b,%r12b
1088
1089
vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k1
1090
vpcmpuq $6,.Lmask52x4(%rip),%ymm20,%k2
1091
kmovb %k1,%r11d
1092
kmovb %k2,%r10d
1093
shlb $4,%r10b
1094
orb %r10b,%r11b
1095
1096
vpcmpuq $6,.Lmask52x4(%rip),%ymm21,%k1
1097
vpcmpuq $6,.Lmask52x4(%rip),%ymm22,%k2
1098
kmovb %k1,%r10d
1099
kmovb %k2,%r9d
1100
shlb $4,%r9b
1101
orb %r9b,%r10b
1102
1103
addb %r14b,%r14b
1104
adcb %r13b,%r13b
1105
adcb %r12b,%r12b
1106
adcb %r11b,%r11b
1107
adcb %r10b,%r10b
1108
1109
1110
vpcmpuq $0,.Lmask52x4(%rip),%ymm13,%k1
1111
vpcmpuq $0,.Lmask52x4(%rip),%ymm14,%k2
1112
kmovb %k1,%r9d
1113
kmovb %k2,%r8d
1114
shlb $4,%r8b
1115
orb %r8b,%r9b
1116
1117
vpcmpuq $0,.Lmask52x4(%rip),%ymm15,%k1
1118
vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2
1119
kmovb %k1,%r8d
1120
kmovb %k2,%edx
1121
shlb $4,%dl
1122
orb %dl,%r8b
1123
1124
vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k1
1125
vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k2
1126
kmovb %k1,%edx
1127
kmovb %k2,%ecx
1128
shlb $4,%cl
1129
orb %cl,%dl
1130
1131
vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k1
1132
vpcmpuq $0,.Lmask52x4(%rip),%ymm20,%k2
1133
kmovb %k1,%ecx
1134
kmovb %k2,%ebx
1135
shlb $4,%bl
1136
orb %bl,%cl
1137
1138
vpcmpuq $0,.Lmask52x4(%rip),%ymm21,%k1
1139
vpcmpuq $0,.Lmask52x4(%rip),%ymm22,%k2
1140
kmovb %k1,%ebx
1141
kmovb %k2,%eax
1142
shlb $4,%al
1143
orb %al,%bl
1144
1145
addb %r9b,%r14b
1146
adcb %r8b,%r13b
1147
adcb %dl,%r12b
1148
adcb %cl,%r11b
1149
adcb %bl,%r10b
1150
1151
xorb %r9b,%r14b
1152
xorb %r8b,%r13b
1153
xorb %dl,%r12b
1154
xorb %cl,%r11b
1155
xorb %bl,%r10b
1156
1157
kmovb %r14d,%k1
1158
shrb $4,%r14b
1159
kmovb %r14d,%k2
1160
kmovb %r13d,%k3
1161
shrb $4,%r13b
1162
kmovb %r13d,%k4
1163
kmovb %r12d,%k5
1164
shrb $4,%r12b
1165
kmovb %r12d,%k6
1166
kmovb %r11d,%k7
1167
1168
vpsubq .Lmask52x4(%rip),%ymm13,%ymm13{%k1}
1169
vpsubq .Lmask52x4(%rip),%ymm14,%ymm14{%k2}
1170
vpsubq .Lmask52x4(%rip),%ymm15,%ymm15{%k3}
1171
vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k4}
1172
vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k5}
1173
vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k6}
1174
vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k7}
1175
1176
vpandq .Lmask52x4(%rip),%ymm13,%ymm13
1177
vpandq .Lmask52x4(%rip),%ymm14,%ymm14
1178
vpandq .Lmask52x4(%rip),%ymm15,%ymm15
1179
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
1180
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
1181
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
1182
vpandq .Lmask52x4(%rip),%ymm19,%ymm19
1183
1184
shrb $4,%r11b
1185
kmovb %r11d,%k1
1186
kmovb %r10d,%k2
1187
shrb $4,%r10b
1188
kmovb %r10d,%k3
1189
1190
vpsubq .Lmask52x4(%rip),%ymm20,%ymm20{%k1}
1191
vpsubq .Lmask52x4(%rip),%ymm21,%ymm21{%k2}
1192
vpsubq .Lmask52x4(%rip),%ymm22,%ymm22{%k3}
1193
1194
vpandq .Lmask52x4(%rip),%ymm20,%ymm20
1195
vpandq .Lmask52x4(%rip),%ymm21,%ymm21
1196
vpandq .Lmask52x4(%rip),%ymm22,%ymm22
1197
1198
vmovdqu64 %ymm3,0(%rdi)
1199
vmovdqu64 %ymm4,32(%rdi)
1200
vmovdqu64 %ymm5,64(%rdi)
1201
vmovdqu64 %ymm6,96(%rdi)
1202
vmovdqu64 %ymm7,128(%rdi)
1203
vmovdqu64 %ymm8,160(%rdi)
1204
vmovdqu64 %ymm9,192(%rdi)
1205
vmovdqu64 %ymm10,224(%rdi)
1206
vmovdqu64 %ymm11,256(%rdi)
1207
vmovdqu64 %ymm12,288(%rdi)
1208
1209
vmovdqu64 %ymm13,320(%rdi)
1210
vmovdqu64 %ymm14,352(%rdi)
1211
vmovdqu64 %ymm15,384(%rdi)
1212
vmovdqu64 %ymm16,416(%rdi)
1213
vmovdqu64 %ymm17,448(%rdi)
1214
vmovdqu64 %ymm18,480(%rdi)
1215
vmovdqu64 %ymm19,512(%rdi)
1216
vmovdqu64 %ymm20,544(%rdi)
1217
vmovdqu64 %ymm21,576(%rdi)
1218
vmovdqu64 %ymm22,608(%rdi)
1219
1220
vzeroupper
1221
leaq (%rsp),%rax
1222
.cfi_def_cfa_register %rax
1223
movq 0(%rax),%r15
1224
.cfi_restore %r15
1225
movq 8(%rax),%r14
1226
.cfi_restore %r14
1227
movq 16(%rax),%r13
1228
.cfi_restore %r13
1229
movq 24(%rax),%r12
1230
.cfi_restore %r12
1231
movq 32(%rax),%rbp
1232
.cfi_restore %rbp
1233
movq 40(%rax),%rbx
1234
.cfi_restore %rbx
1235
leaq 48(%rax),%rsp
1236
.cfi_def_cfa %rsp,8
1237
.Lossl_rsaz_amm52x40_x2_ifma256_epilogue:
1238
.byte 0xf3,0xc3
1239
.cfi_endproc
1240
.size ossl_rsaz_amm52x40_x2_ifma256, .-ossl_rsaz_amm52x40_x2_ifma256
1241
.text
1242
1243
.align 32
1244
.globl ossl_extract_multiplier_2x40_win5
1245
.type ossl_extract_multiplier_2x40_win5,@function
1246
ossl_extract_multiplier_2x40_win5:
1247
.cfi_startproc
1248
.byte 243,15,30,250
1249
vmovdqa64 .Lones(%rip),%ymm24
1250
vpbroadcastq %rdx,%ymm22
1251
vpbroadcastq %rcx,%ymm23
1252
leaq 20480(%rsi),%rax
1253
1254
1255
movq %rsi,%r10
1256
1257
1258
vpxor %xmm0,%xmm0,%xmm0
1259
vmovdqa64 %ymm0,%ymm1
1260
vmovdqa64 %ymm0,%ymm2
1261
vmovdqa64 %ymm0,%ymm3
1262
vmovdqa64 %ymm0,%ymm4
1263
vmovdqa64 %ymm0,%ymm5
1264
vmovdqa64 %ymm0,%ymm16
1265
vmovdqa64 %ymm0,%ymm17
1266
vmovdqa64 %ymm0,%ymm18
1267
vmovdqa64 %ymm0,%ymm19
1268
vpxorq %ymm21,%ymm21,%ymm21
1269
.align 32
1270
.Lloop_0:
1271
vpcmpq $0,%ymm21,%ymm22,%k1
1272
vmovdqu64 0(%rsi),%ymm20
1273
vpblendmq %ymm20,%ymm0,%ymm0{%k1}
1274
vmovdqu64 32(%rsi),%ymm20
1275
vpblendmq %ymm20,%ymm1,%ymm1{%k1}
1276
vmovdqu64 64(%rsi),%ymm20
1277
vpblendmq %ymm20,%ymm2,%ymm2{%k1}
1278
vmovdqu64 96(%rsi),%ymm20
1279
vpblendmq %ymm20,%ymm3,%ymm3{%k1}
1280
vmovdqu64 128(%rsi),%ymm20
1281
vpblendmq %ymm20,%ymm4,%ymm4{%k1}
1282
vmovdqu64 160(%rsi),%ymm20
1283
vpblendmq %ymm20,%ymm5,%ymm5{%k1}
1284
vmovdqu64 192(%rsi),%ymm20
1285
vpblendmq %ymm20,%ymm16,%ymm16{%k1}
1286
vmovdqu64 224(%rsi),%ymm20
1287
vpblendmq %ymm20,%ymm17,%ymm17{%k1}
1288
vmovdqu64 256(%rsi),%ymm20
1289
vpblendmq %ymm20,%ymm18,%ymm18{%k1}
1290
vmovdqu64 288(%rsi),%ymm20
1291
vpblendmq %ymm20,%ymm19,%ymm19{%k1}
1292
vpaddq %ymm24,%ymm21,%ymm21
1293
addq $640,%rsi
1294
cmpq %rsi,%rax
1295
jne .Lloop_0
1296
vmovdqu64 %ymm0,0(%rdi)
1297
vmovdqu64 %ymm1,32(%rdi)
1298
vmovdqu64 %ymm2,64(%rdi)
1299
vmovdqu64 %ymm3,96(%rdi)
1300
vmovdqu64 %ymm4,128(%rdi)
1301
vmovdqu64 %ymm5,160(%rdi)
1302
vmovdqu64 %ymm16,192(%rdi)
1303
vmovdqu64 %ymm17,224(%rdi)
1304
vmovdqu64 %ymm18,256(%rdi)
1305
vmovdqu64 %ymm19,288(%rdi)
1306
movq %r10,%rsi
1307
vpxorq %ymm21,%ymm21,%ymm21
1308
.align 32
1309
.Lloop_320:
1310
vpcmpq $0,%ymm21,%ymm23,%k1
1311
vmovdqu64 320(%rsi),%ymm20
1312
vpblendmq %ymm20,%ymm0,%ymm0{%k1}
1313
vmovdqu64 352(%rsi),%ymm20
1314
vpblendmq %ymm20,%ymm1,%ymm1{%k1}
1315
vmovdqu64 384(%rsi),%ymm20
1316
vpblendmq %ymm20,%ymm2,%ymm2{%k1}
1317
vmovdqu64 416(%rsi),%ymm20
1318
vpblendmq %ymm20,%ymm3,%ymm3{%k1}
1319
vmovdqu64 448(%rsi),%ymm20
1320
vpblendmq %ymm20,%ymm4,%ymm4{%k1}
1321
vmovdqu64 480(%rsi),%ymm20
1322
vpblendmq %ymm20,%ymm5,%ymm5{%k1}
1323
vmovdqu64 512(%rsi),%ymm20
1324
vpblendmq %ymm20,%ymm16,%ymm16{%k1}
1325
vmovdqu64 544(%rsi),%ymm20
1326
vpblendmq %ymm20,%ymm17,%ymm17{%k1}
1327
vmovdqu64 576(%rsi),%ymm20
1328
vpblendmq %ymm20,%ymm18,%ymm18{%k1}
1329
vmovdqu64 608(%rsi),%ymm20
1330
vpblendmq %ymm20,%ymm19,%ymm19{%k1}
1331
vpaddq %ymm24,%ymm21,%ymm21
1332
addq $640,%rsi
1333
cmpq %rsi,%rax
1334
jne .Lloop_320
1335
vmovdqu64 %ymm0,320(%rdi)
1336
vmovdqu64 %ymm1,352(%rdi)
1337
vmovdqu64 %ymm2,384(%rdi)
1338
vmovdqu64 %ymm3,416(%rdi)
1339
vmovdqu64 %ymm4,448(%rdi)
1340
vmovdqu64 %ymm5,480(%rdi)
1341
vmovdqu64 %ymm16,512(%rdi)
1342
vmovdqu64 %ymm17,544(%rdi)
1343
vmovdqu64 %ymm18,576(%rdi)
1344
vmovdqu64 %ymm19,608(%rdi)
1345
1346
.byte 0xf3,0xc3
1347
.cfi_endproc
1348
.size ossl_extract_multiplier_2x40_win5, .-ossl_extract_multiplier_2x40_win5
1349
.section .rodata
1350
.align 32
1351
.Lones:
1352
.quad 1,1,1,1
1353
.Lzeros:
1354
.quad 0,0,0,0
1355
.section ".note.gnu.property", "a"
1356
.p2align 3
1357
.long 1f - 0f
1358
.long 4f - 1f
1359
.long 5
1360
0:
1361
# "GNU" encoded with .byte, since .asciz isn't supported
1362
# on Solaris.
1363
.byte 0x47
1364
.byte 0x4e
1365
.byte 0x55
1366
.byte 0
1367
1:
1368
.p2align 3
1369
.long 0xc0000002
1370
.long 3f - 2f
1371
2:
1372
.long 3
1373
3:
1374
.p2align 3
1375
4:
1376
1377