Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/asm/scrypt-x86.S
1201 views
1
/*
2
* Copyright 2011-2012, 2014 [email protected]
3
* All rights reserved.
4
*
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
7
* are met:
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
*
14
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24
* SUCH DAMAGE.
25
*/
26
27
#include <cpuminer-config.h>
28
29
#if defined(__linux__) && defined(__ELF__)
30
.section .note.GNU-stack,"",%progbits
31
#endif
32
33
#if defined(USE_ASM) && defined(__i386__)
34
35
.macro scrypt_shuffle src, so, dest, do
36
movl \so+60(\src), %eax
37
movl \so+44(\src), %ebx
38
movl \so+28(\src), %ecx
39
movl \so+12(\src), %edx
40
movl %eax, \do+12(\dest)
41
movl %ebx, \do+28(\dest)
42
movl %ecx, \do+44(\dest)
43
movl %edx, \do+60(\dest)
44
movl \so+40(\src), %eax
45
movl \so+8(\src), %ebx
46
movl \so+48(\src), %ecx
47
movl \so+16(\src), %edx
48
movl %eax, \do+8(\dest)
49
movl %ebx, \do+40(\dest)
50
movl %ecx, \do+16(\dest)
51
movl %edx, \do+48(\dest)
52
movl \so+20(\src), %eax
53
movl \so+4(\src), %ebx
54
movl \so+52(\src), %ecx
55
movl \so+36(\src), %edx
56
movl %eax, \do+4(\dest)
57
movl %ebx, \do+20(\dest)
58
movl %ecx, \do+36(\dest)
59
movl %edx, \do+52(\dest)
60
movl \so+0(\src), %eax
61
movl \so+24(\src), %ebx
62
movl \so+32(\src), %ecx
63
movl \so+56(\src), %edx
64
movl %eax, \do+0(\dest)
65
movl %ebx, \do+24(\dest)
66
movl %ecx, \do+32(\dest)
67
movl %edx, \do+56(\dest)
68
.endm
69
70
.macro salsa8_core_gen_quadround
71
movl 52(%esp), %ecx
72
movl 4(%esp), %edx
73
movl 20(%esp), %ebx
74
movl 8(%esp), %esi
75
leal (%ecx, %edx), %edi
76
roll $7, %edi
77
xorl %edi, %ebx
78
movl %ebx, 4(%esp)
79
movl 36(%esp), %edi
80
leal (%edx, %ebx), %ebp
81
roll $9, %ebp
82
xorl %ebp, %edi
83
movl 24(%esp), %ebp
84
movl %edi, 8(%esp)
85
addl %edi, %ebx
86
roll $13, %ebx
87
xorl %ebx, %ecx
88
movl 40(%esp), %ebx
89
movl %ecx, 20(%esp)
90
addl %edi, %ecx
91
roll $18, %ecx
92
leal (%esi, %ebp), %edi
93
roll $7, %edi
94
xorl %edi, %ebx
95
movl %ebx, 24(%esp)
96
movl 56(%esp), %edi
97
xorl %ecx, %edx
98
leal (%ebp, %ebx), %ecx
99
roll $9, %ecx
100
xorl %ecx, %edi
101
movl %edi, 36(%esp)
102
movl 28(%esp), %ecx
103
movl %edx, 28(%esp)
104
movl 44(%esp), %edx
105
addl %edi, %ebx
106
roll $13, %ebx
107
xorl %ebx, %esi
108
movl 60(%esp), %ebx
109
movl %esi, 40(%esp)
110
addl %edi, %esi
111
roll $18, %esi
112
leal (%ecx, %edx), %edi
113
roll $7, %edi
114
xorl %edi, %ebx
115
movl %ebx, 44(%esp)
116
movl 12(%esp), %edi
117
xorl %esi, %ebp
118
leal (%edx, %ebx), %esi
119
roll $9, %esi
120
xorl %esi, %edi
121
movl %edi, 12(%esp)
122
movl 48(%esp), %esi
123
movl %ebp, 48(%esp)
124
movl 64(%esp), %ebp
125
addl %edi, %ebx
126
roll $13, %ebx
127
xorl %ebx, %ecx
128
movl 16(%esp), %ebx
129
movl %ecx, 16(%esp)
130
addl %edi, %ecx
131
roll $18, %ecx
132
leal (%esi, %ebp), %edi
133
roll $7, %edi
134
xorl %edi, %ebx
135
movl 32(%esp), %edi
136
xorl %ecx, %edx
137
leal (%ebp, %ebx), %ecx
138
roll $9, %ecx
139
xorl %ecx, %edi
140
movl %edi, 32(%esp)
141
movl %ebx, %ecx
142
movl %edx, 52(%esp)
143
movl 28(%esp), %edx
144
addl %edi, %ebx
145
roll $13, %ebx
146
xorl %ebx, %esi
147
movl 40(%esp), %ebx
148
movl %esi, 28(%esp)
149
addl %edi, %esi
150
roll $18, %esi
151
leal (%ecx, %edx), %edi
152
roll $7, %edi
153
xorl %edi, %ebx
154
movl %ebx, 40(%esp)
155
movl 12(%esp), %edi
156
xorl %esi, %ebp
157
leal (%edx, %ebx), %esi
158
roll $9, %esi
159
xorl %esi, %edi
160
movl %edi, 12(%esp)
161
movl 4(%esp), %esi
162
movl %ebp, 4(%esp)
163
movl 48(%esp), %ebp
164
addl %edi, %ebx
165
roll $13, %ebx
166
xorl %ebx, %ecx
167
movl 16(%esp), %ebx
168
movl %ecx, 16(%esp)
169
addl %edi, %ecx
170
roll $18, %ecx
171
leal (%esi, %ebp), %edi
172
roll $7, %edi
173
xorl %edi, %ebx
174
movl %ebx, 48(%esp)
175
movl 32(%esp), %edi
176
xorl %ecx, %edx
177
leal (%ebp, %ebx), %ecx
178
roll $9, %ecx
179
xorl %ecx, %edi
180
movl %edi, 32(%esp)
181
movl 24(%esp), %ecx
182
movl %edx, 24(%esp)
183
movl 52(%esp), %edx
184
addl %edi, %ebx
185
roll $13, %ebx
186
xorl %ebx, %esi
187
movl 28(%esp), %ebx
188
movl %esi, 28(%esp)
189
addl %edi, %esi
190
roll $18, %esi
191
leal (%ecx, %edx), %edi
192
roll $7, %edi
193
xorl %edi, %ebx
194
movl %ebx, 52(%esp)
195
movl 8(%esp), %edi
196
xorl %esi, %ebp
197
leal (%edx, %ebx), %esi
198
roll $9, %esi
199
xorl %esi, %edi
200
movl %edi, 8(%esp)
201
movl 44(%esp), %esi
202
movl %ebp, 44(%esp)
203
movl 4(%esp), %ebp
204
addl %edi, %ebx
205
roll $13, %ebx
206
xorl %ebx, %ecx
207
movl 20(%esp), %ebx
208
movl %ecx, 4(%esp)
209
addl %edi, %ecx
210
roll $18, %ecx
211
leal (%esi, %ebp), %edi
212
roll $7, %edi
213
xorl %edi, %ebx
214
movl 36(%esp), %edi
215
xorl %ecx, %edx
216
leal (%ebp, %ebx), %ecx
217
roll $9, %ecx
218
xorl %ecx, %edi
219
movl %edi, 20(%esp)
220
movl %ebx, %ecx
221
movl %edx, 36(%esp)
222
movl 24(%esp), %edx
223
addl %edi, %ebx
224
roll $13, %ebx
225
xorl %ebx, %esi
226
movl 28(%esp), %ebx
227
movl %esi, 24(%esp)
228
addl %edi, %esi
229
roll $18, %esi
230
leal (%ecx, %edx), %edi
231
roll $7, %edi
232
xorl %edi, %ebx
233
movl %ebx, 28(%esp)
234
xorl %esi, %ebp
235
movl 8(%esp), %esi
236
leal (%edx, %ebx), %edi
237
roll $9, %edi
238
xorl %edi, %esi
239
movl 40(%esp), %edi
240
movl %ebp, 8(%esp)
241
movl 44(%esp), %ebp
242
movl %esi, 40(%esp)
243
addl %esi, %ebx
244
roll $13, %ebx
245
xorl %ebx, %ecx
246
movl 4(%esp), %ebx
247
movl %ecx, 44(%esp)
248
addl %esi, %ecx
249
roll $18, %ecx
250
leal (%edi, %ebp), %esi
251
roll $7, %esi
252
xorl %esi, %ebx
253
movl %ebx, 4(%esp)
254
movl 20(%esp), %esi
255
xorl %ecx, %edx
256
leal (%ebp, %ebx), %ecx
257
roll $9, %ecx
258
xorl %ecx, %esi
259
movl %esi, 56(%esp)
260
movl 48(%esp), %ecx
261
movl %edx, 20(%esp)
262
movl 36(%esp), %edx
263
addl %esi, %ebx
264
roll $13, %ebx
265
xorl %ebx, %edi
266
movl 24(%esp), %ebx
267
movl %edi, 24(%esp)
268
addl %esi, %edi
269
roll $18, %edi
270
leal (%ecx, %edx), %esi
271
roll $7, %esi
272
xorl %esi, %ebx
273
movl %ebx, 60(%esp)
274
movl 12(%esp), %esi
275
xorl %edi, %ebp
276
leal (%edx, %ebx), %edi
277
roll $9, %edi
278
xorl %edi, %esi
279
movl %esi, 12(%esp)
280
movl 52(%esp), %edi
281
movl %ebp, 36(%esp)
282
movl 8(%esp), %ebp
283
addl %esi, %ebx
284
roll $13, %ebx
285
xorl %ebx, %ecx
286
movl 16(%esp), %ebx
287
movl %ecx, 16(%esp)
288
addl %esi, %ecx
289
roll $18, %ecx
290
leal (%edi, %ebp), %esi
291
roll $7, %esi
292
xorl %esi, %ebx
293
movl 32(%esp), %esi
294
xorl %ecx, %edx
295
leal (%ebp, %ebx), %ecx
296
roll $9, %ecx
297
xorl %ecx, %esi
298
movl %esi, 32(%esp)
299
movl %ebx, %ecx
300
movl %edx, 48(%esp)
301
movl 20(%esp), %edx
302
addl %esi, %ebx
303
roll $13, %ebx
304
xorl %ebx, %edi
305
movl 24(%esp), %ebx
306
movl %edi, 20(%esp)
307
addl %esi, %edi
308
roll $18, %edi
309
leal (%ecx, %edx), %esi
310
roll $7, %esi
311
xorl %esi, %ebx
312
movl %ebx, 8(%esp)
313
movl 12(%esp), %esi
314
xorl %edi, %ebp
315
leal (%edx, %ebx), %edi
316
roll $9, %edi
317
xorl %edi, %esi
318
movl %esi, 12(%esp)
319
movl 28(%esp), %edi
320
movl %ebp, 52(%esp)
321
movl 36(%esp), %ebp
322
addl %esi, %ebx
323
roll $13, %ebx
324
xorl %ebx, %ecx
325
movl 16(%esp), %ebx
326
movl %ecx, 16(%esp)
327
addl %esi, %ecx
328
roll $18, %ecx
329
leal (%edi, %ebp), %esi
330
roll $7, %esi
331
xorl %esi, %ebx
332
movl %ebx, 28(%esp)
333
movl 32(%esp), %esi
334
xorl %ecx, %edx
335
leal (%ebp, %ebx), %ecx
336
roll $9, %ecx
337
xorl %ecx, %esi
338
movl %esi, 32(%esp)
339
movl 4(%esp), %ecx
340
movl %edx, 4(%esp)
341
movl 48(%esp), %edx
342
addl %esi, %ebx
343
roll $13, %ebx
344
xorl %ebx, %edi
345
movl 20(%esp), %ebx
346
movl %edi, 20(%esp)
347
addl %esi, %edi
348
roll $18, %edi
349
leal (%ecx, %edx), %esi
350
roll $7, %esi
351
xorl %esi, %ebx
352
movl %ebx, 48(%esp)
353
movl 40(%esp), %esi
354
xorl %edi, %ebp
355
leal (%edx, %ebx), %edi
356
roll $9, %edi
357
xorl %edi, %esi
358
movl %esi, 36(%esp)
359
movl 60(%esp), %edi
360
movl %ebp, 24(%esp)
361
movl 52(%esp), %ebp
362
addl %esi, %ebx
363
roll $13, %ebx
364
xorl %ebx, %ecx
365
movl 44(%esp), %ebx
366
movl %ecx, 40(%esp)
367
addl %esi, %ecx
368
roll $18, %ecx
369
leal (%edi, %ebp), %esi
370
roll $7, %esi
371
xorl %esi, %ebx
372
movl %ebx, 52(%esp)
373
movl 56(%esp), %esi
374
xorl %ecx, %edx
375
leal (%ebp, %ebx), %ecx
376
roll $9, %ecx
377
xorl %ecx, %esi
378
movl %esi, 56(%esp)
379
addl %esi, %ebx
380
movl %edx, 44(%esp)
381
roll $13, %ebx
382
xorl %ebx, %edi
383
movl %edi, 60(%esp)
384
addl %esi, %edi
385
roll $18, %edi
386
xorl %edi, %ebp
387
movl %ebp, 64(%esp)
388
.endm
389
390
.text
391
.p2align 5
392
salsa8_core_gen:
393
salsa8_core_gen_quadround
394
salsa8_core_gen_quadround
395
ret
396
397
398
.text
399
.p2align 5
400
.globl scrypt_core
401
.globl _scrypt_core
402
scrypt_core:
403
_scrypt_core:
404
pushl %ebx
405
pushl %ebp
406
pushl %edi
407
pushl %esi
408
409
/* Check for SSE2 availability */
410
movl $1, %eax
411
cpuid
412
andl $0x04000000, %edx
413
jnz scrypt_core_sse2
414
415
scrypt_core_gen:
416
movl 20(%esp), %edi
417
movl 24(%esp), %esi
418
movl 28(%esp), %ecx
419
subl $72, %esp
420
421
.macro scrypt_core_macro1a p, q
422
movl \p(%edi), %eax
423
movl \q(%edi), %edx
424
movl %eax, \p(%esi)
425
movl %edx, \q(%esi)
426
xorl %edx, %eax
427
movl %eax, \p(%edi)
428
movl %eax, \p(%esp)
429
.endm
430
431
.macro scrypt_core_macro1b p, q
432
movl \p(%edi), %eax
433
xorl \p(%esi, %edx), %eax
434
movl \q(%edi), %ebx
435
xorl \q(%esi, %edx), %ebx
436
movl %ebx, \q(%edi)
437
xorl %ebx, %eax
438
movl %eax, \p(%edi)
439
movl %eax, \p(%esp)
440
.endm
441
442
.macro scrypt_core_macro2 p, q
443
movl \p(%esp), %eax
444
addl \p(%edi), %eax
445
movl %eax, \p(%edi)
446
xorl \q(%edi), %eax
447
movl %eax, \q(%edi)
448
movl %eax, \p(%esp)
449
.endm
450
451
.macro scrypt_core_macro3 p, q
452
movl \p(%esp), %eax
453
addl \q(%edi), %eax
454
movl %eax, \q(%edi)
455
.endm
456
457
shll $7, %ecx
458
addl %esi, %ecx
459
scrypt_core_gen_loop1:
460
movl %esi, 64(%esp)
461
movl %ecx, 68(%esp)
462
463
scrypt_core_macro1a 0, 64
464
scrypt_core_macro1a 4, 68
465
scrypt_core_macro1a 8, 72
466
scrypt_core_macro1a 12, 76
467
scrypt_core_macro1a 16, 80
468
scrypt_core_macro1a 20, 84
469
scrypt_core_macro1a 24, 88
470
scrypt_core_macro1a 28, 92
471
scrypt_core_macro1a 32, 96
472
scrypt_core_macro1a 36, 100
473
scrypt_core_macro1a 40, 104
474
scrypt_core_macro1a 44, 108
475
scrypt_core_macro1a 48, 112
476
scrypt_core_macro1a 52, 116
477
scrypt_core_macro1a 56, 120
478
scrypt_core_macro1a 60, 124
479
480
call salsa8_core_gen
481
482
movl 92(%esp), %edi
483
scrypt_core_macro2 0, 64
484
scrypt_core_macro2 4, 68
485
scrypt_core_macro2 8, 72
486
scrypt_core_macro2 12, 76
487
scrypt_core_macro2 16, 80
488
scrypt_core_macro2 20, 84
489
scrypt_core_macro2 24, 88
490
scrypt_core_macro2 28, 92
491
scrypt_core_macro2 32, 96
492
scrypt_core_macro2 36, 100
493
scrypt_core_macro2 40, 104
494
scrypt_core_macro2 44, 108
495
scrypt_core_macro2 48, 112
496
scrypt_core_macro2 52, 116
497
scrypt_core_macro2 56, 120
498
scrypt_core_macro2 60, 124
499
500
call salsa8_core_gen
501
502
movl 92(%esp), %edi
503
scrypt_core_macro3 0, 64
504
scrypt_core_macro3 4, 68
505
scrypt_core_macro3 8, 72
506
scrypt_core_macro3 12, 76
507
scrypt_core_macro3 16, 80
508
scrypt_core_macro3 20, 84
509
scrypt_core_macro3 24, 88
510
scrypt_core_macro3 28, 92
511
scrypt_core_macro3 32, 96
512
scrypt_core_macro3 36, 100
513
scrypt_core_macro3 40, 104
514
scrypt_core_macro3 44, 108
515
scrypt_core_macro3 48, 112
516
scrypt_core_macro3 52, 116
517
scrypt_core_macro3 56, 120
518
scrypt_core_macro3 60, 124
519
520
movl 64(%esp), %esi
521
movl 68(%esp), %ecx
522
addl $128, %esi
523
cmpl %ecx, %esi
524
jne scrypt_core_gen_loop1
525
526
movl 96(%esp), %esi
527
movl 100(%esp), %ecx
528
movl %ecx, %eax
529
subl $1, %eax
530
movl %eax, 100(%esp)
531
scrypt_core_gen_loop2:
532
movl %ecx, 68(%esp)
533
534
movl 64(%edi), %edx
535
andl 100(%esp), %edx
536
shll $7, %edx
537
538
scrypt_core_macro1b 0, 64
539
scrypt_core_macro1b 4, 68
540
scrypt_core_macro1b 8, 72
541
scrypt_core_macro1b 12, 76
542
scrypt_core_macro1b 16, 80
543
scrypt_core_macro1b 20, 84
544
scrypt_core_macro1b 24, 88
545
scrypt_core_macro1b 28, 92
546
scrypt_core_macro1b 32, 96
547
scrypt_core_macro1b 36, 100
548
scrypt_core_macro1b 40, 104
549
scrypt_core_macro1b 44, 108
550
scrypt_core_macro1b 48, 112
551
scrypt_core_macro1b 52, 116
552
scrypt_core_macro1b 56, 120
553
scrypt_core_macro1b 60, 124
554
555
call salsa8_core_gen
556
557
movl 92(%esp), %edi
558
scrypt_core_macro2 0, 64
559
scrypt_core_macro2 4, 68
560
scrypt_core_macro2 8, 72
561
scrypt_core_macro2 12, 76
562
scrypt_core_macro2 16, 80
563
scrypt_core_macro2 20, 84
564
scrypt_core_macro2 24, 88
565
scrypt_core_macro2 28, 92
566
scrypt_core_macro2 32, 96
567
scrypt_core_macro2 36, 100
568
scrypt_core_macro2 40, 104
569
scrypt_core_macro2 44, 108
570
scrypt_core_macro2 48, 112
571
scrypt_core_macro2 52, 116
572
scrypt_core_macro2 56, 120
573
scrypt_core_macro2 60, 124
574
575
call salsa8_core_gen
576
577
movl 92(%esp), %edi
578
movl 96(%esp), %esi
579
scrypt_core_macro3 0, 64
580
scrypt_core_macro3 4, 68
581
scrypt_core_macro3 8, 72
582
scrypt_core_macro3 12, 76
583
scrypt_core_macro3 16, 80
584
scrypt_core_macro3 20, 84
585
scrypt_core_macro3 24, 88
586
scrypt_core_macro3 28, 92
587
scrypt_core_macro3 32, 96
588
scrypt_core_macro3 36, 100
589
scrypt_core_macro3 40, 104
590
scrypt_core_macro3 44, 108
591
scrypt_core_macro3 48, 112
592
scrypt_core_macro3 52, 116
593
scrypt_core_macro3 56, 120
594
scrypt_core_macro3 60, 124
595
596
movl 68(%esp), %ecx
597
subl $1, %ecx
598
ja scrypt_core_gen_loop2
599
600
addl $72, %esp
601
popl %esi
602
popl %edi
603
popl %ebp
604
popl %ebx
605
ret
606
607
608
.macro salsa8_core_sse2_doubleround
609
movdqa %xmm1, %xmm4
610
paddd %xmm0, %xmm4
611
movdqa %xmm4, %xmm5
612
pslld $7, %xmm4
613
psrld $25, %xmm5
614
pxor %xmm4, %xmm3
615
movdqa %xmm0, %xmm4
616
pxor %xmm5, %xmm3
617
618
paddd %xmm3, %xmm4
619
movdqa %xmm4, %xmm5
620
pslld $9, %xmm4
621
psrld $23, %xmm5
622
pxor %xmm4, %xmm2
623
movdqa %xmm3, %xmm4
624
pxor %xmm5, %xmm2
625
pshufd $0x93, %xmm3, %xmm3
626
627
paddd %xmm2, %xmm4
628
movdqa %xmm4, %xmm5
629
pslld $13, %xmm4
630
psrld $19, %xmm5
631
pxor %xmm4, %xmm1
632
movdqa %xmm2, %xmm4
633
pxor %xmm5, %xmm1
634
pshufd $0x4e, %xmm2, %xmm2
635
636
paddd %xmm1, %xmm4
637
movdqa %xmm4, %xmm5
638
pslld $18, %xmm4
639
psrld $14, %xmm5
640
pxor %xmm4, %xmm0
641
movdqa %xmm3, %xmm4
642
pxor %xmm5, %xmm0
643
pshufd $0x39, %xmm1, %xmm1
644
645
paddd %xmm0, %xmm4
646
movdqa %xmm4, %xmm5
647
pslld $7, %xmm4
648
psrld $25, %xmm5
649
pxor %xmm4, %xmm1
650
movdqa %xmm0, %xmm4
651
pxor %xmm5, %xmm1
652
653
paddd %xmm1, %xmm4
654
movdqa %xmm4, %xmm5
655
pslld $9, %xmm4
656
psrld $23, %xmm5
657
pxor %xmm4, %xmm2
658
movdqa %xmm1, %xmm4
659
pxor %xmm5, %xmm2
660
pshufd $0x93, %xmm1, %xmm1
661
662
paddd %xmm2, %xmm4
663
movdqa %xmm4, %xmm5
664
pslld $13, %xmm4
665
psrld $19, %xmm5
666
pxor %xmm4, %xmm3
667
movdqa %xmm2, %xmm4
668
pxor %xmm5, %xmm3
669
pshufd $0x4e, %xmm2, %xmm2
670
671
paddd %xmm3, %xmm4
672
movdqa %xmm4, %xmm5
673
pslld $18, %xmm4
674
psrld $14, %xmm5
675
pxor %xmm4, %xmm0
676
pshufd $0x39, %xmm3, %xmm3
677
pxor %xmm5, %xmm0
678
.endm
679
680
.macro salsa8_core_sse2
681
salsa8_core_sse2_doubleround
682
salsa8_core_sse2_doubleround
683
salsa8_core_sse2_doubleround
684
salsa8_core_sse2_doubleround
685
.endm
686
687
.p2align 5
688
scrypt_core_sse2:
689
movl 20(%esp), %edi
690
movl 24(%esp), %esi
691
movl %esp, %ebp
692
subl $128, %esp
693
andl $-16, %esp
694
695
scrypt_shuffle %edi, 0, %esp, 0
696
scrypt_shuffle %edi, 64, %esp, 64
697
698
movdqa 96(%esp), %xmm6
699
movdqa 112(%esp), %xmm7
700
701
movl %esi, %edx
702
movl 28(%ebp), %ecx
703
shll $7, %ecx
704
addl %esi, %ecx
705
scrypt_core_sse2_loop1:
706
movdqa 0(%esp), %xmm0
707
movdqa 16(%esp), %xmm1
708
movdqa 32(%esp), %xmm2
709
movdqa 48(%esp), %xmm3
710
movdqa 64(%esp), %xmm4
711
movdqa 80(%esp), %xmm5
712
pxor %xmm4, %xmm0
713
pxor %xmm5, %xmm1
714
movdqa %xmm0, 0(%edx)
715
movdqa %xmm1, 16(%edx)
716
pxor %xmm6, %xmm2
717
pxor %xmm7, %xmm3
718
movdqa %xmm2, 32(%edx)
719
movdqa %xmm3, 48(%edx)
720
movdqa %xmm4, 64(%edx)
721
movdqa %xmm5, 80(%edx)
722
movdqa %xmm6, 96(%edx)
723
movdqa %xmm7, 112(%edx)
724
725
salsa8_core_sse2
726
paddd 0(%edx), %xmm0
727
paddd 16(%edx), %xmm1
728
paddd 32(%edx), %xmm2
729
paddd 48(%edx), %xmm3
730
movdqa %xmm0, 0(%esp)
731
movdqa %xmm1, 16(%esp)
732
movdqa %xmm2, 32(%esp)
733
movdqa %xmm3, 48(%esp)
734
735
pxor 64(%esp), %xmm0
736
pxor 80(%esp), %xmm1
737
pxor %xmm6, %xmm2
738
pxor %xmm7, %xmm3
739
movdqa %xmm0, 64(%esp)
740
movdqa %xmm1, 80(%esp)
741
movdqa %xmm2, %xmm6
742
movdqa %xmm3, %xmm7
743
salsa8_core_sse2
744
paddd 64(%esp), %xmm0
745
paddd 80(%esp), %xmm1
746
paddd %xmm2, %xmm6
747
paddd %xmm3, %xmm7
748
movdqa %xmm0, 64(%esp)
749
movdqa %xmm1, 80(%esp)
750
751
addl $128, %edx
752
cmpl %ecx, %edx
753
jne scrypt_core_sse2_loop1
754
755
movdqa 64(%esp), %xmm4
756
movdqa 80(%esp), %xmm5
757
758
movl 28(%ebp), %ecx
759
movl %ecx, %eax
760
subl $1, %eax
761
scrypt_core_sse2_loop2:
762
movd %xmm4, %edx
763
movdqa 0(%esp), %xmm0
764
movdqa 16(%esp), %xmm1
765
movdqa 32(%esp), %xmm2
766
movdqa 48(%esp), %xmm3
767
andl %eax, %edx
768
shll $7, %edx
769
pxor 0(%esi, %edx), %xmm0
770
pxor 16(%esi, %edx), %xmm1
771
pxor 32(%esi, %edx), %xmm2
772
pxor 48(%esi, %edx), %xmm3
773
774
pxor %xmm4, %xmm0
775
pxor %xmm5, %xmm1
776
movdqa %xmm0, 0(%esp)
777
movdqa %xmm1, 16(%esp)
778
pxor %xmm6, %xmm2
779
pxor %xmm7, %xmm3
780
movdqa %xmm2, 32(%esp)
781
movdqa %xmm3, 48(%esp)
782
salsa8_core_sse2
783
paddd 0(%esp), %xmm0
784
paddd 16(%esp), %xmm1
785
paddd 32(%esp), %xmm2
786
paddd 48(%esp), %xmm3
787
movdqa %xmm0, 0(%esp)
788
movdqa %xmm1, 16(%esp)
789
movdqa %xmm2, 32(%esp)
790
movdqa %xmm3, 48(%esp)
791
792
pxor 64(%esi, %edx), %xmm0
793
pxor 80(%esi, %edx), %xmm1
794
pxor 96(%esi, %edx), %xmm2
795
pxor 112(%esi, %edx), %xmm3
796
pxor 64(%esp), %xmm0
797
pxor 80(%esp), %xmm1
798
pxor %xmm6, %xmm2
799
pxor %xmm7, %xmm3
800
movdqa %xmm0, 64(%esp)
801
movdqa %xmm1, 80(%esp)
802
movdqa %xmm2, %xmm6
803
movdqa %xmm3, %xmm7
804
salsa8_core_sse2
805
paddd 64(%esp), %xmm0
806
paddd 80(%esp), %xmm1
807
paddd %xmm2, %xmm6
808
paddd %xmm3, %xmm7
809
movdqa %xmm0, %xmm4
810
movdqa %xmm1, %xmm5
811
movdqa %xmm0, 64(%esp)
812
movdqa %xmm1, 80(%esp)
813
814
subl $1, %ecx
815
ja scrypt_core_sse2_loop2
816
817
movdqa %xmm6, 96(%esp)
818
movdqa %xmm7, 112(%esp)
819
820
scrypt_shuffle %esp, 0, %edi, 0
821
scrypt_shuffle %esp, 64, %edi, 64
822
823
movl %ebp, %esp
824
popl %esi
825
popl %edi
826
popl %ebp
827
popl %ebx
828
ret
829
830
#endif
831
832