Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/asm/neoscrypt_asm.S
1201 views
1
/*
2
* Copyright (c) 2014 John Doering <[email protected]>
3
* All rights reserved.
4
*
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
7
* are met:
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
*
14
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24
* SUCH DAMAGE.
25
*/
26
27
#ifdef _MSC_VER
28
/* arch defines */
29
#include "miner.h"
30
#endif
31
32
#if defined(__GNUC__) && !defined(__arm__)
33
#define ASM 1
34
/* #define WIN64 0 */
35
#endif
36
37
#if (ASM) && (__x86_64__)
38
39
/* neoscrypt_blkcpy(dst, src, len) = SSE2 based block memcpy();
40
* len must be a multiple of 64 bytes aligned properly */
41
.globl neoscrypt_blkcpy
42
.globl _neoscrypt_blkcpy
43
neoscrypt_blkcpy:
44
_neoscrypt_blkcpy:
45
#if (WIN64)
46
movq %rdi, %r10
47
movq %rsi, %r11
48
movq %rcx, %rdi
49
movq %rdx, %rsi
50
movq %r8, %rdx
51
#endif
52
xorq %rcx, %rcx
53
movl %edx, %ecx
54
shrl $6, %ecx
55
movq $64, %rax
56
.blkcpy:
57
movdqa 0(%rsi), %xmm0
58
movdqa 16(%rsi), %xmm1
59
movdqa 32(%rsi), %xmm2
60
movdqa 48(%rsi), %xmm3
61
movdqa %xmm0, 0(%rdi)
62
movdqa %xmm1, 16(%rdi)
63
movdqa %xmm2, 32(%rdi)
64
movdqa %xmm3, 48(%rdi)
65
addq %rax, %rdi
66
addq %rax, %rsi
67
decl %ecx
68
jnz .blkcpy
69
#if (WIN64)
70
movq %r10, %rdi
71
movq %r11, %rsi
72
#endif
73
ret
74
75
76
/* neoscrypt_blkswp(blkA, blkB, len) = SSE2 based block swapper;
77
* len must be a multiple of 64 bytes aligned properly */
78
.globl neoscrypt_blkswp
79
.globl _neoscrypt_blkswp
80
neoscrypt_blkswp:
81
_neoscrypt_blkswp:
82
#if (WIN64)
83
movq %rdi, %r10
84
movq %rsi, %r11
85
movq %rcx, %rdi
86
movq %rdx, %rsi
87
movq %r8, %rdx
88
#endif
89
xorq %rcx, %rcx
90
movl %edx, %ecx
91
shrl $6, %ecx
92
movq $64, %rax
93
.blkswp:
94
movdqa 0(%rdi), %xmm0
95
movdqa 16(%rdi), %xmm1
96
movdqa 32(%rdi), %xmm2
97
movdqa 48(%rdi), %xmm3
98
movdqa 0(%rsi), %xmm4
99
movdqa 16(%rsi), %xmm5
100
movdqa 32(%rsi), %xmm8
101
movdqa 48(%rsi), %xmm9
102
movdqa %xmm0, 0(%rsi)
103
movdqa %xmm1, 16(%rsi)
104
movdqa %xmm2, 32(%rsi)
105
movdqa %xmm3, 48(%rsi)
106
movdqa %xmm4, 0(%rdi)
107
movdqa %xmm5, 16(%rdi)
108
movdqa %xmm8, 32(%rdi)
109
movdqa %xmm9, 48(%rdi)
110
addq %rax, %rdi
111
addq %rax, %rsi
112
decl %ecx
113
jnz .blkswp
114
#if (WIN64)
115
movq %r10, %rdi
116
movq %r11, %rsi
117
#endif
118
ret
119
120
121
/* neoscrypt_blkxor(dst, src, len) = SSE2 based block XOR engine;
122
* len must be a multiple of 64 bytes aligned properly */
123
.globl neoscrypt_blkxor
124
.globl _neoscrypt_blkxor
125
neoscrypt_blkxor:
126
_neoscrypt_blkxor:
127
#if (WIN64)
128
movq %rdi, %r10
129
movq %rsi, %r11
130
movq %rcx, %rdi
131
movq %rdx, %rsi
132
movq %r8, %rdx
133
#endif
134
xorq %rcx, %rcx
135
movl %edx, %ecx
136
shrl $6, %ecx
137
movq $64, %rax
138
.blkxor:
139
movdqa 0(%rdi), %xmm0
140
movdqa 16(%rdi), %xmm1
141
movdqa 32(%rdi), %xmm2
142
movdqa 48(%rdi), %xmm3
143
movdqa 0(%rsi), %xmm4
144
movdqa 16(%rsi), %xmm5
145
movdqa 32(%rsi), %xmm8
146
movdqa 48(%rsi), %xmm9
147
pxor %xmm4, %xmm0
148
pxor %xmm5, %xmm1
149
pxor %xmm8, %xmm2
150
pxor %xmm9, %xmm3
151
movdqa %xmm0, 0(%rdi)
152
movdqa %xmm1, 16(%rdi)
153
movdqa %xmm2, 32(%rdi)
154
movdqa %xmm3, 48(%rdi)
155
addq %rax, %rdi
156
addq %rax, %rsi
157
decl %ecx
158
jnz .blkxor
159
#if (WIN64)
160
movq %r10, %rdi
161
movq %r11, %rsi
162
#endif
163
ret
164
165
166
/* neoscrypt_salsa(mem, rounds) = SSE2 based Salsa20;
167
* mem must be aligned properly, rounds must be a multiple of 2 */
168
.globl neoscrypt_salsa
169
.globl _neoscrypt_salsa
170
neoscrypt_salsa:
171
_neoscrypt_salsa:
172
#if (WIN64)
173
movq %rdi, %r10
174
movq %rsi, %r11
175
movq %rcx, %rdi
176
movq %rdx, %rsi
177
#endif
178
xorq %rcx, %rcx
179
movl %esi, %ecx
180
shrl $1, %ecx
181
movdqa 0(%rdi), %xmm0
182
movdqa %xmm0, %xmm12
183
movdqa 16(%rdi), %xmm1
184
movdqa %xmm1, %xmm13
185
movdqa 32(%rdi), %xmm2
186
movdqa %xmm2, %xmm14
187
movdqa 48(%rdi), %xmm3
188
movdqa %xmm3, %xmm15
189
.salsa:
190
movdqa %xmm1, %xmm4
191
paddd %xmm0, %xmm4
192
movdqa %xmm4, %xmm5
193
pslld $7, %xmm4
194
psrld $25, %xmm5
195
pxor %xmm4, %xmm3
196
movdqa %xmm0, %xmm4
197
pxor %xmm5, %xmm3
198
paddd %xmm3, %xmm4
199
movdqa %xmm4, %xmm5
200
pslld $9, %xmm4
201
psrld $23, %xmm5
202
pxor %xmm4, %xmm2
203
movdqa %xmm3, %xmm4
204
pxor %xmm5, %xmm2
205
pshufd $0x93, %xmm3, %xmm3
206
paddd %xmm2, %xmm4
207
movdqa %xmm4, %xmm5
208
pslld $13, %xmm4
209
psrld $19, %xmm5
210
pxor %xmm4, %xmm1
211
movdqa %xmm2, %xmm4
212
pxor %xmm5, %xmm1
213
pshufd $0x4E, %xmm2, %xmm2
214
paddd %xmm1, %xmm4
215
movdqa %xmm4, %xmm5
216
pslld $18, %xmm4
217
psrld $14, %xmm5
218
pxor %xmm4, %xmm0
219
movdqa %xmm3, %xmm4
220
pxor %xmm5, %xmm0
221
pshufd $0x39, %xmm1, %xmm1
222
paddd %xmm0, %xmm4
223
movdqa %xmm4, %xmm5
224
pslld $7, %xmm4
225
psrld $25, %xmm5
226
pxor %xmm4, %xmm1
227
movdqa %xmm0, %xmm4
228
pxor %xmm5, %xmm1
229
paddd %xmm1, %xmm4
230
movdqa %xmm4, %xmm5
231
pslld $9, %xmm4
232
psrld $23, %xmm5
233
pxor %xmm4, %xmm2
234
movdqa %xmm1, %xmm4
235
pxor %xmm5, %xmm2
236
pshufd $0x93, %xmm1, %xmm1
237
paddd %xmm2, %xmm4
238
movdqa %xmm4, %xmm5
239
pslld $13, %xmm4
240
psrld $19, %xmm5
241
pxor %xmm4, %xmm3
242
movdqa %xmm2, %xmm4
243
pxor %xmm5, %xmm3
244
pshufd $0x4E, %xmm2, %xmm2
245
paddd %xmm3, %xmm4
246
movdqa %xmm4, %xmm5
247
pslld $18, %xmm4
248
psrld $14, %xmm5
249
pxor %xmm4, %xmm0
250
pshufd $0x39, %xmm3, %xmm3
251
pxor %xmm5, %xmm0
252
decl %ecx
253
jnz .salsa
254
255
paddd %xmm12, %xmm0
256
movdqa %xmm0, 0(%rdi)
257
paddd %xmm13, %xmm1
258
movdqa %xmm1, 16(%rdi)
259
paddd %xmm14, %xmm2
260
movdqa %xmm2, 32(%rdi)
261
paddd %xmm15, %xmm3
262
movdqa %xmm3, 48(%rdi)
263
#if (WIN64)
264
movq %r10, %rdi
265
movq %r11, %rsi
266
#endif
267
ret
268
269
270
/* neoscrypt_salsa_tangle(mem, count) = Salsa20 SSE2 map switcher;
271
* correct map: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
272
* SSE2 map: 0 5 10 15 12 1 6 11 8 13 2 7 4 9 14 3 */
273
.globl neoscrypt_salsa_tangle
274
.globl _neoscrypt_salsa_tangle
275
neoscrypt_salsa_tangle:
276
_neoscrypt_salsa_tangle:
277
#if (WIN64)
278
movq %rdi, %r10
279
movq %rsi, %r11
280
movq %rcx, %rdi
281
movq %rdx, %rsi
282
#endif
283
xorq %rcx, %rcx
284
movl %esi, %ecx
285
movq $64, %r8
286
.salsa_tangle:
287
movl 4(%rdi), %eax
288
movl 20(%rdi), %edx
289
movl %eax, 20(%rdi)
290
movl %edx, 4(%rdi)
291
movl 8(%rdi), %eax
292
movl 40(%rdi), %edx
293
movl %eax, 40(%rdi)
294
movl %edx, 8(%rdi)
295
movl 12(%rdi), %eax
296
movl 60(%rdi), %edx
297
movl %eax, 60(%rdi)
298
movl %edx, 12(%rdi)
299
movl 16(%rdi), %eax
300
movl 48(%rdi), %edx
301
movl %eax, 48(%rdi)
302
movl %edx, 16(%rdi)
303
movl 28(%rdi), %eax
304
movl 44(%rdi), %edx
305
movl %eax, 44(%rdi)
306
movl %edx, 28(%rdi)
307
movl 36(%rdi), %eax
308
movl 52(%rdi), %edx
309
movl %eax, 52(%rdi)
310
movl %edx, 36(%rdi)
311
addq %r8, %rdi
312
decl %ecx
313
jnz .salsa_tangle
314
#if (WIN64)
315
movq %r10, %rdi
316
movq %r11, %rsi
317
#endif
318
ret
319
320
321
/* neoscrypt_chacha(mem, rounds) = SSE2 based ChaCha20;
322
* mem must be aligned properly, rounds must be a multiple of 2 */
323
.globl neoscrypt_chacha
324
.globl _neoscrypt_chacha
325
neoscrypt_chacha:
326
_neoscrypt_chacha:
327
#if (WIN64)
328
movq %rdi, %r10
329
movq %rsi, %r11
330
movq %rcx, %rdi
331
movq %rdx, %rsi
332
#endif
333
xorq %rcx, %rcx
334
movl %esi, %ecx
335
shrl $1, %ecx
336
movdqa 0(%rdi), %xmm0
337
movdqa %xmm0, %xmm12
338
movdqa 16(%rdi), %xmm1
339
movdqa %xmm1, %xmm13
340
movdqa 32(%rdi), %xmm2
341
movdqa %xmm2, %xmm14
342
movdqa 48(%rdi), %xmm3
343
movdqa %xmm3, %xmm15
344
.chacha:
345
paddd %xmm1, %xmm0
346
pxor %xmm0, %xmm3
347
pshuflw $0xB1, %xmm3, %xmm3
348
pshufhw $0xB1, %xmm3, %xmm3
349
paddd %xmm3, %xmm2
350
pxor %xmm2, %xmm1
351
movdqa %xmm1, %xmm4
352
pslld $12, %xmm1
353
psrld $20, %xmm4
354
pxor %xmm4, %xmm1
355
paddd %xmm1, %xmm0
356
pxor %xmm0, %xmm3
357
movdqa %xmm3, %xmm4
358
pslld $8, %xmm3
359
psrld $24, %xmm4
360
pxor %xmm4, %xmm3
361
pshufd $0x93, %xmm0, %xmm0
362
paddd %xmm3, %xmm2
363
pshufd $0x4E, %xmm3, %xmm3
364
pxor %xmm2, %xmm1
365
pshufd $0x39, %xmm2, %xmm2
366
movdqa %xmm1, %xmm4
367
pslld $7, %xmm1
368
psrld $25, %xmm4
369
pxor %xmm4, %xmm1
370
paddd %xmm1, %xmm0
371
pxor %xmm0, %xmm3
372
pshuflw $0xB1, %xmm3, %xmm3
373
pshufhw $0xB1, %xmm3, %xmm3
374
paddd %xmm3, %xmm2
375
pxor %xmm2, %xmm1
376
movdqa %xmm1, %xmm4
377
pslld $12, %xmm1
378
psrld $20, %xmm4
379
pxor %xmm4, %xmm1
380
paddd %xmm1, %xmm0
381
pxor %xmm0, %xmm3
382
movdqa %xmm3, %xmm4
383
pslld $8, %xmm3
384
psrld $24, %xmm4
385
pxor %xmm4, %xmm3
386
pshufd $0x39, %xmm0, %xmm0
387
paddd %xmm3, %xmm2
388
pshufd $0x4E, %xmm3, %xmm3
389
pxor %xmm2, %xmm1
390
pshufd $0x93, %xmm2, %xmm2
391
movdqa %xmm1, %xmm4
392
pslld $7, %xmm1
393
psrld $25, %xmm4
394
pxor %xmm4, %xmm1
395
decl %ecx
396
jnz .chacha
397
398
paddd %xmm12, %xmm0
399
movdqa %xmm0, 0(%rdi)
400
paddd %xmm13, %xmm1
401
movdqa %xmm1, 16(%rdi)
402
paddd %xmm14, %xmm2
403
movdqa %xmm2, 32(%rdi)
404
paddd %xmm15, %xmm3
405
movdqa %xmm3, 48(%rdi)
406
#if (WIN64)
407
movq %r10, %rdi
408
movq %r11, %rsi
409
#endif
410
ret
411
412
#endif /* (ASM) && (__x86_64__) */
413
414
415
#if (ASM) && (__i386__)
416
417
/* neoscrypt_blkcpy(dst, src, len) = SSE2 based block memcpy();
418
* len must be a multiple of 64 bytes aligned properly */
419
.globl neoscrypt_blkcpy
420
.globl _neoscrypt_blkcpy
421
neoscrypt_blkcpy:
422
_neoscrypt_blkcpy:
423
pushl %edi
424
pushl %esi
425
movl 12(%esp), %edi
426
movl 16(%esp), %esi
427
movl 20(%esp), %ecx
428
shrl $6, %ecx
429
movl $64, %eax
430
.blkcpy:
431
movdqa 0(%esi), %xmm0
432
movdqa 16(%esi), %xmm1
433
movdqa 32(%esi), %xmm2
434
movdqa 48(%esi), %xmm3
435
movdqa %xmm0, 0(%edi)
436
movdqa %xmm1, 16(%edi)
437
movdqa %xmm2, 32(%edi)
438
movdqa %xmm3, 48(%edi)
439
addl %eax, %edi
440
add %eax, %esi
441
decl %ecx
442
jnz .blkcpy
443
444
popl %esi
445
popl %edi
446
ret
447
448
449
/* neoscrypt_blkswp(blkA, blkB, len) = SSE2 based block swapper;
450
* len must be a multiple of 64 bytes aligned properly */
451
.globl neoscrypt_blkswp
452
.globl _neoscrypt_blkswp
453
neoscrypt_blkswp:
454
_neoscrypt_blkswp:
455
pushl %edi
456
pushl %esi
457
movl 12(%esp), %edi
458
movl 16(%esp), %esi
459
movl 20(%esp), %ecx
460
shrl $6, %ecx
461
movl $64, %eax
462
.blkswp:
463
movdqa 0(%edi), %xmm0
464
movdqa 16(%edi), %xmm1
465
movdqa 32(%edi), %xmm2
466
movdqa 48(%edi), %xmm3
467
movdqa 0(%esi), %xmm4
468
movdqa 16(%esi), %xmm5
469
movdqa 32(%esi), %xmm6
470
movdqa 48(%esi), %xmm7
471
movdqa %xmm0, 0(%esi)
472
movdqa %xmm1, 16(%esi)
473
movdqa %xmm2, 32(%esi)
474
movdqa %xmm3, 48(%esi)
475
movdqa %xmm4, 0(%edi)
476
movdqa %xmm5, 16(%edi)
477
movdqa %xmm6, 32(%edi)
478
movdqa %xmm7, 48(%edi)
479
addl %eax, %edi
480
addl %eax, %esi
481
decl %ecx
482
jnz .blkswp
483
484
popl %esi
485
popl %edi
486
ret
487
488
489
/* neoscrypt_blkxor(dst, src, len) = SSE2 based block XOR engine;
490
* len must be a multiple of 64 bytes aligned properly */
491
.globl neoscrypt_blkxor
492
.globl _neoscrypt_blkxor
493
neoscrypt_blkxor:
494
_neoscrypt_blkxor:
495
pushl %edi
496
pushl %esi
497
movl 12(%esp), %edi
498
movl 16(%esp), %esi
499
movl 20(%esp), %ecx
500
shrl $6, %ecx
501
movl $64, %eax
502
.blkxor:
503
movdqa 0(%edi), %xmm0
504
movdqa 16(%edi), %xmm1
505
movdqa 32(%edi), %xmm2
506
movdqa 48(%edi), %xmm3
507
movdqa 0(%esi), %xmm4
508
movdqa 16(%esi), %xmm5
509
movdqa 32(%esi), %xmm6
510
movdqa 48(%esi), %xmm7
511
pxor %xmm4, %xmm0
512
pxor %xmm5, %xmm1
513
pxor %xmm6, %xmm2
514
pxor %xmm7, %xmm3
515
movdqa %xmm0, 0(%edi)
516
movdqa %xmm1, 16(%edi)
517
movdqa %xmm2, 32(%edi)
518
movdqa %xmm3, 48(%edi)
519
addl %eax, %edi
520
addl %eax, %esi
521
decl %ecx
522
jnz .blkxor
523
524
popl %esi
525
popl %edi
526
ret
527
528
529
/* neoscrypt_salsa(mem, rounds) = SSE2 based Salsa20;
530
* mem must be aligned properly, rounds must be a multiple of 2 */
531
.globl neoscrypt_salsa
532
.globl _neoscrypt_salsa
533
neoscrypt_salsa:
534
_neoscrypt_salsa:
535
movl 4(%esp), %edx
536
movl 8(%esp), %ecx
537
shrl $1, %ecx
538
movdqa 0(%edx), %xmm0
539
movdqa %xmm0, %xmm6
540
movdqa 16(%edx), %xmm1
541
movdqa %xmm1, %xmm7
542
subl $32, %esp
543
movdqa 32(%edx), %xmm2
544
movdqu %xmm2, 0(%esp)
545
movdqa 48(%edx), %xmm3
546
movdqu %xmm3, 16(%esp)
547
.salsa:
548
movdqa %xmm1, %xmm4
549
paddd %xmm0, %xmm4
550
movdqa %xmm4, %xmm5
551
pslld $7, %xmm4
552
psrld $25, %xmm5
553
pxor %xmm4, %xmm3
554
movdqa %xmm0, %xmm4
555
pxor %xmm5, %xmm3
556
paddd %xmm3, %xmm4
557
movdqa %xmm4, %xmm5
558
pslld $9, %xmm4
559
psrld $23, %xmm5
560
pxor %xmm4, %xmm2
561
movdqa %xmm3, %xmm4
562
pxor %xmm5, %xmm2
563
pshufd $0x93, %xmm3, %xmm3
564
paddd %xmm2, %xmm4
565
movdqa %xmm4, %xmm5
566
pslld $13, %xmm4
567
psrld $19, %xmm5
568
pxor %xmm4, %xmm1
569
movdqa %xmm2, %xmm4
570
pxor %xmm5, %xmm1
571
pshufd $0x4E, %xmm2, %xmm2
572
paddd %xmm1, %xmm4
573
movdqa %xmm4, %xmm5
574
pslld $18, %xmm4
575
psrld $14, %xmm5
576
pxor %xmm4, %xmm0
577
movdqa %xmm3, %xmm4
578
pxor %xmm5, %xmm0
579
pshufd $0x39, %xmm1, %xmm1
580
paddd %xmm0, %xmm4
581
movdqa %xmm4, %xmm5
582
pslld $7, %xmm4
583
psrld $25, %xmm5
584
pxor %xmm4, %xmm1
585
movdqa %xmm0, %xmm4
586
pxor %xmm5, %xmm1
587
paddd %xmm1, %xmm4
588
movdqa %xmm4, %xmm5
589
pslld $9, %xmm4
590
psrld $23, %xmm5
591
pxor %xmm4, %xmm2
592
movdqa %xmm1, %xmm4
593
pxor %xmm5, %xmm2
594
pshufd $0x93, %xmm1, %xmm1
595
paddd %xmm2, %xmm4
596
movdqa %xmm4, %xmm5
597
pslld $13, %xmm4
598
psrld $19, %xmm5
599
pxor %xmm4, %xmm3
600
movdqa %xmm2, %xmm4
601
pxor %xmm5, %xmm3
602
pshufd $0x4E, %xmm2, %xmm2
603
paddd %xmm3, %xmm4
604
movdqa %xmm4, %xmm5
605
pslld $18, %xmm4
606
psrld $14, %xmm5
607
pxor %xmm4, %xmm0
608
pshufd $0x39, %xmm3, %xmm3
609
pxor %xmm5, %xmm0
610
decl %ecx
611
jnz .salsa
612
613
paddd %xmm6, %xmm0
614
movdqa %xmm0, 0(%edx)
615
paddd %xmm7, %xmm1
616
movdqa %xmm1, 16(%edx)
617
movdqu 0(%esp), %xmm6
618
paddd %xmm6, %xmm2
619
movdqa %xmm2, 32(%edx)
620
movdqu 16(%esp), %xmm7
621
paddd %xmm7, %xmm3
622
movdqa %xmm3, 48(%edx)
623
addl $32, %esp
624
ret
625
626
627
/* neoscrypt_salsa_tangle(mem, count) = Salsa20 SSE2 map switcher;
628
* correct map: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
629
* SSE2 map: 0 5 10 15 12 1 6 11 8 13 2 7 4 9 14 3 */
630
.globl neoscrypt_salsa_tangle
631
.globl _neoscrypt_salsa_tangle
632
neoscrypt_salsa_tangle:
633
_neoscrypt_salsa_tangle:
634
pushl %ebx
635
push %ebp
636
movl 12(%esp), %ebp
637
movl 16(%esp), %ecx
638
movl $64, %ebx
639
.salsa_tangle:
640
movl 4(%ebp), %eax
641
movl 20(%ebp), %edx
642
movl %eax, 20(%ebp)
643
movl %edx, 4(%ebp)
644
movl 8(%ebp), %eax
645
movl 40(%ebp), %edx
646
movl %eax, 40(%ebp)
647
movl %edx, 8(%ebp)
648
movl 12(%ebp), %eax
649
movl 60(%ebp), %edx
650
movl %eax, 60(%ebp)
651
movl %edx, 12(%ebp)
652
movl 16(%ebp), %eax
653
movl 48(%ebp), %edx
654
movl %eax, 48(%ebp)
655
movl %edx, 16(%ebp)
656
movl 28(%ebp), %eax
657
movl 44(%ebp), %edx
658
movl %eax, 44(%ebp)
659
movl %edx, 28(%ebp)
660
movl 36(%ebp), %eax
661
movl 52(%ebp), %edx
662
movl %eax, 52(%ebp)
663
movl %edx, 36(%ebp)
664
addl %ebx, %ebp
665
decl %ecx
666
jnz .salsa_tangle
667
668
popl %ebp
669
popl %ebx
670
ret
671
672
673
/* neoscrypt_chacha(mem, rounds) = SSE2 based ChaCha20;
674
* mem must be aligned properly, rounds must be a multiple of 2 */
675
.globl neoscrypt_chacha
676
.globl _neoscrypt_chacha
677
neoscrypt_chacha:
678
_neoscrypt_chacha:
679
movl 4(%esp), %edx
680
movl 8(%esp), %ecx
681
shrl $1, %ecx
682
movdqa 0(%edx), %xmm0
683
movdqa %xmm0, %xmm5
684
movdqa 16(%edx), %xmm1
685
movdqa %xmm1, %xmm6
686
movdqa 32(%edx), %xmm2
687
movdqa %xmm2, %xmm7
688
subl $16, %esp
689
movdqa 48(%edx), %xmm3
690
movdqu %xmm3, 0(%esp)
691
.chacha:
692
paddd %xmm1, %xmm0
693
pxor %xmm0, %xmm3
694
pshuflw $0xB1, %xmm3, %xmm3
695
pshufhw $0xB1, %xmm3, %xmm3
696
paddd %xmm3, %xmm2
697
pxor %xmm2, %xmm1
698
movdqa %xmm1, %xmm4
699
pslld $12, %xmm1
700
psrld $20, %xmm4
701
pxor %xmm4, %xmm1
702
paddd %xmm1, %xmm0
703
pxor %xmm0, %xmm3
704
movdqa %xmm3, %xmm4
705
pslld $8, %xmm3
706
psrld $24, %xmm4
707
pxor %xmm4, %xmm3
708
pshufd $0x93, %xmm0, %xmm0
709
paddd %xmm3, %xmm2
710
pshufd $0x4E, %xmm3, %xmm3
711
pxor %xmm2, %xmm1
712
pshufd $0x39, %xmm2, %xmm2
713
movdqa %xmm1, %xmm4
714
pslld $7, %xmm1
715
psrld $25, %xmm4
716
pxor %xmm4, %xmm1
717
paddd %xmm1, %xmm0
718
pxor %xmm0, %xmm3
719
pshuflw $0xB1, %xmm3, %xmm3
720
pshufhw $0xB1, %xmm3, %xmm3
721
paddd %xmm3, %xmm2
722
pxor %xmm2, %xmm1
723
movdqa %xmm1, %xmm4
724
pslld $12, %xmm1
725
psrld $20, %xmm4
726
pxor %xmm4, %xmm1
727
paddd %xmm1, %xmm0
728
pxor %xmm0, %xmm3
729
movdqa %xmm3, %xmm4
730
pslld $8, %xmm3
731
psrld $24, %xmm4
732
pxor %xmm4, %xmm3
733
pshufd $0x39, %xmm0, %xmm0
734
paddd %xmm3, %xmm2
735
pshufd $0x4E, %xmm3, %xmm3
736
pxor %xmm2, %xmm1
737
pshufd $0x93, %xmm2, %xmm2
738
movdqa %xmm1, %xmm4
739
pslld $7, %xmm1
740
psrld $25, %xmm4
741
pxor %xmm4, %xmm1
742
decl %ecx
743
jnz .chacha
744
745
paddd %xmm5, %xmm0
746
movdqa %xmm0, 0(%edx)
747
paddd %xmm6, %xmm1
748
movdqa %xmm1, 16(%edx)
749
paddd %xmm7, %xmm2
750
movdqa %xmm2, 32(%edx)
751
movdqu 0(%esp), %xmm7
752
paddd %xmm7, %xmm3
753
movdqa %xmm3, 48(%edx)
754
addl $16, %esp
755
ret
756
757
#endif /* (ASM) && (__i386__) */
758
759