Path: blob/master/asm/neoscrypt_asm.S
1292 views
/*1* Copyright (c) 2014 John Doering <[email protected]>2* All rights reserved.3*4* Redistribution and use in source and binary forms, with or without5* modification, are permitted provided that the following conditions6* are met:7* 1. Redistributions of source code must retain the above copyright8* notice, this list of conditions and the following disclaimer.9* 2. Redistributions in binary form must reproduce the above copyright10* notice, this list of conditions and the following disclaimer in the11* documentation and/or other materials provided with the distribution.12*13* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND14* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE15* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE16* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE17* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL18* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS19* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)20* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT21* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY22* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF23* SUCH DAMAGE.24*/2526#ifdef _MSC_VER27/* arch defines */28#include "miner.h"29#endif3031#if defined(__GNUC__) && !defined(__arm__)32#define ASM 133/* #define WIN64 0 */34#endif3536#if (ASM) && (__x86_64__)3738/* neoscrypt_blkcpy(dst, src, len) = SSE2 based block memcpy();39* len must be a multiple of 64 bytes aligned properly */40.globl neoscrypt_blkcpy41.globl _neoscrypt_blkcpy42neoscrypt_blkcpy:43_neoscrypt_blkcpy:44#if (WIN64)45movq %rdi, %r1046movq %rsi, %r1147movq %rcx, %rdi48movq %rdx, %rsi49movq %r8, %rdx50#endif51xorq %rcx, %rcx52movl %edx, %ecx53shrl $6, %ecx54movq $64, %rax55.blkcpy:56movdqa 0(%rsi), %xmm057movdqa 16(%rsi), %xmm158movdqa 32(%rsi), %xmm259movdqa 48(%rsi), %xmm360movdqa %xmm0, 0(%rdi)61movdqa %xmm1, 16(%rdi)62movdqa %xmm2, 32(%rdi)63movdqa %xmm3, 48(%rdi)64addq %rax, %rdi65addq %rax, %rsi66decl %ecx67jnz .blkcpy68#if (WIN64)69movq %r10, %rdi70movq %r11, %rsi71#endif72ret737475/* neoscrypt_blkswp(blkA, blkB, len) = SSE2 based block swapper;76* len must be a multiple of 64 bytes aligned properly */77.globl neoscrypt_blkswp78.globl _neoscrypt_blkswp79neoscrypt_blkswp:80_neoscrypt_blkswp:81#if (WIN64)82movq %rdi, %r1083movq %rsi, %r1184movq %rcx, %rdi85movq %rdx, %rsi86movq %r8, %rdx87#endif88xorq %rcx, %rcx89movl %edx, %ecx90shrl $6, %ecx91movq $64, %rax92.blkswp:93movdqa 0(%rdi), %xmm094movdqa 16(%rdi), %xmm195movdqa 32(%rdi), %xmm296movdqa 48(%rdi), %xmm397movdqa 0(%rsi), %xmm498movdqa 16(%rsi), %xmm599movdqa 32(%rsi), %xmm8100movdqa 48(%rsi), %xmm9101movdqa %xmm0, 0(%rsi)102movdqa %xmm1, 16(%rsi)103movdqa %xmm2, 32(%rsi)104movdqa %xmm3, 48(%rsi)105movdqa %xmm4, 0(%rdi)106movdqa %xmm5, 16(%rdi)107movdqa %xmm8, 32(%rdi)108movdqa %xmm9, 48(%rdi)109addq %rax, %rdi110addq %rax, %rsi111decl %ecx112jnz .blkswp113#if (WIN64)114movq %r10, %rdi115movq %r11, %rsi116#endif117ret118119120/* neoscrypt_blkxor(dst, src, len) = SSE2 based block XOR engine;121* len must be a multiple of 64 bytes aligned properly */122.globl neoscrypt_blkxor123.globl _neoscrypt_blkxor124neoscrypt_blkxor:125_neoscrypt_blkxor:126#if (WIN64)127movq %rdi, %r10128movq %rsi, %r11129movq %rcx, %rdi130movq %rdx, %rsi131movq %r8, %rdx132#endif133xorq %rcx, %rcx134movl %edx, %ecx135shrl $6, %ecx136movq $64, %rax137.blkxor:138movdqa 0(%rdi), %xmm0139movdqa 16(%rdi), %xmm1140movdqa 32(%rdi), %xmm2141movdqa 48(%rdi), %xmm3142movdqa 0(%rsi), %xmm4143movdqa 16(%rsi), %xmm5144movdqa 32(%rsi), %xmm8145movdqa 48(%rsi), %xmm9146pxor %xmm4, %xmm0147pxor %xmm5, %xmm1148pxor %xmm8, %xmm2149pxor %xmm9, %xmm3150movdqa %xmm0, 0(%rdi)151movdqa %xmm1, 16(%rdi)152movdqa %xmm2, 32(%rdi)153movdqa %xmm3, 48(%rdi)154addq %rax, %rdi155addq %rax, %rsi156decl %ecx157jnz .blkxor158#if (WIN64)159movq %r10, %rdi160movq %r11, %rsi161#endif162ret163164165/* neoscrypt_salsa(mem, rounds) = SSE2 based Salsa20;166* mem must be aligned properly, rounds must be a multiple of 2 */167.globl neoscrypt_salsa168.globl _neoscrypt_salsa169neoscrypt_salsa:170_neoscrypt_salsa:171#if (WIN64)172movq %rdi, %r10173movq %rsi, %r11174movq %rcx, %rdi175movq %rdx, %rsi176#endif177xorq %rcx, %rcx178movl %esi, %ecx179shrl $1, %ecx180movdqa 0(%rdi), %xmm0181movdqa %xmm0, %xmm12182movdqa 16(%rdi), %xmm1183movdqa %xmm1, %xmm13184movdqa 32(%rdi), %xmm2185movdqa %xmm2, %xmm14186movdqa 48(%rdi), %xmm3187movdqa %xmm3, %xmm15188.salsa:189movdqa %xmm1, %xmm4190paddd %xmm0, %xmm4191movdqa %xmm4, %xmm5192pslld $7, %xmm4193psrld $25, %xmm5194pxor %xmm4, %xmm3195movdqa %xmm0, %xmm4196pxor %xmm5, %xmm3197paddd %xmm3, %xmm4198movdqa %xmm4, %xmm5199pslld $9, %xmm4200psrld $23, %xmm5201pxor %xmm4, %xmm2202movdqa %xmm3, %xmm4203pxor %xmm5, %xmm2204pshufd $0x93, %xmm3, %xmm3205paddd %xmm2, %xmm4206movdqa %xmm4, %xmm5207pslld $13, %xmm4208psrld $19, %xmm5209pxor %xmm4, %xmm1210movdqa %xmm2, %xmm4211pxor %xmm5, %xmm1212pshufd $0x4E, %xmm2, %xmm2213paddd %xmm1, %xmm4214movdqa %xmm4, %xmm5215pslld $18, %xmm4216psrld $14, %xmm5217pxor %xmm4, %xmm0218movdqa %xmm3, %xmm4219pxor %xmm5, %xmm0220pshufd $0x39, %xmm1, %xmm1221paddd %xmm0, %xmm4222movdqa %xmm4, %xmm5223pslld $7, %xmm4224psrld $25, %xmm5225pxor %xmm4, %xmm1226movdqa %xmm0, %xmm4227pxor %xmm5, %xmm1228paddd %xmm1, %xmm4229movdqa %xmm4, %xmm5230pslld $9, %xmm4231psrld $23, %xmm5232pxor %xmm4, %xmm2233movdqa %xmm1, %xmm4234pxor %xmm5, %xmm2235pshufd $0x93, %xmm1, %xmm1236paddd %xmm2, %xmm4237movdqa %xmm4, %xmm5238pslld $13, %xmm4239psrld $19, %xmm5240pxor %xmm4, %xmm3241movdqa %xmm2, %xmm4242pxor %xmm5, %xmm3243pshufd $0x4E, %xmm2, %xmm2244paddd %xmm3, %xmm4245movdqa %xmm4, %xmm5246pslld $18, %xmm4247psrld $14, %xmm5248pxor %xmm4, %xmm0249pshufd $0x39, %xmm3, %xmm3250pxor %xmm5, %xmm0251decl %ecx252jnz .salsa253254paddd %xmm12, %xmm0255movdqa %xmm0, 0(%rdi)256paddd %xmm13, %xmm1257movdqa %xmm1, 16(%rdi)258paddd %xmm14, %xmm2259movdqa %xmm2, 32(%rdi)260paddd %xmm15, %xmm3261movdqa %xmm3, 48(%rdi)262#if (WIN64)263movq %r10, %rdi264movq %r11, %rsi265#endif266ret267268269/* neoscrypt_salsa_tangle(mem, count) = Salsa20 SSE2 map switcher;270* correct map: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15271* SSE2 map: 0 5 10 15 12 1 6 11 8 13 2 7 4 9 14 3 */272.globl neoscrypt_salsa_tangle273.globl _neoscrypt_salsa_tangle274neoscrypt_salsa_tangle:275_neoscrypt_salsa_tangle:276#if (WIN64)277movq %rdi, %r10278movq %rsi, %r11279movq %rcx, %rdi280movq %rdx, %rsi281#endif282xorq %rcx, %rcx283movl %esi, %ecx284movq $64, %r8285.salsa_tangle:286movl 4(%rdi), %eax287movl 20(%rdi), %edx288movl %eax, 20(%rdi)289movl %edx, 4(%rdi)290movl 8(%rdi), %eax291movl 40(%rdi), %edx292movl %eax, 40(%rdi)293movl %edx, 8(%rdi)294movl 12(%rdi), %eax295movl 60(%rdi), %edx296movl %eax, 60(%rdi)297movl %edx, 12(%rdi)298movl 16(%rdi), %eax299movl 48(%rdi), %edx300movl %eax, 48(%rdi)301movl %edx, 16(%rdi)302movl 28(%rdi), %eax303movl 44(%rdi), %edx304movl %eax, 44(%rdi)305movl %edx, 28(%rdi)306movl 36(%rdi), %eax307movl 52(%rdi), %edx308movl %eax, 52(%rdi)309movl %edx, 36(%rdi)310addq %r8, %rdi311decl %ecx312jnz .salsa_tangle313#if (WIN64)314movq %r10, %rdi315movq %r11, %rsi316#endif317ret318319320/* neoscrypt_chacha(mem, rounds) = SSE2 based ChaCha20;321* mem must be aligned properly, rounds must be a multiple of 2 */322.globl neoscrypt_chacha323.globl _neoscrypt_chacha324neoscrypt_chacha:325_neoscrypt_chacha:326#if (WIN64)327movq %rdi, %r10328movq %rsi, %r11329movq %rcx, %rdi330movq %rdx, %rsi331#endif332xorq %rcx, %rcx333movl %esi, %ecx334shrl $1, %ecx335movdqa 0(%rdi), %xmm0336movdqa %xmm0, %xmm12337movdqa 16(%rdi), %xmm1338movdqa %xmm1, %xmm13339movdqa 32(%rdi), %xmm2340movdqa %xmm2, %xmm14341movdqa 48(%rdi), %xmm3342movdqa %xmm3, %xmm15343.chacha:344paddd %xmm1, %xmm0345pxor %xmm0, %xmm3346pshuflw $0xB1, %xmm3, %xmm3347pshufhw $0xB1, %xmm3, %xmm3348paddd %xmm3, %xmm2349pxor %xmm2, %xmm1350movdqa %xmm1, %xmm4351pslld $12, %xmm1352psrld $20, %xmm4353pxor %xmm4, %xmm1354paddd %xmm1, %xmm0355pxor %xmm0, %xmm3356movdqa %xmm3, %xmm4357pslld $8, %xmm3358psrld $24, %xmm4359pxor %xmm4, %xmm3360pshufd $0x93, %xmm0, %xmm0361paddd %xmm3, %xmm2362pshufd $0x4E, %xmm3, %xmm3363pxor %xmm2, %xmm1364pshufd $0x39, %xmm2, %xmm2365movdqa %xmm1, %xmm4366pslld $7, %xmm1367psrld $25, %xmm4368pxor %xmm4, %xmm1369paddd %xmm1, %xmm0370pxor %xmm0, %xmm3371pshuflw $0xB1, %xmm3, %xmm3372pshufhw $0xB1, %xmm3, %xmm3373paddd %xmm3, %xmm2374pxor %xmm2, %xmm1375movdqa %xmm1, %xmm4376pslld $12, %xmm1377psrld $20, %xmm4378pxor %xmm4, %xmm1379paddd %xmm1, %xmm0380pxor %xmm0, %xmm3381movdqa %xmm3, %xmm4382pslld $8, %xmm3383psrld $24, %xmm4384pxor %xmm4, %xmm3385pshufd $0x39, %xmm0, %xmm0386paddd %xmm3, %xmm2387pshufd $0x4E, %xmm3, %xmm3388pxor %xmm2, %xmm1389pshufd $0x93, %xmm2, %xmm2390movdqa %xmm1, %xmm4391pslld $7, %xmm1392psrld $25, %xmm4393pxor %xmm4, %xmm1394decl %ecx395jnz .chacha396397paddd %xmm12, %xmm0398movdqa %xmm0, 0(%rdi)399paddd %xmm13, %xmm1400movdqa %xmm1, 16(%rdi)401paddd %xmm14, %xmm2402movdqa %xmm2, 32(%rdi)403paddd %xmm15, %xmm3404movdqa %xmm3, 48(%rdi)405#if (WIN64)406movq %r10, %rdi407movq %r11, %rsi408#endif409ret410411#endif /* (ASM) && (__x86_64__) */412413414#if (ASM) && (__i386__)415416/* neoscrypt_blkcpy(dst, src, len) = SSE2 based block memcpy();417* len must be a multiple of 64 bytes aligned properly */418.globl neoscrypt_blkcpy419.globl _neoscrypt_blkcpy420neoscrypt_blkcpy:421_neoscrypt_blkcpy:422pushl %edi423pushl %esi424movl 12(%esp), %edi425movl 16(%esp), %esi426movl 20(%esp), %ecx427shrl $6, %ecx428movl $64, %eax429.blkcpy:430movdqa 0(%esi), %xmm0431movdqa 16(%esi), %xmm1432movdqa 32(%esi), %xmm2433movdqa 48(%esi), %xmm3434movdqa %xmm0, 0(%edi)435movdqa %xmm1, 16(%edi)436movdqa %xmm2, 32(%edi)437movdqa %xmm3, 48(%edi)438addl %eax, %edi439add %eax, %esi440decl %ecx441jnz .blkcpy442443popl %esi444popl %edi445ret446447448/* neoscrypt_blkswp(blkA, blkB, len) = SSE2 based block swapper;449* len must be a multiple of 64 bytes aligned properly */450.globl neoscrypt_blkswp451.globl _neoscrypt_blkswp452neoscrypt_blkswp:453_neoscrypt_blkswp:454pushl %edi455pushl %esi456movl 12(%esp), %edi457movl 16(%esp), %esi458movl 20(%esp), %ecx459shrl $6, %ecx460movl $64, %eax461.blkswp:462movdqa 0(%edi), %xmm0463movdqa 16(%edi), %xmm1464movdqa 32(%edi), %xmm2465movdqa 48(%edi), %xmm3466movdqa 0(%esi), %xmm4467movdqa 16(%esi), %xmm5468movdqa 32(%esi), %xmm6469movdqa 48(%esi), %xmm7470movdqa %xmm0, 0(%esi)471movdqa %xmm1, 16(%esi)472movdqa %xmm2, 32(%esi)473movdqa %xmm3, 48(%esi)474movdqa %xmm4, 0(%edi)475movdqa %xmm5, 16(%edi)476movdqa %xmm6, 32(%edi)477movdqa %xmm7, 48(%edi)478addl %eax, %edi479addl %eax, %esi480decl %ecx481jnz .blkswp482483popl %esi484popl %edi485ret486487488/* neoscrypt_blkxor(dst, src, len) = SSE2 based block XOR engine;489* len must be a multiple of 64 bytes aligned properly */490.globl neoscrypt_blkxor491.globl _neoscrypt_blkxor492neoscrypt_blkxor:493_neoscrypt_blkxor:494pushl %edi495pushl %esi496movl 12(%esp), %edi497movl 16(%esp), %esi498movl 20(%esp), %ecx499shrl $6, %ecx500movl $64, %eax501.blkxor:502movdqa 0(%edi), %xmm0503movdqa 16(%edi), %xmm1504movdqa 32(%edi), %xmm2505movdqa 48(%edi), %xmm3506movdqa 0(%esi), %xmm4507movdqa 16(%esi), %xmm5508movdqa 32(%esi), %xmm6509movdqa 48(%esi), %xmm7510pxor %xmm4, %xmm0511pxor %xmm5, %xmm1512pxor %xmm6, %xmm2513pxor %xmm7, %xmm3514movdqa %xmm0, 0(%edi)515movdqa %xmm1, 16(%edi)516movdqa %xmm2, 32(%edi)517movdqa %xmm3, 48(%edi)518addl %eax, %edi519addl %eax, %esi520decl %ecx521jnz .blkxor522523popl %esi524popl %edi525ret526527528/* neoscrypt_salsa(mem, rounds) = SSE2 based Salsa20;529* mem must be aligned properly, rounds must be a multiple of 2 */530.globl neoscrypt_salsa531.globl _neoscrypt_salsa532neoscrypt_salsa:533_neoscrypt_salsa:534movl 4(%esp), %edx535movl 8(%esp), %ecx536shrl $1, %ecx537movdqa 0(%edx), %xmm0538movdqa %xmm0, %xmm6539movdqa 16(%edx), %xmm1540movdqa %xmm1, %xmm7541subl $32, %esp542movdqa 32(%edx), %xmm2543movdqu %xmm2, 0(%esp)544movdqa 48(%edx), %xmm3545movdqu %xmm3, 16(%esp)546.salsa:547movdqa %xmm1, %xmm4548paddd %xmm0, %xmm4549movdqa %xmm4, %xmm5550pslld $7, %xmm4551psrld $25, %xmm5552pxor %xmm4, %xmm3553movdqa %xmm0, %xmm4554pxor %xmm5, %xmm3555paddd %xmm3, %xmm4556movdqa %xmm4, %xmm5557pslld $9, %xmm4558psrld $23, %xmm5559pxor %xmm4, %xmm2560movdqa %xmm3, %xmm4561pxor %xmm5, %xmm2562pshufd $0x93, %xmm3, %xmm3563paddd %xmm2, %xmm4564movdqa %xmm4, %xmm5565pslld $13, %xmm4566psrld $19, %xmm5567pxor %xmm4, %xmm1568movdqa %xmm2, %xmm4569pxor %xmm5, %xmm1570pshufd $0x4E, %xmm2, %xmm2571paddd %xmm1, %xmm4572movdqa %xmm4, %xmm5573pslld $18, %xmm4574psrld $14, %xmm5575pxor %xmm4, %xmm0576movdqa %xmm3, %xmm4577pxor %xmm5, %xmm0578pshufd $0x39, %xmm1, %xmm1579paddd %xmm0, %xmm4580movdqa %xmm4, %xmm5581pslld $7, %xmm4582psrld $25, %xmm5583pxor %xmm4, %xmm1584movdqa %xmm0, %xmm4585pxor %xmm5, %xmm1586paddd %xmm1, %xmm4587movdqa %xmm4, %xmm5588pslld $9, %xmm4589psrld $23, %xmm5590pxor %xmm4, %xmm2591movdqa %xmm1, %xmm4592pxor %xmm5, %xmm2593pshufd $0x93, %xmm1, %xmm1594paddd %xmm2, %xmm4595movdqa %xmm4, %xmm5596pslld $13, %xmm4597psrld $19, %xmm5598pxor %xmm4, %xmm3599movdqa %xmm2, %xmm4600pxor %xmm5, %xmm3601pshufd $0x4E, %xmm2, %xmm2602paddd %xmm3, %xmm4603movdqa %xmm4, %xmm5604pslld $18, %xmm4605psrld $14, %xmm5606pxor %xmm4, %xmm0607pshufd $0x39, %xmm3, %xmm3608pxor %xmm5, %xmm0609decl %ecx610jnz .salsa611612paddd %xmm6, %xmm0613movdqa %xmm0, 0(%edx)614paddd %xmm7, %xmm1615movdqa %xmm1, 16(%edx)616movdqu 0(%esp), %xmm6617paddd %xmm6, %xmm2618movdqa %xmm2, 32(%edx)619movdqu 16(%esp), %xmm7620paddd %xmm7, %xmm3621movdqa %xmm3, 48(%edx)622addl $32, %esp623ret624625626/* neoscrypt_salsa_tangle(mem, count) = Salsa20 SSE2 map switcher;627* correct map: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15628* SSE2 map: 0 5 10 15 12 1 6 11 8 13 2 7 4 9 14 3 */629.globl neoscrypt_salsa_tangle630.globl _neoscrypt_salsa_tangle631neoscrypt_salsa_tangle:632_neoscrypt_salsa_tangle:633pushl %ebx634push %ebp635movl 12(%esp), %ebp636movl 16(%esp), %ecx637movl $64, %ebx638.salsa_tangle:639movl 4(%ebp), %eax640movl 20(%ebp), %edx641movl %eax, 20(%ebp)642movl %edx, 4(%ebp)643movl 8(%ebp), %eax644movl 40(%ebp), %edx645movl %eax, 40(%ebp)646movl %edx, 8(%ebp)647movl 12(%ebp), %eax648movl 60(%ebp), %edx649movl %eax, 60(%ebp)650movl %edx, 12(%ebp)651movl 16(%ebp), %eax652movl 48(%ebp), %edx653movl %eax, 48(%ebp)654movl %edx, 16(%ebp)655movl 28(%ebp), %eax656movl 44(%ebp), %edx657movl %eax, 44(%ebp)658movl %edx, 28(%ebp)659movl 36(%ebp), %eax660movl 52(%ebp), %edx661movl %eax, 52(%ebp)662movl %edx, 36(%ebp)663addl %ebx, %ebp664decl %ecx665jnz .salsa_tangle666667popl %ebp668popl %ebx669ret670671672/* neoscrypt_chacha(mem, rounds) = SSE2 based ChaCha20;673* mem must be aligned properly, rounds must be a multiple of 2 */674.globl neoscrypt_chacha675.globl _neoscrypt_chacha676neoscrypt_chacha:677_neoscrypt_chacha:678movl 4(%esp), %edx679movl 8(%esp), %ecx680shrl $1, %ecx681movdqa 0(%edx), %xmm0682movdqa %xmm0, %xmm5683movdqa 16(%edx), %xmm1684movdqa %xmm1, %xmm6685movdqa 32(%edx), %xmm2686movdqa %xmm2, %xmm7687subl $16, %esp688movdqa 48(%edx), %xmm3689movdqu %xmm3, 0(%esp)690.chacha:691paddd %xmm1, %xmm0692pxor %xmm0, %xmm3693pshuflw $0xB1, %xmm3, %xmm3694pshufhw $0xB1, %xmm3, %xmm3695paddd %xmm3, %xmm2696pxor %xmm2, %xmm1697movdqa %xmm1, %xmm4698pslld $12, %xmm1699psrld $20, %xmm4700pxor %xmm4, %xmm1701paddd %xmm1, %xmm0702pxor %xmm0, %xmm3703movdqa %xmm3, %xmm4704pslld $8, %xmm3705psrld $24, %xmm4706pxor %xmm4, %xmm3707pshufd $0x93, %xmm0, %xmm0708paddd %xmm3, %xmm2709pshufd $0x4E, %xmm3, %xmm3710pxor %xmm2, %xmm1711pshufd $0x39, %xmm2, %xmm2712movdqa %xmm1, %xmm4713pslld $7, %xmm1714psrld $25, %xmm4715pxor %xmm4, %xmm1716paddd %xmm1, %xmm0717pxor %xmm0, %xmm3718pshuflw $0xB1, %xmm3, %xmm3719pshufhw $0xB1, %xmm3, %xmm3720paddd %xmm3, %xmm2721pxor %xmm2, %xmm1722movdqa %xmm1, %xmm4723pslld $12, %xmm1724psrld $20, %xmm4725pxor %xmm4, %xmm1726paddd %xmm1, %xmm0727pxor %xmm0, %xmm3728movdqa %xmm3, %xmm4729pslld $8, %xmm3730psrld $24, %xmm4731pxor %xmm4, %xmm3732pshufd $0x39, %xmm0, %xmm0733paddd %xmm3, %xmm2734pshufd $0x4E, %xmm3, %xmm3735pxor %xmm2, %xmm1736pshufd $0x93, %xmm2, %xmm2737movdqa %xmm1, %xmm4738pslld $7, %xmm1739psrld $25, %xmm4740pxor %xmm4, %xmm1741decl %ecx742jnz .chacha743744paddd %xmm5, %xmm0745movdqa %xmm0, 0(%edx)746paddd %xmm6, %xmm1747movdqa %xmm1, 16(%edx)748paddd %xmm7, %xmm2749movdqa %xmm2, 32(%edx)750movdqu 0(%esp), %xmm7751paddd %xmm7, %xmm3752movdqa %xmm3, 48(%edx)753addl $16, %esp754ret755756#endif /* (ASM) && (__i386__) */757758759