Path: blob/master/arch/x86/crypto/twofish-x86_64-asm_64.S
10817 views
/***************************************************************************1* Copyright (C) 2006 by Joachim Fritschi, <[email protected]> *2* *3* This program is free software; you can redistribute it and/or modify *4* it under the terms of the GNU General Public License as published by *5* the Free Software Foundation; either version 2 of the License, or *6* (at your option) any later version. *7* *8* This program is distributed in the hope that it will be useful, *9* but WITHOUT ANY WARRANTY; without even the implied warranty of *10* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *11* GNU General Public License for more details. *12* *13* You should have received a copy of the GNU General Public License *14* along with this program; if not, write to the *15* Free Software Foundation, Inc., *16* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *17***************************************************************************/1819.file "twofish-x86_64-asm.S"20.text2122#include <asm/asm-offsets.h>2324#define a_offset 025#define b_offset 426#define c_offset 827#define d_offset 122829/* Structure of the crypto context struct*/3031#define s0 0 /* S0 Array 256 Words each */32#define s1 1024 /* S1 Array */33#define s2 2048 /* S2 Array */34#define s3 3072 /* S3 Array */35#define w 4096 /* 8 whitening keys (word) */36#define k 4128 /* key 1-32 ( word ) */3738/* define a few register aliases to allow macro substitution */3940#define R0 %rax41#define R0D %eax42#define R0B %al43#define R0H %ah4445#define R1 %rbx46#define R1D %ebx47#define R1B %bl48#define R1H %bh4950#define R2 %rcx51#define R2D %ecx52#define R2B %cl53#define R2H %ch5455#define R3 %rdx56#define R3D %edx57#define R3B %dl58#define R3H %dh596061/* performs input whitening */62#define input_whitening(src,context,offset)\63xor w+offset(context), src;6465/* performs input whitening */66#define output_whitening(src,context,offset)\67xor w+16+offset(context), src;686970/*71* a input register containing a (rotated 16)72* b input register containing b73* c input register containing c74* d input register containing d (already rol $1)75* operations on a and b are interleaved to increase performance76*/77#define encrypt_round(a,b,c,d,round)\78movzx b ## B, %edi;\79mov s1(%r11,%rdi,4),%r8d;\80movzx a ## B, %edi;\81mov s2(%r11,%rdi,4),%r9d;\82movzx b ## H, %edi;\83ror $16, b ## D;\84xor s2(%r11,%rdi,4),%r8d;\85movzx a ## H, %edi;\86ror $16, a ## D;\87xor s3(%r11,%rdi,4),%r9d;\88movzx b ## B, %edi;\89xor s3(%r11,%rdi,4),%r8d;\90movzx a ## B, %edi;\91xor (%r11,%rdi,4), %r9d;\92movzx b ## H, %edi;\93ror $15, b ## D;\94xor (%r11,%rdi,4), %r8d;\95movzx a ## H, %edi;\96xor s1(%r11,%rdi,4),%r9d;\97add %r8d, %r9d;\98add %r9d, %r8d;\99add k+round(%r11), %r9d;\100xor %r9d, c ## D;\101rol $15, c ## D;\102add k+4+round(%r11),%r8d;\103xor %r8d, d ## D;104105/*106* a input register containing a(rotated 16)107* b input register containing b108* c input register containing c109* d input register containing d (already rol $1)110* operations on a and b are interleaved to increase performance111* during the round a and b are prepared for the output whitening112*/113#define encrypt_last_round(a,b,c,d,round)\114mov b ## D, %r10d;\115shl $32, %r10;\116movzx b ## B, %edi;\117mov s1(%r11,%rdi,4),%r8d;\118movzx a ## B, %edi;\119mov s2(%r11,%rdi,4),%r9d;\120movzx b ## H, %edi;\121ror $16, b ## D;\122xor s2(%r11,%rdi,4),%r8d;\123movzx a ## H, %edi;\124ror $16, a ## D;\125xor s3(%r11,%rdi,4),%r9d;\126movzx b ## B, %edi;\127xor s3(%r11,%rdi,4),%r8d;\128movzx a ## B, %edi;\129xor (%r11,%rdi,4), %r9d;\130xor a, %r10;\131movzx b ## H, %edi;\132xor (%r11,%rdi,4), %r8d;\133movzx a ## H, %edi;\134xor s1(%r11,%rdi,4),%r9d;\135add %r8d, %r9d;\136add %r9d, %r8d;\137add k+round(%r11), %r9d;\138xor %r9d, c ## D;\139ror $1, c ## D;\140add k+4+round(%r11),%r8d;\141xor %r8d, d ## D142143/*144* a input register containing a145* b input register containing b (rotated 16)146* c input register containing c (already rol $1)147* d input register containing d148* operations on a and b are interleaved to increase performance149*/150#define decrypt_round(a,b,c,d,round)\151movzx a ## B, %edi;\152mov (%r11,%rdi,4), %r9d;\153movzx b ## B, %edi;\154mov s3(%r11,%rdi,4),%r8d;\155movzx a ## H, %edi;\156ror $16, a ## D;\157xor s1(%r11,%rdi,4),%r9d;\158movzx b ## H, %edi;\159ror $16, b ## D;\160xor (%r11,%rdi,4), %r8d;\161movzx a ## B, %edi;\162xor s2(%r11,%rdi,4),%r9d;\163movzx b ## B, %edi;\164xor s1(%r11,%rdi,4),%r8d;\165movzx a ## H, %edi;\166ror $15, a ## D;\167xor s3(%r11,%rdi,4),%r9d;\168movzx b ## H, %edi;\169xor s2(%r11,%rdi,4),%r8d;\170add %r8d, %r9d;\171add %r9d, %r8d;\172add k+round(%r11), %r9d;\173xor %r9d, c ## D;\174add k+4+round(%r11),%r8d;\175xor %r8d, d ## D;\176rol $15, d ## D;177178/*179* a input register containing a180* b input register containing b181* c input register containing c (already rol $1)182* d input register containing d183* operations on a and b are interleaved to increase performance184* during the round a and b are prepared for the output whitening185*/186#define decrypt_last_round(a,b,c,d,round)\187movzx a ## B, %edi;\188mov (%r11,%rdi,4), %r9d;\189movzx b ## B, %edi;\190mov s3(%r11,%rdi,4),%r8d;\191movzx b ## H, %edi;\192ror $16, b ## D;\193xor (%r11,%rdi,4), %r8d;\194movzx a ## H, %edi;\195mov b ## D, %r10d;\196shl $32, %r10;\197xor a, %r10;\198ror $16, a ## D;\199xor s1(%r11,%rdi,4),%r9d;\200movzx b ## B, %edi;\201xor s1(%r11,%rdi,4),%r8d;\202movzx a ## B, %edi;\203xor s2(%r11,%rdi,4),%r9d;\204movzx b ## H, %edi;\205xor s2(%r11,%rdi,4),%r8d;\206movzx a ## H, %edi;\207xor s3(%r11,%rdi,4),%r9d;\208add %r8d, %r9d;\209add %r9d, %r8d;\210add k+round(%r11), %r9d;\211xor %r9d, c ## D;\212add k+4+round(%r11),%r8d;\213xor %r8d, d ## D;\214ror $1, d ## D;215216.align 8217.global twofish_enc_blk218.global twofish_dec_blk219220twofish_enc_blk:221pushq R1222223/* %rdi contains the crypto tfm address */224/* %rsi contains the output address */225/* %rdx contains the input address */226add $crypto_tfm_ctx_offset, %rdi /* set ctx address */227/* ctx address is moved to free one non-rex register228as target for the 8bit high operations */229mov %rdi, %r11230231movq (R3), R1232movq 8(R3), R3233input_whitening(R1,%r11,a_offset)234input_whitening(R3,%r11,c_offset)235mov R1D, R0D236rol $16, R0D237shr $32, R1238mov R3D, R2D239shr $32, R3240rol $1, R3D241242encrypt_round(R0,R1,R2,R3,0);243encrypt_round(R2,R3,R0,R1,8);244encrypt_round(R0,R1,R2,R3,2*8);245encrypt_round(R2,R3,R0,R1,3*8);246encrypt_round(R0,R1,R2,R3,4*8);247encrypt_round(R2,R3,R0,R1,5*8);248encrypt_round(R0,R1,R2,R3,6*8);249encrypt_round(R2,R3,R0,R1,7*8);250encrypt_round(R0,R1,R2,R3,8*8);251encrypt_round(R2,R3,R0,R1,9*8);252encrypt_round(R0,R1,R2,R3,10*8);253encrypt_round(R2,R3,R0,R1,11*8);254encrypt_round(R0,R1,R2,R3,12*8);255encrypt_round(R2,R3,R0,R1,13*8);256encrypt_round(R0,R1,R2,R3,14*8);257encrypt_last_round(R2,R3,R0,R1,15*8);258259260output_whitening(%r10,%r11,a_offset)261movq %r10, (%rsi)262263shl $32, R1264xor R0, R1265266output_whitening(R1,%r11,c_offset)267movq R1, 8(%rsi)268269popq R1270movq $1,%rax271ret272273twofish_dec_blk:274pushq R1275276/* %rdi contains the crypto tfm address */277/* %rsi contains the output address */278/* %rdx contains the input address */279add $crypto_tfm_ctx_offset, %rdi /* set ctx address */280/* ctx address is moved to free one non-rex register281as target for the 8bit high operations */282mov %rdi, %r11283284movq (R3), R1285movq 8(R3), R3286output_whitening(R1,%r11,a_offset)287output_whitening(R3,%r11,c_offset)288mov R1D, R0D289shr $32, R1290rol $16, R1D291mov R3D, R2D292shr $32, R3293rol $1, R2D294295decrypt_round(R0,R1,R2,R3,15*8);296decrypt_round(R2,R3,R0,R1,14*8);297decrypt_round(R0,R1,R2,R3,13*8);298decrypt_round(R2,R3,R0,R1,12*8);299decrypt_round(R0,R1,R2,R3,11*8);300decrypt_round(R2,R3,R0,R1,10*8);301decrypt_round(R0,R1,R2,R3,9*8);302decrypt_round(R2,R3,R0,R1,8*8);303decrypt_round(R0,R1,R2,R3,7*8);304decrypt_round(R2,R3,R0,R1,6*8);305decrypt_round(R0,R1,R2,R3,5*8);306decrypt_round(R2,R3,R0,R1,4*8);307decrypt_round(R0,R1,R2,R3,3*8);308decrypt_round(R2,R3,R0,R1,2*8);309decrypt_round(R0,R1,R2,R3,1*8);310decrypt_last_round(R2,R3,R0,R1,0);311312input_whitening(%r10,%r11,a_offset)313movq %r10, (%rsi)314315shl $32, R1316xor R0, R1317318input_whitening(R1,%r11,c_offset)319movq R1, 8(%rsi)320321popq R1322movq $1,%rax323ret324325326