Path: blob/master/arch/x86/crypto/twofish-i586-asm_32.S
10817 views
/***************************************************************************1* Copyright (C) 2006 by Joachim Fritschi, <[email protected]> *2* *3* This program is free software; you can redistribute it and/or modify *4* it under the terms of the GNU General Public License as published by *5* the Free Software Foundation; either version 2 of the License, or *6* (at your option) any later version. *7* *8* This program is distributed in the hope that it will be useful, *9* but WITHOUT ANY WARRANTY; without even the implied warranty of *10* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *11* GNU General Public License for more details. *12* *13* You should have received a copy of the GNU General Public License *14* along with this program; if not, write to the *15* Free Software Foundation, Inc., *16* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *17***************************************************************************/1819.file "twofish-i586-asm.S"20.text2122#include <asm/asm-offsets.h>2324/* return address at 0 */2526#define in_blk 12 /* input byte array address parameter*/27#define out_blk 8 /* output byte array address parameter*/28#define tfm 4 /* Twofish context structure */2930#define a_offset 031#define b_offset 432#define c_offset 833#define d_offset 123435/* Structure of the crypto context struct*/3637#define s0 0 /* S0 Array 256 Words each */38#define s1 1024 /* S1 Array */39#define s2 2048 /* S2 Array */40#define s3 3072 /* S3 Array */41#define w 4096 /* 8 whitening keys (word) */42#define k 4128 /* key 1-32 ( word ) */4344/* define a few register aliases to allow macro substitution */4546#define R0D %eax47#define R0B %al48#define R0H %ah4950#define R1D %ebx51#define R1B %bl52#define R1H %bh5354#define R2D %ecx55#define R2B %cl56#define R2H %ch5758#define R3D %edx59#define R3B %dl60#define R3H %dh616263/* performs input whitening */64#define input_whitening(src,context,offset)\65xor w+offset(context), src;6667/* performs input whitening */68#define output_whitening(src,context,offset)\69xor w+16+offset(context), src;7071/*72* a input register containing a (rotated 16)73* b input register containing b74* c input register containing c75* d input register containing d (already rol $1)76* operations on a and b are interleaved to increase performance77*/78#define encrypt_round(a,b,c,d,round)\79push d ## D;\80movzx b ## B, %edi;\81mov s1(%ebp,%edi,4),d ## D;\82movzx a ## B, %edi;\83mov s2(%ebp,%edi,4),%esi;\84movzx b ## H, %edi;\85ror $16, b ## D;\86xor s2(%ebp,%edi,4),d ## D;\87movzx a ## H, %edi;\88ror $16, a ## D;\89xor s3(%ebp,%edi,4),%esi;\90movzx b ## B, %edi;\91xor s3(%ebp,%edi,4),d ## D;\92movzx a ## B, %edi;\93xor (%ebp,%edi,4), %esi;\94movzx b ## H, %edi;\95ror $15, b ## D;\96xor (%ebp,%edi,4), d ## D;\97movzx a ## H, %edi;\98xor s1(%ebp,%edi,4),%esi;\99pop %edi;\100add d ## D, %esi;\101add %esi, d ## D;\102add k+round(%ebp), %esi;\103xor %esi, c ## D;\104rol $15, c ## D;\105add k+4+round(%ebp),d ## D;\106xor %edi, d ## D;107108/*109* a input register containing a (rotated 16)110* b input register containing b111* c input register containing c112* d input register containing d (already rol $1)113* operations on a and b are interleaved to increase performance114* last round has different rotations for the output preparation115*/116#define encrypt_last_round(a,b,c,d,round)\117push d ## D;\118movzx b ## B, %edi;\119mov s1(%ebp,%edi,4),d ## D;\120movzx a ## B, %edi;\121mov s2(%ebp,%edi,4),%esi;\122movzx b ## H, %edi;\123ror $16, b ## D;\124xor s2(%ebp,%edi,4),d ## D;\125movzx a ## H, %edi;\126ror $16, a ## D;\127xor s3(%ebp,%edi,4),%esi;\128movzx b ## B, %edi;\129xor s3(%ebp,%edi,4),d ## D;\130movzx a ## B, %edi;\131xor (%ebp,%edi,4), %esi;\132movzx b ## H, %edi;\133ror $16, b ## D;\134xor (%ebp,%edi,4), d ## D;\135movzx a ## H, %edi;\136xor s1(%ebp,%edi,4),%esi;\137pop %edi;\138add d ## D, %esi;\139add %esi, d ## D;\140add k+round(%ebp), %esi;\141xor %esi, c ## D;\142ror $1, c ## D;\143add k+4+round(%ebp),d ## D;\144xor %edi, d ## D;145146/*147* a input register containing a148* b input register containing b (rotated 16)149* c input register containing c150* d input register containing d (already rol $1)151* operations on a and b are interleaved to increase performance152*/153#define decrypt_round(a,b,c,d,round)\154push c ## D;\155movzx a ## B, %edi;\156mov (%ebp,%edi,4), c ## D;\157movzx b ## B, %edi;\158mov s3(%ebp,%edi,4),%esi;\159movzx a ## H, %edi;\160ror $16, a ## D;\161xor s1(%ebp,%edi,4),c ## D;\162movzx b ## H, %edi;\163ror $16, b ## D;\164xor (%ebp,%edi,4), %esi;\165movzx a ## B, %edi;\166xor s2(%ebp,%edi,4),c ## D;\167movzx b ## B, %edi;\168xor s1(%ebp,%edi,4),%esi;\169movzx a ## H, %edi;\170ror $15, a ## D;\171xor s3(%ebp,%edi,4),c ## D;\172movzx b ## H, %edi;\173xor s2(%ebp,%edi,4),%esi;\174pop %edi;\175add %esi, c ## D;\176add c ## D, %esi;\177add k+round(%ebp), c ## D;\178xor %edi, c ## D;\179add k+4+round(%ebp),%esi;\180xor %esi, d ## D;\181rol $15, d ## D;182183/*184* a input register containing a185* b input register containing b (rotated 16)186* c input register containing c187* d input register containing d (already rol $1)188* operations on a and b are interleaved to increase performance189* last round has different rotations for the output preparation190*/191#define decrypt_last_round(a,b,c,d,round)\192push c ## D;\193movzx a ## B, %edi;\194mov (%ebp,%edi,4), c ## D;\195movzx b ## B, %edi;\196mov s3(%ebp,%edi,4),%esi;\197movzx a ## H, %edi;\198ror $16, a ## D;\199xor s1(%ebp,%edi,4),c ## D;\200movzx b ## H, %edi;\201ror $16, b ## D;\202xor (%ebp,%edi,4), %esi;\203movzx a ## B, %edi;\204xor s2(%ebp,%edi,4),c ## D;\205movzx b ## B, %edi;\206xor s1(%ebp,%edi,4),%esi;\207movzx a ## H, %edi;\208ror $16, a ## D;\209xor s3(%ebp,%edi,4),c ## D;\210movzx b ## H, %edi;\211xor s2(%ebp,%edi,4),%esi;\212pop %edi;\213add %esi, c ## D;\214add c ## D, %esi;\215add k+round(%ebp), c ## D;\216xor %edi, c ## D;\217add k+4+round(%ebp),%esi;\218xor %esi, d ## D;\219ror $1, d ## D;220221.align 4222.global twofish_enc_blk223.global twofish_dec_blk224225twofish_enc_blk:226push %ebp /* save registers according to calling convention*/227push %ebx228push %esi229push %edi230231mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */232add $crypto_tfm_ctx_offset, %ebp /* ctx address */233mov in_blk+16(%esp),%edi /* input address in edi */234235mov (%edi), %eax236mov b_offset(%edi), %ebx237mov c_offset(%edi), %ecx238mov d_offset(%edi), %edx239input_whitening(%eax,%ebp,a_offset)240ror $16, %eax241input_whitening(%ebx,%ebp,b_offset)242input_whitening(%ecx,%ebp,c_offset)243input_whitening(%edx,%ebp,d_offset)244rol $1, %edx245246encrypt_round(R0,R1,R2,R3,0);247encrypt_round(R2,R3,R0,R1,8);248encrypt_round(R0,R1,R2,R3,2*8);249encrypt_round(R2,R3,R0,R1,3*8);250encrypt_round(R0,R1,R2,R3,4*8);251encrypt_round(R2,R3,R0,R1,5*8);252encrypt_round(R0,R1,R2,R3,6*8);253encrypt_round(R2,R3,R0,R1,7*8);254encrypt_round(R0,R1,R2,R3,8*8);255encrypt_round(R2,R3,R0,R1,9*8);256encrypt_round(R0,R1,R2,R3,10*8);257encrypt_round(R2,R3,R0,R1,11*8);258encrypt_round(R0,R1,R2,R3,12*8);259encrypt_round(R2,R3,R0,R1,13*8);260encrypt_round(R0,R1,R2,R3,14*8);261encrypt_last_round(R2,R3,R0,R1,15*8);262263output_whitening(%eax,%ebp,c_offset)264output_whitening(%ebx,%ebp,d_offset)265output_whitening(%ecx,%ebp,a_offset)266output_whitening(%edx,%ebp,b_offset)267mov out_blk+16(%esp),%edi;268mov %eax, c_offset(%edi)269mov %ebx, d_offset(%edi)270mov %ecx, (%edi)271mov %edx, b_offset(%edi)272273pop %edi274pop %esi275pop %ebx276pop %ebp277mov $1, %eax278ret279280twofish_dec_blk:281push %ebp /* save registers according to calling convention*/282push %ebx283push %esi284push %edi285286287mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */288add $crypto_tfm_ctx_offset, %ebp /* ctx address */289mov in_blk+16(%esp),%edi /* input address in edi */290291mov (%edi), %eax292mov b_offset(%edi), %ebx293mov c_offset(%edi), %ecx294mov d_offset(%edi), %edx295output_whitening(%eax,%ebp,a_offset)296output_whitening(%ebx,%ebp,b_offset)297ror $16, %ebx298output_whitening(%ecx,%ebp,c_offset)299output_whitening(%edx,%ebp,d_offset)300rol $1, %ecx301302decrypt_round(R0,R1,R2,R3,15*8);303decrypt_round(R2,R3,R0,R1,14*8);304decrypt_round(R0,R1,R2,R3,13*8);305decrypt_round(R2,R3,R0,R1,12*8);306decrypt_round(R0,R1,R2,R3,11*8);307decrypt_round(R2,R3,R0,R1,10*8);308decrypt_round(R0,R1,R2,R3,9*8);309decrypt_round(R2,R3,R0,R1,8*8);310decrypt_round(R0,R1,R2,R3,7*8);311decrypt_round(R2,R3,R0,R1,6*8);312decrypt_round(R0,R1,R2,R3,5*8);313decrypt_round(R2,R3,R0,R1,4*8);314decrypt_round(R0,R1,R2,R3,3*8);315decrypt_round(R2,R3,R0,R1,2*8);316decrypt_round(R0,R1,R2,R3,1*8);317decrypt_last_round(R2,R3,R0,R1,0);318319input_whitening(%eax,%ebp,c_offset)320input_whitening(%ebx,%ebp,d_offset)321input_whitening(%ecx,%ebp,a_offset)322input_whitening(%edx,%ebp,b_offset)323mov out_blk+16(%esp),%edi;324mov %eax, c_offset(%edi)325mov %ebx, d_offset(%edi)326mov %ecx, (%edi)327mov %edx, b_offset(%edi)328329pop %edi330pop %esi331pop %ebx332pop %ebp333mov $1, %eax334ret335336337