Path: blob/master/arch/x86/crypto/serpent-sse2-i586-asm_32.S
26424 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* Serpent Cipher 4-way parallel algorithm (i586/SSE2)3*4* Copyright (C) 2011 Jussi Kivilinna <[email protected]>5*6* Based on crypto/serpent.c by7* Copyright (C) 2002 Dag Arne Osvik <[email protected]>8* 2003 Herbert Valerio Riedel <[email protected]>9*/1011#include <linux/linkage.h>1213.file "serpent-sse2-i586-asm_32.S"14.text1516#define arg_ctx 417#define arg_dst 818#define arg_src 1219#define arg_xor 162021/**********************************************************************224-way SSE2 serpent23**********************************************************************/24#define CTX %edx2526#define RA %xmm027#define RB %xmm128#define RC %xmm229#define RD %xmm330#define RE %xmm43132#define RT0 %xmm533#define RT1 %xmm63435#define RNOT %xmm73637#define get_key(i, j, t) \38movd (4*(i)+(j))*4(CTX), t; \39pshufd $0, t, t;4041#define K(x0, x1, x2, x3, x4, i) \42get_key(i, 0, x4); \43get_key(i, 1, RT0); \44get_key(i, 2, RT1); \45pxor x4, x0; \46pxor RT0, x1; \47pxor RT1, x2; \48get_key(i, 3, x4); \49pxor x4, x3;5051#define LK(x0, x1, x2, x3, x4, i) \52movdqa x0, x4; \53pslld $13, x0; \54psrld $(32 - 13), x4; \55por x4, x0; \56pxor x0, x1; \57movdqa x2, x4; \58pslld $3, x2; \59psrld $(32 - 3), x4; \60por x4, x2; \61pxor x2, x1; \62movdqa x1, x4; \63pslld $1, x1; \64psrld $(32 - 1), x4; \65por x4, x1; \66movdqa x0, x4; \67pslld $3, x4; \68pxor x2, x3; \69pxor x4, x3; \70movdqa x3, x4; \71pslld $7, x3; \72psrld $(32 - 7), x4; \73por x4, x3; \74movdqa x1, x4; \75pslld $7, x4; \76pxor x1, x0; \77pxor x3, x0; \78pxor x3, x2; \79pxor x4, x2; \80movdqa x0, x4; \81get_key(i, 1, RT0); \82pxor RT0, x1; \83get_key(i, 3, RT0); \84pxor RT0, x3; \85pslld $5, x0; \86psrld $(32 - 5), x4; \87por x4, x0; \88movdqa x2, x4; \89pslld $22, x2; \90psrld $(32 - 22), x4; \91por x4, x2; \92get_key(i, 0, RT0); \93pxor RT0, x0; \94get_key(i, 2, RT0); \95pxor RT0, x2;9697#define KL(x0, x1, x2, x3, x4, i) \98K(x0, x1, x2, x3, x4, i); \99movdqa x0, x4; \100psrld $5, x0; \101pslld $(32 - 5), x4; \102por x4, x0; \103movdqa x2, x4; \104psrld $22, x2; \105pslld $(32 - 22), x4; \106por x4, x2; \107pxor x3, x2; \108pxor x3, x0; \109movdqa x1, x4; \110pslld $7, x4; \111pxor x1, x0; \112pxor x4, x2; \113movdqa x1, x4; \114psrld $1, x1; \115pslld $(32 - 1), x4; \116por x4, x1; \117movdqa x3, x4; \118psrld $7, x3; \119pslld $(32 - 7), x4; \120por x4, x3; \121pxor x0, x1; \122movdqa x0, x4; \123pslld $3, x4; \124pxor x4, x3; \125movdqa x0, x4; \126psrld $13, x0; \127pslld $(32 - 13), x4; \128por x4, x0; \129pxor x2, x1; \130pxor x2, x3; \131movdqa x2, x4; \132psrld $3, x2; \133pslld $(32 - 3), x4; \134por x4, x2;135136#define S0(x0, x1, x2, x3, x4) \137movdqa x3, x4; \138por x0, x3; \139pxor x4, x0; \140pxor x2, x4; \141pxor RNOT, x4; \142pxor x1, x3; \143pand x0, x1; \144pxor x4, x1; \145pxor x0, x2; \146pxor x3, x0; \147por x0, x4; \148pxor x2, x0; \149pand x1, x2; \150pxor x2, x3; \151pxor RNOT, x1; \152pxor x4, x2; \153pxor x2, x1;154155#define S1(x0, x1, x2, x3, x4) \156movdqa x1, x4; \157pxor x0, x1; \158pxor x3, x0; \159pxor RNOT, x3; \160pand x1, x4; \161por x1, x0; \162pxor x2, x3; \163pxor x3, x0; \164pxor x3, x1; \165pxor x4, x3; \166por x4, x1; \167pxor x2, x4; \168pand x0, x2; \169pxor x1, x2; \170por x0, x1; \171pxor RNOT, x0; \172pxor x2, x0; \173pxor x1, x4;174175#define S2(x0, x1, x2, x3, x4) \176pxor RNOT, x3; \177pxor x0, x1; \178movdqa x0, x4; \179pand x2, x0; \180pxor x3, x0; \181por x4, x3; \182pxor x1, x2; \183pxor x1, x3; \184pand x0, x1; \185pxor x2, x0; \186pand x3, x2; \187por x1, x3; \188pxor RNOT, x0; \189pxor x0, x3; \190pxor x0, x4; \191pxor x2, x0; \192por x2, x1;193194#define S3(x0, x1, x2, x3, x4) \195movdqa x1, x4; \196pxor x3, x1; \197por x0, x3; \198pand x0, x4; \199pxor x2, x0; \200pxor x1, x2; \201pand x3, x1; \202pxor x3, x2; \203por x4, x0; \204pxor x3, x4; \205pxor x0, x1; \206pand x3, x0; \207pand x4, x3; \208pxor x2, x3; \209por x1, x4; \210pand x1, x2; \211pxor x3, x4; \212pxor x3, x0; \213pxor x2, x3;214215#define S4(x0, x1, x2, x3, x4) \216movdqa x3, x4; \217pand x0, x3; \218pxor x4, x0; \219pxor x2, x3; \220por x4, x2; \221pxor x1, x0; \222pxor x3, x4; \223por x0, x2; \224pxor x1, x2; \225pand x0, x1; \226pxor x4, x1; \227pand x2, x4; \228pxor x3, x2; \229pxor x0, x4; \230por x1, x3; \231pxor RNOT, x1; \232pxor x0, x3;233234#define S5(x0, x1, x2, x3, x4) \235movdqa x1, x4; \236por x0, x1; \237pxor x1, x2; \238pxor RNOT, x3; \239pxor x0, x4; \240pxor x2, x0; \241pand x4, x1; \242por x3, x4; \243pxor x0, x4; \244pand x3, x0; \245pxor x3, x1; \246pxor x2, x3; \247pxor x1, x0; \248pand x4, x2; \249pxor x2, x1; \250pand x0, x2; \251pxor x2, x3;252253#define S6(x0, x1, x2, x3, x4) \254movdqa x1, x4; \255pxor x0, x3; \256pxor x2, x1; \257pxor x0, x2; \258pand x3, x0; \259por x3, x1; \260pxor RNOT, x4; \261pxor x1, x0; \262pxor x2, x1; \263pxor x4, x3; \264pxor x0, x4; \265pand x0, x2; \266pxor x1, x4; \267pxor x3, x2; \268pand x1, x3; \269pxor x0, x3; \270pxor x2, x1;271272#define S7(x0, x1, x2, x3, x4) \273pxor RNOT, x1; \274movdqa x1, x4; \275pxor RNOT, x0; \276pand x2, x1; \277pxor x3, x1; \278por x4, x3; \279pxor x2, x4; \280pxor x3, x2; \281pxor x0, x3; \282por x1, x0; \283pand x0, x2; \284pxor x4, x0; \285pxor x3, x4; \286pand x0, x3; \287pxor x1, x4; \288pxor x4, x2; \289pxor x1, x3; \290por x0, x4; \291pxor x1, x4;292293#define SI0(x0, x1, x2, x3, x4) \294movdqa x3, x4; \295pxor x0, x1; \296por x1, x3; \297pxor x1, x4; \298pxor RNOT, x0; \299pxor x3, x2; \300pxor x0, x3; \301pand x1, x0; \302pxor x2, x0; \303pand x3, x2; \304pxor x4, x3; \305pxor x3, x2; \306pxor x3, x1; \307pand x0, x3; \308pxor x0, x1; \309pxor x2, x0; \310pxor x3, x4;311312#define SI1(x0, x1, x2, x3, x4) \313pxor x3, x1; \314movdqa x0, x4; \315pxor x2, x0; \316pxor RNOT, x2; \317por x1, x4; \318pxor x3, x4; \319pand x1, x3; \320pxor x2, x1; \321pand x4, x2; \322pxor x1, x4; \323por x3, x1; \324pxor x0, x3; \325pxor x0, x2; \326por x4, x0; \327pxor x4, x2; \328pxor x0, x1; \329pxor x1, x4;330331#define SI2(x0, x1, x2, x3, x4) \332pxor x1, x2; \333movdqa x3, x4; \334pxor RNOT, x3; \335por x2, x3; \336pxor x4, x2; \337pxor x0, x4; \338pxor x1, x3; \339por x2, x1; \340pxor x0, x2; \341pxor x4, x1; \342por x3, x4; \343pxor x3, x2; \344pxor x2, x4; \345pand x1, x2; \346pxor x3, x2; \347pxor x4, x3; \348pxor x0, x4;349350#define SI3(x0, x1, x2, x3, x4) \351pxor x1, x2; \352movdqa x1, x4; \353pand x2, x1; \354pxor x0, x1; \355por x4, x0; \356pxor x3, x4; \357pxor x3, x0; \358por x1, x3; \359pxor x2, x1; \360pxor x3, x1; \361pxor x2, x0; \362pxor x3, x2; \363pand x1, x3; \364pxor x0, x1; \365pand x2, x0; \366pxor x3, x4; \367pxor x0, x3; \368pxor x1, x0;369370#define SI4(x0, x1, x2, x3, x4) \371pxor x3, x2; \372movdqa x0, x4; \373pand x1, x0; \374pxor x2, x0; \375por x3, x2; \376pxor RNOT, x4; \377pxor x0, x1; \378pxor x2, x0; \379pand x4, x2; \380pxor x0, x2; \381por x4, x0; \382pxor x3, x0; \383pand x2, x3; \384pxor x3, x4; \385pxor x1, x3; \386pand x0, x1; \387pxor x1, x4; \388pxor x3, x0;389390#define SI5(x0, x1, x2, x3, x4) \391movdqa x1, x4; \392por x2, x1; \393pxor x4, x2; \394pxor x3, x1; \395pand x4, x3; \396pxor x3, x2; \397por x0, x3; \398pxor RNOT, x0; \399pxor x2, x3; \400por x0, x2; \401pxor x1, x4; \402pxor x4, x2; \403pand x0, x4; \404pxor x1, x0; \405pxor x3, x1; \406pand x2, x0; \407pxor x3, x2; \408pxor x2, x0; \409pxor x4, x2; \410pxor x3, x4;411412#define SI6(x0, x1, x2, x3, x4) \413pxor x2, x0; \414movdqa x0, x4; \415pand x3, x0; \416pxor x3, x2; \417pxor x2, x0; \418pxor x1, x3; \419por x4, x2; \420pxor x3, x2; \421pand x0, x3; \422pxor RNOT, x0; \423pxor x1, x3; \424pand x2, x1; \425pxor x0, x4; \426pxor x4, x3; \427pxor x2, x4; \428pxor x1, x0; \429pxor x0, x2;430431#define SI7(x0, x1, x2, x3, x4) \432movdqa x3, x4; \433pand x0, x3; \434pxor x2, x0; \435por x4, x2; \436pxor x1, x4; \437pxor RNOT, x0; \438por x3, x1; \439pxor x0, x4; \440pand x2, x0; \441pxor x1, x0; \442pand x2, x1; \443pxor x2, x3; \444pxor x3, x4; \445pand x3, x2; \446por x0, x3; \447pxor x4, x1; \448pxor x4, x3; \449pand x0, x4; \450pxor x2, x4;451452#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \453movdqa x0, t2; \454punpckldq x1, x0; \455punpckhdq x1, t2; \456movdqa x2, t1; \457punpckhdq x3, x2; \458punpckldq x3, t1; \459movdqa x0, x1; \460punpcklqdq t1, x0; \461punpckhqdq t1, x1; \462movdqa t2, x3; \463punpcklqdq x2, t2; \464punpckhqdq x2, x3; \465movdqa t2, x2;466467#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \468movdqu (0*4*4)(in), x0; \469movdqu (1*4*4)(in), x1; \470movdqu (2*4*4)(in), x2; \471movdqu (3*4*4)(in), x3; \472\473transpose_4x4(x0, x1, x2, x3, t0, t1, t2)474475#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \476transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \477\478movdqu x0, (0*4*4)(out); \479movdqu x1, (1*4*4)(out); \480movdqu x2, (2*4*4)(out); \481movdqu x3, (3*4*4)(out);482483#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \484transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \485\486movdqu (0*4*4)(out), t0; \487pxor t0, x0; \488movdqu x0, (0*4*4)(out); \489movdqu (1*4*4)(out), t0; \490pxor t0, x1; \491movdqu x1, (1*4*4)(out); \492movdqu (2*4*4)(out), t0; \493pxor t0, x2; \494movdqu x2, (2*4*4)(out); \495movdqu (3*4*4)(out), t0; \496pxor t0, x3; \497movdqu x3, (3*4*4)(out);498499SYM_FUNC_START(__serpent_enc_blk_4way)500/* input:501* arg_ctx(%esp): ctx, CTX502* arg_dst(%esp): dst503* arg_src(%esp): src504* arg_xor(%esp): bool, if true: xor output505*/506507pcmpeqd RNOT, RNOT;508509movl arg_ctx(%esp), CTX;510511movl arg_src(%esp), %eax;512read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);513514K(RA, RB, RC, RD, RE, 0);515S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1);516S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2);517S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3);518S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4);519S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5);520S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6);521S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7);522S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8);523S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9);524S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10);525S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11);526S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12);527S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13);528S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14);529S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15);530S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16);531S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17);532S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18);533S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19);534S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20);535S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21);536S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22);537S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23);538S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24);539S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25);540S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26);541S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27);542S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28);543S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29);544S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30);545S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31);546S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32);547548movl arg_dst(%esp), %eax;549550cmpb $0, arg_xor(%esp);551jnz .L__enc_xor4;552553write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);554555RET;556557.L__enc_xor4:558xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);559560RET;561SYM_FUNC_END(__serpent_enc_blk_4way)562563SYM_FUNC_START(serpent_dec_blk_4way)564/* input:565* arg_ctx(%esp): ctx, CTX566* arg_dst(%esp): dst567* arg_src(%esp): src568*/569570pcmpeqd RNOT, RNOT;571572movl arg_ctx(%esp), CTX;573574movl arg_src(%esp), %eax;575read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);576577K(RA, RB, RC, RD, RE, 32);578SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31);579SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30);580SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29);581SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28);582SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27);583SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26);584SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25);585SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24);586SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23);587SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22);588SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21);589SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20);590SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19);591SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18);592SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17);593SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16);594SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15);595SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14);596SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13);597SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12);598SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11);599SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10);600SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9);601SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8);602SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7);603SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6);604SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5);605SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4);606SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3);607SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2);608SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1);609SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0);610611movl arg_dst(%esp), %eax;612write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);613614RET;615SYM_FUNC_END(serpent_dec_blk_4way)616617618