Path: blob/master/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
26451 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)3*4* Copyright (C) 2011 Jussi Kivilinna <[email protected]>5*6* Based on crypto/serpent.c by7* Copyright (C) 2002 Dag Arne Osvik <[email protected]>8* 2003 Herbert Valerio Riedel <[email protected]>9*/1011#include <linux/linkage.h>1213.file "serpent-sse2-x86_64-asm_64.S"14.text1516#define CTX %rdi1718/**********************************************************************198-way SSE2 serpent20**********************************************************************/21#define RA1 %xmm022#define RB1 %xmm123#define RC1 %xmm224#define RD1 %xmm325#define RE1 %xmm42627#define RA2 %xmm528#define RB2 %xmm629#define RC2 %xmm730#define RD2 %xmm831#define RE2 %xmm93233#define RNOT %xmm103435#define RK0 %xmm1136#define RK1 %xmm1237#define RK2 %xmm1338#define RK3 %xmm143940#define S0_1(x0, x1, x2, x3, x4) \41movdqa x3, x4; \42por x0, x3; \43pxor x4, x0; \44pxor x2, x4; \45pxor RNOT, x4; \46pxor x1, x3; \47pand x0, x1; \48pxor x4, x1; \49pxor x0, x2;50#define S0_2(x0, x1, x2, x3, x4) \51pxor x3, x0; \52por x0, x4; \53pxor x2, x0; \54pand x1, x2; \55pxor x2, x3; \56pxor RNOT, x1; \57pxor x4, x2; \58pxor x2, x1;5960#define S1_1(x0, x1, x2, x3, x4) \61movdqa x1, x4; \62pxor x0, x1; \63pxor x3, x0; \64pxor RNOT, x3; \65pand x1, x4; \66por x1, x0; \67pxor x2, x3; \68pxor x3, x0; \69pxor x3, x1;70#define S1_2(x0, x1, x2, x3, x4) \71pxor x4, x3; \72por x4, x1; \73pxor x2, x4; \74pand x0, x2; \75pxor x1, x2; \76por x0, x1; \77pxor RNOT, x0; \78pxor x2, x0; \79pxor x1, x4;8081#define S2_1(x0, x1, x2, x3, x4) \82pxor RNOT, x3; \83pxor x0, x1; \84movdqa x0, x4; \85pand x2, x0; \86pxor x3, x0; \87por x4, x3; \88pxor x1, x2; \89pxor x1, x3; \90pand x0, x1;91#define S2_2(x0, x1, x2, x3, x4) \92pxor x2, x0; \93pand x3, x2; \94por x1, x3; \95pxor RNOT, x0; \96pxor x0, x3; \97pxor x0, x4; \98pxor x2, x0; \99por x2, x1;100101#define S3_1(x0, x1, x2, x3, x4) \102movdqa x1, x4; \103pxor x3, x1; \104por x0, x3; \105pand x0, x4; \106pxor x2, x0; \107pxor x1, x2; \108pand x3, x1; \109pxor x3, x2; \110por x4, x0; \111pxor x3, x4;112#define S3_2(x0, x1, x2, x3, x4) \113pxor x0, x1; \114pand x3, x0; \115pand x4, x3; \116pxor x2, x3; \117por x1, x4; \118pand x1, x2; \119pxor x3, x4; \120pxor x3, x0; \121pxor x2, x3;122123#define S4_1(x0, x1, x2, x3, x4) \124movdqa x3, x4; \125pand x0, x3; \126pxor x4, x0; \127pxor x2, x3; \128por x4, x2; \129pxor x1, x0; \130pxor x3, x4; \131por x0, x2; \132pxor x1, x2;133#define S4_2(x0, x1, x2, x3, x4) \134pand x0, x1; \135pxor x4, x1; \136pand x2, x4; \137pxor x3, x2; \138pxor x0, x4; \139por x1, x3; \140pxor RNOT, x1; \141pxor x0, x3;142143#define S5_1(x0, x1, x2, x3, x4) \144movdqa x1, x4; \145por x0, x1; \146pxor x1, x2; \147pxor RNOT, x3; \148pxor x0, x4; \149pxor x2, x0; \150pand x4, x1; \151por x3, x4; \152pxor x0, x4;153#define S5_2(x0, x1, x2, x3, x4) \154pand x3, x0; \155pxor x3, x1; \156pxor x2, x3; \157pxor x1, x0; \158pand x4, x2; \159pxor x2, x1; \160pand x0, x2; \161pxor x2, x3;162163#define S6_1(x0, x1, x2, x3, x4) \164movdqa x1, x4; \165pxor x0, x3; \166pxor x2, x1; \167pxor x0, x2; \168pand x3, x0; \169por x3, x1; \170pxor RNOT, x4; \171pxor x1, x0; \172pxor x2, x1;173#define S6_2(x0, x1, x2, x3, x4) \174pxor x4, x3; \175pxor x0, x4; \176pand x0, x2; \177pxor x1, x4; \178pxor x3, x2; \179pand x1, x3; \180pxor x0, x3; \181pxor x2, x1;182183#define S7_1(x0, x1, x2, x3, x4) \184pxor RNOT, x1; \185movdqa x1, x4; \186pxor RNOT, x0; \187pand x2, x1; \188pxor x3, x1; \189por x4, x3; \190pxor x2, x4; \191pxor x3, x2; \192pxor x0, x3; \193por x1, x0;194#define S7_2(x0, x1, x2, x3, x4) \195pand x0, x2; \196pxor x4, x0; \197pxor x3, x4; \198pand x0, x3; \199pxor x1, x4; \200pxor x4, x2; \201pxor x1, x3; \202por x0, x4; \203pxor x1, x4;204205#define SI0_1(x0, x1, x2, x3, x4) \206movdqa x3, x4; \207pxor x0, x1; \208por x1, x3; \209pxor x1, x4; \210pxor RNOT, x0; \211pxor x3, x2; \212pxor x0, x3; \213pand x1, x0; \214pxor x2, x0;215#define SI0_2(x0, x1, x2, x3, x4) \216pand x3, x2; \217pxor x4, x3; \218pxor x3, x2; \219pxor x3, x1; \220pand x0, x3; \221pxor x0, x1; \222pxor x2, x0; \223pxor x3, x4;224225#define SI1_1(x0, x1, x2, x3, x4) \226pxor x3, x1; \227movdqa x0, x4; \228pxor x2, x0; \229pxor RNOT, x2; \230por x1, x4; \231pxor x3, x4; \232pand x1, x3; \233pxor x2, x1; \234pand x4, x2;235#define SI1_2(x0, x1, x2, x3, x4) \236pxor x1, x4; \237por x3, x1; \238pxor x0, x3; \239pxor x0, x2; \240por x4, x0; \241pxor x4, x2; \242pxor x0, x1; \243pxor x1, x4;244245#define SI2_1(x0, x1, x2, x3, x4) \246pxor x1, x2; \247movdqa x3, x4; \248pxor RNOT, x3; \249por x2, x3; \250pxor x4, x2; \251pxor x0, x4; \252pxor x1, x3; \253por x2, x1; \254pxor x0, x2;255#define SI2_2(x0, x1, x2, x3, x4) \256pxor x4, x1; \257por x3, x4; \258pxor x3, x2; \259pxor x2, x4; \260pand x1, x2; \261pxor x3, x2; \262pxor x4, x3; \263pxor x0, x4;264265#define SI3_1(x0, x1, x2, x3, x4) \266pxor x1, x2; \267movdqa x1, x4; \268pand x2, x1; \269pxor x0, x1; \270por x4, x0; \271pxor x3, x4; \272pxor x3, x0; \273por x1, x3; \274pxor x2, x1;275#define SI3_2(x0, x1, x2, x3, x4) \276pxor x3, x1; \277pxor x2, x0; \278pxor x3, x2; \279pand x1, x3; \280pxor x0, x1; \281pand x2, x0; \282pxor x3, x4; \283pxor x0, x3; \284pxor x1, x0;285286#define SI4_1(x0, x1, x2, x3, x4) \287pxor x3, x2; \288movdqa x0, x4; \289pand x1, x0; \290pxor x2, x0; \291por x3, x2; \292pxor RNOT, x4; \293pxor x0, x1; \294pxor x2, x0; \295pand x4, x2;296#define SI4_2(x0, x1, x2, x3, x4) \297pxor x0, x2; \298por x4, x0; \299pxor x3, x0; \300pand x2, x3; \301pxor x3, x4; \302pxor x1, x3; \303pand x0, x1; \304pxor x1, x4; \305pxor x3, x0;306307#define SI5_1(x0, x1, x2, x3, x4) \308movdqa x1, x4; \309por x2, x1; \310pxor x4, x2; \311pxor x3, x1; \312pand x4, x3; \313pxor x3, x2; \314por x0, x3; \315pxor RNOT, x0; \316pxor x2, x3; \317por x0, x2;318#define SI5_2(x0, x1, x2, x3, x4) \319pxor x1, x4; \320pxor x4, x2; \321pand x0, x4; \322pxor x1, x0; \323pxor x3, x1; \324pand x2, x0; \325pxor x3, x2; \326pxor x2, x0; \327pxor x4, x2; \328pxor x3, x4;329330#define SI6_1(x0, x1, x2, x3, x4) \331pxor x2, x0; \332movdqa x0, x4; \333pand x3, x0; \334pxor x3, x2; \335pxor x2, x0; \336pxor x1, x3; \337por x4, x2; \338pxor x3, x2; \339pand x0, x3;340#define SI6_2(x0, x1, x2, x3, x4) \341pxor RNOT, x0; \342pxor x1, x3; \343pand x2, x1; \344pxor x0, x4; \345pxor x4, x3; \346pxor x2, x4; \347pxor x1, x0; \348pxor x0, x2;349350#define SI7_1(x0, x1, x2, x3, x4) \351movdqa x3, x4; \352pand x0, x3; \353pxor x2, x0; \354por x4, x2; \355pxor x1, x4; \356pxor RNOT, x0; \357por x3, x1; \358pxor x0, x4; \359pand x2, x0; \360pxor x1, x0;361#define SI7_2(x0, x1, x2, x3, x4) \362pand x2, x1; \363pxor x2, x3; \364pxor x3, x4; \365pand x3, x2; \366por x0, x3; \367pxor x4, x1; \368pxor x4, x3; \369pand x0, x4; \370pxor x2, x4;371372#define get_key(i, j, t) \373movd (4*(i)+(j))*4(CTX), t; \374pshufd $0, t, t;375376#define K2(x0, x1, x2, x3, x4, i) \377get_key(i, 0, RK0); \378get_key(i, 1, RK1); \379get_key(i, 2, RK2); \380get_key(i, 3, RK3); \381pxor RK0, x0 ## 1; \382pxor RK1, x1 ## 1; \383pxor RK2, x2 ## 1; \384pxor RK3, x3 ## 1; \385pxor RK0, x0 ## 2; \386pxor RK1, x1 ## 2; \387pxor RK2, x2 ## 2; \388pxor RK3, x3 ## 2;389390#define LK2(x0, x1, x2, x3, x4, i) \391movdqa x0 ## 1, x4 ## 1; \392pslld $13, x0 ## 1; \393psrld $(32 - 13), x4 ## 1; \394por x4 ## 1, x0 ## 1; \395pxor x0 ## 1, x1 ## 1; \396movdqa x2 ## 1, x4 ## 1; \397pslld $3, x2 ## 1; \398psrld $(32 - 3), x4 ## 1; \399por x4 ## 1, x2 ## 1; \400pxor x2 ## 1, x1 ## 1; \401movdqa x0 ## 2, x4 ## 2; \402pslld $13, x0 ## 2; \403psrld $(32 - 13), x4 ## 2; \404por x4 ## 2, x0 ## 2; \405pxor x0 ## 2, x1 ## 2; \406movdqa x2 ## 2, x4 ## 2; \407pslld $3, x2 ## 2; \408psrld $(32 - 3), x4 ## 2; \409por x4 ## 2, x2 ## 2; \410pxor x2 ## 2, x1 ## 2; \411movdqa x1 ## 1, x4 ## 1; \412pslld $1, x1 ## 1; \413psrld $(32 - 1), x4 ## 1; \414por x4 ## 1, x1 ## 1; \415movdqa x0 ## 1, x4 ## 1; \416pslld $3, x4 ## 1; \417pxor x2 ## 1, x3 ## 1; \418pxor x4 ## 1, x3 ## 1; \419movdqa x3 ## 1, x4 ## 1; \420get_key(i, 1, RK1); \421movdqa x1 ## 2, x4 ## 2; \422pslld $1, x1 ## 2; \423psrld $(32 - 1), x4 ## 2; \424por x4 ## 2, x1 ## 2; \425movdqa x0 ## 2, x4 ## 2; \426pslld $3, x4 ## 2; \427pxor x2 ## 2, x3 ## 2; \428pxor x4 ## 2, x3 ## 2; \429movdqa x3 ## 2, x4 ## 2; \430get_key(i, 3, RK3); \431pslld $7, x3 ## 1; \432psrld $(32 - 7), x4 ## 1; \433por x4 ## 1, x3 ## 1; \434movdqa x1 ## 1, x4 ## 1; \435pslld $7, x4 ## 1; \436pxor x1 ## 1, x0 ## 1; \437pxor x3 ## 1, x0 ## 1; \438pxor x3 ## 1, x2 ## 1; \439pxor x4 ## 1, x2 ## 1; \440get_key(i, 0, RK0); \441pslld $7, x3 ## 2; \442psrld $(32 - 7), x4 ## 2; \443por x4 ## 2, x3 ## 2; \444movdqa x1 ## 2, x4 ## 2; \445pslld $7, x4 ## 2; \446pxor x1 ## 2, x0 ## 2; \447pxor x3 ## 2, x0 ## 2; \448pxor x3 ## 2, x2 ## 2; \449pxor x4 ## 2, x2 ## 2; \450get_key(i, 2, RK2); \451pxor RK1, x1 ## 1; \452pxor RK3, x3 ## 1; \453movdqa x0 ## 1, x4 ## 1; \454pslld $5, x0 ## 1; \455psrld $(32 - 5), x4 ## 1; \456por x4 ## 1, x0 ## 1; \457movdqa x2 ## 1, x4 ## 1; \458pslld $22, x2 ## 1; \459psrld $(32 - 22), x4 ## 1; \460por x4 ## 1, x2 ## 1; \461pxor RK0, x0 ## 1; \462pxor RK2, x2 ## 1; \463pxor RK1, x1 ## 2; \464pxor RK3, x3 ## 2; \465movdqa x0 ## 2, x4 ## 2; \466pslld $5, x0 ## 2; \467psrld $(32 - 5), x4 ## 2; \468por x4 ## 2, x0 ## 2; \469movdqa x2 ## 2, x4 ## 2; \470pslld $22, x2 ## 2; \471psrld $(32 - 22), x4 ## 2; \472por x4 ## 2, x2 ## 2; \473pxor RK0, x0 ## 2; \474pxor RK2, x2 ## 2;475476#define KL2(x0, x1, x2, x3, x4, i) \477pxor RK0, x0 ## 1; \478pxor RK2, x2 ## 1; \479movdqa x0 ## 1, x4 ## 1; \480psrld $5, x0 ## 1; \481pslld $(32 - 5), x4 ## 1; \482por x4 ## 1, x0 ## 1; \483pxor RK3, x3 ## 1; \484pxor RK1, x1 ## 1; \485movdqa x2 ## 1, x4 ## 1; \486psrld $22, x2 ## 1; \487pslld $(32 - 22), x4 ## 1; \488por x4 ## 1, x2 ## 1; \489pxor x3 ## 1, x2 ## 1; \490pxor RK0, x0 ## 2; \491pxor RK2, x2 ## 2; \492movdqa x0 ## 2, x4 ## 2; \493psrld $5, x0 ## 2; \494pslld $(32 - 5), x4 ## 2; \495por x4 ## 2, x0 ## 2; \496pxor RK3, x3 ## 2; \497pxor RK1, x1 ## 2; \498movdqa x2 ## 2, x4 ## 2; \499psrld $22, x2 ## 2; \500pslld $(32 - 22), x4 ## 2; \501por x4 ## 2, x2 ## 2; \502pxor x3 ## 2, x2 ## 2; \503pxor x3 ## 1, x0 ## 1; \504movdqa x1 ## 1, x4 ## 1; \505pslld $7, x4 ## 1; \506pxor x1 ## 1, x0 ## 1; \507pxor x4 ## 1, x2 ## 1; \508movdqa x1 ## 1, x4 ## 1; \509psrld $1, x1 ## 1; \510pslld $(32 - 1), x4 ## 1; \511por x4 ## 1, x1 ## 1; \512pxor x3 ## 2, x0 ## 2; \513movdqa x1 ## 2, x4 ## 2; \514pslld $7, x4 ## 2; \515pxor x1 ## 2, x0 ## 2; \516pxor x4 ## 2, x2 ## 2; \517movdqa x1 ## 2, x4 ## 2; \518psrld $1, x1 ## 2; \519pslld $(32 - 1), x4 ## 2; \520por x4 ## 2, x1 ## 2; \521movdqa x3 ## 1, x4 ## 1; \522psrld $7, x3 ## 1; \523pslld $(32 - 7), x4 ## 1; \524por x4 ## 1, x3 ## 1; \525pxor x0 ## 1, x1 ## 1; \526movdqa x0 ## 1, x4 ## 1; \527pslld $3, x4 ## 1; \528pxor x4 ## 1, x3 ## 1; \529movdqa x0 ## 1, x4 ## 1; \530movdqa x3 ## 2, x4 ## 2; \531psrld $7, x3 ## 2; \532pslld $(32 - 7), x4 ## 2; \533por x4 ## 2, x3 ## 2; \534pxor x0 ## 2, x1 ## 2; \535movdqa x0 ## 2, x4 ## 2; \536pslld $3, x4 ## 2; \537pxor x4 ## 2, x3 ## 2; \538movdqa x0 ## 2, x4 ## 2; \539psrld $13, x0 ## 1; \540pslld $(32 - 13), x4 ## 1; \541por x4 ## 1, x0 ## 1; \542pxor x2 ## 1, x1 ## 1; \543pxor x2 ## 1, x3 ## 1; \544movdqa x2 ## 1, x4 ## 1; \545psrld $3, x2 ## 1; \546pslld $(32 - 3), x4 ## 1; \547por x4 ## 1, x2 ## 1; \548psrld $13, x0 ## 2; \549pslld $(32 - 13), x4 ## 2; \550por x4 ## 2, x0 ## 2; \551pxor x2 ## 2, x1 ## 2; \552pxor x2 ## 2, x3 ## 2; \553movdqa x2 ## 2, x4 ## 2; \554psrld $3, x2 ## 2; \555pslld $(32 - 3), x4 ## 2; \556por x4 ## 2, x2 ## 2;557558#define S(SBOX, x0, x1, x2, x3, x4) \559SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \560SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \561SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \562SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);563564#define SP(SBOX, x0, x1, x2, x3, x4, i) \565get_key(i, 0, RK0); \566SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \567get_key(i, 2, RK2); \568SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \569get_key(i, 3, RK3); \570SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \571get_key(i, 1, RK1); \572SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \573574#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \575movdqa x0, t2; \576punpckldq x1, x0; \577punpckhdq x1, t2; \578movdqa x2, t1; \579punpckhdq x3, x2; \580punpckldq x3, t1; \581movdqa x0, x1; \582punpcklqdq t1, x0; \583punpckhqdq t1, x1; \584movdqa t2, x3; \585punpcklqdq x2, t2; \586punpckhqdq x2, x3; \587movdqa t2, x2;588589#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \590movdqu (0*4*4)(in), x0; \591movdqu (1*4*4)(in), x1; \592movdqu (2*4*4)(in), x2; \593movdqu (3*4*4)(in), x3; \594\595transpose_4x4(x0, x1, x2, x3, t0, t1, t2)596597#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \598transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \599\600movdqu x0, (0*4*4)(out); \601movdqu x1, (1*4*4)(out); \602movdqu x2, (2*4*4)(out); \603movdqu x3, (3*4*4)(out);604605#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \606transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \607\608movdqu (0*4*4)(out), t0; \609pxor t0, x0; \610movdqu x0, (0*4*4)(out); \611movdqu (1*4*4)(out), t0; \612pxor t0, x1; \613movdqu x1, (1*4*4)(out); \614movdqu (2*4*4)(out), t0; \615pxor t0, x2; \616movdqu x2, (2*4*4)(out); \617movdqu (3*4*4)(out), t0; \618pxor t0, x3; \619movdqu x3, (3*4*4)(out);620621SYM_FUNC_START(__serpent_enc_blk_8way)622/* input:623* %rdi: ctx, CTX624* %rsi: dst625* %rdx: src626* %rcx: bool, if true: xor output627*/628629pcmpeqd RNOT, RNOT;630631leaq (4*4*4)(%rdx), %rax;632read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);633read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);634635K2(RA, RB, RC, RD, RE, 0);636S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);637S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);638S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);639S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);640S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);641S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);642S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);643S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);644S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);645S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);646S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);647S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);648S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);649S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);650S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);651S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);652S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);653S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);654S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);655S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);656S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);657S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);658S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);659S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);660S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);661S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);662S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);663S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);664S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);665S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);666S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);667S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);668669leaq (4*4*4)(%rsi), %rax;670671testb %cl, %cl;672jnz .L__enc_xor8;673674write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);675write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);676677RET;678679.L__enc_xor8:680xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);681xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);682683RET;684SYM_FUNC_END(__serpent_enc_blk_8way)685686SYM_FUNC_START(serpent_dec_blk_8way)687/* input:688* %rdi: ctx, CTX689* %rsi: dst690* %rdx: src691*/692693pcmpeqd RNOT, RNOT;694695leaq (4*4*4)(%rdx), %rax;696read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);697read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);698699K2(RA, RB, RC, RD, RE, 32);700SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);701SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);702SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);703SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);704SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);705SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);706SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);707SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);708SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);709SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);710SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);711SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);712SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);713SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);714SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);715SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);716SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);717SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);718SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);719SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);720SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);721SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);722SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);723SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);724SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);725SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);726SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);727SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);728SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);729SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);730SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);731S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);732733leaq (4*4*4)(%rsi), %rax;734write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);735write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);736737RET;738SYM_FUNC_END(serpent_dec_blk_8way)739740741