Path: blob/master/arch/x86/crypto/serpent-avx2-asm_64.S
26424 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* x86_64/AVX2 assembler optimized version of Serpent3*4* Copyright © 2012-2013 Jussi Kivilinna <[email protected]>5*6* Based on AVX assembler implementation of Serpent by:7* Copyright © 2012 Johannes Goetzfried8* <[email protected]>9*/1011#include <linux/linkage.h>12#include <asm/frame.h>13#include "glue_helper-asm-avx2.S"1415.file "serpent-avx2-asm_64.S"1617.section .rodata.cst16.bswap128_mask, "aM", @progbits, 1618.align 1619.Lbswap128_mask:20.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 02122.text2324#define CTX %rdi2526#define RNOT %ymm027#define tp %ymm12829#define RA1 %ymm230#define RA2 %ymm331#define RB1 %ymm432#define RB2 %ymm533#define RC1 %ymm634#define RC2 %ymm735#define RD1 %ymm836#define RD2 %ymm937#define RE1 %ymm1038#define RE2 %ymm113940#define RK0 %ymm1241#define RK1 %ymm1342#define RK2 %ymm1443#define RK3 %ymm154445#define RK0x %xmm1246#define RK1x %xmm1347#define RK2x %xmm1448#define RK3x %xmm154950#define S0_1(x0, x1, x2, x3, x4) \51vpor x0, x3, tp; \52vpxor x3, x0, x0; \53vpxor x2, x3, x4; \54vpxor RNOT, x4, x4; \55vpxor x1, tp, x3; \56vpand x0, x1, x1; \57vpxor x4, x1, x1; \58vpxor x0, x2, x2;59#define S0_2(x0, x1, x2, x3, x4) \60vpxor x3, x0, x0; \61vpor x0, x4, x4; \62vpxor x2, x0, x0; \63vpand x1, x2, x2; \64vpxor x2, x3, x3; \65vpxor RNOT, x1, x1; \66vpxor x4, x2, x2; \67vpxor x2, x1, x1;6869#define S1_1(x0, x1, x2, x3, x4) \70vpxor x0, x1, tp; \71vpxor x3, x0, x0; \72vpxor RNOT, x3, x3; \73vpand tp, x1, x4; \74vpor tp, x0, x0; \75vpxor x2, x3, x3; \76vpxor x3, x0, x0; \77vpxor x3, tp, x1;78#define S1_2(x0, x1, x2, x3, x4) \79vpxor x4, x3, x3; \80vpor x4, x1, x1; \81vpxor x2, x4, x4; \82vpand x0, x2, x2; \83vpxor x1, x2, x2; \84vpor x0, x1, x1; \85vpxor RNOT, x0, x0; \86vpxor x2, x0, x0; \87vpxor x1, x4, x4;8889#define S2_1(x0, x1, x2, x3, x4) \90vpxor RNOT, x3, x3; \91vpxor x0, x1, x1; \92vpand x2, x0, tp; \93vpxor x3, tp, tp; \94vpor x0, x3, x3; \95vpxor x1, x2, x2; \96vpxor x1, x3, x3; \97vpand tp, x1, x1;98#define S2_2(x0, x1, x2, x3, x4) \99vpxor x2, tp, tp; \100vpand x3, x2, x2; \101vpor x1, x3, x3; \102vpxor RNOT, tp, tp; \103vpxor tp, x3, x3; \104vpxor tp, x0, x4; \105vpxor x2, tp, x0; \106vpor x2, x1, x1;107108#define S3_1(x0, x1, x2, x3, x4) \109vpxor x3, x1, tp; \110vpor x0, x3, x3; \111vpand x0, x1, x4; \112vpxor x2, x0, x0; \113vpxor tp, x2, x2; \114vpand x3, tp, x1; \115vpxor x3, x2, x2; \116vpor x4, x0, x0; \117vpxor x3, x4, x4;118#define S3_2(x0, x1, x2, x3, x4) \119vpxor x0, x1, x1; \120vpand x3, x0, x0; \121vpand x4, x3, x3; \122vpxor x2, x3, x3; \123vpor x1, x4, x4; \124vpand x1, x2, x2; \125vpxor x3, x4, x4; \126vpxor x3, x0, x0; \127vpxor x2, x3, x3;128129#define S4_1(x0, x1, x2, x3, x4) \130vpand x0, x3, tp; \131vpxor x3, x0, x0; \132vpxor x2, tp, tp; \133vpor x3, x2, x2; \134vpxor x1, x0, x0; \135vpxor tp, x3, x4; \136vpor x0, x2, x2; \137vpxor x1, x2, x2;138#define S4_2(x0, x1, x2, x3, x4) \139vpand x0, x1, x1; \140vpxor x4, x1, x1; \141vpand x2, x4, x4; \142vpxor tp, x2, x2; \143vpxor x0, x4, x4; \144vpor x1, tp, x3; \145vpxor RNOT, x1, x1; \146vpxor x0, x3, x3;147148#define S5_1(x0, x1, x2, x3, x4) \149vpor x0, x1, tp; \150vpxor tp, x2, x2; \151vpxor RNOT, x3, x3; \152vpxor x0, x1, x4; \153vpxor x2, x0, x0; \154vpand x4, tp, x1; \155vpor x3, x4, x4; \156vpxor x0, x4, x4;157#define S5_2(x0, x1, x2, x3, x4) \158vpand x3, x0, x0; \159vpxor x3, x1, x1; \160vpxor x2, x3, x3; \161vpxor x1, x0, x0; \162vpand x4, x2, x2; \163vpxor x2, x1, x1; \164vpand x0, x2, x2; \165vpxor x2, x3, x3;166167#define S6_1(x0, x1, x2, x3, x4) \168vpxor x0, x3, x3; \169vpxor x2, x1, tp; \170vpxor x0, x2, x2; \171vpand x3, x0, x0; \172vpor x3, tp, tp; \173vpxor RNOT, x1, x4; \174vpxor tp, x0, x0; \175vpxor x2, tp, x1;176#define S6_2(x0, x1, x2, x3, x4) \177vpxor x4, x3, x3; \178vpxor x0, x4, x4; \179vpand x0, x2, x2; \180vpxor x1, x4, x4; \181vpxor x3, x2, x2; \182vpand x1, x3, x3; \183vpxor x0, x3, x3; \184vpxor x2, x1, x1;185186#define S7_1(x0, x1, x2, x3, x4) \187vpxor RNOT, x1, tp; \188vpxor RNOT, x0, x0; \189vpand x2, tp, x1; \190vpxor x3, x1, x1; \191vpor tp, x3, x3; \192vpxor x2, tp, x4; \193vpxor x3, x2, x2; \194vpxor x0, x3, x3; \195vpor x1, x0, x0;196#define S7_2(x0, x1, x2, x3, x4) \197vpand x0, x2, x2; \198vpxor x4, x0, x0; \199vpxor x3, x4, x4; \200vpand x0, x3, x3; \201vpxor x1, x4, x4; \202vpxor x4, x2, x2; \203vpxor x1, x3, x3; \204vpor x0, x4, x4; \205vpxor x1, x4, x4;206207#define SI0_1(x0, x1, x2, x3, x4) \208vpxor x0, x1, x1; \209vpor x1, x3, tp; \210vpxor x1, x3, x4; \211vpxor RNOT, x0, x0; \212vpxor tp, x2, x2; \213vpxor x0, tp, x3; \214vpand x1, x0, x0; \215vpxor x2, x0, x0;216#define SI0_2(x0, x1, x2, x3, x4) \217vpand x3, x2, x2; \218vpxor x4, x3, x3; \219vpxor x3, x2, x2; \220vpxor x3, x1, x1; \221vpand x0, x3, x3; \222vpxor x0, x1, x1; \223vpxor x2, x0, x0; \224vpxor x3, x4, x4;225226#define SI1_1(x0, x1, x2, x3, x4) \227vpxor x3, x1, x1; \228vpxor x2, x0, tp; \229vpxor RNOT, x2, x2; \230vpor x1, x0, x4; \231vpxor x3, x4, x4; \232vpand x1, x3, x3; \233vpxor x2, x1, x1; \234vpand x4, x2, x2;235#define SI1_2(x0, x1, x2, x3, x4) \236vpxor x1, x4, x4; \237vpor x3, x1, x1; \238vpxor tp, x3, x3; \239vpxor tp, x2, x2; \240vpor x4, tp, x0; \241vpxor x4, x2, x2; \242vpxor x0, x1, x1; \243vpxor x1, x4, x4;244245#define SI2_1(x0, x1, x2, x3, x4) \246vpxor x1, x2, x2; \247vpxor RNOT, x3, tp; \248vpor x2, tp, tp; \249vpxor x3, x2, x2; \250vpxor x0, x3, x4; \251vpxor x1, tp, x3; \252vpor x2, x1, x1; \253vpxor x0, x2, x2;254#define SI2_2(x0, x1, x2, x3, x4) \255vpxor x4, x1, x1; \256vpor x3, x4, x4; \257vpxor x3, x2, x2; \258vpxor x2, x4, x4; \259vpand x1, x2, x2; \260vpxor x3, x2, x2; \261vpxor x4, x3, x3; \262vpxor x0, x4, x4;263264#define SI3_1(x0, x1, x2, x3, x4) \265vpxor x1, x2, x2; \266vpand x2, x1, tp; \267vpxor x0, tp, tp; \268vpor x1, x0, x0; \269vpxor x3, x1, x4; \270vpxor x3, x0, x0; \271vpor tp, x3, x3; \272vpxor x2, tp, x1;273#define SI3_2(x0, x1, x2, x3, x4) \274vpxor x3, x1, x1; \275vpxor x2, x0, x0; \276vpxor x3, x2, x2; \277vpand x1, x3, x3; \278vpxor x0, x1, x1; \279vpand x2, x0, x0; \280vpxor x3, x4, x4; \281vpxor x0, x3, x3; \282vpxor x1, x0, x0;283284#define SI4_1(x0, x1, x2, x3, x4) \285vpxor x3, x2, x2; \286vpand x1, x0, tp; \287vpxor x2, tp, tp; \288vpor x3, x2, x2; \289vpxor RNOT, x0, x4; \290vpxor tp, x1, x1; \291vpxor x2, tp, x0; \292vpand x4, x2, x2;293#define SI4_2(x0, x1, x2, x3, x4) \294vpxor x0, x2, x2; \295vpor x4, x0, x0; \296vpxor x3, x0, x0; \297vpand x2, x3, x3; \298vpxor x3, x4, x4; \299vpxor x1, x3, x3; \300vpand x0, x1, x1; \301vpxor x1, x4, x4; \302vpxor x3, x0, x0;303304#define SI5_1(x0, x1, x2, x3, x4) \305vpor x2, x1, tp; \306vpxor x1, x2, x2; \307vpxor x3, tp, tp; \308vpand x1, x3, x3; \309vpxor x3, x2, x2; \310vpor x0, x3, x3; \311vpxor RNOT, x0, x0; \312vpxor x2, x3, x3; \313vpor x0, x2, x2;314#define SI5_2(x0, x1, x2, x3, x4) \315vpxor tp, x1, x4; \316vpxor x4, x2, x2; \317vpand x0, x4, x4; \318vpxor tp, x0, x0; \319vpxor x3, tp, x1; \320vpand x2, x0, x0; \321vpxor x3, x2, x2; \322vpxor x2, x0, x0; \323vpxor x4, x2, x2; \324vpxor x3, x4, x4;325326#define SI6_1(x0, x1, x2, x3, x4) \327vpxor x2, x0, x0; \328vpand x3, x0, tp; \329vpxor x3, x2, x2; \330vpxor x2, tp, tp; \331vpxor x1, x3, x3; \332vpor x0, x2, x2; \333vpxor x3, x2, x2; \334vpand tp, x3, x3;335#define SI6_2(x0, x1, x2, x3, x4) \336vpxor RNOT, tp, tp; \337vpxor x1, x3, x3; \338vpand x2, x1, x1; \339vpxor tp, x0, x4; \340vpxor x4, x3, x3; \341vpxor x2, x4, x4; \342vpxor x1, tp, x0; \343vpxor x0, x2, x2;344345#define SI7_1(x0, x1, x2, x3, x4) \346vpand x0, x3, tp; \347vpxor x2, x0, x0; \348vpor x3, x2, x2; \349vpxor x1, x3, x4; \350vpxor RNOT, x0, x0; \351vpor tp, x1, x1; \352vpxor x0, x4, x4; \353vpand x2, x0, x0; \354vpxor x1, x0, x0;355#define SI7_2(x0, x1, x2, x3, x4) \356vpand x2, x1, x1; \357vpxor x2, tp, x3; \358vpxor x3, x4, x4; \359vpand x3, x2, x2; \360vpor x0, x3, x3; \361vpxor x4, x1, x1; \362vpxor x4, x3, x3; \363vpand x0, x4, x4; \364vpxor x2, x4, x4;365366#define get_key(i,j,t) \367vpbroadcastd (4*(i)+(j))*4(CTX), t;368369#define K2(x0, x1, x2, x3, x4, i) \370get_key(i, 0, RK0); \371get_key(i, 1, RK1); \372get_key(i, 2, RK2); \373get_key(i, 3, RK3); \374vpxor RK0, x0 ## 1, x0 ## 1; \375vpxor RK1, x1 ## 1, x1 ## 1; \376vpxor RK2, x2 ## 1, x2 ## 1; \377vpxor RK3, x3 ## 1, x3 ## 1; \378vpxor RK0, x0 ## 2, x0 ## 2; \379vpxor RK1, x1 ## 2, x1 ## 2; \380vpxor RK2, x2 ## 2, x2 ## 2; \381vpxor RK3, x3 ## 2, x3 ## 2;382383#define LK2(x0, x1, x2, x3, x4, i) \384vpslld $13, x0 ## 1, x4 ## 1; \385vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \386vpor x4 ## 1, x0 ## 1, x0 ## 1; \387vpxor x0 ## 1, x1 ## 1, x1 ## 1; \388vpslld $3, x2 ## 1, x4 ## 1; \389vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \390vpor x4 ## 1, x2 ## 1, x2 ## 1; \391vpxor x2 ## 1, x1 ## 1, x1 ## 1; \392vpslld $13, x0 ## 2, x4 ## 2; \393vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \394vpor x4 ## 2, x0 ## 2, x0 ## 2; \395vpxor x0 ## 2, x1 ## 2, x1 ## 2; \396vpslld $3, x2 ## 2, x4 ## 2; \397vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \398vpor x4 ## 2, x2 ## 2, x2 ## 2; \399vpxor x2 ## 2, x1 ## 2, x1 ## 2; \400vpslld $1, x1 ## 1, x4 ## 1; \401vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \402vpor x4 ## 1, x1 ## 1, x1 ## 1; \403vpslld $3, x0 ## 1, x4 ## 1; \404vpxor x2 ## 1, x3 ## 1, x3 ## 1; \405vpxor x4 ## 1, x3 ## 1, x3 ## 1; \406get_key(i, 1, RK1); \407vpslld $1, x1 ## 2, x4 ## 2; \408vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \409vpor x4 ## 2, x1 ## 2, x1 ## 2; \410vpslld $3, x0 ## 2, x4 ## 2; \411vpxor x2 ## 2, x3 ## 2, x3 ## 2; \412vpxor x4 ## 2, x3 ## 2, x3 ## 2; \413get_key(i, 3, RK3); \414vpslld $7, x3 ## 1, x4 ## 1; \415vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \416vpor x4 ## 1, x3 ## 1, x3 ## 1; \417vpslld $7, x1 ## 1, x4 ## 1; \418vpxor x1 ## 1, x0 ## 1, x0 ## 1; \419vpxor x3 ## 1, x0 ## 1, x0 ## 1; \420vpxor x3 ## 1, x2 ## 1, x2 ## 1; \421vpxor x4 ## 1, x2 ## 1, x2 ## 1; \422get_key(i, 0, RK0); \423vpslld $7, x3 ## 2, x4 ## 2; \424vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \425vpor x4 ## 2, x3 ## 2, x3 ## 2; \426vpslld $7, x1 ## 2, x4 ## 2; \427vpxor x1 ## 2, x0 ## 2, x0 ## 2; \428vpxor x3 ## 2, x0 ## 2, x0 ## 2; \429vpxor x3 ## 2, x2 ## 2, x2 ## 2; \430vpxor x4 ## 2, x2 ## 2, x2 ## 2; \431get_key(i, 2, RK2); \432vpxor RK1, x1 ## 1, x1 ## 1; \433vpxor RK3, x3 ## 1, x3 ## 1; \434vpslld $5, x0 ## 1, x4 ## 1; \435vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \436vpor x4 ## 1, x0 ## 1, x0 ## 1; \437vpslld $22, x2 ## 1, x4 ## 1; \438vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \439vpor x4 ## 1, x2 ## 1, x2 ## 1; \440vpxor RK0, x0 ## 1, x0 ## 1; \441vpxor RK2, x2 ## 1, x2 ## 1; \442vpxor RK1, x1 ## 2, x1 ## 2; \443vpxor RK3, x3 ## 2, x3 ## 2; \444vpslld $5, x0 ## 2, x4 ## 2; \445vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \446vpor x4 ## 2, x0 ## 2, x0 ## 2; \447vpslld $22, x2 ## 2, x4 ## 2; \448vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \449vpor x4 ## 2, x2 ## 2, x2 ## 2; \450vpxor RK0, x0 ## 2, x0 ## 2; \451vpxor RK2, x2 ## 2, x2 ## 2;452453#define KL2(x0, x1, x2, x3, x4, i) \454vpxor RK0, x0 ## 1, x0 ## 1; \455vpxor RK2, x2 ## 1, x2 ## 1; \456vpsrld $5, x0 ## 1, x4 ## 1; \457vpslld $(32 - 5), x0 ## 1, x0 ## 1; \458vpor x4 ## 1, x0 ## 1, x0 ## 1; \459vpxor RK3, x3 ## 1, x3 ## 1; \460vpxor RK1, x1 ## 1, x1 ## 1; \461vpsrld $22, x2 ## 1, x4 ## 1; \462vpslld $(32 - 22), x2 ## 1, x2 ## 1; \463vpor x4 ## 1, x2 ## 1, x2 ## 1; \464vpxor x3 ## 1, x2 ## 1, x2 ## 1; \465vpxor RK0, x0 ## 2, x0 ## 2; \466vpxor RK2, x2 ## 2, x2 ## 2; \467vpsrld $5, x0 ## 2, x4 ## 2; \468vpslld $(32 - 5), x0 ## 2, x0 ## 2; \469vpor x4 ## 2, x0 ## 2, x0 ## 2; \470vpxor RK3, x3 ## 2, x3 ## 2; \471vpxor RK1, x1 ## 2, x1 ## 2; \472vpsrld $22, x2 ## 2, x4 ## 2; \473vpslld $(32 - 22), x2 ## 2, x2 ## 2; \474vpor x4 ## 2, x2 ## 2, x2 ## 2; \475vpxor x3 ## 2, x2 ## 2, x2 ## 2; \476vpxor x3 ## 1, x0 ## 1, x0 ## 1; \477vpslld $7, x1 ## 1, x4 ## 1; \478vpxor x1 ## 1, x0 ## 1, x0 ## 1; \479vpxor x4 ## 1, x2 ## 1, x2 ## 1; \480vpsrld $1, x1 ## 1, x4 ## 1; \481vpslld $(32 - 1), x1 ## 1, x1 ## 1; \482vpor x4 ## 1, x1 ## 1, x1 ## 1; \483vpxor x3 ## 2, x0 ## 2, x0 ## 2; \484vpslld $7, x1 ## 2, x4 ## 2; \485vpxor x1 ## 2, x0 ## 2, x0 ## 2; \486vpxor x4 ## 2, x2 ## 2, x2 ## 2; \487vpsrld $1, x1 ## 2, x4 ## 2; \488vpslld $(32 - 1), x1 ## 2, x1 ## 2; \489vpor x4 ## 2, x1 ## 2, x1 ## 2; \490vpsrld $7, x3 ## 1, x4 ## 1; \491vpslld $(32 - 7), x3 ## 1, x3 ## 1; \492vpor x4 ## 1, x3 ## 1, x3 ## 1; \493vpxor x0 ## 1, x1 ## 1, x1 ## 1; \494vpslld $3, x0 ## 1, x4 ## 1; \495vpxor x4 ## 1, x3 ## 1, x3 ## 1; \496vpsrld $7, x3 ## 2, x4 ## 2; \497vpslld $(32 - 7), x3 ## 2, x3 ## 2; \498vpor x4 ## 2, x3 ## 2, x3 ## 2; \499vpxor x0 ## 2, x1 ## 2, x1 ## 2; \500vpslld $3, x0 ## 2, x4 ## 2; \501vpxor x4 ## 2, x3 ## 2, x3 ## 2; \502vpsrld $13, x0 ## 1, x4 ## 1; \503vpslld $(32 - 13), x0 ## 1, x0 ## 1; \504vpor x4 ## 1, x0 ## 1, x0 ## 1; \505vpxor x2 ## 1, x1 ## 1, x1 ## 1; \506vpxor x2 ## 1, x3 ## 1, x3 ## 1; \507vpsrld $3, x2 ## 1, x4 ## 1; \508vpslld $(32 - 3), x2 ## 1, x2 ## 1; \509vpor x4 ## 1, x2 ## 1, x2 ## 1; \510vpsrld $13, x0 ## 2, x4 ## 2; \511vpslld $(32 - 13), x0 ## 2, x0 ## 2; \512vpor x4 ## 2, x0 ## 2, x0 ## 2; \513vpxor x2 ## 2, x1 ## 2, x1 ## 2; \514vpxor x2 ## 2, x3 ## 2, x3 ## 2; \515vpsrld $3, x2 ## 2, x4 ## 2; \516vpslld $(32 - 3), x2 ## 2, x2 ## 2; \517vpor x4 ## 2, x2 ## 2, x2 ## 2;518519#define S(SBOX, x0, x1, x2, x3, x4) \520SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \521SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \522SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \523SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);524525#define SP(SBOX, x0, x1, x2, x3, x4, i) \526get_key(i, 0, RK0); \527SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \528get_key(i, 2, RK2); \529SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \530get_key(i, 3, RK3); \531SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \532get_key(i, 1, RK1); \533SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \534535#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \536vpunpckldq x1, x0, t0; \537vpunpckhdq x1, x0, t2; \538vpunpckldq x3, x2, t1; \539vpunpckhdq x3, x2, x3; \540\541vpunpcklqdq t1, t0, x0; \542vpunpckhqdq t1, t0, x1; \543vpunpcklqdq x3, t2, x2; \544vpunpckhqdq x3, t2, x3;545546#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \547transpose_4x4(x0, x1, x2, x3, t0, t1, t2)548549#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \550transpose_4x4(x0, x1, x2, x3, t0, t1, t2)551552SYM_FUNC_START_LOCAL(__serpent_enc_blk16)553/* input:554* %rdi: ctx, CTX555* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext556* output:557* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext558*/559560vpcmpeqd RNOT, RNOT, RNOT;561562read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);563read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);564565K2(RA, RB, RC, RD, RE, 0);566S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);567S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);568S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);569S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);570S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);571S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);572S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);573S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);574S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);575S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);576S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);577S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);578S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);579S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);580S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);581S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);582S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);583S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);584S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);585S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);586S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);587S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);588S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);589S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);590S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);591S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);592S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);593S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);594S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);595S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);596S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);597S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);598599write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);600write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);601602RET;603SYM_FUNC_END(__serpent_enc_blk16)604605SYM_FUNC_START_LOCAL(__serpent_dec_blk16)606/* input:607* %rdi: ctx, CTX608* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext609* output:610* RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext611*/612613vpcmpeqd RNOT, RNOT, RNOT;614615read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);616read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);617618K2(RA, RB, RC, RD, RE, 32);619SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);620SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);621SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);622SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);623SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);624SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);625SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);626SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);627SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);628SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);629SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);630SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);631SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);632SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);633SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);634SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);635SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);636SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);637SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);638SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);639SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);640SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);641SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);642SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);643SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);644SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);645SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);646SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);647SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);648SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);649SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);650S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);651652write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);653write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);654655RET;656SYM_FUNC_END(__serpent_dec_blk16)657658SYM_FUNC_START(serpent_ecb_enc_16way)659/* input:660* %rdi: ctx, CTX661* %rsi: dst662* %rdx: src663*/664FRAME_BEGIN665666vzeroupper;667668load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);669670call __serpent_enc_blk16;671672store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);673674vzeroupper;675676FRAME_END677RET;678SYM_FUNC_END(serpent_ecb_enc_16way)679680SYM_FUNC_START(serpent_ecb_dec_16way)681/* input:682* %rdi: ctx, CTX683* %rsi: dst684* %rdx: src685*/686FRAME_BEGIN687688vzeroupper;689690load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);691692call __serpent_dec_blk16;693694store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);695696vzeroupper;697698FRAME_END699RET;700SYM_FUNC_END(serpent_ecb_dec_16way)701702SYM_FUNC_START(serpent_cbc_dec_16way)703/* input:704* %rdi: ctx, CTX705* %rsi: dst706* %rdx: src707*/708FRAME_BEGIN709710vzeroupper;711712load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);713714call __serpent_dec_blk16;715716store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,717RK0);718719vzeroupper;720721FRAME_END722RET;723SYM_FUNC_END(serpent_cbc_dec_16way)724725726