Path: blob/master/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
26451 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* x86_64/AVX2/AES-NI assembler implementation of Camellia3*4* Copyright © 2013 Jussi Kivilinna <[email protected]>5*/67#include <linux/linkage.h>8#include <linux/cfi_types.h>9#include <asm/frame.h>1011#define CAMELLIA_TABLE_BYTE_LEN 2721213/* struct camellia_ctx: */14#define key_table 015#define key_length CAMELLIA_TABLE_BYTE_LEN1617/* register macros */18#define CTX %rdi19#define RIO %r82021/**********************************************************************22helper macros23**********************************************************************/24#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \25vpand x, mask4bit, tmp0; \26vpandn x, mask4bit, x; \27vpsrld $4, x, x; \28\29vpshufb tmp0, lo_t, tmp0; \30vpshufb x, hi_t, x; \31vpxor tmp0, x, x;3233#define ymm0_x xmm034#define ymm1_x xmm135#define ymm2_x xmm236#define ymm3_x xmm337#define ymm4_x xmm438#define ymm5_x xmm539#define ymm6_x xmm640#define ymm7_x xmm741#define ymm8_x xmm842#define ymm9_x xmm943#define ymm10_x xmm1044#define ymm11_x xmm1145#define ymm12_x xmm1246#define ymm13_x xmm1347#define ymm14_x xmm1448#define ymm15_x xmm154950/**********************************************************************5132-way camellia52**********************************************************************/5354/*55* IN:56* x0..x7: byte-sliced AB state57* mem_cd: register pointer storing CD state58* key: index for key material59* OUT:60* x0..x7: new byte-sliced CD state61*/62#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \63t7, mem_cd, key) \64/* \65* S-function with AES subbytes \66*/ \67vbroadcasti128 .Linv_shift_row(%rip), t4; \68vpbroadcastd .L0f0f0f0f(%rip), t7; \69vbroadcasti128 .Lpre_tf_lo_s1(%rip), t5; \70vbroadcasti128 .Lpre_tf_hi_s1(%rip), t6; \71vbroadcasti128 .Lpre_tf_lo_s4(%rip), t2; \72vbroadcasti128 .Lpre_tf_hi_s4(%rip), t3; \73\74/* AES inverse shift rows */ \75vpshufb t4, x0, x0; \76vpshufb t4, x7, x7; \77vpshufb t4, x3, x3; \78vpshufb t4, x6, x6; \79vpshufb t4, x2, x2; \80vpshufb t4, x5, x5; \81vpshufb t4, x1, x1; \82vpshufb t4, x4, x4; \83\84/* prefilter sboxes 1, 2 and 3 */ \85/* prefilter sbox 4 */ \86filter_8bit(x0, t5, t6, t7, t4); \87filter_8bit(x7, t5, t6, t7, t4); \88vextracti128 $1, x0, t0##_x; \89vextracti128 $1, x7, t1##_x; \90filter_8bit(x3, t2, t3, t7, t4); \91filter_8bit(x6, t2, t3, t7, t4); \92vextracti128 $1, x3, t3##_x; \93vextracti128 $1, x6, t2##_x; \94filter_8bit(x2, t5, t6, t7, t4); \95filter_8bit(x5, t5, t6, t7, t4); \96filter_8bit(x1, t5, t6, t7, t4); \97filter_8bit(x4, t5, t6, t7, t4); \98\99vpxor t4##_x, t4##_x, t4##_x; \100\101/* AES subbytes + AES shift rows */ \102vextracti128 $1, x2, t6##_x; \103vextracti128 $1, x5, t5##_x; \104vaesenclast t4##_x, x0##_x, x0##_x; \105vaesenclast t4##_x, t0##_x, t0##_x; \106vinserti128 $1, t0##_x, x0, x0; \107vaesenclast t4##_x, x7##_x, x7##_x; \108vaesenclast t4##_x, t1##_x, t1##_x; \109vinserti128 $1, t1##_x, x7, x7; \110vaesenclast t4##_x, x3##_x, x3##_x; \111vaesenclast t4##_x, t3##_x, t3##_x; \112vinserti128 $1, t3##_x, x3, x3; \113vaesenclast t4##_x, x6##_x, x6##_x; \114vaesenclast t4##_x, t2##_x, t2##_x; \115vinserti128 $1, t2##_x, x6, x6; \116vextracti128 $1, x1, t3##_x; \117vextracti128 $1, x4, t2##_x; \118vbroadcasti128 .Lpost_tf_lo_s1(%rip), t0; \119vbroadcasti128 .Lpost_tf_hi_s1(%rip), t1; \120vaesenclast t4##_x, x2##_x, x2##_x; \121vaesenclast t4##_x, t6##_x, t6##_x; \122vinserti128 $1, t6##_x, x2, x2; \123vaesenclast t4##_x, x5##_x, x5##_x; \124vaesenclast t4##_x, t5##_x, t5##_x; \125vinserti128 $1, t5##_x, x5, x5; \126vaesenclast t4##_x, x1##_x, x1##_x; \127vaesenclast t4##_x, t3##_x, t3##_x; \128vinserti128 $1, t3##_x, x1, x1; \129vaesenclast t4##_x, x4##_x, x4##_x; \130vaesenclast t4##_x, t2##_x, t2##_x; \131vinserti128 $1, t2##_x, x4, x4; \132\133/* postfilter sboxes 1 and 4 */ \134vbroadcasti128 .Lpost_tf_lo_s3(%rip), t2; \135vbroadcasti128 .Lpost_tf_hi_s3(%rip), t3; \136filter_8bit(x0, t0, t1, t7, t6); \137filter_8bit(x7, t0, t1, t7, t6); \138filter_8bit(x3, t0, t1, t7, t6); \139filter_8bit(x6, t0, t1, t7, t6); \140\141/* postfilter sbox 3 */ \142vbroadcasti128 .Lpost_tf_lo_s2(%rip), t4; \143vbroadcasti128 .Lpost_tf_hi_s2(%rip), t5; \144filter_8bit(x2, t2, t3, t7, t6); \145filter_8bit(x5, t2, t3, t7, t6); \146\147vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \148\149/* postfilter sbox 2 */ \150filter_8bit(x1, t4, t5, t7, t2); \151filter_8bit(x4, t4, t5, t7, t2); \152vpxor t7, t7, t7; \153\154vpsrldq $1, t0, t1; \155vpsrldq $2, t0, t2; \156vpshufb t7, t1, t1; \157vpsrldq $3, t0, t3; \158\159/* P-function */ \160vpxor x5, x0, x0; \161vpxor x6, x1, x1; \162vpxor x7, x2, x2; \163vpxor x4, x3, x3; \164\165vpshufb t7, t2, t2; \166vpsrldq $4, t0, t4; \167vpshufb t7, t3, t3; \168vpsrldq $5, t0, t5; \169vpshufb t7, t4, t4; \170\171vpxor x2, x4, x4; \172vpxor x3, x5, x5; \173vpxor x0, x6, x6; \174vpxor x1, x7, x7; \175\176vpsrldq $6, t0, t6; \177vpshufb t7, t5, t5; \178vpshufb t7, t6, t6; \179\180vpxor x7, x0, x0; \181vpxor x4, x1, x1; \182vpxor x5, x2, x2; \183vpxor x6, x3, x3; \184\185vpxor x3, x4, x4; \186vpxor x0, x5, x5; \187vpxor x1, x6, x6; \188vpxor x2, x7, x7; /* note: high and low parts swapped */ \189\190/* Add key material and result to CD (x becomes new CD) */ \191\192vpxor t6, x1, x1; \193vpxor 5 * 32(mem_cd), x1, x1; \194\195vpsrldq $7, t0, t6; \196vpshufb t7, t0, t0; \197vpshufb t7, t6, t7; \198\199vpxor t7, x0, x0; \200vpxor 4 * 32(mem_cd), x0, x0; \201\202vpxor t5, x2, x2; \203vpxor 6 * 32(mem_cd), x2, x2; \204\205vpxor t4, x3, x3; \206vpxor 7 * 32(mem_cd), x3, x3; \207\208vpxor t3, x4, x4; \209vpxor 0 * 32(mem_cd), x4, x4; \210\211vpxor t2, x5, x5; \212vpxor 1 * 32(mem_cd), x5, x5; \213\214vpxor t1, x6, x6; \215vpxor 2 * 32(mem_cd), x6, x6; \216\217vpxor t0, x7, x7; \218vpxor 3 * 32(mem_cd), x7, x7;219220/*221* Size optimization... with inlined roundsm32 binary would be over 5 times222* larger and would only marginally faster.223*/224SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)225roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,226%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,227%rcx, (%r9));228RET;229SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)230231SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)232roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,233%ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,234%rax, (%r9));235RET;236SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)237238/*239* IN/OUT:240* x0..x7: byte-sliced AB state preloaded241* mem_ab: byte-sliced AB state in memory242* mem_cb: byte-sliced CD state in memory243*/244#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \245y6, y7, mem_ab, mem_cd, i, dir, store_ab) \246leaq (key_table + (i) * 8)(CTX), %r9; \247call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \248\249vmovdqu x0, 4 * 32(mem_cd); \250vmovdqu x1, 5 * 32(mem_cd); \251vmovdqu x2, 6 * 32(mem_cd); \252vmovdqu x3, 7 * 32(mem_cd); \253vmovdqu x4, 0 * 32(mem_cd); \254vmovdqu x5, 1 * 32(mem_cd); \255vmovdqu x6, 2 * 32(mem_cd); \256vmovdqu x7, 3 * 32(mem_cd); \257\258leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \259call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \260\261store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);262263#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */264265#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \266/* Store new AB state */ \267vmovdqu x4, 4 * 32(mem_ab); \268vmovdqu x5, 5 * 32(mem_ab); \269vmovdqu x6, 6 * 32(mem_ab); \270vmovdqu x7, 7 * 32(mem_ab); \271vmovdqu x0, 0 * 32(mem_ab); \272vmovdqu x1, 1 * 32(mem_ab); \273vmovdqu x2, 2 * 32(mem_ab); \274vmovdqu x3, 3 * 32(mem_ab);275276#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \277y6, y7, mem_ab, mem_cd, i) \278two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \279y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \280two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \281y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \282two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \283y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);284285#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \286y6, y7, mem_ab, mem_cd, i) \287two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \288y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \289two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \290y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \291two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \292y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);293294/*295* IN:296* v0..3: byte-sliced 32-bit integers297* OUT:298* v0..3: (IN <<< 1)299*/300#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \301vpcmpgtb v0, zero, t0; \302vpaddb v0, v0, v0; \303vpabsb t0, t0; \304\305vpcmpgtb v1, zero, t1; \306vpaddb v1, v1, v1; \307vpabsb t1, t1; \308\309vpcmpgtb v2, zero, t2; \310vpaddb v2, v2, v2; \311vpabsb t2, t2; \312\313vpor t0, v1, v1; \314\315vpcmpgtb v3, zero, t0; \316vpaddb v3, v3, v3; \317vpabsb t0, t0; \318\319vpor t1, v2, v2; \320vpor t2, v3, v3; \321vpor t0, v0, v0;322323/*324* IN:325* r: byte-sliced AB state in memory326* l: byte-sliced CD state in memory327* OUT:328* x0..x7: new byte-sliced CD state329*/330#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \331tt1, tt2, tt3, kll, klr, krl, krr) \332/* \333* t0 = kll; \334* t0 &= ll; \335* lr ^= rol32(t0, 1); \336*/ \337vpbroadcastd kll, t0; /* only lowest 32-bit used */ \338vpxor tt0, tt0, tt0; \339vpshufb tt0, t0, t3; \340vpsrldq $1, t0, t0; \341vpshufb tt0, t0, t2; \342vpsrldq $1, t0, t0; \343vpshufb tt0, t0, t1; \344vpsrldq $1, t0, t0; \345vpshufb tt0, t0, t0; \346\347vpand l0, t0, t0; \348vpand l1, t1, t1; \349vpand l2, t2, t2; \350vpand l3, t3, t3; \351\352rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \353\354vpxor l4, t0, l4; \355vpbroadcastd krr, t0; /* only lowest 32-bit used */ \356vmovdqu l4, 4 * 32(l); \357vpxor l5, t1, l5; \358vmovdqu l5, 5 * 32(l); \359vpxor l6, t2, l6; \360vmovdqu l6, 6 * 32(l); \361vpxor l7, t3, l7; \362vmovdqu l7, 7 * 32(l); \363\364/* \365* t2 = krr; \366* t2 |= rr; \367* rl ^= t2; \368*/ \369\370vpshufb tt0, t0, t3; \371vpsrldq $1, t0, t0; \372vpshufb tt0, t0, t2; \373vpsrldq $1, t0, t0; \374vpshufb tt0, t0, t1; \375vpsrldq $1, t0, t0; \376vpshufb tt0, t0, t0; \377\378vpor 4 * 32(r), t0, t0; \379vpor 5 * 32(r), t1, t1; \380vpor 6 * 32(r), t2, t2; \381vpor 7 * 32(r), t3, t3; \382\383vpxor 0 * 32(r), t0, t0; \384vpxor 1 * 32(r), t1, t1; \385vpxor 2 * 32(r), t2, t2; \386vpxor 3 * 32(r), t3, t3; \387vmovdqu t0, 0 * 32(r); \388vpbroadcastd krl, t0; /* only lowest 32-bit used */ \389vmovdqu t1, 1 * 32(r); \390vmovdqu t2, 2 * 32(r); \391vmovdqu t3, 3 * 32(r); \392\393/* \394* t2 = krl; \395* t2 &= rl; \396* rr ^= rol32(t2, 1); \397*/ \398vpshufb tt0, t0, t3; \399vpsrldq $1, t0, t0; \400vpshufb tt0, t0, t2; \401vpsrldq $1, t0, t0; \402vpshufb tt0, t0, t1; \403vpsrldq $1, t0, t0; \404vpshufb tt0, t0, t0; \405\406vpand 0 * 32(r), t0, t0; \407vpand 1 * 32(r), t1, t1; \408vpand 2 * 32(r), t2, t2; \409vpand 3 * 32(r), t3, t3; \410\411rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \412\413vpxor 4 * 32(r), t0, t0; \414vpxor 5 * 32(r), t1, t1; \415vpxor 6 * 32(r), t2, t2; \416vpxor 7 * 32(r), t3, t3; \417vmovdqu t0, 4 * 32(r); \418vpbroadcastd klr, t0; /* only lowest 32-bit used */ \419vmovdqu t1, 5 * 32(r); \420vmovdqu t2, 6 * 32(r); \421vmovdqu t3, 7 * 32(r); \422\423/* \424* t0 = klr; \425* t0 |= lr; \426* ll ^= t0; \427*/ \428\429vpshufb tt0, t0, t3; \430vpsrldq $1, t0, t0; \431vpshufb tt0, t0, t2; \432vpsrldq $1, t0, t0; \433vpshufb tt0, t0, t1; \434vpsrldq $1, t0, t0; \435vpshufb tt0, t0, t0; \436\437vpor l4, t0, t0; \438vpor l5, t1, t1; \439vpor l6, t2, t2; \440vpor l7, t3, t3; \441\442vpxor l0, t0, l0; \443vmovdqu l0, 0 * 32(l); \444vpxor l1, t1, l1; \445vmovdqu l1, 1 * 32(l); \446vpxor l2, t2, l2; \447vmovdqu l2, 2 * 32(l); \448vpxor l3, t3, l3; \449vmovdqu l3, 3 * 32(l);450451#define transpose_4x4(x0, x1, x2, x3, t1, t2) \452vpunpckhdq x1, x0, t2; \453vpunpckldq x1, x0, x0; \454\455vpunpckldq x3, x2, t1; \456vpunpckhdq x3, x2, x2; \457\458vpunpckhqdq t1, x0, x1; \459vpunpcklqdq t1, x0, x0; \460\461vpunpckhqdq x2, t2, x3; \462vpunpcklqdq x2, t2, x2;463464#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \465a3, b3, c3, d3, st0, st1) \466vmovdqu d2, st0; \467vmovdqu d3, st1; \468transpose_4x4(a0, a1, a2, a3, d2, d3); \469transpose_4x4(b0, b1, b2, b3, d2, d3); \470vmovdqu st0, d2; \471vmovdqu st1, d3; \472\473vmovdqu a0, st0; \474vmovdqu a1, st1; \475transpose_4x4(c0, c1, c2, c3, a0, a1); \476transpose_4x4(d0, d1, d2, d3, a0, a1); \477\478vbroadcasti128 .Lshufb_16x16b(%rip), a0; \479vmovdqu st1, a1; \480vpshufb a0, a2, a2; \481vpshufb a0, a3, a3; \482vpshufb a0, b0, b0; \483vpshufb a0, b1, b1; \484vpshufb a0, b2, b2; \485vpshufb a0, b3, b3; \486vpshufb a0, a1, a1; \487vpshufb a0, c0, c0; \488vpshufb a0, c1, c1; \489vpshufb a0, c2, c2; \490vpshufb a0, c3, c3; \491vpshufb a0, d0, d0; \492vpshufb a0, d1, d1; \493vpshufb a0, d2, d2; \494vpshufb a0, d3, d3; \495vmovdqu d3, st1; \496vmovdqu st0, d3; \497vpshufb a0, d3, a0; \498vmovdqu d2, st0; \499\500transpose_4x4(a0, b0, c0, d0, d2, d3); \501transpose_4x4(a1, b1, c1, d1, d2, d3); \502vmovdqu st0, d2; \503vmovdqu st1, d3; \504\505vmovdqu b0, st0; \506vmovdqu b1, st1; \507transpose_4x4(a2, b2, c2, d2, b0, b1); \508transpose_4x4(a3, b3, c3, d3, b0, b1); \509vmovdqu st0, b0; \510vmovdqu st1, b1; \511/* does not adjust output bytes inside vectors */512513/* load blocks to registers and apply pre-whitening */514#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \515y6, y7, rio, key) \516vpbroadcastq key, x0; \517vpshufb .Lpack_bswap(%rip), x0, x0; \518\519vpxor 0 * 32(rio), x0, y7; \520vpxor 1 * 32(rio), x0, y6; \521vpxor 2 * 32(rio), x0, y5; \522vpxor 3 * 32(rio), x0, y4; \523vpxor 4 * 32(rio), x0, y3; \524vpxor 5 * 32(rio), x0, y2; \525vpxor 6 * 32(rio), x0, y1; \526vpxor 7 * 32(rio), x0, y0; \527vpxor 8 * 32(rio), x0, x7; \528vpxor 9 * 32(rio), x0, x6; \529vpxor 10 * 32(rio), x0, x5; \530vpxor 11 * 32(rio), x0, x4; \531vpxor 12 * 32(rio), x0, x3; \532vpxor 13 * 32(rio), x0, x2; \533vpxor 14 * 32(rio), x0, x1; \534vpxor 15 * 32(rio), x0, x0;535536/* byteslice pre-whitened blocks and store to temporary memory */537#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \538y6, y7, mem_ab, mem_cd) \539byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \540y4, y5, y6, y7, (mem_ab), (mem_cd)); \541\542vmovdqu x0, 0 * 32(mem_ab); \543vmovdqu x1, 1 * 32(mem_ab); \544vmovdqu x2, 2 * 32(mem_ab); \545vmovdqu x3, 3 * 32(mem_ab); \546vmovdqu x4, 4 * 32(mem_ab); \547vmovdqu x5, 5 * 32(mem_ab); \548vmovdqu x6, 6 * 32(mem_ab); \549vmovdqu x7, 7 * 32(mem_ab); \550vmovdqu y0, 0 * 32(mem_cd); \551vmovdqu y1, 1 * 32(mem_cd); \552vmovdqu y2, 2 * 32(mem_cd); \553vmovdqu y3, 3 * 32(mem_cd); \554vmovdqu y4, 4 * 32(mem_cd); \555vmovdqu y5, 5 * 32(mem_cd); \556vmovdqu y6, 6 * 32(mem_cd); \557vmovdqu y7, 7 * 32(mem_cd);558559/* de-byteslice, apply post-whitening and store blocks */560#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \561y5, y6, y7, key, stack_tmp0, stack_tmp1) \562byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \563y3, y7, x3, x7, stack_tmp0, stack_tmp1); \564\565vmovdqu x0, stack_tmp0; \566\567vpbroadcastq key, x0; \568vpshufb .Lpack_bswap(%rip), x0, x0; \569\570vpxor x0, y7, y7; \571vpxor x0, y6, y6; \572vpxor x0, y5, y5; \573vpxor x0, y4, y4; \574vpxor x0, y3, y3; \575vpxor x0, y2, y2; \576vpxor x0, y1, y1; \577vpxor x0, y0, y0; \578vpxor x0, x7, x7; \579vpxor x0, x6, x6; \580vpxor x0, x5, x5; \581vpxor x0, x4, x4; \582vpxor x0, x3, x3; \583vpxor x0, x2, x2; \584vpxor x0, x1, x1; \585vpxor stack_tmp0, x0, x0;586587#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \588y6, y7, rio) \589vmovdqu x0, 0 * 32(rio); \590vmovdqu x1, 1 * 32(rio); \591vmovdqu x2, 2 * 32(rio); \592vmovdqu x3, 3 * 32(rio); \593vmovdqu x4, 4 * 32(rio); \594vmovdqu x5, 5 * 32(rio); \595vmovdqu x6, 6 * 32(rio); \596vmovdqu x7, 7 * 32(rio); \597vmovdqu y0, 8 * 32(rio); \598vmovdqu y1, 9 * 32(rio); \599vmovdqu y2, 10 * 32(rio); \600vmovdqu y3, 11 * 32(rio); \601vmovdqu y4, 12 * 32(rio); \602vmovdqu y5, 13 * 32(rio); \603vmovdqu y6, 14 * 32(rio); \604vmovdqu y7, 15 * 32(rio);605606607.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32608.align 32609#define SHUFB_BYTES(idx) \6100 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)611.Lshufb_16x16b:612.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)613.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)614615.section .rodata.cst32.pack_bswap, "aM", @progbits, 32616.align 32617.Lpack_bswap:618.long 0x00010203, 0x04050607, 0x80808080, 0x80808080619.long 0x00010203, 0x04050607, 0x80808080, 0x80808080620621/* NB: section is mergeable, all elements must be aligned 16-byte blocks */622.section .rodata.cst16, "aM", @progbits, 16623.align 16624625/*626* pre-SubByte transform627*628* pre-lookup for sbox1, sbox2, sbox3:629* swap_bitendianness(630* isom_map_camellia_to_aes(631* camellia_f(632* swap_bitendianess(in)633* )634* )635* )636*637* (note: '⊕ 0xc5' inside camellia_f())638*/639.Lpre_tf_lo_s1:640.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86641.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88642.Lpre_tf_hi_s1:643.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a644.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23645646/*647* pre-SubByte transform648*649* pre-lookup for sbox4:650* swap_bitendianness(651* isom_map_camellia_to_aes(652* camellia_f(653* swap_bitendianess(in <<< 1)654* )655* )656* )657*658* (note: '⊕ 0xc5' inside camellia_f())659*/660.Lpre_tf_lo_s4:661.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25662.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74663.Lpre_tf_hi_s4:664.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72665.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf666667/*668* post-SubByte transform669*670* post-lookup for sbox1, sbox4:671* swap_bitendianness(672* camellia_h(673* isom_map_aes_to_camellia(674* swap_bitendianness(675* aes_inverse_affine_transform(in)676* )677* )678* )679* )680*681* (note: '⊕ 0x6e' inside camellia_h())682*/683.Lpost_tf_lo_s1:684.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31685.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1686.Lpost_tf_hi_s1:687.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8688.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c689690/*691* post-SubByte transform692*693* post-lookup for sbox2:694* swap_bitendianness(695* camellia_h(696* isom_map_aes_to_camellia(697* swap_bitendianness(698* aes_inverse_affine_transform(in)699* )700* )701* )702* ) <<< 1703*704* (note: '⊕ 0x6e' inside camellia_h())705*/706.Lpost_tf_lo_s2:707.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62708.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3709.Lpost_tf_hi_s2:710.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51711.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18712713/*714* post-SubByte transform715*716* post-lookup for sbox3:717* swap_bitendianness(718* camellia_h(719* isom_map_aes_to_camellia(720* swap_bitendianness(721* aes_inverse_affine_transform(in)722* )723* )724* )725* ) >>> 1726*727* (note: '⊕ 0x6e' inside camellia_h())728*/729.Lpost_tf_lo_s3:730.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98731.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8732.Lpost_tf_hi_s3:733.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54734.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06735736/* For isolating SubBytes from AESENCLAST, inverse shift row */737.Linv_shift_row:738.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b739.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03740741.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4742.align 4743/* 4-bit mask */744.L0f0f0f0f:745.long 0x0f0f0f0f746747.text748749SYM_FUNC_START_LOCAL(__camellia_enc_blk32)750/* input:751* %rdi: ctx, CTX752* %rax: temporary storage, 512 bytes753* %ymm0..%ymm15: 32 plaintext blocks754* output:755* %ymm0..%ymm15: 32 encrypted blocks, order swapped:756* 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8757*/758FRAME_BEGIN759760leaq 8 * 32(%rax), %rcx;761762inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,763%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,764%ymm15, %rax, %rcx);765766enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,767%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,768%ymm15, %rax, %rcx, 0);769770fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,771%rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,772%ymm15,773((key_table + (8) * 8) + 0)(CTX),774((key_table + (8) * 8) + 4)(CTX),775((key_table + (8) * 8) + 8)(CTX),776((key_table + (8) * 8) + 12)(CTX));777778enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,779%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,780%ymm15, %rax, %rcx, 8);781782fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,783%rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,784%ymm15,785((key_table + (16) * 8) + 0)(CTX),786((key_table + (16) * 8) + 4)(CTX),787((key_table + (16) * 8) + 8)(CTX),788((key_table + (16) * 8) + 12)(CTX));789790enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,791%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,792%ymm15, %rax, %rcx, 16);793794movl $24, %r8d;795cmpl $16, key_length(CTX);796jne .Lenc_max32;797798.Lenc_done:799/* load CD for output */800vmovdqu 0 * 32(%rcx), %ymm8;801vmovdqu 1 * 32(%rcx), %ymm9;802vmovdqu 2 * 32(%rcx), %ymm10;803vmovdqu 3 * 32(%rcx), %ymm11;804vmovdqu 4 * 32(%rcx), %ymm12;805vmovdqu 5 * 32(%rcx), %ymm13;806vmovdqu 6 * 32(%rcx), %ymm14;807vmovdqu 7 * 32(%rcx), %ymm15;808809outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,810%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,811%ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));812813FRAME_END814RET;815816.align 8817.Lenc_max32:818movl $32, %r8d;819820fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,821%rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,822%ymm15,823((key_table + (24) * 8) + 0)(CTX),824((key_table + (24) * 8) + 4)(CTX),825((key_table + (24) * 8) + 8)(CTX),826((key_table + (24) * 8) + 12)(CTX));827828enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,829%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,830%ymm15, %rax, %rcx, 24);831832jmp .Lenc_done;833SYM_FUNC_END(__camellia_enc_blk32)834835SYM_FUNC_START_LOCAL(__camellia_dec_blk32)836/* input:837* %rdi: ctx, CTX838* %rax: temporary storage, 512 bytes839* %r8d: 24 for 16 byte key, 32 for larger840* %ymm0..%ymm15: 16 encrypted blocks841* output:842* %ymm0..%ymm15: 16 plaintext blocks, order swapped:843* 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8844*/845FRAME_BEGIN846847leaq 8 * 32(%rax), %rcx;848849inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,850%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,851%ymm15, %rax, %rcx);852853cmpl $32, %r8d;854je .Ldec_max32;855856.Ldec_max24:857dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,858%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,859%ymm15, %rax, %rcx, 16);860861fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,862%rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,863%ymm15,864((key_table + (16) * 8) + 8)(CTX),865((key_table + (16) * 8) + 12)(CTX),866((key_table + (16) * 8) + 0)(CTX),867((key_table + (16) * 8) + 4)(CTX));868869dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,870%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,871%ymm15, %rax, %rcx, 8);872873fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,874%rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,875%ymm15,876((key_table + (8) * 8) + 8)(CTX),877((key_table + (8) * 8) + 12)(CTX),878((key_table + (8) * 8) + 0)(CTX),879((key_table + (8) * 8) + 4)(CTX));880881dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,882%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,883%ymm15, %rax, %rcx, 0);884885/* load CD for output */886vmovdqu 0 * 32(%rcx), %ymm8;887vmovdqu 1 * 32(%rcx), %ymm9;888vmovdqu 2 * 32(%rcx), %ymm10;889vmovdqu 3 * 32(%rcx), %ymm11;890vmovdqu 4 * 32(%rcx), %ymm12;891vmovdqu 5 * 32(%rcx), %ymm13;892vmovdqu 6 * 32(%rcx), %ymm14;893vmovdqu 7 * 32(%rcx), %ymm15;894895outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,896%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,897%ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));898899FRAME_END900RET;901902.align 8903.Ldec_max32:904dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,905%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,906%ymm15, %rax, %rcx, 24);907908fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,909%rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,910%ymm15,911((key_table + (24) * 8) + 8)(CTX),912((key_table + (24) * 8) + 12)(CTX),913((key_table + (24) * 8) + 0)(CTX),914((key_table + (24) * 8) + 4)(CTX));915916jmp .Ldec_max24;917SYM_FUNC_END(__camellia_dec_blk32)918919SYM_FUNC_START(camellia_ecb_enc_32way)920/* input:921* %rdi: ctx, CTX922* %rsi: dst (32 blocks)923* %rdx: src (32 blocks)924*/925FRAME_BEGIN926927vzeroupper;928929inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,930%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,931%ymm15, %rdx, (key_table)(CTX));932933/* now dst can be used as temporary buffer (even in src == dst case) */934movq %rsi, %rax;935936call __camellia_enc_blk32;937938write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,939%ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,940%ymm8, %rsi);941942vzeroupper;943944FRAME_END945RET;946SYM_FUNC_END(camellia_ecb_enc_32way)947948SYM_FUNC_START(camellia_ecb_dec_32way)949/* input:950* %rdi: ctx, CTX951* %rsi: dst (32 blocks)952* %rdx: src (32 blocks)953*/954FRAME_BEGIN955956vzeroupper;957958cmpl $16, key_length(CTX);959movl $32, %r8d;960movl $24, %eax;961cmovel %eax, %r8d; /* max */962963inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,964%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,965%ymm15, %rdx, (key_table)(CTX, %r8, 8));966967/* now dst can be used as temporary buffer (even in src == dst case) */968movq %rsi, %rax;969970call __camellia_dec_blk32;971972write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,973%ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,974%ymm8, %rsi);975976vzeroupper;977978FRAME_END979RET;980SYM_FUNC_END(camellia_ecb_dec_32way)981982SYM_FUNC_START(camellia_cbc_dec_32way)983/* input:984* %rdi: ctx, CTX985* %rsi: dst (32 blocks)986* %rdx: src (32 blocks)987*/988FRAME_BEGIN989subq $(16 * 32), %rsp;990991vzeroupper;992993cmpl $16, key_length(CTX);994movl $32, %r8d;995movl $24, %eax;996cmovel %eax, %r8d; /* max */997998inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,999%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1000%ymm15, %rdx, (key_table)(CTX, %r8, 8));10011002cmpq %rsi, %rdx;1003je .Lcbc_dec_use_stack;10041005/* dst can be used as temporary storage, src is not overwritten. */1006movq %rsi, %rax;1007jmp .Lcbc_dec_continue;10081009.Lcbc_dec_use_stack:1010/*1011* dst still in-use (because dst == src), so use stack for temporary1012* storage.1013*/1014movq %rsp, %rax;10151016.Lcbc_dec_continue:1017call __camellia_dec_blk32;10181019vmovdqu %ymm7, (%rax);1020vpxor %ymm7, %ymm7, %ymm7;1021vinserti128 $1, (%rdx), %ymm7, %ymm7;1022vpxor (%rax), %ymm7, %ymm7;1023vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;1024vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;1025vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;1026vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;1027vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;1028vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;1029vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;1030vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;1031vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;1032vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;1033vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;1034vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;1035vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;1036vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;1037vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;1038write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,1039%ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,1040%ymm8, %rsi);10411042vzeroupper;10431044addq $(16 * 32), %rsp;1045FRAME_END1046RET;1047SYM_FUNC_END(camellia_cbc_dec_32way)104810491050