Path: blob/master/arch/x86/crypto/aria-aesni-avx-asm_64.S
26451 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* ARIA Cipher 16-way parallel algorithm (AVX)3*4* Copyright (c) 2022 Taehee Yoo <[email protected]>5*6*/78#include <linux/linkage.h>9#include <linux/cfi_types.h>10#include <asm/asm-offsets.h>11#include <asm/frame.h>1213/* register macros */14#define CTX %rdi151617#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \18( (((a0) & 1) << 0) | \19(((a1) & 1) << 1) | \20(((a2) & 1) << 2) | \21(((a3) & 1) << 3) | \22(((a4) & 1) << 4) | \23(((a5) & 1) << 5) | \24(((a6) & 1) << 6) | \25(((a7) & 1) << 7) )2627#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \28( ((l7) << (0 * 8)) | \29((l6) << (1 * 8)) | \30((l5) << (2 * 8)) | \31((l4) << (3 * 8)) | \32((l3) << (4 * 8)) | \33((l2) << (5 * 8)) | \34((l1) << (6 * 8)) | \35((l0) << (7 * 8)) )3637#define inc_le128(x, minus_one, tmp) \38vpcmpeqq minus_one, x, tmp; \39vpsubq minus_one, x, x; \40vpslldq $8, tmp, tmp; \41vpsubq tmp, x, x;4243#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \44vpand x, mask4bit, tmp0; \45vpandn x, mask4bit, x; \46vpsrld $4, x, x; \47\48vpshufb tmp0, lo_t, tmp0; \49vpshufb x, hi_t, x; \50vpxor tmp0, x, x;5152#define transpose_4x4(x0, x1, x2, x3, t1, t2) \53vpunpckhdq x1, x0, t2; \54vpunpckldq x1, x0, x0; \55\56vpunpckldq x3, x2, t1; \57vpunpckhdq x3, x2, x2; \58\59vpunpckhqdq t1, x0, x1; \60vpunpcklqdq t1, x0, x0; \61\62vpunpckhqdq x2, t2, x3; \63vpunpcklqdq x2, t2, x2;6465#define byteslice_16x16b(a0, b0, c0, d0, \66a1, b1, c1, d1, \67a2, b2, c2, d2, \68a3, b3, c3, d3, \69st0, st1) \70vmovdqu d2, st0; \71vmovdqu d3, st1; \72transpose_4x4(a0, a1, a2, a3, d2, d3); \73transpose_4x4(b0, b1, b2, b3, d2, d3); \74vmovdqu st0, d2; \75vmovdqu st1, d3; \76\77vmovdqu a0, st0; \78vmovdqu a1, st1; \79transpose_4x4(c0, c1, c2, c3, a0, a1); \80transpose_4x4(d0, d1, d2, d3, a0, a1); \81\82vmovdqu .Lshufb_16x16b(%rip), a0; \83vmovdqu st1, a1; \84vpshufb a0, a2, a2; \85vpshufb a0, a3, a3; \86vpshufb a0, b0, b0; \87vpshufb a0, b1, b1; \88vpshufb a0, b2, b2; \89vpshufb a0, b3, b3; \90vpshufb a0, a1, a1; \91vpshufb a0, c0, c0; \92vpshufb a0, c1, c1; \93vpshufb a0, c2, c2; \94vpshufb a0, c3, c3; \95vpshufb a0, d0, d0; \96vpshufb a0, d1, d1; \97vpshufb a0, d2, d2; \98vpshufb a0, d3, d3; \99vmovdqu d3, st1; \100vmovdqu st0, d3; \101vpshufb a0, d3, a0; \102vmovdqu d2, st0; \103\104transpose_4x4(a0, b0, c0, d0, d2, d3); \105transpose_4x4(a1, b1, c1, d1, d2, d3); \106vmovdqu st0, d2; \107vmovdqu st1, d3; \108\109vmovdqu b0, st0; \110vmovdqu b1, st1; \111transpose_4x4(a2, b2, c2, d2, b0, b1); \112transpose_4x4(a3, b3, c3, d3, b0, b1); \113vmovdqu st0, b0; \114vmovdqu st1, b1; \115/* does not adjust output bytes inside vectors */116117#define debyteslice_16x16b(a0, b0, c0, d0, \118a1, b1, c1, d1, \119a2, b2, c2, d2, \120a3, b3, c3, d3, \121st0, st1) \122vmovdqu d2, st0; \123vmovdqu d3, st1; \124transpose_4x4(a0, a1, a2, a3, d2, d3); \125transpose_4x4(b0, b1, b2, b3, d2, d3); \126vmovdqu st0, d2; \127vmovdqu st1, d3; \128\129vmovdqu a0, st0; \130vmovdqu a1, st1; \131transpose_4x4(c0, c1, c2, c3, a0, a1); \132transpose_4x4(d0, d1, d2, d3, a0, a1); \133\134vmovdqu .Lshufb_16x16b(%rip), a0; \135vmovdqu st1, a1; \136vpshufb a0, a2, a2; \137vpshufb a0, a3, a3; \138vpshufb a0, b0, b0; \139vpshufb a0, b1, b1; \140vpshufb a0, b2, b2; \141vpshufb a0, b3, b3; \142vpshufb a0, a1, a1; \143vpshufb a0, c0, c0; \144vpshufb a0, c1, c1; \145vpshufb a0, c2, c2; \146vpshufb a0, c3, c3; \147vpshufb a0, d0, d0; \148vpshufb a0, d1, d1; \149vpshufb a0, d2, d2; \150vpshufb a0, d3, d3; \151vmovdqu d3, st1; \152vmovdqu st0, d3; \153vpshufb a0, d3, a0; \154vmovdqu d2, st0; \155\156transpose_4x4(c0, d0, a0, b0, d2, d3); \157transpose_4x4(c1, d1, a1, b1, d2, d3); \158vmovdqu st0, d2; \159vmovdqu st1, d3; \160\161vmovdqu b0, st0; \162vmovdqu b1, st1; \163transpose_4x4(c2, d2, a2, b2, b0, b1); \164transpose_4x4(c3, d3, a3, b3, b0, b1); \165vmovdqu st0, b0; \166vmovdqu st1, b1; \167/* does not adjust output bytes inside vectors */168169/* load blocks to registers and apply pre-whitening */170#define inpack16_pre(x0, x1, x2, x3, \171x4, x5, x6, x7, \172y0, y1, y2, y3, \173y4, y5, y6, y7, \174rio) \175vmovdqu (0 * 16)(rio), x0; \176vmovdqu (1 * 16)(rio), x1; \177vmovdqu (2 * 16)(rio), x2; \178vmovdqu (3 * 16)(rio), x3; \179vmovdqu (4 * 16)(rio), x4; \180vmovdqu (5 * 16)(rio), x5; \181vmovdqu (6 * 16)(rio), x6; \182vmovdqu (7 * 16)(rio), x7; \183vmovdqu (8 * 16)(rio), y0; \184vmovdqu (9 * 16)(rio), y1; \185vmovdqu (10 * 16)(rio), y2; \186vmovdqu (11 * 16)(rio), y3; \187vmovdqu (12 * 16)(rio), y4; \188vmovdqu (13 * 16)(rio), y5; \189vmovdqu (14 * 16)(rio), y6; \190vmovdqu (15 * 16)(rio), y7;191192/* byteslice pre-whitened blocks and store to temporary memory */193#define inpack16_post(x0, x1, x2, x3, \194x4, x5, x6, x7, \195y0, y1, y2, y3, \196y4, y5, y6, y7, \197mem_ab, mem_cd) \198byteslice_16x16b(x0, x1, x2, x3, \199x4, x5, x6, x7, \200y0, y1, y2, y3, \201y4, y5, y6, y7, \202(mem_ab), (mem_cd)); \203\204vmovdqu x0, 0 * 16(mem_ab); \205vmovdqu x1, 1 * 16(mem_ab); \206vmovdqu x2, 2 * 16(mem_ab); \207vmovdqu x3, 3 * 16(mem_ab); \208vmovdqu x4, 4 * 16(mem_ab); \209vmovdqu x5, 5 * 16(mem_ab); \210vmovdqu x6, 6 * 16(mem_ab); \211vmovdqu x7, 7 * 16(mem_ab); \212vmovdqu y0, 0 * 16(mem_cd); \213vmovdqu y1, 1 * 16(mem_cd); \214vmovdqu y2, 2 * 16(mem_cd); \215vmovdqu y3, 3 * 16(mem_cd); \216vmovdqu y4, 4 * 16(mem_cd); \217vmovdqu y5, 5 * 16(mem_cd); \218vmovdqu y6, 6 * 16(mem_cd); \219vmovdqu y7, 7 * 16(mem_cd);220221#define write_output(x0, x1, x2, x3, \222x4, x5, x6, x7, \223y0, y1, y2, y3, \224y4, y5, y6, y7, \225mem) \226vmovdqu x0, 0 * 16(mem); \227vmovdqu x1, 1 * 16(mem); \228vmovdqu x2, 2 * 16(mem); \229vmovdqu x3, 3 * 16(mem); \230vmovdqu x4, 4 * 16(mem); \231vmovdqu x5, 5 * 16(mem); \232vmovdqu x6, 6 * 16(mem); \233vmovdqu x7, 7 * 16(mem); \234vmovdqu y0, 8 * 16(mem); \235vmovdqu y1, 9 * 16(mem); \236vmovdqu y2, 10 * 16(mem); \237vmovdqu y3, 11 * 16(mem); \238vmovdqu y4, 12 * 16(mem); \239vmovdqu y5, 13 * 16(mem); \240vmovdqu y6, 14 * 16(mem); \241vmovdqu y7, 15 * 16(mem); \242243#define aria_store_state_8way(x0, x1, x2, x3, \244x4, x5, x6, x7, \245mem_tmp, idx) \246vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \247vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \248vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \249vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \250vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \251vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \252vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \253vmovdqu x7, ((idx + 7) * 16)(mem_tmp);254255#define aria_load_state_8way(x0, x1, x2, x3, \256x4, x5, x6, x7, \257mem_tmp, idx) \258vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \259vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \260vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \261vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \262vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \263vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \264vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \265vmovdqu ((idx + 7) * 16)(mem_tmp), x7;266267#define aria_ark_8way(x0, x1, x2, x3, \268x4, x5, x6, x7, \269t0, t1, t2, rk, \270idx, round) \271/* AddRoundKey */ \272vbroadcastss ((round * 16) + idx + 0)(rk), t0; \273vpsrld $24, t0, t2; \274vpshufb t1, t2, t2; \275vpxor t2, x0, x0; \276vpsrld $16, t0, t2; \277vpshufb t1, t2, t2; \278vpxor t2, x1, x1; \279vpsrld $8, t0, t2; \280vpshufb t1, t2, t2; \281vpxor t2, x2, x2; \282vpshufb t1, t0, t2; \283vpxor t2, x3, x3; \284vbroadcastss ((round * 16) + idx + 4)(rk), t0; \285vpsrld $24, t0, t2; \286vpshufb t1, t2, t2; \287vpxor t2, x4, x4; \288vpsrld $16, t0, t2; \289vpshufb t1, t2, t2; \290vpxor t2, x5, x5; \291vpsrld $8, t0, t2; \292vpshufb t1, t2, t2; \293vpxor t2, x6, x6; \294vpshufb t1, t0, t2; \295vpxor t2, x7, x7;296297#ifdef CONFIG_AS_GFNI298#define aria_sbox_8way_gfni(x0, x1, x2, x3, \299x4, x5, x6, x7, \300t0, t1, t2, t3, \301t4, t5, t6, t7) \302vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \303vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \304vmovdqa .Ltf_id_bitmatrix(%rip), t2; \305vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \306vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \307vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \308vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \309vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \310vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \311vgf2p8affineinvqb $0, t2, x2, x2; \312vgf2p8affineinvqb $0, t2, x6, x6; \313vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \314vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \315vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \316vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \317vgf2p8affineinvqb $0, t2, x3, x3; \318vgf2p8affineinvqb $0, t2, x7, x7319320#endif /* CONFIG_AS_GFNI */321322#define aria_sbox_8way(x0, x1, x2, x3, \323x4, x5, x6, x7, \324t0, t1, t2, t3, \325t4, t5, t6, t7) \326vmovdqa .Linv_shift_row(%rip), t0; \327vmovdqa .Lshift_row(%rip), t1; \328vbroadcastss .L0f0f0f0f(%rip), t6; \329vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \330vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \331vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \332vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \333\334vaesenclast t7, x0, x0; \335vaesenclast t7, x4, x4; \336vaesenclast t7, x1, x1; \337vaesenclast t7, x5, x5; \338vaesdeclast t7, x2, x2; \339vaesdeclast t7, x6, x6; \340\341/* AES inverse shift rows */ \342vpshufb t0, x0, x0; \343vpshufb t0, x4, x4; \344vpshufb t0, x1, x1; \345vpshufb t0, x5, x5; \346vpshufb t1, x3, x3; \347vpshufb t1, x7, x7; \348vpshufb t1, x2, x2; \349vpshufb t1, x6, x6; \350\351/* affine transformation for S2 */ \352filter_8bit(x1, t2, t3, t6, t0); \353/* affine transformation for S2 */ \354filter_8bit(x5, t2, t3, t6, t0); \355\356/* affine transformation for X2 */ \357filter_8bit(x3, t4, t5, t6, t0); \358/* affine transformation for X2 */ \359filter_8bit(x7, t4, t5, t6, t0); \360vaesdeclast t7, x3, x3; \361vaesdeclast t7, x7, x7;362363#define aria_diff_m(x0, x1, x2, x3, \364t0, t1, t2, t3) \365/* T = rotr32(X, 8); */ \366/* X ^= T */ \367vpxor x0, x3, t0; \368vpxor x1, x0, t1; \369vpxor x2, x1, t2; \370vpxor x3, x2, t3; \371/* X = T ^ rotr(X, 16); */ \372vpxor t2, x0, x0; \373vpxor x1, t3, t3; \374vpxor t0, x2, x2; \375vpxor t1, x3, x1; \376vmovdqu t3, x3;377378#define aria_diff_word(x0, x1, x2, x3, \379x4, x5, x6, x7, \380y0, y1, y2, y3, \381y4, y5, y6, y7) \382/* t1 ^= t2; */ \383vpxor y0, x4, x4; \384vpxor y1, x5, x5; \385vpxor y2, x6, x6; \386vpxor y3, x7, x7; \387\388/* t2 ^= t3; */ \389vpxor y4, y0, y0; \390vpxor y5, y1, y1; \391vpxor y6, y2, y2; \392vpxor y7, y3, y3; \393\394/* t0 ^= t1; */ \395vpxor x4, x0, x0; \396vpxor x5, x1, x1; \397vpxor x6, x2, x2; \398vpxor x7, x3, x3; \399\400/* t3 ^= t1; */ \401vpxor x4, y4, y4; \402vpxor x5, y5, y5; \403vpxor x6, y6, y6; \404vpxor x7, y7, y7; \405\406/* t2 ^= t0; */ \407vpxor x0, y0, y0; \408vpxor x1, y1, y1; \409vpxor x2, y2, y2; \410vpxor x3, y3, y3; \411\412/* t1 ^= t2; */ \413vpxor y0, x4, x4; \414vpxor y1, x5, x5; \415vpxor y2, x6, x6; \416vpxor y3, x7, x7;417418#define aria_fe(x0, x1, x2, x3, \419x4, x5, x6, x7, \420y0, y1, y2, y3, \421y4, y5, y6, y7, \422mem_tmp, rk, round) \423vpxor y7, y7, y7; \424aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \425y0, y7, y2, rk, 8, round); \426\427aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \428y0, y1, y2, y3, y4, y5, y6, y7); \429\430aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \431aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \432aria_store_state_8way(x0, x1, x2, x3, \433x4, x5, x6, x7, \434mem_tmp, 8); \435\436aria_load_state_8way(x0, x1, x2, x3, \437x4, x5, x6, x7, \438mem_tmp, 0); \439aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \440y0, y7, y2, rk, 0, round); \441\442aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \443y0, y1, y2, y3, y4, y5, y6, y7); \444\445aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \446aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \447aria_store_state_8way(x0, x1, x2, x3, \448x4, x5, x6, x7, \449mem_tmp, 0); \450aria_load_state_8way(y0, y1, y2, y3, \451y4, y5, y6, y7, \452mem_tmp, 8); \453aria_diff_word(x0, x1, x2, x3, \454x4, x5, x6, x7, \455y0, y1, y2, y3, \456y4, y5, y6, y7); \457/* aria_diff_byte() \458* T3 = ABCD -> BADC \459* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \460* T0 = ABCD -> CDAB \461* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \462* T1 = ABCD -> DCBA \463* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \464*/ \465aria_diff_word(x2, x3, x0, x1, \466x7, x6, x5, x4, \467y0, y1, y2, y3, \468y5, y4, y7, y6); \469aria_store_state_8way(x3, x2, x1, x0, \470x6, x7, x4, x5, \471mem_tmp, 0);472473#define aria_fo(x0, x1, x2, x3, \474x4, x5, x6, x7, \475y0, y1, y2, y3, \476y4, y5, y6, y7, \477mem_tmp, rk, round) \478vpxor y7, y7, y7; \479aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \480y0, y7, y2, rk, 8, round); \481\482aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \483y0, y1, y2, y3, y4, y5, y6, y7); \484\485aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \486aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \487aria_store_state_8way(x0, x1, x2, x3, \488x4, x5, x6, x7, \489mem_tmp, 8); \490\491aria_load_state_8way(x0, x1, x2, x3, \492x4, x5, x6, x7, \493mem_tmp, 0); \494aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \495y0, y7, y2, rk, 0, round); \496\497aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \498y0, y1, y2, y3, y4, y5, y6, y7); \499\500aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \501aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \502aria_store_state_8way(x0, x1, x2, x3, \503x4, x5, x6, x7, \504mem_tmp, 0); \505aria_load_state_8way(y0, y1, y2, y3, \506y4, y5, y6, y7, \507mem_tmp, 8); \508aria_diff_word(x0, x1, x2, x3, \509x4, x5, x6, x7, \510y0, y1, y2, y3, \511y4, y5, y6, y7); \512/* aria_diff_byte() \513* T1 = ABCD -> BADC \514* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \515* T2 = ABCD -> CDAB \516* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \517* T3 = ABCD -> DCBA \518* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \519*/ \520aria_diff_word(x0, x1, x2, x3, \521x5, x4, x7, x6, \522y2, y3, y0, y1, \523y7, y6, y5, y4); \524aria_store_state_8way(x3, x2, x1, x0, \525x6, x7, x4, x5, \526mem_tmp, 0);527528#define aria_ff(x0, x1, x2, x3, \529x4, x5, x6, x7, \530y0, y1, y2, y3, \531y4, y5, y6, y7, \532mem_tmp, rk, round, last_round) \533vpxor y7, y7, y7; \534aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \535y0, y7, y2, rk, 8, round); \536\537aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \538y0, y1, y2, y3, y4, y5, y6, y7); \539\540aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \541y0, y7, y2, rk, 8, last_round); \542\543aria_store_state_8way(x0, x1, x2, x3, \544x4, x5, x6, x7, \545mem_tmp, 8); \546\547aria_load_state_8way(x0, x1, x2, x3, \548x4, x5, x6, x7, \549mem_tmp, 0); \550aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \551y0, y7, y2, rk, 0, round); \552\553aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \554y0, y1, y2, y3, y4, y5, y6, y7); \555\556aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \557y0, y7, y2, rk, 0, last_round); \558\559aria_load_state_8way(y0, y1, y2, y3, \560y4, y5, y6, y7, \561mem_tmp, 8);562563#ifdef CONFIG_AS_GFNI564#define aria_fe_gfni(x0, x1, x2, x3, \565x4, x5, x6, x7, \566y0, y1, y2, y3, \567y4, y5, y6, y7, \568mem_tmp, rk, round) \569vpxor y7, y7, y7; \570aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \571y0, y7, y2, rk, 8, round); \572\573aria_sbox_8way_gfni(x2, x3, x0, x1, \574x6, x7, x4, x5, \575y0, y1, y2, y3, \576y4, y5, y6, y7); \577\578aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \579aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \580aria_store_state_8way(x0, x1, x2, x3, \581x4, x5, x6, x7, \582mem_tmp, 8); \583\584aria_load_state_8way(x0, x1, x2, x3, \585x4, x5, x6, x7, \586mem_tmp, 0); \587aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \588y0, y7, y2, rk, 0, round); \589\590aria_sbox_8way_gfni(x2, x3, x0, x1, \591x6, x7, x4, x5, \592y0, y1, y2, y3, \593y4, y5, y6, y7); \594\595aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \596aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \597aria_store_state_8way(x0, x1, x2, x3, \598x4, x5, x6, x7, \599mem_tmp, 0); \600aria_load_state_8way(y0, y1, y2, y3, \601y4, y5, y6, y7, \602mem_tmp, 8); \603aria_diff_word(x0, x1, x2, x3, \604x4, x5, x6, x7, \605y0, y1, y2, y3, \606y4, y5, y6, y7); \607/* aria_diff_byte() \608* T3 = ABCD -> BADC \609* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \610* T0 = ABCD -> CDAB \611* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \612* T1 = ABCD -> DCBA \613* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \614*/ \615aria_diff_word(x2, x3, x0, x1, \616x7, x6, x5, x4, \617y0, y1, y2, y3, \618y5, y4, y7, y6); \619aria_store_state_8way(x3, x2, x1, x0, \620x6, x7, x4, x5, \621mem_tmp, 0);622623#define aria_fo_gfni(x0, x1, x2, x3, \624x4, x5, x6, x7, \625y0, y1, y2, y3, \626y4, y5, y6, y7, \627mem_tmp, rk, round) \628vpxor y7, y7, y7; \629aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \630y0, y7, y2, rk, 8, round); \631\632aria_sbox_8way_gfni(x0, x1, x2, x3, \633x4, x5, x6, x7, \634y0, y1, y2, y3, \635y4, y5, y6, y7); \636\637aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \638aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \639aria_store_state_8way(x0, x1, x2, x3, \640x4, x5, x6, x7, \641mem_tmp, 8); \642\643aria_load_state_8way(x0, x1, x2, x3, \644x4, x5, x6, x7, \645mem_tmp, 0); \646aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \647y0, y7, y2, rk, 0, round); \648\649aria_sbox_8way_gfni(x0, x1, x2, x3, \650x4, x5, x6, x7, \651y0, y1, y2, y3, \652y4, y5, y6, y7); \653\654aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \655aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \656aria_store_state_8way(x0, x1, x2, x3, \657x4, x5, x6, x7, \658mem_tmp, 0); \659aria_load_state_8way(y0, y1, y2, y3, \660y4, y5, y6, y7, \661mem_tmp, 8); \662aria_diff_word(x0, x1, x2, x3, \663x4, x5, x6, x7, \664y0, y1, y2, y3, \665y4, y5, y6, y7); \666/* aria_diff_byte() \667* T1 = ABCD -> BADC \668* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \669* T2 = ABCD -> CDAB \670* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \671* T3 = ABCD -> DCBA \672* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \673*/ \674aria_diff_word(x0, x1, x2, x3, \675x5, x4, x7, x6, \676y2, y3, y0, y1, \677y7, y6, y5, y4); \678aria_store_state_8way(x3, x2, x1, x0, \679x6, x7, x4, x5, \680mem_tmp, 0);681682#define aria_ff_gfni(x0, x1, x2, x3, \683x4, x5, x6, x7, \684y0, y1, y2, y3, \685y4, y5, y6, y7, \686mem_tmp, rk, round, last_round) \687vpxor y7, y7, y7; \688aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \689y0, y7, y2, rk, 8, round); \690\691aria_sbox_8way_gfni(x2, x3, x0, x1, \692x6, x7, x4, x5, \693y0, y1, y2, y3, \694y4, y5, y6, y7); \695\696aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \697y0, y7, y2, rk, 8, last_round); \698\699aria_store_state_8way(x0, x1, x2, x3, \700x4, x5, x6, x7, \701mem_tmp, 8); \702\703aria_load_state_8way(x0, x1, x2, x3, \704x4, x5, x6, x7, \705mem_tmp, 0); \706aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \707y0, y7, y2, rk, 0, round); \708\709aria_sbox_8way_gfni(x2, x3, x0, x1, \710x6, x7, x4, x5, \711y0, y1, y2, y3, \712y4, y5, y6, y7); \713\714aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \715y0, y7, y2, rk, 0, last_round); \716\717aria_load_state_8way(y0, y1, y2, y3, \718y4, y5, y6, y7, \719mem_tmp, 8);720721#endif /* CONFIG_AS_GFNI */722723/* NB: section is mergeable, all elements must be aligned 16-byte blocks */724.section .rodata.cst16, "aM", @progbits, 16725.align 16726727#define SHUFB_BYTES(idx) \7280 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)729730.Lshufb_16x16b:731.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);732/* For isolating SubBytes from AESENCLAST, inverse shift row */733.Linv_shift_row:734.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b735.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03736.Lshift_row:737.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03738.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b739/* For CTR-mode IV byteswap */740.Lbswap128_mask:741.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08742.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00743744/* AES inverse affine and S2 combined:745* 1 1 0 0 0 0 0 1 x0 0746* 0 1 0 0 1 0 0 0 x1 0747* 1 1 0 0 1 1 1 1 x2 0748* 0 1 1 0 1 0 0 1 x3 1749* 0 1 0 0 1 1 0 0 * x4 + 0750* 0 1 0 1 1 0 0 0 x5 0751* 0 0 0 0 0 1 0 1 x6 0752* 1 1 1 0 0 1 1 1 x7 1753*/754.Ltf_lo__inv_aff__and__s2:755.octa 0x92172DA81A9FA520B2370D883ABF8500756.Ltf_hi__inv_aff__and__s2:757.octa 0x2B15FFC1AF917B45E6D8320C625CB688758759/* X2 and AES forward affine combined:760* 1 0 1 1 0 0 0 1 x0 0761* 0 1 1 1 1 0 1 1 x1 0762* 0 0 0 1 1 0 1 0 x2 1763* 0 1 0 0 0 1 0 0 x3 0764* 0 0 1 1 1 0 1 1 * x4 + 0765* 0 1 0 0 1 0 0 0 x5 0766* 1 1 0 1 0 0 1 1 x6 0767* 0 1 0 0 1 0 1 0 x7 0768*/769.Ltf_lo__x2__and__fwd_aff:770.octa 0xEFAE0544FCBD1657B8F95213ABEA4100771.Ltf_hi__x2__and__fwd_aff:772.octa 0x3F893781E95FE1576CDA64D2BA0CB204773774#ifdef CONFIG_AS_GFNI775/* AES affine: */776#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)777.Ltf_aff_bitmatrix:778.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),779BV8(1, 1, 0, 0, 0, 1, 1, 1),780BV8(1, 1, 1, 0, 0, 0, 1, 1),781BV8(1, 1, 1, 1, 0, 0, 0, 1),782BV8(1, 1, 1, 1, 1, 0, 0, 0),783BV8(0, 1, 1, 1, 1, 1, 0, 0),784BV8(0, 0, 1, 1, 1, 1, 1, 0),785BV8(0, 0, 0, 1, 1, 1, 1, 1))786.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),787BV8(1, 1, 0, 0, 0, 1, 1, 1),788BV8(1, 1, 1, 0, 0, 0, 1, 1),789BV8(1, 1, 1, 1, 0, 0, 0, 1),790BV8(1, 1, 1, 1, 1, 0, 0, 0),791BV8(0, 1, 1, 1, 1, 1, 0, 0),792BV8(0, 0, 1, 1, 1, 1, 1, 0),793BV8(0, 0, 0, 1, 1, 1, 1, 1))794795/* AES inverse affine: */796#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)797.Ltf_inv_bitmatrix:798.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),799BV8(1, 0, 0, 1, 0, 0, 1, 0),800BV8(0, 1, 0, 0, 1, 0, 0, 1),801BV8(1, 0, 1, 0, 0, 1, 0, 0),802BV8(0, 1, 0, 1, 0, 0, 1, 0),803BV8(0, 0, 1, 0, 1, 0, 0, 1),804BV8(1, 0, 0, 1, 0, 1, 0, 0),805BV8(0, 1, 0, 0, 1, 0, 1, 0))806.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),807BV8(1, 0, 0, 1, 0, 0, 1, 0),808BV8(0, 1, 0, 0, 1, 0, 0, 1),809BV8(1, 0, 1, 0, 0, 1, 0, 0),810BV8(0, 1, 0, 1, 0, 0, 1, 0),811BV8(0, 0, 1, 0, 1, 0, 0, 1),812BV8(1, 0, 0, 1, 0, 1, 0, 0),813BV8(0, 1, 0, 0, 1, 0, 1, 0))814815/* S2: */816#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)817.Ltf_s2_bitmatrix:818.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),819BV8(0, 0, 1, 1, 1, 1, 1, 1),820BV8(1, 1, 1, 0, 1, 1, 0, 1),821BV8(1, 1, 0, 0, 0, 0, 1, 1),822BV8(0, 1, 0, 0, 0, 0, 1, 1),823BV8(1, 1, 0, 0, 1, 1, 1, 0),824BV8(0, 1, 1, 0, 0, 0, 1, 1),825BV8(1, 1, 1, 1, 0, 1, 1, 0))826.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),827BV8(0, 0, 1, 1, 1, 1, 1, 1),828BV8(1, 1, 1, 0, 1, 1, 0, 1),829BV8(1, 1, 0, 0, 0, 0, 1, 1),830BV8(0, 1, 0, 0, 0, 0, 1, 1),831BV8(1, 1, 0, 0, 1, 1, 1, 0),832BV8(0, 1, 1, 0, 0, 0, 1, 1),833BV8(1, 1, 1, 1, 0, 1, 1, 0))834835/* X2: */836#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)837.Ltf_x2_bitmatrix:838.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),839BV8(0, 0, 1, 0, 0, 1, 1, 0),840BV8(0, 0, 0, 0, 1, 0, 1, 0),841BV8(1, 1, 1, 0, 0, 0, 1, 1),842BV8(1, 1, 1, 0, 1, 1, 0, 0),843BV8(0, 1, 1, 0, 1, 0, 1, 1),844BV8(1, 0, 1, 1, 1, 1, 0, 1),845BV8(1, 0, 0, 1, 0, 0, 1, 1))846.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),847BV8(0, 0, 1, 0, 0, 1, 1, 0),848BV8(0, 0, 0, 0, 1, 0, 1, 0),849BV8(1, 1, 1, 0, 0, 0, 1, 1),850BV8(1, 1, 1, 0, 1, 1, 0, 0),851BV8(0, 1, 1, 0, 1, 0, 1, 1),852BV8(1, 0, 1, 1, 1, 1, 0, 1),853BV8(1, 0, 0, 1, 0, 0, 1, 1))854855/* Identity matrix: */856.Ltf_id_bitmatrix:857.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),858BV8(0, 1, 0, 0, 0, 0, 0, 0),859BV8(0, 0, 1, 0, 0, 0, 0, 0),860BV8(0, 0, 0, 1, 0, 0, 0, 0),861BV8(0, 0, 0, 0, 1, 0, 0, 0),862BV8(0, 0, 0, 0, 0, 1, 0, 0),863BV8(0, 0, 0, 0, 0, 0, 1, 0),864BV8(0, 0, 0, 0, 0, 0, 0, 1))865.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),866BV8(0, 1, 0, 0, 0, 0, 0, 0),867BV8(0, 0, 1, 0, 0, 0, 0, 0),868BV8(0, 0, 0, 1, 0, 0, 0, 0),869BV8(0, 0, 0, 0, 1, 0, 0, 0),870BV8(0, 0, 0, 0, 0, 1, 0, 0),871BV8(0, 0, 0, 0, 0, 0, 1, 0),872BV8(0, 0, 0, 0, 0, 0, 0, 1))873#endif /* CONFIG_AS_GFNI */874875/* 4-bit mask */876.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4877.align 4878.L0f0f0f0f:879.long 0x0f0f0f0f880881.text882883SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)884/* input:885* %r9: rk886* %rsi: dst887* %rdx: src888* %xmm0..%xmm15: 16 byte-sliced blocks889*/890891FRAME_BEGIN892893movq %rsi, %rax;894leaq 8 * 16(%rax), %r8;895896inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,897%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,898%xmm15, %rax, %r8);899aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,900%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,901%rax, %r9, 0);902aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,903%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,904%xmm15, %rax, %r9, 1);905aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,906%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,907%rax, %r9, 2);908aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,909%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,910%xmm15, %rax, %r9, 3);911aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,912%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,913%rax, %r9, 4);914aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,915%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,916%xmm15, %rax, %r9, 5);917aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,918%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,919%rax, %r9, 6);920aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,921%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,922%xmm15, %rax, %r9, 7);923aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,924%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,925%rax, %r9, 8);926aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,927%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,928%xmm15, %rax, %r9, 9);929aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,930%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,931%rax, %r9, 10);932cmpl $12, ARIA_CTX_rounds(CTX);933jne .Laria_192;934aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,935%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,936%xmm15, %rax, %r9, 11, 12);937jmp .Laria_end;938.Laria_192:939aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,940%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,941%xmm15, %rax, %r9, 11);942aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,943%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,944%rax, %r9, 12);945cmpl $14, ARIA_CTX_rounds(CTX);946jne .Laria_256;947aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,948%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,949%xmm15, %rax, %r9, 13, 14);950jmp .Laria_end;951.Laria_256:952aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,953%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,954%xmm15, %rax, %r9, 13);955aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,956%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,957%rax, %r9, 14);958aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,959%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,960%xmm15, %rax, %r9, 15, 16);961.Laria_end:962debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,963%xmm9, %xmm13, %xmm0, %xmm5,964%xmm10, %xmm14, %xmm3, %xmm6,965%xmm11, %xmm15, %xmm2, %xmm7,966(%rax), (%r8));967968FRAME_END969RET;970SYM_FUNC_END(__aria_aesni_avx_crypt_16way)971972SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)973/* input:974* %rdi: ctx, CTX975* %rsi: dst976* %rdx: src977*/978979FRAME_BEGIN980981leaq ARIA_CTX_enc_key(CTX), %r9;982983inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,984%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,985%xmm15, %rdx);986987call __aria_aesni_avx_crypt_16way;988989write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,990%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,991%xmm15, %rax);992993FRAME_END994RET;995SYM_FUNC_END(aria_aesni_avx_encrypt_16way)996997SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)998/* input:999* %rdi: ctx, CTX1000* %rsi: dst1001* %rdx: src1002*/10031004FRAME_BEGIN10051006leaq ARIA_CTX_dec_key(CTX), %r9;10071008inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,1009%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1010%xmm15, %rdx);10111012call __aria_aesni_avx_crypt_16way;10131014write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,1015%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1016%xmm15, %rax);10171018FRAME_END1019RET;1020SYM_FUNC_END(aria_aesni_avx_decrypt_16way)10211022SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)1023/* input:1024* %rdi: ctx1025* %rsi: dst1026* %rdx: src1027* %rcx: keystream1028* %r8: iv (big endian, 128bit)1029*/10301031FRAME_BEGIN1032/* load IV and byteswap */1033vmovdqu (%r8), %xmm8;10341035vmovdqa .Lbswap128_mask (%rip), %xmm1;1036vpshufb %xmm1, %xmm8, %xmm3; /* be => le */10371038vpcmpeqd %xmm0, %xmm0, %xmm0;1039vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */10401041/* construct IVs */1042inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1043vpshufb %xmm1, %xmm3, %xmm9;1044inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1045vpshufb %xmm1, %xmm3, %xmm10;1046inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1047vpshufb %xmm1, %xmm3, %xmm11;1048inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1049vpshufb %xmm1, %xmm3, %xmm12;1050inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1051vpshufb %xmm1, %xmm3, %xmm13;1052inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1053vpshufb %xmm1, %xmm3, %xmm14;1054inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1055vpshufb %xmm1, %xmm3, %xmm15;1056vmovdqu %xmm8, (0 * 16)(%rcx);1057vmovdqu %xmm9, (1 * 16)(%rcx);1058vmovdqu %xmm10, (2 * 16)(%rcx);1059vmovdqu %xmm11, (3 * 16)(%rcx);1060vmovdqu %xmm12, (4 * 16)(%rcx);1061vmovdqu %xmm13, (5 * 16)(%rcx);1062vmovdqu %xmm14, (6 * 16)(%rcx);1063vmovdqu %xmm15, (7 * 16)(%rcx);10641065inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1066vpshufb %xmm1, %xmm3, %xmm8;1067inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1068vpshufb %xmm1, %xmm3, %xmm9;1069inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1070vpshufb %xmm1, %xmm3, %xmm10;1071inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1072vpshufb %xmm1, %xmm3, %xmm11;1073inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1074vpshufb %xmm1, %xmm3, %xmm12;1075inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1076vpshufb %xmm1, %xmm3, %xmm13;1077inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1078vpshufb %xmm1, %xmm3, %xmm14;1079inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1080vpshufb %xmm1, %xmm3, %xmm15;1081inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1082vpshufb %xmm1, %xmm3, %xmm4;1083vmovdqu %xmm4, (%r8);10841085vmovdqu (0 * 16)(%rcx), %xmm0;1086vmovdqu (1 * 16)(%rcx), %xmm1;1087vmovdqu (2 * 16)(%rcx), %xmm2;1088vmovdqu (3 * 16)(%rcx), %xmm3;1089vmovdqu (4 * 16)(%rcx), %xmm4;1090vmovdqu (5 * 16)(%rcx), %xmm5;1091vmovdqu (6 * 16)(%rcx), %xmm6;1092vmovdqu (7 * 16)(%rcx), %xmm7;10931094FRAME_END1095RET;1096SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)10971098SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)1099/* input:1100* %rdi: ctx1101* %rsi: dst1102* %rdx: src1103* %rcx: keystream1104* %r8: iv (big endian, 128bit)1105*/1106FRAME_BEGIN11071108call __aria_aesni_avx_ctr_gen_keystream_16way;11091110leaq (%rsi), %r10;1111leaq (%rdx), %r11;1112leaq (%rcx), %rsi;1113leaq (%rcx), %rdx;1114leaq ARIA_CTX_enc_key(CTX), %r9;11151116call __aria_aesni_avx_crypt_16way;11171118vpxor (0 * 16)(%r11), %xmm1, %xmm1;1119vpxor (1 * 16)(%r11), %xmm0, %xmm0;1120vpxor (2 * 16)(%r11), %xmm3, %xmm3;1121vpxor (3 * 16)(%r11), %xmm2, %xmm2;1122vpxor (4 * 16)(%r11), %xmm4, %xmm4;1123vpxor (5 * 16)(%r11), %xmm5, %xmm5;1124vpxor (6 * 16)(%r11), %xmm6, %xmm6;1125vpxor (7 * 16)(%r11), %xmm7, %xmm7;1126vpxor (8 * 16)(%r11), %xmm8, %xmm8;1127vpxor (9 * 16)(%r11), %xmm9, %xmm9;1128vpxor (10 * 16)(%r11), %xmm10, %xmm10;1129vpxor (11 * 16)(%r11), %xmm11, %xmm11;1130vpxor (12 * 16)(%r11), %xmm12, %xmm12;1131vpxor (13 * 16)(%r11), %xmm13, %xmm13;1132vpxor (14 * 16)(%r11), %xmm14, %xmm14;1133vpxor (15 * 16)(%r11), %xmm15, %xmm15;1134write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,1135%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1136%xmm15, %r10);11371138FRAME_END1139RET;1140SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)11411142#ifdef CONFIG_AS_GFNI1143SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)1144/* input:1145* %r9: rk1146* %rsi: dst1147* %rdx: src1148* %xmm0..%xmm15: 16 byte-sliced blocks1149*/11501151FRAME_BEGIN11521153movq %rsi, %rax;1154leaq 8 * 16(%rax), %r8;11551156inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,1157%xmm4, %xmm5, %xmm6, %xmm7,1158%xmm8, %xmm9, %xmm10, %xmm11,1159%xmm12, %xmm13, %xmm14,1160%xmm15, %rax, %r8);1161aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,1162%xmm12, %xmm13, %xmm14, %xmm15,1163%xmm0, %xmm1, %xmm2, %xmm3,1164%xmm4, %xmm5, %xmm6, %xmm7,1165%rax, %r9, 0);1166aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1167%xmm4, %xmm5, %xmm6, %xmm7,1168%xmm8, %xmm9, %xmm10, %xmm11,1169%xmm12, %xmm13, %xmm14,1170%xmm15, %rax, %r9, 1);1171aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1172%xmm12, %xmm13, %xmm14, %xmm15,1173%xmm0, %xmm1, %xmm2, %xmm3,1174%xmm4, %xmm5, %xmm6, %xmm7,1175%rax, %r9, 2);1176aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1177%xmm4, %xmm5, %xmm6, %xmm7,1178%xmm8, %xmm9, %xmm10, %xmm11,1179%xmm12, %xmm13, %xmm14,1180%xmm15, %rax, %r9, 3);1181aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1182%xmm12, %xmm13, %xmm14, %xmm15,1183%xmm0, %xmm1, %xmm2, %xmm3,1184%xmm4, %xmm5, %xmm6, %xmm7,1185%rax, %r9, 4);1186aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1187%xmm4, %xmm5, %xmm6, %xmm7,1188%xmm8, %xmm9, %xmm10, %xmm11,1189%xmm12, %xmm13, %xmm14,1190%xmm15, %rax, %r9, 5);1191aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1192%xmm12, %xmm13, %xmm14, %xmm15,1193%xmm0, %xmm1, %xmm2, %xmm3,1194%xmm4, %xmm5, %xmm6, %xmm7,1195%rax, %r9, 6);1196aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1197%xmm4, %xmm5, %xmm6, %xmm7,1198%xmm8, %xmm9, %xmm10, %xmm11,1199%xmm12, %xmm13, %xmm14,1200%xmm15, %rax, %r9, 7);1201aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1202%xmm12, %xmm13, %xmm14, %xmm15,1203%xmm0, %xmm1, %xmm2, %xmm3,1204%xmm4, %xmm5, %xmm6, %xmm7,1205%rax, %r9, 8);1206aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1207%xmm4, %xmm5, %xmm6, %xmm7,1208%xmm8, %xmm9, %xmm10, %xmm11,1209%xmm12, %xmm13, %xmm14,1210%xmm15, %rax, %r9, 9);1211aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1212%xmm12, %xmm13, %xmm14, %xmm15,1213%xmm0, %xmm1, %xmm2, %xmm3,1214%xmm4, %xmm5, %xmm6, %xmm7,1215%rax, %r9, 10);1216cmpl $12, ARIA_CTX_rounds(CTX);1217jne .Laria_gfni_192;1218aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,1219%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1220%xmm15, %rax, %r9, 11, 12);1221jmp .Laria_gfni_end;1222.Laria_gfni_192:1223aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1224%xmm4, %xmm5, %xmm6, %xmm7,1225%xmm8, %xmm9, %xmm10, %xmm11,1226%xmm12, %xmm13, %xmm14,1227%xmm15, %rax, %r9, 11);1228aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1229%xmm12, %xmm13, %xmm14, %xmm15,1230%xmm0, %xmm1, %xmm2, %xmm3,1231%xmm4, %xmm5, %xmm6, %xmm7,1232%rax, %r9, 12);1233cmpl $14, ARIA_CTX_rounds(CTX);1234jne .Laria_gfni_256;1235aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1236%xmm4, %xmm5, %xmm6, %xmm7,1237%xmm8, %xmm9, %xmm10, %xmm11,1238%xmm12, %xmm13, %xmm14,1239%xmm15, %rax, %r9, 13, 14);1240jmp .Laria_gfni_end;1241.Laria_gfni_256:1242aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1243%xmm4, %xmm5, %xmm6, %xmm7,1244%xmm8, %xmm9, %xmm10, %xmm11,1245%xmm12, %xmm13, %xmm14,1246%xmm15, %rax, %r9, 13);1247aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1248%xmm12, %xmm13, %xmm14, %xmm15,1249%xmm0, %xmm1, %xmm2, %xmm3,1250%xmm4, %xmm5, %xmm6, %xmm7,1251%rax, %r9, 14);1252aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1253%xmm4, %xmm5, %xmm6, %xmm7,1254%xmm8, %xmm9, %xmm10, %xmm11,1255%xmm12, %xmm13, %xmm14,1256%xmm15, %rax, %r9, 15, 16);1257.Laria_gfni_end:1258debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,1259%xmm9, %xmm13, %xmm0, %xmm5,1260%xmm10, %xmm14, %xmm3, %xmm6,1261%xmm11, %xmm15, %xmm2, %xmm7,1262(%rax), (%r8));12631264FRAME_END1265RET;1266SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)12671268SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)1269/* input:1270* %rdi: ctx, CTX1271* %rsi: dst1272* %rdx: src1273*/12741275FRAME_BEGIN12761277leaq ARIA_CTX_enc_key(CTX), %r9;12781279inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,1280%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1281%xmm15, %rdx);12821283call __aria_aesni_avx_gfni_crypt_16way;12841285write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,1286%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1287%xmm15, %rax);12881289FRAME_END1290RET;1291SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)12921293SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)1294/* input:1295* %rdi: ctx, CTX1296* %rsi: dst1297* %rdx: src1298*/12991300FRAME_BEGIN13011302leaq ARIA_CTX_dec_key(CTX), %r9;13031304inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,1305%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1306%xmm15, %rdx);13071308call __aria_aesni_avx_gfni_crypt_16way;13091310write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,1311%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1312%xmm15, %rax);13131314FRAME_END1315RET;1316SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)13171318SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)1319/* input:1320* %rdi: ctx1321* %rsi: dst1322* %rdx: src1323* %rcx: keystream1324* %r8: iv (big endian, 128bit)1325*/1326FRAME_BEGIN13271328call __aria_aesni_avx_ctr_gen_keystream_16way13291330leaq (%rsi), %r10;1331leaq (%rdx), %r11;1332leaq (%rcx), %rsi;1333leaq (%rcx), %rdx;1334leaq ARIA_CTX_enc_key(CTX), %r9;13351336call __aria_aesni_avx_gfni_crypt_16way;13371338vpxor (0 * 16)(%r11), %xmm1, %xmm1;1339vpxor (1 * 16)(%r11), %xmm0, %xmm0;1340vpxor (2 * 16)(%r11), %xmm3, %xmm3;1341vpxor (3 * 16)(%r11), %xmm2, %xmm2;1342vpxor (4 * 16)(%r11), %xmm4, %xmm4;1343vpxor (5 * 16)(%r11), %xmm5, %xmm5;1344vpxor (6 * 16)(%r11), %xmm6, %xmm6;1345vpxor (7 * 16)(%r11), %xmm7, %xmm7;1346vpxor (8 * 16)(%r11), %xmm8, %xmm8;1347vpxor (9 * 16)(%r11), %xmm9, %xmm9;1348vpxor (10 * 16)(%r11), %xmm10, %xmm10;1349vpxor (11 * 16)(%r11), %xmm11, %xmm11;1350vpxor (12 * 16)(%r11), %xmm12, %xmm12;1351vpxor (13 * 16)(%r11), %xmm13, %xmm13;1352vpxor (14 * 16)(%r11), %xmm14, %xmm14;1353vpxor (15 * 16)(%r11), %xmm15, %xmm15;1354write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,1355%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1356%xmm15, %r10);13571358FRAME_END1359RET;1360SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)1361#endif /* CONFIG_AS_GFNI */136213631364