Path: blob/master/arch/x86/crypto/aria-aesni-avx-asm_64.S
54609 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* ARIA Cipher 16-way parallel algorithm (AVX)3*4* Copyright (c) 2022 Taehee Yoo <[email protected]>5*6*/78#include <linux/linkage.h>9#include <linux/cfi_types.h>10#include <asm/asm-offsets.h>11#include <asm/frame.h>1213/* register macros */14#define CTX %rdi151617#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \18( (((a0) & 1) << 0) | \19(((a1) & 1) << 1) | \20(((a2) & 1) << 2) | \21(((a3) & 1) << 3) | \22(((a4) & 1) << 4) | \23(((a5) & 1) << 5) | \24(((a6) & 1) << 6) | \25(((a7) & 1) << 7) )2627#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \28( ((l7) << (0 * 8)) | \29((l6) << (1 * 8)) | \30((l5) << (2 * 8)) | \31((l4) << (3 * 8)) | \32((l3) << (4 * 8)) | \33((l2) << (5 * 8)) | \34((l1) << (6 * 8)) | \35((l0) << (7 * 8)) )3637#define inc_le128(x, minus_one, tmp) \38vpcmpeqq minus_one, x, tmp; \39vpsubq minus_one, x, x; \40vpslldq $8, tmp, tmp; \41vpsubq tmp, x, x;4243#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \44vpand x, mask4bit, tmp0; \45vpandn x, mask4bit, x; \46vpsrld $4, x, x; \47\48vpshufb tmp0, lo_t, tmp0; \49vpshufb x, hi_t, x; \50vpxor tmp0, x, x;5152#define transpose_4x4(x0, x1, x2, x3, t1, t2) \53vpunpckhdq x1, x0, t2; \54vpunpckldq x1, x0, x0; \55\56vpunpckldq x3, x2, t1; \57vpunpckhdq x3, x2, x2; \58\59vpunpckhqdq t1, x0, x1; \60vpunpcklqdq t1, x0, x0; \61\62vpunpckhqdq x2, t2, x3; \63vpunpcklqdq x2, t2, x2;6465#define byteslice_16x16b(a0, b0, c0, d0, \66a1, b1, c1, d1, \67a2, b2, c2, d2, \68a3, b3, c3, d3, \69st0, st1) \70vmovdqu d2, st0; \71vmovdqu d3, st1; \72transpose_4x4(a0, a1, a2, a3, d2, d3); \73transpose_4x4(b0, b1, b2, b3, d2, d3); \74vmovdqu st0, d2; \75vmovdqu st1, d3; \76\77vmovdqu a0, st0; \78vmovdqu a1, st1; \79transpose_4x4(c0, c1, c2, c3, a0, a1); \80transpose_4x4(d0, d1, d2, d3, a0, a1); \81\82vmovdqu .Lshufb_16x16b(%rip), a0; \83vmovdqu st1, a1; \84vpshufb a0, a2, a2; \85vpshufb a0, a3, a3; \86vpshufb a0, b0, b0; \87vpshufb a0, b1, b1; \88vpshufb a0, b2, b2; \89vpshufb a0, b3, b3; \90vpshufb a0, a1, a1; \91vpshufb a0, c0, c0; \92vpshufb a0, c1, c1; \93vpshufb a0, c2, c2; \94vpshufb a0, c3, c3; \95vpshufb a0, d0, d0; \96vpshufb a0, d1, d1; \97vpshufb a0, d2, d2; \98vpshufb a0, d3, d3; \99vmovdqu d3, st1; \100vmovdqu st0, d3; \101vpshufb a0, d3, a0; \102vmovdqu d2, st0; \103\104transpose_4x4(a0, b0, c0, d0, d2, d3); \105transpose_4x4(a1, b1, c1, d1, d2, d3); \106vmovdqu st0, d2; \107vmovdqu st1, d3; \108\109vmovdqu b0, st0; \110vmovdqu b1, st1; \111transpose_4x4(a2, b2, c2, d2, b0, b1); \112transpose_4x4(a3, b3, c3, d3, b0, b1); \113vmovdqu st0, b0; \114vmovdqu st1, b1; \115/* does not adjust output bytes inside vectors */116117#define debyteslice_16x16b(a0, b0, c0, d0, \118a1, b1, c1, d1, \119a2, b2, c2, d2, \120a3, b3, c3, d3, \121st0, st1) \122vmovdqu d2, st0; \123vmovdqu d3, st1; \124transpose_4x4(a0, a1, a2, a3, d2, d3); \125transpose_4x4(b0, b1, b2, b3, d2, d3); \126vmovdqu st0, d2; \127vmovdqu st1, d3; \128\129vmovdqu a0, st0; \130vmovdqu a1, st1; \131transpose_4x4(c0, c1, c2, c3, a0, a1); \132transpose_4x4(d0, d1, d2, d3, a0, a1); \133\134vmovdqu .Lshufb_16x16b(%rip), a0; \135vmovdqu st1, a1; \136vpshufb a0, a2, a2; \137vpshufb a0, a3, a3; \138vpshufb a0, b0, b0; \139vpshufb a0, b1, b1; \140vpshufb a0, b2, b2; \141vpshufb a0, b3, b3; \142vpshufb a0, a1, a1; \143vpshufb a0, c0, c0; \144vpshufb a0, c1, c1; \145vpshufb a0, c2, c2; \146vpshufb a0, c3, c3; \147vpshufb a0, d0, d0; \148vpshufb a0, d1, d1; \149vpshufb a0, d2, d2; \150vpshufb a0, d3, d3; \151vmovdqu d3, st1; \152vmovdqu st0, d3; \153vpshufb a0, d3, a0; \154vmovdqu d2, st0; \155\156transpose_4x4(c0, d0, a0, b0, d2, d3); \157transpose_4x4(c1, d1, a1, b1, d2, d3); \158vmovdqu st0, d2; \159vmovdqu st1, d3; \160\161vmovdqu b0, st0; \162vmovdqu b1, st1; \163transpose_4x4(c2, d2, a2, b2, b0, b1); \164transpose_4x4(c3, d3, a3, b3, b0, b1); \165vmovdqu st0, b0; \166vmovdqu st1, b1; \167/* does not adjust output bytes inside vectors */168169/* load blocks to registers and apply pre-whitening */170#define inpack16_pre(x0, x1, x2, x3, \171x4, x5, x6, x7, \172y0, y1, y2, y3, \173y4, y5, y6, y7, \174rio) \175vmovdqu (0 * 16)(rio), x0; \176vmovdqu (1 * 16)(rio), x1; \177vmovdqu (2 * 16)(rio), x2; \178vmovdqu (3 * 16)(rio), x3; \179vmovdqu (4 * 16)(rio), x4; \180vmovdqu (5 * 16)(rio), x5; \181vmovdqu (6 * 16)(rio), x6; \182vmovdqu (7 * 16)(rio), x7; \183vmovdqu (8 * 16)(rio), y0; \184vmovdqu (9 * 16)(rio), y1; \185vmovdqu (10 * 16)(rio), y2; \186vmovdqu (11 * 16)(rio), y3; \187vmovdqu (12 * 16)(rio), y4; \188vmovdqu (13 * 16)(rio), y5; \189vmovdqu (14 * 16)(rio), y6; \190vmovdqu (15 * 16)(rio), y7;191192/* byteslice pre-whitened blocks and store to temporary memory */193#define inpack16_post(x0, x1, x2, x3, \194x4, x5, x6, x7, \195y0, y1, y2, y3, \196y4, y5, y6, y7, \197mem_ab, mem_cd) \198byteslice_16x16b(x0, x1, x2, x3, \199x4, x5, x6, x7, \200y0, y1, y2, y3, \201y4, y5, y6, y7, \202(mem_ab), (mem_cd)); \203\204vmovdqu x0, 0 * 16(mem_ab); \205vmovdqu x1, 1 * 16(mem_ab); \206vmovdqu x2, 2 * 16(mem_ab); \207vmovdqu x3, 3 * 16(mem_ab); \208vmovdqu x4, 4 * 16(mem_ab); \209vmovdqu x5, 5 * 16(mem_ab); \210vmovdqu x6, 6 * 16(mem_ab); \211vmovdqu x7, 7 * 16(mem_ab); \212vmovdqu y0, 0 * 16(mem_cd); \213vmovdqu y1, 1 * 16(mem_cd); \214vmovdqu y2, 2 * 16(mem_cd); \215vmovdqu y3, 3 * 16(mem_cd); \216vmovdqu y4, 4 * 16(mem_cd); \217vmovdqu y5, 5 * 16(mem_cd); \218vmovdqu y6, 6 * 16(mem_cd); \219vmovdqu y7, 7 * 16(mem_cd);220221#define write_output(x0, x1, x2, x3, \222x4, x5, x6, x7, \223y0, y1, y2, y3, \224y4, y5, y6, y7, \225mem) \226vmovdqu x0, 0 * 16(mem); \227vmovdqu x1, 1 * 16(mem); \228vmovdqu x2, 2 * 16(mem); \229vmovdqu x3, 3 * 16(mem); \230vmovdqu x4, 4 * 16(mem); \231vmovdqu x5, 5 * 16(mem); \232vmovdqu x6, 6 * 16(mem); \233vmovdqu x7, 7 * 16(mem); \234vmovdqu y0, 8 * 16(mem); \235vmovdqu y1, 9 * 16(mem); \236vmovdqu y2, 10 * 16(mem); \237vmovdqu y3, 11 * 16(mem); \238vmovdqu y4, 12 * 16(mem); \239vmovdqu y5, 13 * 16(mem); \240vmovdqu y6, 14 * 16(mem); \241vmovdqu y7, 15 * 16(mem); \242243#define aria_store_state_8way(x0, x1, x2, x3, \244x4, x5, x6, x7, \245mem_tmp, idx) \246vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \247vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \248vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \249vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \250vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \251vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \252vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \253vmovdqu x7, ((idx + 7) * 16)(mem_tmp);254255#define aria_load_state_8way(x0, x1, x2, x3, \256x4, x5, x6, x7, \257mem_tmp, idx) \258vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \259vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \260vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \261vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \262vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \263vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \264vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \265vmovdqu ((idx + 7) * 16)(mem_tmp), x7;266267#define aria_ark_8way(x0, x1, x2, x3, \268x4, x5, x6, x7, \269t0, t1, t2, rk, \270idx, round) \271/* AddRoundKey */ \272vbroadcastss ((round * 16) + idx + 0)(rk), t0; \273vpsrld $24, t0, t2; \274vpshufb t1, t2, t2; \275vpxor t2, x0, x0; \276vpsrld $16, t0, t2; \277vpshufb t1, t2, t2; \278vpxor t2, x1, x1; \279vpsrld $8, t0, t2; \280vpshufb t1, t2, t2; \281vpxor t2, x2, x2; \282vpshufb t1, t0, t2; \283vpxor t2, x3, x3; \284vbroadcastss ((round * 16) + idx + 4)(rk), t0; \285vpsrld $24, t0, t2; \286vpshufb t1, t2, t2; \287vpxor t2, x4, x4; \288vpsrld $16, t0, t2; \289vpshufb t1, t2, t2; \290vpxor t2, x5, x5; \291vpsrld $8, t0, t2; \292vpshufb t1, t2, t2; \293vpxor t2, x6, x6; \294vpshufb t1, t0, t2; \295vpxor t2, x7, x7;296297#define aria_sbox_8way_gfni(x0, x1, x2, x3, \298x4, x5, x6, x7, \299t0, t1, t2, t3, \300t4, t5, t6, t7) \301vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \302vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \303vmovdqa .Ltf_id_bitmatrix(%rip), t2; \304vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \305vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \306vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \307vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \308vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \309vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \310vgf2p8affineinvqb $0, t2, x2, x2; \311vgf2p8affineinvqb $0, t2, x6, x6; \312vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \313vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \314vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \315vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \316vgf2p8affineinvqb $0, t2, x3, x3; \317vgf2p8affineinvqb $0, t2, x7, x7318319#define aria_sbox_8way(x0, x1, x2, x3, \320x4, x5, x6, x7, \321t0, t1, t2, t3, \322t4, t5, t6, t7) \323vmovdqa .Linv_shift_row(%rip), t0; \324vmovdqa .Lshift_row(%rip), t1; \325vbroadcastss .L0f0f0f0f(%rip), t6; \326vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \327vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \328vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \329vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \330\331vaesenclast t7, x0, x0; \332vaesenclast t7, x4, x4; \333vaesenclast t7, x1, x1; \334vaesenclast t7, x5, x5; \335vaesdeclast t7, x2, x2; \336vaesdeclast t7, x6, x6; \337\338/* AES inverse shift rows */ \339vpshufb t0, x0, x0; \340vpshufb t0, x4, x4; \341vpshufb t0, x1, x1; \342vpshufb t0, x5, x5; \343vpshufb t1, x3, x3; \344vpshufb t1, x7, x7; \345vpshufb t1, x2, x2; \346vpshufb t1, x6, x6; \347\348/* affine transformation for S2 */ \349filter_8bit(x1, t2, t3, t6, t0); \350/* affine transformation for S2 */ \351filter_8bit(x5, t2, t3, t6, t0); \352\353/* affine transformation for X2 */ \354filter_8bit(x3, t4, t5, t6, t0); \355/* affine transformation for X2 */ \356filter_8bit(x7, t4, t5, t6, t0); \357vaesdeclast t7, x3, x3; \358vaesdeclast t7, x7, x7;359360#define aria_diff_m(x0, x1, x2, x3, \361t0, t1, t2, t3) \362/* T = rotr32(X, 8); */ \363/* X ^= T */ \364vpxor x0, x3, t0; \365vpxor x1, x0, t1; \366vpxor x2, x1, t2; \367vpxor x3, x2, t3; \368/* X = T ^ rotr(X, 16); */ \369vpxor t2, x0, x0; \370vpxor x1, t3, t3; \371vpxor t0, x2, x2; \372vpxor t1, x3, x1; \373vmovdqu t3, x3;374375#define aria_diff_word(x0, x1, x2, x3, \376x4, x5, x6, x7, \377y0, y1, y2, y3, \378y4, y5, y6, y7) \379/* t1 ^= t2; */ \380vpxor y0, x4, x4; \381vpxor y1, x5, x5; \382vpxor y2, x6, x6; \383vpxor y3, x7, x7; \384\385/* t2 ^= t3; */ \386vpxor y4, y0, y0; \387vpxor y5, y1, y1; \388vpxor y6, y2, y2; \389vpxor y7, y3, y3; \390\391/* t0 ^= t1; */ \392vpxor x4, x0, x0; \393vpxor x5, x1, x1; \394vpxor x6, x2, x2; \395vpxor x7, x3, x3; \396\397/* t3 ^= t1; */ \398vpxor x4, y4, y4; \399vpxor x5, y5, y5; \400vpxor x6, y6, y6; \401vpxor x7, y7, y7; \402\403/* t2 ^= t0; */ \404vpxor x0, y0, y0; \405vpxor x1, y1, y1; \406vpxor x2, y2, y2; \407vpxor x3, y3, y3; \408\409/* t1 ^= t2; */ \410vpxor y0, x4, x4; \411vpxor y1, x5, x5; \412vpxor y2, x6, x6; \413vpxor y3, x7, x7;414415#define aria_fe(x0, x1, x2, x3, \416x4, x5, x6, x7, \417y0, y1, y2, y3, \418y4, y5, y6, y7, \419mem_tmp, rk, round) \420vpxor y7, y7, y7; \421aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \422y0, y7, y2, rk, 8, round); \423\424aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \425y0, y1, y2, y3, y4, y5, y6, y7); \426\427aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \428aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \429aria_store_state_8way(x0, x1, x2, x3, \430x4, x5, x6, x7, \431mem_tmp, 8); \432\433aria_load_state_8way(x0, x1, x2, x3, \434x4, x5, x6, x7, \435mem_tmp, 0); \436aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \437y0, y7, y2, rk, 0, round); \438\439aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \440y0, y1, y2, y3, y4, y5, y6, y7); \441\442aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \443aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \444aria_store_state_8way(x0, x1, x2, x3, \445x4, x5, x6, x7, \446mem_tmp, 0); \447aria_load_state_8way(y0, y1, y2, y3, \448y4, y5, y6, y7, \449mem_tmp, 8); \450aria_diff_word(x0, x1, x2, x3, \451x4, x5, x6, x7, \452y0, y1, y2, y3, \453y4, y5, y6, y7); \454/* aria_diff_byte() \455* T3 = ABCD -> BADC \456* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \457* T0 = ABCD -> CDAB \458* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \459* T1 = ABCD -> DCBA \460* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \461*/ \462aria_diff_word(x2, x3, x0, x1, \463x7, x6, x5, x4, \464y0, y1, y2, y3, \465y5, y4, y7, y6); \466aria_store_state_8way(x3, x2, x1, x0, \467x6, x7, x4, x5, \468mem_tmp, 0);469470#define aria_fo(x0, x1, x2, x3, \471x4, x5, x6, x7, \472y0, y1, y2, y3, \473y4, y5, y6, y7, \474mem_tmp, rk, round) \475vpxor y7, y7, y7; \476aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \477y0, y7, y2, rk, 8, round); \478\479aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \480y0, y1, y2, y3, y4, y5, y6, y7); \481\482aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \483aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \484aria_store_state_8way(x0, x1, x2, x3, \485x4, x5, x6, x7, \486mem_tmp, 8); \487\488aria_load_state_8way(x0, x1, x2, x3, \489x4, x5, x6, x7, \490mem_tmp, 0); \491aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \492y0, y7, y2, rk, 0, round); \493\494aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \495y0, y1, y2, y3, y4, y5, y6, y7); \496\497aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \498aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \499aria_store_state_8way(x0, x1, x2, x3, \500x4, x5, x6, x7, \501mem_tmp, 0); \502aria_load_state_8way(y0, y1, y2, y3, \503y4, y5, y6, y7, \504mem_tmp, 8); \505aria_diff_word(x0, x1, x2, x3, \506x4, x5, x6, x7, \507y0, y1, y2, y3, \508y4, y5, y6, y7); \509/* aria_diff_byte() \510* T1 = ABCD -> BADC \511* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \512* T2 = ABCD -> CDAB \513* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \514* T3 = ABCD -> DCBA \515* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \516*/ \517aria_diff_word(x0, x1, x2, x3, \518x5, x4, x7, x6, \519y2, y3, y0, y1, \520y7, y6, y5, y4); \521aria_store_state_8way(x3, x2, x1, x0, \522x6, x7, x4, x5, \523mem_tmp, 0);524525#define aria_ff(x0, x1, x2, x3, \526x4, x5, x6, x7, \527y0, y1, y2, y3, \528y4, y5, y6, y7, \529mem_tmp, rk, round, last_round) \530vpxor y7, y7, y7; \531aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \532y0, y7, y2, rk, 8, round); \533\534aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \535y0, y1, y2, y3, y4, y5, y6, y7); \536\537aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \538y0, y7, y2, rk, 8, last_round); \539\540aria_store_state_8way(x0, x1, x2, x3, \541x4, x5, x6, x7, \542mem_tmp, 8); \543\544aria_load_state_8way(x0, x1, x2, x3, \545x4, x5, x6, x7, \546mem_tmp, 0); \547aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \548y0, y7, y2, rk, 0, round); \549\550aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \551y0, y1, y2, y3, y4, y5, y6, y7); \552\553aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \554y0, y7, y2, rk, 0, last_round); \555\556aria_load_state_8way(y0, y1, y2, y3, \557y4, y5, y6, y7, \558mem_tmp, 8);559560#define aria_fe_gfni(x0, x1, x2, x3, \561x4, x5, x6, x7, \562y0, y1, y2, y3, \563y4, y5, y6, y7, \564mem_tmp, rk, round) \565vpxor y7, y7, y7; \566aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \567y0, y7, y2, rk, 8, round); \568\569aria_sbox_8way_gfni(x2, x3, x0, x1, \570x6, x7, x4, x5, \571y0, y1, y2, y3, \572y4, y5, y6, y7); \573\574aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \575aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \576aria_store_state_8way(x0, x1, x2, x3, \577x4, x5, x6, x7, \578mem_tmp, 8); \579\580aria_load_state_8way(x0, x1, x2, x3, \581x4, x5, x6, x7, \582mem_tmp, 0); \583aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \584y0, y7, y2, rk, 0, round); \585\586aria_sbox_8way_gfni(x2, x3, x0, x1, \587x6, x7, x4, x5, \588y0, y1, y2, y3, \589y4, y5, y6, y7); \590\591aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \592aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \593aria_store_state_8way(x0, x1, x2, x3, \594x4, x5, x6, x7, \595mem_tmp, 0); \596aria_load_state_8way(y0, y1, y2, y3, \597y4, y5, y6, y7, \598mem_tmp, 8); \599aria_diff_word(x0, x1, x2, x3, \600x4, x5, x6, x7, \601y0, y1, y2, y3, \602y4, y5, y6, y7); \603/* aria_diff_byte() \604* T3 = ABCD -> BADC \605* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \606* T0 = ABCD -> CDAB \607* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \608* T1 = ABCD -> DCBA \609* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \610*/ \611aria_diff_word(x2, x3, x0, x1, \612x7, x6, x5, x4, \613y0, y1, y2, y3, \614y5, y4, y7, y6); \615aria_store_state_8way(x3, x2, x1, x0, \616x6, x7, x4, x5, \617mem_tmp, 0);618619#define aria_fo_gfni(x0, x1, x2, x3, \620x4, x5, x6, x7, \621y0, y1, y2, y3, \622y4, y5, y6, y7, \623mem_tmp, rk, round) \624vpxor y7, y7, y7; \625aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \626y0, y7, y2, rk, 8, round); \627\628aria_sbox_8way_gfni(x0, x1, x2, x3, \629x4, x5, x6, x7, \630y0, y1, y2, y3, \631y4, y5, y6, y7); \632\633aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \634aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \635aria_store_state_8way(x0, x1, x2, x3, \636x4, x5, x6, x7, \637mem_tmp, 8); \638\639aria_load_state_8way(x0, x1, x2, x3, \640x4, x5, x6, x7, \641mem_tmp, 0); \642aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \643y0, y7, y2, rk, 0, round); \644\645aria_sbox_8way_gfni(x0, x1, x2, x3, \646x4, x5, x6, x7, \647y0, y1, y2, y3, \648y4, y5, y6, y7); \649\650aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \651aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \652aria_store_state_8way(x0, x1, x2, x3, \653x4, x5, x6, x7, \654mem_tmp, 0); \655aria_load_state_8way(y0, y1, y2, y3, \656y4, y5, y6, y7, \657mem_tmp, 8); \658aria_diff_word(x0, x1, x2, x3, \659x4, x5, x6, x7, \660y0, y1, y2, y3, \661y4, y5, y6, y7); \662/* aria_diff_byte() \663* T1 = ABCD -> BADC \664* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \665* T2 = ABCD -> CDAB \666* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \667* T3 = ABCD -> DCBA \668* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \669*/ \670aria_diff_word(x0, x1, x2, x3, \671x5, x4, x7, x6, \672y2, y3, y0, y1, \673y7, y6, y5, y4); \674aria_store_state_8way(x3, x2, x1, x0, \675x6, x7, x4, x5, \676mem_tmp, 0);677678#define aria_ff_gfni(x0, x1, x2, x3, \679x4, x5, x6, x7, \680y0, y1, y2, y3, \681y4, y5, y6, y7, \682mem_tmp, rk, round, last_round) \683vpxor y7, y7, y7; \684aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \685y0, y7, y2, rk, 8, round); \686\687aria_sbox_8way_gfni(x2, x3, x0, x1, \688x6, x7, x4, x5, \689y0, y1, y2, y3, \690y4, y5, y6, y7); \691\692aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \693y0, y7, y2, rk, 8, last_round); \694\695aria_store_state_8way(x0, x1, x2, x3, \696x4, x5, x6, x7, \697mem_tmp, 8); \698\699aria_load_state_8way(x0, x1, x2, x3, \700x4, x5, x6, x7, \701mem_tmp, 0); \702aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \703y0, y7, y2, rk, 0, round); \704\705aria_sbox_8way_gfni(x2, x3, x0, x1, \706x6, x7, x4, x5, \707y0, y1, y2, y3, \708y4, y5, y6, y7); \709\710aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \711y0, y7, y2, rk, 0, last_round); \712\713aria_load_state_8way(y0, y1, y2, y3, \714y4, y5, y6, y7, \715mem_tmp, 8);716717/* NB: section is mergeable, all elements must be aligned 16-byte blocks */718.section .rodata.cst16, "aM", @progbits, 16719.align 16720721#define SHUFB_BYTES(idx) \7220 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)723724.Lshufb_16x16b:725.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);726/* For isolating SubBytes from AESENCLAST, inverse shift row */727.Linv_shift_row:728.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b729.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03730.Lshift_row:731.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03732.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b733/* For CTR-mode IV byteswap */734.Lbswap128_mask:735.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08736.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00737738/* AES inverse affine and S2 combined:739* 1 1 0 0 0 0 0 1 x0 0740* 0 1 0 0 1 0 0 0 x1 0741* 1 1 0 0 1 1 1 1 x2 0742* 0 1 1 0 1 0 0 1 x3 1743* 0 1 0 0 1 1 0 0 * x4 + 0744* 0 1 0 1 1 0 0 0 x5 0745* 0 0 0 0 0 1 0 1 x6 0746* 1 1 1 0 0 1 1 1 x7 1747*/748.Ltf_lo__inv_aff__and__s2:749.octa 0x92172DA81A9FA520B2370D883ABF8500750.Ltf_hi__inv_aff__and__s2:751.octa 0x2B15FFC1AF917B45E6D8320C625CB688752753/* X2 and AES forward affine combined:754* 1 0 1 1 0 0 0 1 x0 0755* 0 1 1 1 1 0 1 1 x1 0756* 0 0 0 1 1 0 1 0 x2 1757* 0 1 0 0 0 1 0 0 x3 0758* 0 0 1 1 1 0 1 1 * x4 + 0759* 0 1 0 0 1 0 0 0 x5 0760* 1 1 0 1 0 0 1 1 x6 0761* 0 1 0 0 1 0 1 0 x7 0762*/763.Ltf_lo__x2__and__fwd_aff:764.octa 0xEFAE0544FCBD1657B8F95213ABEA4100765.Ltf_hi__x2__and__fwd_aff:766.octa 0x3F893781E95FE1576CDA64D2BA0CB204767768/* AES affine: */769#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)770.Ltf_aff_bitmatrix:771.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),772BV8(1, 1, 0, 0, 0, 1, 1, 1),773BV8(1, 1, 1, 0, 0, 0, 1, 1),774BV8(1, 1, 1, 1, 0, 0, 0, 1),775BV8(1, 1, 1, 1, 1, 0, 0, 0),776BV8(0, 1, 1, 1, 1, 1, 0, 0),777BV8(0, 0, 1, 1, 1, 1, 1, 0),778BV8(0, 0, 0, 1, 1, 1, 1, 1))779.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),780BV8(1, 1, 0, 0, 0, 1, 1, 1),781BV8(1, 1, 1, 0, 0, 0, 1, 1),782BV8(1, 1, 1, 1, 0, 0, 0, 1),783BV8(1, 1, 1, 1, 1, 0, 0, 0),784BV8(0, 1, 1, 1, 1, 1, 0, 0),785BV8(0, 0, 1, 1, 1, 1, 1, 0),786BV8(0, 0, 0, 1, 1, 1, 1, 1))787788/* AES inverse affine: */789#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)790.Ltf_inv_bitmatrix:791.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),792BV8(1, 0, 0, 1, 0, 0, 1, 0),793BV8(0, 1, 0, 0, 1, 0, 0, 1),794BV8(1, 0, 1, 0, 0, 1, 0, 0),795BV8(0, 1, 0, 1, 0, 0, 1, 0),796BV8(0, 0, 1, 0, 1, 0, 0, 1),797BV8(1, 0, 0, 1, 0, 1, 0, 0),798BV8(0, 1, 0, 0, 1, 0, 1, 0))799.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),800BV8(1, 0, 0, 1, 0, 0, 1, 0),801BV8(0, 1, 0, 0, 1, 0, 0, 1),802BV8(1, 0, 1, 0, 0, 1, 0, 0),803BV8(0, 1, 0, 1, 0, 0, 1, 0),804BV8(0, 0, 1, 0, 1, 0, 0, 1),805BV8(1, 0, 0, 1, 0, 1, 0, 0),806BV8(0, 1, 0, 0, 1, 0, 1, 0))807808/* S2: */809#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)810.Ltf_s2_bitmatrix:811.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),812BV8(0, 0, 1, 1, 1, 1, 1, 1),813BV8(1, 1, 1, 0, 1, 1, 0, 1),814BV8(1, 1, 0, 0, 0, 0, 1, 1),815BV8(0, 1, 0, 0, 0, 0, 1, 1),816BV8(1, 1, 0, 0, 1, 1, 1, 0),817BV8(0, 1, 1, 0, 0, 0, 1, 1),818BV8(1, 1, 1, 1, 0, 1, 1, 0))819.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),820BV8(0, 0, 1, 1, 1, 1, 1, 1),821BV8(1, 1, 1, 0, 1, 1, 0, 1),822BV8(1, 1, 0, 0, 0, 0, 1, 1),823BV8(0, 1, 0, 0, 0, 0, 1, 1),824BV8(1, 1, 0, 0, 1, 1, 1, 0),825BV8(0, 1, 1, 0, 0, 0, 1, 1),826BV8(1, 1, 1, 1, 0, 1, 1, 0))827828/* X2: */829#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)830.Ltf_x2_bitmatrix:831.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),832BV8(0, 0, 1, 0, 0, 1, 1, 0),833BV8(0, 0, 0, 0, 1, 0, 1, 0),834BV8(1, 1, 1, 0, 0, 0, 1, 1),835BV8(1, 1, 1, 0, 1, 1, 0, 0),836BV8(0, 1, 1, 0, 1, 0, 1, 1),837BV8(1, 0, 1, 1, 1, 1, 0, 1),838BV8(1, 0, 0, 1, 0, 0, 1, 1))839.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),840BV8(0, 0, 1, 0, 0, 1, 1, 0),841BV8(0, 0, 0, 0, 1, 0, 1, 0),842BV8(1, 1, 1, 0, 0, 0, 1, 1),843BV8(1, 1, 1, 0, 1, 1, 0, 0),844BV8(0, 1, 1, 0, 1, 0, 1, 1),845BV8(1, 0, 1, 1, 1, 1, 0, 1),846BV8(1, 0, 0, 1, 0, 0, 1, 1))847848/* Identity matrix: */849.Ltf_id_bitmatrix:850.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),851BV8(0, 1, 0, 0, 0, 0, 0, 0),852BV8(0, 0, 1, 0, 0, 0, 0, 0),853BV8(0, 0, 0, 1, 0, 0, 0, 0),854BV8(0, 0, 0, 0, 1, 0, 0, 0),855BV8(0, 0, 0, 0, 0, 1, 0, 0),856BV8(0, 0, 0, 0, 0, 0, 1, 0),857BV8(0, 0, 0, 0, 0, 0, 0, 1))858.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),859BV8(0, 1, 0, 0, 0, 0, 0, 0),860BV8(0, 0, 1, 0, 0, 0, 0, 0),861BV8(0, 0, 0, 1, 0, 0, 0, 0),862BV8(0, 0, 0, 0, 1, 0, 0, 0),863BV8(0, 0, 0, 0, 0, 1, 0, 0),864BV8(0, 0, 0, 0, 0, 0, 1, 0),865BV8(0, 0, 0, 0, 0, 0, 0, 1))866867/* 4-bit mask */868.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4869.align 4870.L0f0f0f0f:871.long 0x0f0f0f0f872873.text874875SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)876/* input:877* %r9: rk878* %rsi: dst879* %rdx: src880* %xmm0..%xmm15: 16 byte-sliced blocks881*/882883FRAME_BEGIN884885movq %rsi, %rax;886leaq 8 * 16(%rax), %r8;887888inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,889%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,890%xmm15, %rax, %r8);891aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,892%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,893%rax, %r9, 0);894aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,895%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,896%xmm15, %rax, %r9, 1);897aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,898%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,899%rax, %r9, 2);900aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,901%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,902%xmm15, %rax, %r9, 3);903aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,904%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,905%rax, %r9, 4);906aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,907%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,908%xmm15, %rax, %r9, 5);909aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,910%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,911%rax, %r9, 6);912aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,913%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,914%xmm15, %rax, %r9, 7);915aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,916%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,917%rax, %r9, 8);918aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,919%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,920%xmm15, %rax, %r9, 9);921aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,922%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,923%rax, %r9, 10);924cmpl $12, ARIA_CTX_rounds(CTX);925jne .Laria_192;926aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,927%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,928%xmm15, %rax, %r9, 11, 12);929jmp .Laria_end;930.Laria_192:931aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,932%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,933%xmm15, %rax, %r9, 11);934aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,935%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,936%rax, %r9, 12);937cmpl $14, ARIA_CTX_rounds(CTX);938jne .Laria_256;939aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,940%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,941%xmm15, %rax, %r9, 13, 14);942jmp .Laria_end;943.Laria_256:944aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,945%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,946%xmm15, %rax, %r9, 13);947aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,948%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,949%rax, %r9, 14);950aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,951%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,952%xmm15, %rax, %r9, 15, 16);953.Laria_end:954debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,955%xmm9, %xmm13, %xmm0, %xmm5,956%xmm10, %xmm14, %xmm3, %xmm6,957%xmm11, %xmm15, %xmm2, %xmm7,958(%rax), (%r8));959960FRAME_END961RET;962SYM_FUNC_END(__aria_aesni_avx_crypt_16way)963964SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)965/* input:966* %rdi: ctx, CTX967* %rsi: dst968* %rdx: src969*/970971FRAME_BEGIN972973leaq ARIA_CTX_enc_key(CTX), %r9;974975inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,976%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,977%xmm15, %rdx);978979call __aria_aesni_avx_crypt_16way;980981write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,982%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,983%xmm15, %rax);984985FRAME_END986RET;987SYM_FUNC_END(aria_aesni_avx_encrypt_16way)988989SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)990/* input:991* %rdi: ctx, CTX992* %rsi: dst993* %rdx: src994*/995996FRAME_BEGIN997998leaq ARIA_CTX_dec_key(CTX), %r9;9991000inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,1001%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1002%xmm15, %rdx);10031004call __aria_aesni_avx_crypt_16way;10051006write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,1007%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1008%xmm15, %rax);10091010FRAME_END1011RET;1012SYM_FUNC_END(aria_aesni_avx_decrypt_16way)10131014SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)1015/* input:1016* %rdi: ctx1017* %rsi: dst1018* %rdx: src1019* %rcx: keystream1020* %r8: iv (big endian, 128bit)1021*/10221023FRAME_BEGIN1024/* load IV and byteswap */1025vmovdqu (%r8), %xmm8;10261027vmovdqa .Lbswap128_mask (%rip), %xmm1;1028vpshufb %xmm1, %xmm8, %xmm3; /* be => le */10291030vpcmpeqd %xmm0, %xmm0, %xmm0;1031vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */10321033/* construct IVs */1034inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1035vpshufb %xmm1, %xmm3, %xmm9;1036inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1037vpshufb %xmm1, %xmm3, %xmm10;1038inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1039vpshufb %xmm1, %xmm3, %xmm11;1040inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1041vpshufb %xmm1, %xmm3, %xmm12;1042inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1043vpshufb %xmm1, %xmm3, %xmm13;1044inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1045vpshufb %xmm1, %xmm3, %xmm14;1046inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1047vpshufb %xmm1, %xmm3, %xmm15;1048vmovdqu %xmm8, (0 * 16)(%rcx);1049vmovdqu %xmm9, (1 * 16)(%rcx);1050vmovdqu %xmm10, (2 * 16)(%rcx);1051vmovdqu %xmm11, (3 * 16)(%rcx);1052vmovdqu %xmm12, (4 * 16)(%rcx);1053vmovdqu %xmm13, (5 * 16)(%rcx);1054vmovdqu %xmm14, (6 * 16)(%rcx);1055vmovdqu %xmm15, (7 * 16)(%rcx);10561057inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1058vpshufb %xmm1, %xmm3, %xmm8;1059inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1060vpshufb %xmm1, %xmm3, %xmm9;1061inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1062vpshufb %xmm1, %xmm3, %xmm10;1063inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1064vpshufb %xmm1, %xmm3, %xmm11;1065inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1066vpshufb %xmm1, %xmm3, %xmm12;1067inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1068vpshufb %xmm1, %xmm3, %xmm13;1069inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1070vpshufb %xmm1, %xmm3, %xmm14;1071inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1072vpshufb %xmm1, %xmm3, %xmm15;1073inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */1074vpshufb %xmm1, %xmm3, %xmm4;1075vmovdqu %xmm4, (%r8);10761077vmovdqu (0 * 16)(%rcx), %xmm0;1078vmovdqu (1 * 16)(%rcx), %xmm1;1079vmovdqu (2 * 16)(%rcx), %xmm2;1080vmovdqu (3 * 16)(%rcx), %xmm3;1081vmovdqu (4 * 16)(%rcx), %xmm4;1082vmovdqu (5 * 16)(%rcx), %xmm5;1083vmovdqu (6 * 16)(%rcx), %xmm6;1084vmovdqu (7 * 16)(%rcx), %xmm7;10851086FRAME_END1087RET;1088SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)10891090SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)1091/* input:1092* %rdi: ctx1093* %rsi: dst1094* %rdx: src1095* %rcx: keystream1096* %r8: iv (big endian, 128bit)1097*/1098FRAME_BEGIN10991100call __aria_aesni_avx_ctr_gen_keystream_16way;11011102leaq (%rsi), %r10;1103leaq (%rdx), %r11;1104leaq (%rcx), %rsi;1105leaq (%rcx), %rdx;1106leaq ARIA_CTX_enc_key(CTX), %r9;11071108call __aria_aesni_avx_crypt_16way;11091110vpxor (0 * 16)(%r11), %xmm1, %xmm1;1111vpxor (1 * 16)(%r11), %xmm0, %xmm0;1112vpxor (2 * 16)(%r11), %xmm3, %xmm3;1113vpxor (3 * 16)(%r11), %xmm2, %xmm2;1114vpxor (4 * 16)(%r11), %xmm4, %xmm4;1115vpxor (5 * 16)(%r11), %xmm5, %xmm5;1116vpxor (6 * 16)(%r11), %xmm6, %xmm6;1117vpxor (7 * 16)(%r11), %xmm7, %xmm7;1118vpxor (8 * 16)(%r11), %xmm8, %xmm8;1119vpxor (9 * 16)(%r11), %xmm9, %xmm9;1120vpxor (10 * 16)(%r11), %xmm10, %xmm10;1121vpxor (11 * 16)(%r11), %xmm11, %xmm11;1122vpxor (12 * 16)(%r11), %xmm12, %xmm12;1123vpxor (13 * 16)(%r11), %xmm13, %xmm13;1124vpxor (14 * 16)(%r11), %xmm14, %xmm14;1125vpxor (15 * 16)(%r11), %xmm15, %xmm15;1126write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,1127%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1128%xmm15, %r10);11291130FRAME_END1131RET;1132SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)11331134SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)1135/* input:1136* %r9: rk1137* %rsi: dst1138* %rdx: src1139* %xmm0..%xmm15: 16 byte-sliced blocks1140*/11411142FRAME_BEGIN11431144movq %rsi, %rax;1145leaq 8 * 16(%rax), %r8;11461147inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,1148%xmm4, %xmm5, %xmm6, %xmm7,1149%xmm8, %xmm9, %xmm10, %xmm11,1150%xmm12, %xmm13, %xmm14,1151%xmm15, %rax, %r8);1152aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,1153%xmm12, %xmm13, %xmm14, %xmm15,1154%xmm0, %xmm1, %xmm2, %xmm3,1155%xmm4, %xmm5, %xmm6, %xmm7,1156%rax, %r9, 0);1157aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1158%xmm4, %xmm5, %xmm6, %xmm7,1159%xmm8, %xmm9, %xmm10, %xmm11,1160%xmm12, %xmm13, %xmm14,1161%xmm15, %rax, %r9, 1);1162aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1163%xmm12, %xmm13, %xmm14, %xmm15,1164%xmm0, %xmm1, %xmm2, %xmm3,1165%xmm4, %xmm5, %xmm6, %xmm7,1166%rax, %r9, 2);1167aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1168%xmm4, %xmm5, %xmm6, %xmm7,1169%xmm8, %xmm9, %xmm10, %xmm11,1170%xmm12, %xmm13, %xmm14,1171%xmm15, %rax, %r9, 3);1172aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1173%xmm12, %xmm13, %xmm14, %xmm15,1174%xmm0, %xmm1, %xmm2, %xmm3,1175%xmm4, %xmm5, %xmm6, %xmm7,1176%rax, %r9, 4);1177aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1178%xmm4, %xmm5, %xmm6, %xmm7,1179%xmm8, %xmm9, %xmm10, %xmm11,1180%xmm12, %xmm13, %xmm14,1181%xmm15, %rax, %r9, 5);1182aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1183%xmm12, %xmm13, %xmm14, %xmm15,1184%xmm0, %xmm1, %xmm2, %xmm3,1185%xmm4, %xmm5, %xmm6, %xmm7,1186%rax, %r9, 6);1187aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1188%xmm4, %xmm5, %xmm6, %xmm7,1189%xmm8, %xmm9, %xmm10, %xmm11,1190%xmm12, %xmm13, %xmm14,1191%xmm15, %rax, %r9, 7);1192aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1193%xmm12, %xmm13, %xmm14, %xmm15,1194%xmm0, %xmm1, %xmm2, %xmm3,1195%xmm4, %xmm5, %xmm6, %xmm7,1196%rax, %r9, 8);1197aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1198%xmm4, %xmm5, %xmm6, %xmm7,1199%xmm8, %xmm9, %xmm10, %xmm11,1200%xmm12, %xmm13, %xmm14,1201%xmm15, %rax, %r9, 9);1202aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1203%xmm12, %xmm13, %xmm14, %xmm15,1204%xmm0, %xmm1, %xmm2, %xmm3,1205%xmm4, %xmm5, %xmm6, %xmm7,1206%rax, %r9, 10);1207cmpl $12, ARIA_CTX_rounds(CTX);1208jne .Laria_gfni_192;1209aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,1210%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1211%xmm15, %rax, %r9, 11, 12);1212jmp .Laria_gfni_end;1213.Laria_gfni_192:1214aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1215%xmm4, %xmm5, %xmm6, %xmm7,1216%xmm8, %xmm9, %xmm10, %xmm11,1217%xmm12, %xmm13, %xmm14,1218%xmm15, %rax, %r9, 11);1219aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1220%xmm12, %xmm13, %xmm14, %xmm15,1221%xmm0, %xmm1, %xmm2, %xmm3,1222%xmm4, %xmm5, %xmm6, %xmm7,1223%rax, %r9, 12);1224cmpl $14, ARIA_CTX_rounds(CTX);1225jne .Laria_gfni_256;1226aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1227%xmm4, %xmm5, %xmm6, %xmm7,1228%xmm8, %xmm9, %xmm10, %xmm11,1229%xmm12, %xmm13, %xmm14,1230%xmm15, %rax, %r9, 13, 14);1231jmp .Laria_gfni_end;1232.Laria_gfni_256:1233aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1234%xmm4, %xmm5, %xmm6, %xmm7,1235%xmm8, %xmm9, %xmm10, %xmm11,1236%xmm12, %xmm13, %xmm14,1237%xmm15, %rax, %r9, 13);1238aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,1239%xmm12, %xmm13, %xmm14, %xmm15,1240%xmm0, %xmm1, %xmm2, %xmm3,1241%xmm4, %xmm5, %xmm6, %xmm7,1242%rax, %r9, 14);1243aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,1244%xmm4, %xmm5, %xmm6, %xmm7,1245%xmm8, %xmm9, %xmm10, %xmm11,1246%xmm12, %xmm13, %xmm14,1247%xmm15, %rax, %r9, 15, 16);1248.Laria_gfni_end:1249debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,1250%xmm9, %xmm13, %xmm0, %xmm5,1251%xmm10, %xmm14, %xmm3, %xmm6,1252%xmm11, %xmm15, %xmm2, %xmm7,1253(%rax), (%r8));12541255FRAME_END1256RET;1257SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)12581259SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)1260/* input:1261* %rdi: ctx, CTX1262* %rsi: dst1263* %rdx: src1264*/12651266FRAME_BEGIN12671268leaq ARIA_CTX_enc_key(CTX), %r9;12691270inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,1271%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1272%xmm15, %rdx);12731274call __aria_aesni_avx_gfni_crypt_16way;12751276write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,1277%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1278%xmm15, %rax);12791280FRAME_END1281RET;1282SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)12831284SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)1285/* input:1286* %rdi: ctx, CTX1287* %rsi: dst1288* %rdx: src1289*/12901291FRAME_BEGIN12921293leaq ARIA_CTX_dec_key(CTX), %r9;12941295inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,1296%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1297%xmm15, %rdx);12981299call __aria_aesni_avx_gfni_crypt_16way;13001301write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,1302%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1303%xmm15, %rax);13041305FRAME_END1306RET;1307SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)13081309SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)1310/* input:1311* %rdi: ctx1312* %rsi: dst1313* %rdx: src1314* %rcx: keystream1315* %r8: iv (big endian, 128bit)1316*/1317FRAME_BEGIN13181319call __aria_aesni_avx_ctr_gen_keystream_16way13201321leaq (%rsi), %r10;1322leaq (%rdx), %r11;1323leaq (%rcx), %rsi;1324leaq (%rcx), %rdx;1325leaq ARIA_CTX_enc_key(CTX), %r9;13261327call __aria_aesni_avx_gfni_crypt_16way;13281329vpxor (0 * 16)(%r11), %xmm1, %xmm1;1330vpxor (1 * 16)(%r11), %xmm0, %xmm0;1331vpxor (2 * 16)(%r11), %xmm3, %xmm3;1332vpxor (3 * 16)(%r11), %xmm2, %xmm2;1333vpxor (4 * 16)(%r11), %xmm4, %xmm4;1334vpxor (5 * 16)(%r11), %xmm5, %xmm5;1335vpxor (6 * 16)(%r11), %xmm6, %xmm6;1336vpxor (7 * 16)(%r11), %xmm7, %xmm7;1337vpxor (8 * 16)(%r11), %xmm8, %xmm8;1338vpxor (9 * 16)(%r11), %xmm9, %xmm9;1339vpxor (10 * 16)(%r11), %xmm10, %xmm10;1340vpxor (11 * 16)(%r11), %xmm11, %xmm11;1341vpxor (12 * 16)(%r11), %xmm12, %xmm12;1342vpxor (13 * 16)(%r11), %xmm13, %xmm13;1343vpxor (14 * 16)(%r11), %xmm14, %xmm14;1344vpxor (15 * 16)(%r11), %xmm15, %xmm15;1345write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,1346%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,1347%xmm15, %r10);13481349FRAME_END1350RET;1351SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)135213531354