Path: blob/master/arch/x86/crypto/aria-aesni-avx2-asm_64.S
54321 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* ARIA Cipher 32-way parallel algorithm (AVX2)3*4* Copyright (c) 2022 Taehee Yoo <[email protected]>5*6*/78#include <linux/linkage.h>9#include <asm/frame.h>10#include <asm/asm-offsets.h>11#include <linux/cfi_types.h>1213/* register macros */14#define CTX %rdi1516#define ymm0_x xmm017#define ymm1_x xmm118#define ymm2_x xmm219#define ymm3_x xmm320#define ymm4_x xmm421#define ymm5_x xmm522#define ymm6_x xmm623#define ymm7_x xmm724#define ymm8_x xmm825#define ymm9_x xmm926#define ymm10_x xmm1027#define ymm11_x xmm1128#define ymm12_x xmm1229#define ymm13_x xmm1330#define ymm14_x xmm1431#define ymm15_x xmm153233#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \34( (((a0) & 1) << 0) | \35(((a1) & 1) << 1) | \36(((a2) & 1) << 2) | \37(((a3) & 1) << 3) | \38(((a4) & 1) << 4) | \39(((a5) & 1) << 5) | \40(((a6) & 1) << 6) | \41(((a7) & 1) << 7) )4243#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \44( ((l7) << (0 * 8)) | \45((l6) << (1 * 8)) | \46((l5) << (2 * 8)) | \47((l4) << (3 * 8)) | \48((l3) << (4 * 8)) | \49((l2) << (5 * 8)) | \50((l1) << (6 * 8)) | \51((l0) << (7 * 8)) )5253#define inc_le128(x, minus_one, tmp) \54vpcmpeqq minus_one, x, tmp; \55vpsubq minus_one, x, x; \56vpslldq $8, tmp, tmp; \57vpsubq tmp, x, x;5859#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \60vpand x, mask4bit, tmp0; \61vpandn x, mask4bit, x; \62vpsrld $4, x, x; \63\64vpshufb tmp0, lo_t, tmp0; \65vpshufb x, hi_t, x; \66vpxor tmp0, x, x;6768#define transpose_4x4(x0, x1, x2, x3, t1, t2) \69vpunpckhdq x1, x0, t2; \70vpunpckldq x1, x0, x0; \71\72vpunpckldq x3, x2, t1; \73vpunpckhdq x3, x2, x2; \74\75vpunpckhqdq t1, x0, x1; \76vpunpcklqdq t1, x0, x0; \77\78vpunpckhqdq x2, t2, x3; \79vpunpcklqdq x2, t2, x2;8081#define byteslice_16x16b(a0, b0, c0, d0, \82a1, b1, c1, d1, \83a2, b2, c2, d2, \84a3, b3, c3, d3, \85st0, st1) \86vmovdqu d2, st0; \87vmovdqu d3, st1; \88transpose_4x4(a0, a1, a2, a3, d2, d3); \89transpose_4x4(b0, b1, b2, b3, d2, d3); \90vmovdqu st0, d2; \91vmovdqu st1, d3; \92\93vmovdqu a0, st0; \94vmovdqu a1, st1; \95transpose_4x4(c0, c1, c2, c3, a0, a1); \96transpose_4x4(d0, d1, d2, d3, a0, a1); \97\98vbroadcasti128 .Lshufb_16x16b(%rip), a0; \99vmovdqu st1, a1; \100vpshufb a0, a2, a2; \101vpshufb a0, a3, a3; \102vpshufb a0, b0, b0; \103vpshufb a0, b1, b1; \104vpshufb a0, b2, b2; \105vpshufb a0, b3, b3; \106vpshufb a0, a1, a1; \107vpshufb a0, c0, c0; \108vpshufb a0, c1, c1; \109vpshufb a0, c2, c2; \110vpshufb a0, c3, c3; \111vpshufb a0, d0, d0; \112vpshufb a0, d1, d1; \113vpshufb a0, d2, d2; \114vpshufb a0, d3, d3; \115vmovdqu d3, st1; \116vmovdqu st0, d3; \117vpshufb a0, d3, a0; \118vmovdqu d2, st0; \119\120transpose_4x4(a0, b0, c0, d0, d2, d3); \121transpose_4x4(a1, b1, c1, d1, d2, d3); \122vmovdqu st0, d2; \123vmovdqu st1, d3; \124\125vmovdqu b0, st0; \126vmovdqu b1, st1; \127transpose_4x4(a2, b2, c2, d2, b0, b1); \128transpose_4x4(a3, b3, c3, d3, b0, b1); \129vmovdqu st0, b0; \130vmovdqu st1, b1; \131/* does not adjust output bytes inside vectors */132133#define debyteslice_16x16b(a0, b0, c0, d0, \134a1, b1, c1, d1, \135a2, b2, c2, d2, \136a3, b3, c3, d3, \137st0, st1) \138vmovdqu d2, st0; \139vmovdqu d3, st1; \140transpose_4x4(a0, a1, a2, a3, d2, d3); \141transpose_4x4(b0, b1, b2, b3, d2, d3); \142vmovdqu st0, d2; \143vmovdqu st1, d3; \144\145vmovdqu a0, st0; \146vmovdqu a1, st1; \147transpose_4x4(c0, c1, c2, c3, a0, a1); \148transpose_4x4(d0, d1, d2, d3, a0, a1); \149\150vbroadcasti128 .Lshufb_16x16b(%rip), a0; \151vmovdqu st1, a1; \152vpshufb a0, a2, a2; \153vpshufb a0, a3, a3; \154vpshufb a0, b0, b0; \155vpshufb a0, b1, b1; \156vpshufb a0, b2, b2; \157vpshufb a0, b3, b3; \158vpshufb a0, a1, a1; \159vpshufb a0, c0, c0; \160vpshufb a0, c1, c1; \161vpshufb a0, c2, c2; \162vpshufb a0, c3, c3; \163vpshufb a0, d0, d0; \164vpshufb a0, d1, d1; \165vpshufb a0, d2, d2; \166vpshufb a0, d3, d3; \167vmovdqu d3, st1; \168vmovdqu st0, d3; \169vpshufb a0, d3, a0; \170vmovdqu d2, st0; \171\172transpose_4x4(c0, d0, a0, b0, d2, d3); \173transpose_4x4(c1, d1, a1, b1, d2, d3); \174vmovdqu st0, d2; \175vmovdqu st1, d3; \176\177vmovdqu b0, st0; \178vmovdqu b1, st1; \179transpose_4x4(c2, d2, a2, b2, b0, b1); \180transpose_4x4(c3, d3, a3, b3, b0, b1); \181vmovdqu st0, b0; \182vmovdqu st1, b1; \183/* does not adjust output bytes inside vectors */184185/* load blocks to registers and apply pre-whitening */186#define inpack16_pre(x0, x1, x2, x3, \187x4, x5, x6, x7, \188y0, y1, y2, y3, \189y4, y5, y6, y7, \190rio) \191vmovdqu (0 * 32)(rio), x0; \192vmovdqu (1 * 32)(rio), x1; \193vmovdqu (2 * 32)(rio), x2; \194vmovdqu (3 * 32)(rio), x3; \195vmovdqu (4 * 32)(rio), x4; \196vmovdqu (5 * 32)(rio), x5; \197vmovdqu (6 * 32)(rio), x6; \198vmovdqu (7 * 32)(rio), x7; \199vmovdqu (8 * 32)(rio), y0; \200vmovdqu (9 * 32)(rio), y1; \201vmovdqu (10 * 32)(rio), y2; \202vmovdqu (11 * 32)(rio), y3; \203vmovdqu (12 * 32)(rio), y4; \204vmovdqu (13 * 32)(rio), y5; \205vmovdqu (14 * 32)(rio), y6; \206vmovdqu (15 * 32)(rio), y7;207208/* byteslice pre-whitened blocks and store to temporary memory */209#define inpack16_post(x0, x1, x2, x3, \210x4, x5, x6, x7, \211y0, y1, y2, y3, \212y4, y5, y6, y7, \213mem_ab, mem_cd) \214byteslice_16x16b(x0, x1, x2, x3, \215x4, x5, x6, x7, \216y0, y1, y2, y3, \217y4, y5, y6, y7, \218(mem_ab), (mem_cd)); \219\220vmovdqu x0, 0 * 32(mem_ab); \221vmovdqu x1, 1 * 32(mem_ab); \222vmovdqu x2, 2 * 32(mem_ab); \223vmovdqu x3, 3 * 32(mem_ab); \224vmovdqu x4, 4 * 32(mem_ab); \225vmovdqu x5, 5 * 32(mem_ab); \226vmovdqu x6, 6 * 32(mem_ab); \227vmovdqu x7, 7 * 32(mem_ab); \228vmovdqu y0, 0 * 32(mem_cd); \229vmovdqu y1, 1 * 32(mem_cd); \230vmovdqu y2, 2 * 32(mem_cd); \231vmovdqu y3, 3 * 32(mem_cd); \232vmovdqu y4, 4 * 32(mem_cd); \233vmovdqu y5, 5 * 32(mem_cd); \234vmovdqu y6, 6 * 32(mem_cd); \235vmovdqu y7, 7 * 32(mem_cd);236237#define write_output(x0, x1, x2, x3, \238x4, x5, x6, x7, \239y0, y1, y2, y3, \240y4, y5, y6, y7, \241mem) \242vmovdqu x0, 0 * 32(mem); \243vmovdqu x1, 1 * 32(mem); \244vmovdqu x2, 2 * 32(mem); \245vmovdqu x3, 3 * 32(mem); \246vmovdqu x4, 4 * 32(mem); \247vmovdqu x5, 5 * 32(mem); \248vmovdqu x6, 6 * 32(mem); \249vmovdqu x7, 7 * 32(mem); \250vmovdqu y0, 8 * 32(mem); \251vmovdqu y1, 9 * 32(mem); \252vmovdqu y2, 10 * 32(mem); \253vmovdqu y3, 11 * 32(mem); \254vmovdqu y4, 12 * 32(mem); \255vmovdqu y5, 13 * 32(mem); \256vmovdqu y6, 14 * 32(mem); \257vmovdqu y7, 15 * 32(mem); \258259#define aria_store_state_8way(x0, x1, x2, x3, \260x4, x5, x6, x7, \261mem_tmp, idx) \262vmovdqu x0, ((idx + 0) * 32)(mem_tmp); \263vmovdqu x1, ((idx + 1) * 32)(mem_tmp); \264vmovdqu x2, ((idx + 2) * 32)(mem_tmp); \265vmovdqu x3, ((idx + 3) * 32)(mem_tmp); \266vmovdqu x4, ((idx + 4) * 32)(mem_tmp); \267vmovdqu x5, ((idx + 5) * 32)(mem_tmp); \268vmovdqu x6, ((idx + 6) * 32)(mem_tmp); \269vmovdqu x7, ((idx + 7) * 32)(mem_tmp);270271#define aria_load_state_8way(x0, x1, x2, x3, \272x4, x5, x6, x7, \273mem_tmp, idx) \274vmovdqu ((idx + 0) * 32)(mem_tmp), x0; \275vmovdqu ((idx + 1) * 32)(mem_tmp), x1; \276vmovdqu ((idx + 2) * 32)(mem_tmp), x2; \277vmovdqu ((idx + 3) * 32)(mem_tmp), x3; \278vmovdqu ((idx + 4) * 32)(mem_tmp), x4; \279vmovdqu ((idx + 5) * 32)(mem_tmp), x5; \280vmovdqu ((idx + 6) * 32)(mem_tmp), x6; \281vmovdqu ((idx + 7) * 32)(mem_tmp), x7;282283#define aria_ark_8way(x0, x1, x2, x3, \284x4, x5, x6, x7, \285t0, rk, idx, round) \286/* AddRoundKey */ \287vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \288vpxor t0, x0, x0; \289vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \290vpxor t0, x1, x1; \291vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \292vpxor t0, x2, x2; \293vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \294vpxor t0, x3, x3; \295vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \296vpxor t0, x4, x4; \297vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \298vpxor t0, x5, x5; \299vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \300vpxor t0, x6, x6; \301vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \302vpxor t0, x7, x7;303304#define aria_sbox_8way_gfni(x0, x1, x2, x3, \305x4, x5, x6, x7, \306t0, t1, t2, t3, \307t4, t5, t6, t7) \308vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \309vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \310vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \311vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \312vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \313vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \314vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \315vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \316vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \317vgf2p8affineinvqb $0, t2, x2, x2; \318vgf2p8affineinvqb $0, t2, x6, x6; \319vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \320vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \321vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \322vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \323vgf2p8affineinvqb $0, t2, x3, x3; \324vgf2p8affineinvqb $0, t2, x7, x7325326#define aria_sbox_8way(x0, x1, x2, x3, \327x4, x5, x6, x7, \328t0, t1, t2, t3, \329t4, t5, t6, t7) \330vpxor t7, t7, t7; \331vpxor t6, t6, t6; \332vbroadcasti128 .Linv_shift_row(%rip), t0; \333vbroadcasti128 .Lshift_row(%rip), t1; \334vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \335vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \336vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \337vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \338\339vextracti128 $1, x0, t6##_x; \340vaesenclast t7##_x, x0##_x, x0##_x; \341vaesenclast t7##_x, t6##_x, t6##_x; \342vinserti128 $1, t6##_x, x0, x0; \343\344vextracti128 $1, x4, t6##_x; \345vaesenclast t7##_x, x4##_x, x4##_x; \346vaesenclast t7##_x, t6##_x, t6##_x; \347vinserti128 $1, t6##_x, x4, x4; \348\349vextracti128 $1, x1, t6##_x; \350vaesenclast t7##_x, x1##_x, x1##_x; \351vaesenclast t7##_x, t6##_x, t6##_x; \352vinserti128 $1, t6##_x, x1, x1; \353\354vextracti128 $1, x5, t6##_x; \355vaesenclast t7##_x, x5##_x, x5##_x; \356vaesenclast t7##_x, t6##_x, t6##_x; \357vinserti128 $1, t6##_x, x5, x5; \358\359vextracti128 $1, x2, t6##_x; \360vaesdeclast t7##_x, x2##_x, x2##_x; \361vaesdeclast t7##_x, t6##_x, t6##_x; \362vinserti128 $1, t6##_x, x2, x2; \363\364vextracti128 $1, x6, t6##_x; \365vaesdeclast t7##_x, x6##_x, x6##_x; \366vaesdeclast t7##_x, t6##_x, t6##_x; \367vinserti128 $1, t6##_x, x6, x6; \368\369vpbroadcastd .L0f0f0f0f(%rip), t6; \370\371/* AES inverse shift rows */ \372vpshufb t0, x0, x0; \373vpshufb t0, x4, x4; \374vpshufb t0, x1, x1; \375vpshufb t0, x5, x5; \376vpshufb t1, x3, x3; \377vpshufb t1, x7, x7; \378vpshufb t1, x2, x2; \379vpshufb t1, x6, x6; \380\381/* affine transformation for S2 */ \382filter_8bit(x1, t2, t3, t6, t0); \383/* affine transformation for S2 */ \384filter_8bit(x5, t2, t3, t6, t0); \385\386/* affine transformation for X2 */ \387filter_8bit(x3, t4, t5, t6, t0); \388/* affine transformation for X2 */ \389filter_8bit(x7, t4, t5, t6, t0); \390\391vpxor t6, t6, t6; \392vextracti128 $1, x3, t6##_x; \393vaesdeclast t7##_x, x3##_x, x3##_x; \394vaesdeclast t7##_x, t6##_x, t6##_x; \395vinserti128 $1, t6##_x, x3, x3; \396\397vextracti128 $1, x7, t6##_x; \398vaesdeclast t7##_x, x7##_x, x7##_x; \399vaesdeclast t7##_x, t6##_x, t6##_x; \400vinserti128 $1, t6##_x, x7, x7; \401402#define aria_diff_m(x0, x1, x2, x3, \403t0, t1, t2, t3) \404/* T = rotr32(X, 8); */ \405/* X ^= T */ \406vpxor x0, x3, t0; \407vpxor x1, x0, t1; \408vpxor x2, x1, t2; \409vpxor x3, x2, t3; \410/* X = T ^ rotr(X, 16); */ \411vpxor t2, x0, x0; \412vpxor x1, t3, t3; \413vpxor t0, x2, x2; \414vpxor t1, x3, x1; \415vmovdqu t3, x3;416417#define aria_diff_word(x0, x1, x2, x3, \418x4, x5, x6, x7, \419y0, y1, y2, y3, \420y4, y5, y6, y7) \421/* t1 ^= t2; */ \422vpxor y0, x4, x4; \423vpxor y1, x5, x5; \424vpxor y2, x6, x6; \425vpxor y3, x7, x7; \426\427/* t2 ^= t3; */ \428vpxor y4, y0, y0; \429vpxor y5, y1, y1; \430vpxor y6, y2, y2; \431vpxor y7, y3, y3; \432\433/* t0 ^= t1; */ \434vpxor x4, x0, x0; \435vpxor x5, x1, x1; \436vpxor x6, x2, x2; \437vpxor x7, x3, x3; \438\439/* t3 ^= t1; */ \440vpxor x4, y4, y4; \441vpxor x5, y5, y5; \442vpxor x6, y6, y6; \443vpxor x7, y7, y7; \444\445/* t2 ^= t0; */ \446vpxor x0, y0, y0; \447vpxor x1, y1, y1; \448vpxor x2, y2, y2; \449vpxor x3, y3, y3; \450\451/* t1 ^= t2; */ \452vpxor y0, x4, x4; \453vpxor y1, x5, x5; \454vpxor y2, x6, x6; \455vpxor y3, x7, x7;456457#define aria_fe(x0, x1, x2, x3, \458x4, x5, x6, x7, \459y0, y1, y2, y3, \460y4, y5, y6, y7, \461mem_tmp, rk, round) \462aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \463y0, rk, 8, round); \464\465aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \466y0, y1, y2, y3, y4, y5, y6, y7); \467\468aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \469aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \470aria_store_state_8way(x0, x1, x2, x3, \471x4, x5, x6, x7, \472mem_tmp, 8); \473\474aria_load_state_8way(x0, x1, x2, x3, \475x4, x5, x6, x7, \476mem_tmp, 0); \477aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \478y0, rk, 0, round); \479\480aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \481y0, y1, y2, y3, y4, y5, y6, y7); \482\483aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \484aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \485aria_store_state_8way(x0, x1, x2, x3, \486x4, x5, x6, x7, \487mem_tmp, 0); \488aria_load_state_8way(y0, y1, y2, y3, \489y4, y5, y6, y7, \490mem_tmp, 8); \491aria_diff_word(x0, x1, x2, x3, \492x4, x5, x6, x7, \493y0, y1, y2, y3, \494y4, y5, y6, y7); \495/* aria_diff_byte() \496* T3 = ABCD -> BADC \497* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \498* T0 = ABCD -> CDAB \499* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \500* T1 = ABCD -> DCBA \501* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \502*/ \503aria_diff_word(x2, x3, x0, x1, \504x7, x6, x5, x4, \505y0, y1, y2, y3, \506y5, y4, y7, y6); \507aria_store_state_8way(x3, x2, x1, x0, \508x6, x7, x4, x5, \509mem_tmp, 0);510511#define aria_fo(x0, x1, x2, x3, \512x4, x5, x6, x7, \513y0, y1, y2, y3, \514y4, y5, y6, y7, \515mem_tmp, rk, round) \516aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \517y0, rk, 8, round); \518\519aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \520y0, y1, y2, y3, y4, y5, y6, y7); \521\522aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \523aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \524aria_store_state_8way(x0, x1, x2, x3, \525x4, x5, x6, x7, \526mem_tmp, 8); \527\528aria_load_state_8way(x0, x1, x2, x3, \529x4, x5, x6, x7, \530mem_tmp, 0); \531aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \532y0, rk, 0, round); \533\534aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \535y0, y1, y2, y3, y4, y5, y6, y7); \536\537aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \538aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \539aria_store_state_8way(x0, x1, x2, x3, \540x4, x5, x6, x7, \541mem_tmp, 0); \542aria_load_state_8way(y0, y1, y2, y3, \543y4, y5, y6, y7, \544mem_tmp, 8); \545aria_diff_word(x0, x1, x2, x3, \546x4, x5, x6, x7, \547y0, y1, y2, y3, \548y4, y5, y6, y7); \549/* aria_diff_byte() \550* T1 = ABCD -> BADC \551* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \552* T2 = ABCD -> CDAB \553* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \554* T3 = ABCD -> DCBA \555* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \556*/ \557aria_diff_word(x0, x1, x2, x3, \558x5, x4, x7, x6, \559y2, y3, y0, y1, \560y7, y6, y5, y4); \561aria_store_state_8way(x3, x2, x1, x0, \562x6, x7, x4, x5, \563mem_tmp, 0);564565#define aria_ff(x0, x1, x2, x3, \566x4, x5, x6, x7, \567y0, y1, y2, y3, \568y4, y5, y6, y7, \569mem_tmp, rk, round, last_round) \570aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \571y0, rk, 8, round); \572\573aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \574y0, y1, y2, y3, y4, y5, y6, y7); \575\576aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \577y0, rk, 8, last_round); \578\579aria_store_state_8way(x0, x1, x2, x3, \580x4, x5, x6, x7, \581mem_tmp, 8); \582\583aria_load_state_8way(x0, x1, x2, x3, \584x4, x5, x6, x7, \585mem_tmp, 0); \586aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \587y0, rk, 0, round); \588\589aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \590y0, y1, y2, y3, y4, y5, y6, y7); \591\592aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \593y0, rk, 0, last_round); \594\595aria_load_state_8way(y0, y1, y2, y3, \596y4, y5, y6, y7, \597mem_tmp, 8);598599#define aria_fe_gfni(x0, x1, x2, x3, \600x4, x5, x6, x7, \601y0, y1, y2, y3, \602y4, y5, y6, y7, \603mem_tmp, rk, round) \604aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \605y0, rk, 8, round); \606\607aria_sbox_8way_gfni(x2, x3, x0, x1, \608x6, x7, x4, x5, \609y0, y1, y2, y3, \610y4, y5, y6, y7); \611\612aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \613aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \614aria_store_state_8way(x0, x1, x2, x3, \615x4, x5, x6, x7, \616mem_tmp, 8); \617\618aria_load_state_8way(x0, x1, x2, x3, \619x4, x5, x6, x7, \620mem_tmp, 0); \621aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \622y0, rk, 0, round); \623\624aria_sbox_8way_gfni(x2, x3, x0, x1, \625x6, x7, x4, x5, \626y0, y1, y2, y3, \627y4, y5, y6, y7); \628\629aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \630aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \631aria_store_state_8way(x0, x1, x2, x3, \632x4, x5, x6, x7, \633mem_tmp, 0); \634aria_load_state_8way(y0, y1, y2, y3, \635y4, y5, y6, y7, \636mem_tmp, 8); \637aria_diff_word(x0, x1, x2, x3, \638x4, x5, x6, x7, \639y0, y1, y2, y3, \640y4, y5, y6, y7); \641/* aria_diff_byte() \642* T3 = ABCD -> BADC \643* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \644* T0 = ABCD -> CDAB \645* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \646* T1 = ABCD -> DCBA \647* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \648*/ \649aria_diff_word(x2, x3, x0, x1, \650x7, x6, x5, x4, \651y0, y1, y2, y3, \652y5, y4, y7, y6); \653aria_store_state_8way(x3, x2, x1, x0, \654x6, x7, x4, x5, \655mem_tmp, 0);656657#define aria_fo_gfni(x0, x1, x2, x3, \658x4, x5, x6, x7, \659y0, y1, y2, y3, \660y4, y5, y6, y7, \661mem_tmp, rk, round) \662aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \663y0, rk, 8, round); \664\665aria_sbox_8way_gfni(x0, x1, x2, x3, \666x4, x5, x6, x7, \667y0, y1, y2, y3, \668y4, y5, y6, y7); \669\670aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \671aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \672aria_store_state_8way(x0, x1, x2, x3, \673x4, x5, x6, x7, \674mem_tmp, 8); \675\676aria_load_state_8way(x0, x1, x2, x3, \677x4, x5, x6, x7, \678mem_tmp, 0); \679aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \680y0, rk, 0, round); \681\682aria_sbox_8way_gfni(x0, x1, x2, x3, \683x4, x5, x6, x7, \684y0, y1, y2, y3, \685y4, y5, y6, y7); \686\687aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \688aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \689aria_store_state_8way(x0, x1, x2, x3, \690x4, x5, x6, x7, \691mem_tmp, 0); \692aria_load_state_8way(y0, y1, y2, y3, \693y4, y5, y6, y7, \694mem_tmp, 8); \695aria_diff_word(x0, x1, x2, x3, \696x4, x5, x6, x7, \697y0, y1, y2, y3, \698y4, y5, y6, y7); \699/* aria_diff_byte() \700* T1 = ABCD -> BADC \701* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \702* T2 = ABCD -> CDAB \703* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \704* T3 = ABCD -> DCBA \705* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \706*/ \707aria_diff_word(x0, x1, x2, x3, \708x5, x4, x7, x6, \709y2, y3, y0, y1, \710y7, y6, y5, y4); \711aria_store_state_8way(x3, x2, x1, x0, \712x6, x7, x4, x5, \713mem_tmp, 0);714715#define aria_ff_gfni(x0, x1, x2, x3, \716x4, x5, x6, x7, \717y0, y1, y2, y3, \718y4, y5, y6, y7, \719mem_tmp, rk, round, last_round) \720aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \721y0, rk, 8, round); \722\723aria_sbox_8way_gfni(x2, x3, x0, x1, \724x6, x7, x4, x5, \725y0, y1, y2, y3, \726y4, y5, y6, y7); \727\728aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \729y0, rk, 8, last_round); \730\731aria_store_state_8way(x0, x1, x2, x3, \732x4, x5, x6, x7, \733mem_tmp, 8); \734\735aria_load_state_8way(x0, x1, x2, x3, \736x4, x5, x6, x7, \737mem_tmp, 0); \738aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \739y0, rk, 0, round); \740\741aria_sbox_8way_gfni(x2, x3, x0, x1, \742x6, x7, x4, x5, \743y0, y1, y2, y3, \744y4, y5, y6, y7); \745\746aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \747y0, rk, 0, last_round); \748\749aria_load_state_8way(y0, y1, y2, y3, \750y4, y5, y6, y7, \751mem_tmp, 8);752753.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32754.align 32755#define SHUFB_BYTES(idx) \7560 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)757.Lshufb_16x16b:758.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)759.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)760761.section .rodata.cst16, "aM", @progbits, 16762.align 16763/* For isolating SubBytes from AESENCLAST, inverse shift row */764.Linv_shift_row:765.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b766.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03767.Lshift_row:768.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03769.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b770/* For CTR-mode IV byteswap */771.Lbswap128_mask:772.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08773.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00774775/* AES inverse affine and S2 combined:776* 1 1 0 0 0 0 0 1 x0 0777* 0 1 0 0 1 0 0 0 x1 0778* 1 1 0 0 1 1 1 1 x2 0779* 0 1 1 0 1 0 0 1 x3 1780* 0 1 0 0 1 1 0 0 * x4 + 0781* 0 1 0 1 1 0 0 0 x5 0782* 0 0 0 0 0 1 0 1 x6 0783* 1 1 1 0 0 1 1 1 x7 1784*/785.Ltf_lo__inv_aff__and__s2:786.octa 0x92172DA81A9FA520B2370D883ABF8500787.Ltf_hi__inv_aff__and__s2:788.octa 0x2B15FFC1AF917B45E6D8320C625CB688789790/* X2 and AES forward affine combined:791* 1 0 1 1 0 0 0 1 x0 0792* 0 1 1 1 1 0 1 1 x1 0793* 0 0 0 1 1 0 1 0 x2 1794* 0 1 0 0 0 1 0 0 x3 0795* 0 0 1 1 1 0 1 1 * x4 + 0796* 0 1 0 0 1 0 0 0 x5 0797* 1 1 0 1 0 0 1 1 x6 0798* 0 1 0 0 1 0 1 0 x7 0799*/800.Ltf_lo__x2__and__fwd_aff:801.octa 0xEFAE0544FCBD1657B8F95213ABEA4100802.Ltf_hi__x2__and__fwd_aff:803.octa 0x3F893781E95FE1576CDA64D2BA0CB204804805.section .rodata.cst8, "aM", @progbits, 8806.align 8807/* AES affine: */808#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)809.Ltf_aff_bitmatrix:810.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),811BV8(1, 1, 0, 0, 0, 1, 1, 1),812BV8(1, 1, 1, 0, 0, 0, 1, 1),813BV8(1, 1, 1, 1, 0, 0, 0, 1),814BV8(1, 1, 1, 1, 1, 0, 0, 0),815BV8(0, 1, 1, 1, 1, 1, 0, 0),816BV8(0, 0, 1, 1, 1, 1, 1, 0),817BV8(0, 0, 0, 1, 1, 1, 1, 1))818819/* AES inverse affine: */820#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)821.Ltf_inv_bitmatrix:822.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),823BV8(1, 0, 0, 1, 0, 0, 1, 0),824BV8(0, 1, 0, 0, 1, 0, 0, 1),825BV8(1, 0, 1, 0, 0, 1, 0, 0),826BV8(0, 1, 0, 1, 0, 0, 1, 0),827BV8(0, 0, 1, 0, 1, 0, 0, 1),828BV8(1, 0, 0, 1, 0, 1, 0, 0),829BV8(0, 1, 0, 0, 1, 0, 1, 0))830831/* S2: */832#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)833.Ltf_s2_bitmatrix:834.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),835BV8(0, 0, 1, 1, 1, 1, 1, 1),836BV8(1, 1, 1, 0, 1, 1, 0, 1),837BV8(1, 1, 0, 0, 0, 0, 1, 1),838BV8(0, 1, 0, 0, 0, 0, 1, 1),839BV8(1, 1, 0, 0, 1, 1, 1, 0),840BV8(0, 1, 1, 0, 0, 0, 1, 1),841BV8(1, 1, 1, 1, 0, 1, 1, 0))842843/* X2: */844#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)845.Ltf_x2_bitmatrix:846.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),847BV8(0, 0, 1, 0, 0, 1, 1, 0),848BV8(0, 0, 0, 0, 1, 0, 1, 0),849BV8(1, 1, 1, 0, 0, 0, 1, 1),850BV8(1, 1, 1, 0, 1, 1, 0, 0),851BV8(0, 1, 1, 0, 1, 0, 1, 1),852BV8(1, 0, 1, 1, 1, 1, 0, 1),853BV8(1, 0, 0, 1, 0, 0, 1, 1))854855/* Identity matrix: */856.Ltf_id_bitmatrix:857.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),858BV8(0, 1, 0, 0, 0, 0, 0, 0),859BV8(0, 0, 1, 0, 0, 0, 0, 0),860BV8(0, 0, 0, 1, 0, 0, 0, 0),861BV8(0, 0, 0, 0, 1, 0, 0, 0),862BV8(0, 0, 0, 0, 0, 1, 0, 0),863BV8(0, 0, 0, 0, 0, 0, 1, 0),864BV8(0, 0, 0, 0, 0, 0, 0, 1))865866/* 4-bit mask */867.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4868.align 4869.L0f0f0f0f:870.long 0x0f0f0f0f871872.text873874SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)875/* input:876* %r9: rk877* %rsi: dst878* %rdx: src879* %ymm0..%ymm15: byte-sliced blocks880*/881882FRAME_BEGIN883884movq %rsi, %rax;885leaq 8 * 32(%rax), %r8;886887inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,888%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,889%ymm15, %rax, %r8);890aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,891%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,892%rax, %r9, 0);893aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,894%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,895%ymm15, %rax, %r9, 1);896aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,897%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,898%rax, %r9, 2);899aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,900%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,901%ymm15, %rax, %r9, 3);902aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,903%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,904%rax, %r9, 4);905aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,906%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,907%ymm15, %rax, %r9, 5);908aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,909%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,910%rax, %r9, 6);911aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,912%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,913%ymm15, %rax, %r9, 7);914aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,915%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,916%rax, %r9, 8);917aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,918%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,919%ymm15, %rax, %r9, 9);920aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,921%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,922%rax, %r9, 10);923cmpl $12, ARIA_CTX_rounds(CTX);924jne .Laria_192;925aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,926%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,927%ymm15, %rax, %r9, 11, 12);928jmp .Laria_end;929.Laria_192:930aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,931%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,932%ymm15, %rax, %r9, 11);933aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,934%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,935%rax, %r9, 12);936cmpl $14, ARIA_CTX_rounds(CTX);937jne .Laria_256;938aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,939%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,940%ymm15, %rax, %r9, 13, 14);941jmp .Laria_end;942.Laria_256:943aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,944%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,945%ymm15, %rax, %r9, 13);946aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,947%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,948%rax, %r9, 14);949aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,950%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,951%ymm15, %rax, %r9, 15, 16);952.Laria_end:953debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,954%ymm9, %ymm13, %ymm0, %ymm5,955%ymm10, %ymm14, %ymm3, %ymm6,956%ymm11, %ymm15, %ymm2, %ymm7,957(%rax), (%r8));958959FRAME_END960RET;961SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)962963SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)964/* input:965* %rdi: ctx, CTX966* %rsi: dst967* %rdx: src968*/969970FRAME_BEGIN971972leaq ARIA_CTX_enc_key(CTX), %r9;973974inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,975%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,976%ymm15, %rdx);977978call __aria_aesni_avx2_crypt_32way;979980write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,981%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,982%ymm15, %rax);983984FRAME_END985RET;986SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)987988SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)989/* input:990* %rdi: ctx, CTX991* %rsi: dst992* %rdx: src993*/994995FRAME_BEGIN996997leaq ARIA_CTX_dec_key(CTX), %r9;998999inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,1000%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1001%ymm15, %rdx);10021003call __aria_aesni_avx2_crypt_32way;10041005write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,1006%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1007%ymm15, %rax);10081009FRAME_END1010RET;1011SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)10121013SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)1014/* input:1015* %rdi: ctx1016* %rsi: dst1017* %rdx: src1018* %rcx: keystream1019* %r8: iv (big endian, 128bit)1020*/10211022FRAME_BEGIN1023movq 8(%r8), %r11;1024bswapq %r11;10251026vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;1027vpcmpeqd %ymm0, %ymm0, %ymm0;1028vpsrldq $8, %ymm0, %ymm0; /* ab: -1:0 ; cd: -1:0 */1029vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */10301031/* load IV and byteswap */1032vmovdqu (%r8), %xmm7;1033vpshufb %xmm6, %xmm7, %xmm7;1034vmovdqa %xmm7, %xmm3;1035inc_le128(%xmm7, %xmm0, %xmm4);1036vinserti128 $1, %xmm7, %ymm3, %ymm3;1037vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */10381039/* check need for handling 64-bit overflow and carry */1040cmpq $(0xffffffffffffffff - 32), %r11;1041ja .Lhandle_ctr_carry;10421043/* construct IVs */1044vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */1045vpshufb %ymm6, %ymm3, %ymm9;1046vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */1047vpshufb %ymm6, %ymm3, %ymm10;1048vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */1049vpshufb %ymm6, %ymm3, %ymm11;1050vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */1051vpshufb %ymm6, %ymm3, %ymm12;1052vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */1053vpshufb %ymm6, %ymm3, %ymm13;1054vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */1055vpshufb %ymm6, %ymm3, %ymm14;1056vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */1057vpshufb %ymm6, %ymm3, %ymm15;1058vmovdqu %ymm8, (0 * 32)(%rcx);1059vmovdqu %ymm9, (1 * 32)(%rcx);1060vmovdqu %ymm10, (2 * 32)(%rcx);1061vmovdqu %ymm11, (3 * 32)(%rcx);1062vmovdqu %ymm12, (4 * 32)(%rcx);1063vmovdqu %ymm13, (5 * 32)(%rcx);1064vmovdqu %ymm14, (6 * 32)(%rcx);1065vmovdqu %ymm15, (7 * 32)(%rcx);10661067vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */1068vpshufb %ymm6, %ymm3, %ymm8;1069vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */1070vpshufb %ymm6, %ymm3, %ymm9;1071vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */1072vpshufb %ymm6, %ymm3, %ymm10;1073vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */1074vpshufb %ymm6, %ymm3, %ymm11;1075vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */1076vpshufb %ymm6, %ymm3, %ymm12;1077vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */1078vpshufb %ymm6, %ymm3, %ymm13;1079vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */1080vpshufb %ymm6, %ymm3, %ymm14;1081vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */1082vpshufb %ymm6, %ymm3, %ymm15;1083vpsubq %ymm5, %ymm3, %ymm3; /* +32 */1084vpshufb %xmm6, %xmm3, %xmm3;1085vmovdqu %xmm3, (%r8);1086vmovdqu (0 * 32)(%rcx), %ymm0;1087vmovdqu (1 * 32)(%rcx), %ymm1;1088vmovdqu (2 * 32)(%rcx), %ymm2;1089vmovdqu (3 * 32)(%rcx), %ymm3;1090vmovdqu (4 * 32)(%rcx), %ymm4;1091vmovdqu (5 * 32)(%rcx), %ymm5;1092vmovdqu (6 * 32)(%rcx), %ymm6;1093vmovdqu (7 * 32)(%rcx), %ymm7;1094jmp .Lctr_carry_done;10951096.Lhandle_ctr_carry:1097/* construct IVs */1098inc_le128(%ymm3, %ymm0, %ymm4);1099inc_le128(%ymm3, %ymm0, %ymm4);1100vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */1101inc_le128(%ymm3, %ymm0, %ymm4);1102inc_le128(%ymm3, %ymm0, %ymm4);1103vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */1104inc_le128(%ymm3, %ymm0, %ymm4);1105inc_le128(%ymm3, %ymm0, %ymm4);1106vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */1107inc_le128(%ymm3, %ymm0, %ymm4);1108inc_le128(%ymm3, %ymm0, %ymm4);1109vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */1110inc_le128(%ymm3, %ymm0, %ymm4);1111inc_le128(%ymm3, %ymm0, %ymm4);1112vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */1113inc_le128(%ymm3, %ymm0, %ymm4);1114inc_le128(%ymm3, %ymm0, %ymm4);1115vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */1116inc_le128(%ymm3, %ymm0, %ymm4);1117inc_le128(%ymm3, %ymm0, %ymm4);1118vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */1119vmovdqu %ymm8, (0 * 32)(%rcx);1120vmovdqu %ymm9, (1 * 32)(%rcx);1121vmovdqu %ymm10, (2 * 32)(%rcx);1122vmovdqu %ymm11, (3 * 32)(%rcx);1123vmovdqu %ymm12, (4 * 32)(%rcx);1124vmovdqu %ymm13, (5 * 32)(%rcx);1125vmovdqu %ymm14, (6 * 32)(%rcx);1126vmovdqu %ymm15, (7 * 32)(%rcx);11271128inc_le128(%ymm3, %ymm0, %ymm4);1129inc_le128(%ymm3, %ymm0, %ymm4);1130vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */1131inc_le128(%ymm3, %ymm0, %ymm4);1132inc_le128(%ymm3, %ymm0, %ymm4);1133vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */1134inc_le128(%ymm3, %ymm0, %ymm4);1135inc_le128(%ymm3, %ymm0, %ymm4);1136vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */1137inc_le128(%ymm3, %ymm0, %ymm4);1138inc_le128(%ymm3, %ymm0, %ymm4);1139vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */1140inc_le128(%ymm3, %ymm0, %ymm4);1141inc_le128(%ymm3, %ymm0, %ymm4);1142vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */1143inc_le128(%ymm3, %ymm0, %ymm4);1144inc_le128(%ymm3, %ymm0, %ymm4);1145vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */1146inc_le128(%ymm3, %ymm0, %ymm4);1147inc_le128(%ymm3, %ymm0, %ymm4);1148vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */1149inc_le128(%ymm3, %ymm0, %ymm4);1150inc_le128(%ymm3, %ymm0, %ymm4);1151vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */1152inc_le128(%ymm3, %ymm0, %ymm4);1153vextracti128 $1, %ymm3, %xmm3;1154vpshufb %xmm6, %xmm3, %xmm3; /* +32 */1155vmovdqu %xmm3, (%r8);1156vmovdqu (0 * 32)(%rcx), %ymm0;1157vmovdqu (1 * 32)(%rcx), %ymm1;1158vmovdqu (2 * 32)(%rcx), %ymm2;1159vmovdqu (3 * 32)(%rcx), %ymm3;1160vmovdqu (4 * 32)(%rcx), %ymm4;1161vmovdqu (5 * 32)(%rcx), %ymm5;1162vmovdqu (6 * 32)(%rcx), %ymm6;1163vmovdqu (7 * 32)(%rcx), %ymm7;11641165.Lctr_carry_done:11661167FRAME_END1168RET;1169SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)11701171SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)1172/* input:1173* %rdi: ctx1174* %rsi: dst1175* %rdx: src1176* %rcx: keystream1177* %r8: iv (big endian, 128bit)1178*/1179FRAME_BEGIN11801181call __aria_aesni_avx2_ctr_gen_keystream_32way;11821183leaq (%rsi), %r10;1184leaq (%rdx), %r11;1185leaq (%rcx), %rsi;1186leaq (%rcx), %rdx;1187leaq ARIA_CTX_enc_key(CTX), %r9;11881189call __aria_aesni_avx2_crypt_32way;11901191vpxor (0 * 32)(%r11), %ymm1, %ymm1;1192vpxor (1 * 32)(%r11), %ymm0, %ymm0;1193vpxor (2 * 32)(%r11), %ymm3, %ymm3;1194vpxor (3 * 32)(%r11), %ymm2, %ymm2;1195vpxor (4 * 32)(%r11), %ymm4, %ymm4;1196vpxor (5 * 32)(%r11), %ymm5, %ymm5;1197vpxor (6 * 32)(%r11), %ymm6, %ymm6;1198vpxor (7 * 32)(%r11), %ymm7, %ymm7;1199vpxor (8 * 32)(%r11), %ymm8, %ymm8;1200vpxor (9 * 32)(%r11), %ymm9, %ymm9;1201vpxor (10 * 32)(%r11), %ymm10, %ymm10;1202vpxor (11 * 32)(%r11), %ymm11, %ymm11;1203vpxor (12 * 32)(%r11), %ymm12, %ymm12;1204vpxor (13 * 32)(%r11), %ymm13, %ymm13;1205vpxor (14 * 32)(%r11), %ymm14, %ymm14;1206vpxor (15 * 32)(%r11), %ymm15, %ymm15;1207write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,1208%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1209%ymm15, %r10);12101211FRAME_END1212RET;1213SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)12141215SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)1216/* input:1217* %r9: rk1218* %rsi: dst1219* %rdx: src1220* %ymm0..%ymm15: 16 byte-sliced blocks1221*/12221223FRAME_BEGIN12241225movq %rsi, %rax;1226leaq 8 * 32(%rax), %r8;12271228inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,1229%ymm4, %ymm5, %ymm6, %ymm7,1230%ymm8, %ymm9, %ymm10, %ymm11,1231%ymm12, %ymm13, %ymm14,1232%ymm15, %rax, %r8);1233aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,1234%ymm12, %ymm13, %ymm14, %ymm15,1235%ymm0, %ymm1, %ymm2, %ymm3,1236%ymm4, %ymm5, %ymm6, %ymm7,1237%rax, %r9, 0);1238aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1239%ymm4, %ymm5, %ymm6, %ymm7,1240%ymm8, %ymm9, %ymm10, %ymm11,1241%ymm12, %ymm13, %ymm14,1242%ymm15, %rax, %r9, 1);1243aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1244%ymm12, %ymm13, %ymm14, %ymm15,1245%ymm0, %ymm1, %ymm2, %ymm3,1246%ymm4, %ymm5, %ymm6, %ymm7,1247%rax, %r9, 2);1248aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1249%ymm4, %ymm5, %ymm6, %ymm7,1250%ymm8, %ymm9, %ymm10, %ymm11,1251%ymm12, %ymm13, %ymm14,1252%ymm15, %rax, %r9, 3);1253aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1254%ymm12, %ymm13, %ymm14, %ymm15,1255%ymm0, %ymm1, %ymm2, %ymm3,1256%ymm4, %ymm5, %ymm6, %ymm7,1257%rax, %r9, 4);1258aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1259%ymm4, %ymm5, %ymm6, %ymm7,1260%ymm8, %ymm9, %ymm10, %ymm11,1261%ymm12, %ymm13, %ymm14,1262%ymm15, %rax, %r9, 5);1263aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1264%ymm12, %ymm13, %ymm14, %ymm15,1265%ymm0, %ymm1, %ymm2, %ymm3,1266%ymm4, %ymm5, %ymm6, %ymm7,1267%rax, %r9, 6);1268aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1269%ymm4, %ymm5, %ymm6, %ymm7,1270%ymm8, %ymm9, %ymm10, %ymm11,1271%ymm12, %ymm13, %ymm14,1272%ymm15, %rax, %r9, 7);1273aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1274%ymm12, %ymm13, %ymm14, %ymm15,1275%ymm0, %ymm1, %ymm2, %ymm3,1276%ymm4, %ymm5, %ymm6, %ymm7,1277%rax, %r9, 8);1278aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1279%ymm4, %ymm5, %ymm6, %ymm7,1280%ymm8, %ymm9, %ymm10, %ymm11,1281%ymm12, %ymm13, %ymm14,1282%ymm15, %rax, %r9, 9);1283aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1284%ymm12, %ymm13, %ymm14, %ymm15,1285%ymm0, %ymm1, %ymm2, %ymm3,1286%ymm4, %ymm5, %ymm6, %ymm7,1287%rax, %r9, 10);1288cmpl $12, ARIA_CTX_rounds(CTX);1289jne .Laria_gfni_192;1290aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,1291%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1292%ymm15, %rax, %r9, 11, 12);1293jmp .Laria_gfni_end;1294.Laria_gfni_192:1295aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1296%ymm4, %ymm5, %ymm6, %ymm7,1297%ymm8, %ymm9, %ymm10, %ymm11,1298%ymm12, %ymm13, %ymm14,1299%ymm15, %rax, %r9, 11);1300aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1301%ymm12, %ymm13, %ymm14, %ymm15,1302%ymm0, %ymm1, %ymm2, %ymm3,1303%ymm4, %ymm5, %ymm6, %ymm7,1304%rax, %r9, 12);1305cmpl $14, ARIA_CTX_rounds(CTX);1306jne .Laria_gfni_256;1307aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1308%ymm4, %ymm5, %ymm6, %ymm7,1309%ymm8, %ymm9, %ymm10, %ymm11,1310%ymm12, %ymm13, %ymm14,1311%ymm15, %rax, %r9, 13, 14);1312jmp .Laria_gfni_end;1313.Laria_gfni_256:1314aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1315%ymm4, %ymm5, %ymm6, %ymm7,1316%ymm8, %ymm9, %ymm10, %ymm11,1317%ymm12, %ymm13, %ymm14,1318%ymm15, %rax, %r9, 13);1319aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1320%ymm12, %ymm13, %ymm14, %ymm15,1321%ymm0, %ymm1, %ymm2, %ymm3,1322%ymm4, %ymm5, %ymm6, %ymm7,1323%rax, %r9, 14);1324aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1325%ymm4, %ymm5, %ymm6, %ymm7,1326%ymm8, %ymm9, %ymm10, %ymm11,1327%ymm12, %ymm13, %ymm14,1328%ymm15, %rax, %r9, 15, 16);1329.Laria_gfni_end:1330debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,1331%ymm9, %ymm13, %ymm0, %ymm5,1332%ymm10, %ymm14, %ymm3, %ymm6,1333%ymm11, %ymm15, %ymm2, %ymm7,1334(%rax), (%r8));13351336FRAME_END1337RET;1338SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)13391340SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)1341/* input:1342* %rdi: ctx, CTX1343* %rsi: dst1344* %rdx: src1345*/13461347FRAME_BEGIN13481349leaq ARIA_CTX_enc_key(CTX), %r9;13501351inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,1352%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1353%ymm15, %rdx);13541355call __aria_aesni_avx2_gfni_crypt_32way;13561357write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,1358%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1359%ymm15, %rax);13601361FRAME_END1362RET;1363SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)13641365SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)1366/* input:1367* %rdi: ctx, CTX1368* %rsi: dst1369* %rdx: src1370*/13711372FRAME_BEGIN13731374leaq ARIA_CTX_dec_key(CTX), %r9;13751376inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,1377%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1378%ymm15, %rdx);13791380call __aria_aesni_avx2_gfni_crypt_32way;13811382write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,1383%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1384%ymm15, %rax);13851386FRAME_END1387RET;1388SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)13891390SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)1391/* input:1392* %rdi: ctx1393* %rsi: dst1394* %rdx: src1395* %rcx: keystream1396* %r8: iv (big endian, 128bit)1397*/1398FRAME_BEGIN13991400call __aria_aesni_avx2_ctr_gen_keystream_32way14011402leaq (%rsi), %r10;1403leaq (%rdx), %r11;1404leaq (%rcx), %rsi;1405leaq (%rcx), %rdx;1406leaq ARIA_CTX_enc_key(CTX), %r9;14071408call __aria_aesni_avx2_gfni_crypt_32way;14091410vpxor (0 * 32)(%r11), %ymm1, %ymm1;1411vpxor (1 * 32)(%r11), %ymm0, %ymm0;1412vpxor (2 * 32)(%r11), %ymm3, %ymm3;1413vpxor (3 * 32)(%r11), %ymm2, %ymm2;1414vpxor (4 * 32)(%r11), %ymm4, %ymm4;1415vpxor (5 * 32)(%r11), %ymm5, %ymm5;1416vpxor (6 * 32)(%r11), %ymm6, %ymm6;1417vpxor (7 * 32)(%r11), %ymm7, %ymm7;1418vpxor (8 * 32)(%r11), %ymm8, %ymm8;1419vpxor (9 * 32)(%r11), %ymm9, %ymm9;1420vpxor (10 * 32)(%r11), %ymm10, %ymm10;1421vpxor (11 * 32)(%r11), %ymm11, %ymm11;1422vpxor (12 * 32)(%r11), %ymm12, %ymm12;1423vpxor (13 * 32)(%r11), %ymm13, %ymm13;1424vpxor (14 * 32)(%r11), %ymm14, %ymm14;1425vpxor (15 * 32)(%r11), %ymm15, %ymm15;1426write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,1427%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1428%ymm15, %r10);14291430FRAME_END1431RET;1432SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)143314341435