Path: blob/master/arch/x86/crypto/aria-aesni-avx2-asm_64.S
26451 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* ARIA Cipher 32-way parallel algorithm (AVX2)3*4* Copyright (c) 2022 Taehee Yoo <[email protected]>5*6*/78#include <linux/linkage.h>9#include <asm/frame.h>10#include <asm/asm-offsets.h>11#include <linux/cfi_types.h>1213/* register macros */14#define CTX %rdi1516#define ymm0_x xmm017#define ymm1_x xmm118#define ymm2_x xmm219#define ymm3_x xmm320#define ymm4_x xmm421#define ymm5_x xmm522#define ymm6_x xmm623#define ymm7_x xmm724#define ymm8_x xmm825#define ymm9_x xmm926#define ymm10_x xmm1027#define ymm11_x xmm1128#define ymm12_x xmm1229#define ymm13_x xmm1330#define ymm14_x xmm1431#define ymm15_x xmm153233#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \34( (((a0) & 1) << 0) | \35(((a1) & 1) << 1) | \36(((a2) & 1) << 2) | \37(((a3) & 1) << 3) | \38(((a4) & 1) << 4) | \39(((a5) & 1) << 5) | \40(((a6) & 1) << 6) | \41(((a7) & 1) << 7) )4243#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \44( ((l7) << (0 * 8)) | \45((l6) << (1 * 8)) | \46((l5) << (2 * 8)) | \47((l4) << (3 * 8)) | \48((l3) << (4 * 8)) | \49((l2) << (5 * 8)) | \50((l1) << (6 * 8)) | \51((l0) << (7 * 8)) )5253#define inc_le128(x, minus_one, tmp) \54vpcmpeqq minus_one, x, tmp; \55vpsubq minus_one, x, x; \56vpslldq $8, tmp, tmp; \57vpsubq tmp, x, x;5859#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \60vpand x, mask4bit, tmp0; \61vpandn x, mask4bit, x; \62vpsrld $4, x, x; \63\64vpshufb tmp0, lo_t, tmp0; \65vpshufb x, hi_t, x; \66vpxor tmp0, x, x;6768#define transpose_4x4(x0, x1, x2, x3, t1, t2) \69vpunpckhdq x1, x0, t2; \70vpunpckldq x1, x0, x0; \71\72vpunpckldq x3, x2, t1; \73vpunpckhdq x3, x2, x2; \74\75vpunpckhqdq t1, x0, x1; \76vpunpcklqdq t1, x0, x0; \77\78vpunpckhqdq x2, t2, x3; \79vpunpcklqdq x2, t2, x2;8081#define byteslice_16x16b(a0, b0, c0, d0, \82a1, b1, c1, d1, \83a2, b2, c2, d2, \84a3, b3, c3, d3, \85st0, st1) \86vmovdqu d2, st0; \87vmovdqu d3, st1; \88transpose_4x4(a0, a1, a2, a3, d2, d3); \89transpose_4x4(b0, b1, b2, b3, d2, d3); \90vmovdqu st0, d2; \91vmovdqu st1, d3; \92\93vmovdqu a0, st0; \94vmovdqu a1, st1; \95transpose_4x4(c0, c1, c2, c3, a0, a1); \96transpose_4x4(d0, d1, d2, d3, a0, a1); \97\98vbroadcasti128 .Lshufb_16x16b(%rip), a0; \99vmovdqu st1, a1; \100vpshufb a0, a2, a2; \101vpshufb a0, a3, a3; \102vpshufb a0, b0, b0; \103vpshufb a0, b1, b1; \104vpshufb a0, b2, b2; \105vpshufb a0, b3, b3; \106vpshufb a0, a1, a1; \107vpshufb a0, c0, c0; \108vpshufb a0, c1, c1; \109vpshufb a0, c2, c2; \110vpshufb a0, c3, c3; \111vpshufb a0, d0, d0; \112vpshufb a0, d1, d1; \113vpshufb a0, d2, d2; \114vpshufb a0, d3, d3; \115vmovdqu d3, st1; \116vmovdqu st0, d3; \117vpshufb a0, d3, a0; \118vmovdqu d2, st0; \119\120transpose_4x4(a0, b0, c0, d0, d2, d3); \121transpose_4x4(a1, b1, c1, d1, d2, d3); \122vmovdqu st0, d2; \123vmovdqu st1, d3; \124\125vmovdqu b0, st0; \126vmovdqu b1, st1; \127transpose_4x4(a2, b2, c2, d2, b0, b1); \128transpose_4x4(a3, b3, c3, d3, b0, b1); \129vmovdqu st0, b0; \130vmovdqu st1, b1; \131/* does not adjust output bytes inside vectors */132133#define debyteslice_16x16b(a0, b0, c0, d0, \134a1, b1, c1, d1, \135a2, b2, c2, d2, \136a3, b3, c3, d3, \137st0, st1) \138vmovdqu d2, st0; \139vmovdqu d3, st1; \140transpose_4x4(a0, a1, a2, a3, d2, d3); \141transpose_4x4(b0, b1, b2, b3, d2, d3); \142vmovdqu st0, d2; \143vmovdqu st1, d3; \144\145vmovdqu a0, st0; \146vmovdqu a1, st1; \147transpose_4x4(c0, c1, c2, c3, a0, a1); \148transpose_4x4(d0, d1, d2, d3, a0, a1); \149\150vbroadcasti128 .Lshufb_16x16b(%rip), a0; \151vmovdqu st1, a1; \152vpshufb a0, a2, a2; \153vpshufb a0, a3, a3; \154vpshufb a0, b0, b0; \155vpshufb a0, b1, b1; \156vpshufb a0, b2, b2; \157vpshufb a0, b3, b3; \158vpshufb a0, a1, a1; \159vpshufb a0, c0, c0; \160vpshufb a0, c1, c1; \161vpshufb a0, c2, c2; \162vpshufb a0, c3, c3; \163vpshufb a0, d0, d0; \164vpshufb a0, d1, d1; \165vpshufb a0, d2, d2; \166vpshufb a0, d3, d3; \167vmovdqu d3, st1; \168vmovdqu st0, d3; \169vpshufb a0, d3, a0; \170vmovdqu d2, st0; \171\172transpose_4x4(c0, d0, a0, b0, d2, d3); \173transpose_4x4(c1, d1, a1, b1, d2, d3); \174vmovdqu st0, d2; \175vmovdqu st1, d3; \176\177vmovdqu b0, st0; \178vmovdqu b1, st1; \179transpose_4x4(c2, d2, a2, b2, b0, b1); \180transpose_4x4(c3, d3, a3, b3, b0, b1); \181vmovdqu st0, b0; \182vmovdqu st1, b1; \183/* does not adjust output bytes inside vectors */184185/* load blocks to registers and apply pre-whitening */186#define inpack16_pre(x0, x1, x2, x3, \187x4, x5, x6, x7, \188y0, y1, y2, y3, \189y4, y5, y6, y7, \190rio) \191vmovdqu (0 * 32)(rio), x0; \192vmovdqu (1 * 32)(rio), x1; \193vmovdqu (2 * 32)(rio), x2; \194vmovdqu (3 * 32)(rio), x3; \195vmovdqu (4 * 32)(rio), x4; \196vmovdqu (5 * 32)(rio), x5; \197vmovdqu (6 * 32)(rio), x6; \198vmovdqu (7 * 32)(rio), x7; \199vmovdqu (8 * 32)(rio), y0; \200vmovdqu (9 * 32)(rio), y1; \201vmovdqu (10 * 32)(rio), y2; \202vmovdqu (11 * 32)(rio), y3; \203vmovdqu (12 * 32)(rio), y4; \204vmovdqu (13 * 32)(rio), y5; \205vmovdqu (14 * 32)(rio), y6; \206vmovdqu (15 * 32)(rio), y7;207208/* byteslice pre-whitened blocks and store to temporary memory */209#define inpack16_post(x0, x1, x2, x3, \210x4, x5, x6, x7, \211y0, y1, y2, y3, \212y4, y5, y6, y7, \213mem_ab, mem_cd) \214byteslice_16x16b(x0, x1, x2, x3, \215x4, x5, x6, x7, \216y0, y1, y2, y3, \217y4, y5, y6, y7, \218(mem_ab), (mem_cd)); \219\220vmovdqu x0, 0 * 32(mem_ab); \221vmovdqu x1, 1 * 32(mem_ab); \222vmovdqu x2, 2 * 32(mem_ab); \223vmovdqu x3, 3 * 32(mem_ab); \224vmovdqu x4, 4 * 32(mem_ab); \225vmovdqu x5, 5 * 32(mem_ab); \226vmovdqu x6, 6 * 32(mem_ab); \227vmovdqu x7, 7 * 32(mem_ab); \228vmovdqu y0, 0 * 32(mem_cd); \229vmovdqu y1, 1 * 32(mem_cd); \230vmovdqu y2, 2 * 32(mem_cd); \231vmovdqu y3, 3 * 32(mem_cd); \232vmovdqu y4, 4 * 32(mem_cd); \233vmovdqu y5, 5 * 32(mem_cd); \234vmovdqu y6, 6 * 32(mem_cd); \235vmovdqu y7, 7 * 32(mem_cd);236237#define write_output(x0, x1, x2, x3, \238x4, x5, x6, x7, \239y0, y1, y2, y3, \240y4, y5, y6, y7, \241mem) \242vmovdqu x0, 0 * 32(mem); \243vmovdqu x1, 1 * 32(mem); \244vmovdqu x2, 2 * 32(mem); \245vmovdqu x3, 3 * 32(mem); \246vmovdqu x4, 4 * 32(mem); \247vmovdqu x5, 5 * 32(mem); \248vmovdqu x6, 6 * 32(mem); \249vmovdqu x7, 7 * 32(mem); \250vmovdqu y0, 8 * 32(mem); \251vmovdqu y1, 9 * 32(mem); \252vmovdqu y2, 10 * 32(mem); \253vmovdqu y3, 11 * 32(mem); \254vmovdqu y4, 12 * 32(mem); \255vmovdqu y5, 13 * 32(mem); \256vmovdqu y6, 14 * 32(mem); \257vmovdqu y7, 15 * 32(mem); \258259#define aria_store_state_8way(x0, x1, x2, x3, \260x4, x5, x6, x7, \261mem_tmp, idx) \262vmovdqu x0, ((idx + 0) * 32)(mem_tmp); \263vmovdqu x1, ((idx + 1) * 32)(mem_tmp); \264vmovdqu x2, ((idx + 2) * 32)(mem_tmp); \265vmovdqu x3, ((idx + 3) * 32)(mem_tmp); \266vmovdqu x4, ((idx + 4) * 32)(mem_tmp); \267vmovdqu x5, ((idx + 5) * 32)(mem_tmp); \268vmovdqu x6, ((idx + 6) * 32)(mem_tmp); \269vmovdqu x7, ((idx + 7) * 32)(mem_tmp);270271#define aria_load_state_8way(x0, x1, x2, x3, \272x4, x5, x6, x7, \273mem_tmp, idx) \274vmovdqu ((idx + 0) * 32)(mem_tmp), x0; \275vmovdqu ((idx + 1) * 32)(mem_tmp), x1; \276vmovdqu ((idx + 2) * 32)(mem_tmp), x2; \277vmovdqu ((idx + 3) * 32)(mem_tmp), x3; \278vmovdqu ((idx + 4) * 32)(mem_tmp), x4; \279vmovdqu ((idx + 5) * 32)(mem_tmp), x5; \280vmovdqu ((idx + 6) * 32)(mem_tmp), x6; \281vmovdqu ((idx + 7) * 32)(mem_tmp), x7;282283#define aria_ark_8way(x0, x1, x2, x3, \284x4, x5, x6, x7, \285t0, rk, idx, round) \286/* AddRoundKey */ \287vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \288vpxor t0, x0, x0; \289vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \290vpxor t0, x1, x1; \291vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \292vpxor t0, x2, x2; \293vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \294vpxor t0, x3, x3; \295vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \296vpxor t0, x4, x4; \297vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \298vpxor t0, x5, x5; \299vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \300vpxor t0, x6, x6; \301vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \302vpxor t0, x7, x7;303304#ifdef CONFIG_AS_GFNI305#define aria_sbox_8way_gfni(x0, x1, x2, x3, \306x4, x5, x6, x7, \307t0, t1, t2, t3, \308t4, t5, t6, t7) \309vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \310vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \311vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \312vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \313vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \314vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \315vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \316vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \317vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \318vgf2p8affineinvqb $0, t2, x2, x2; \319vgf2p8affineinvqb $0, t2, x6, x6; \320vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \321vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \322vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \323vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \324vgf2p8affineinvqb $0, t2, x3, x3; \325vgf2p8affineinvqb $0, t2, x7, x7326327#endif /* CONFIG_AS_GFNI */328#define aria_sbox_8way(x0, x1, x2, x3, \329x4, x5, x6, x7, \330t0, t1, t2, t3, \331t4, t5, t6, t7) \332vpxor t7, t7, t7; \333vpxor t6, t6, t6; \334vbroadcasti128 .Linv_shift_row(%rip), t0; \335vbroadcasti128 .Lshift_row(%rip), t1; \336vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \337vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \338vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \339vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \340\341vextracti128 $1, x0, t6##_x; \342vaesenclast t7##_x, x0##_x, x0##_x; \343vaesenclast t7##_x, t6##_x, t6##_x; \344vinserti128 $1, t6##_x, x0, x0; \345\346vextracti128 $1, x4, t6##_x; \347vaesenclast t7##_x, x4##_x, x4##_x; \348vaesenclast t7##_x, t6##_x, t6##_x; \349vinserti128 $1, t6##_x, x4, x4; \350\351vextracti128 $1, x1, t6##_x; \352vaesenclast t7##_x, x1##_x, x1##_x; \353vaesenclast t7##_x, t6##_x, t6##_x; \354vinserti128 $1, t6##_x, x1, x1; \355\356vextracti128 $1, x5, t6##_x; \357vaesenclast t7##_x, x5##_x, x5##_x; \358vaesenclast t7##_x, t6##_x, t6##_x; \359vinserti128 $1, t6##_x, x5, x5; \360\361vextracti128 $1, x2, t6##_x; \362vaesdeclast t7##_x, x2##_x, x2##_x; \363vaesdeclast t7##_x, t6##_x, t6##_x; \364vinserti128 $1, t6##_x, x2, x2; \365\366vextracti128 $1, x6, t6##_x; \367vaesdeclast t7##_x, x6##_x, x6##_x; \368vaesdeclast t7##_x, t6##_x, t6##_x; \369vinserti128 $1, t6##_x, x6, x6; \370\371vpbroadcastd .L0f0f0f0f(%rip), t6; \372\373/* AES inverse shift rows */ \374vpshufb t0, x0, x0; \375vpshufb t0, x4, x4; \376vpshufb t0, x1, x1; \377vpshufb t0, x5, x5; \378vpshufb t1, x3, x3; \379vpshufb t1, x7, x7; \380vpshufb t1, x2, x2; \381vpshufb t1, x6, x6; \382\383/* affine transformation for S2 */ \384filter_8bit(x1, t2, t3, t6, t0); \385/* affine transformation for S2 */ \386filter_8bit(x5, t2, t3, t6, t0); \387\388/* affine transformation for X2 */ \389filter_8bit(x3, t4, t5, t6, t0); \390/* affine transformation for X2 */ \391filter_8bit(x7, t4, t5, t6, t0); \392\393vpxor t6, t6, t6; \394vextracti128 $1, x3, t6##_x; \395vaesdeclast t7##_x, x3##_x, x3##_x; \396vaesdeclast t7##_x, t6##_x, t6##_x; \397vinserti128 $1, t6##_x, x3, x3; \398\399vextracti128 $1, x7, t6##_x; \400vaesdeclast t7##_x, x7##_x, x7##_x; \401vaesdeclast t7##_x, t6##_x, t6##_x; \402vinserti128 $1, t6##_x, x7, x7; \403404#define aria_diff_m(x0, x1, x2, x3, \405t0, t1, t2, t3) \406/* T = rotr32(X, 8); */ \407/* X ^= T */ \408vpxor x0, x3, t0; \409vpxor x1, x0, t1; \410vpxor x2, x1, t2; \411vpxor x3, x2, t3; \412/* X = T ^ rotr(X, 16); */ \413vpxor t2, x0, x0; \414vpxor x1, t3, t3; \415vpxor t0, x2, x2; \416vpxor t1, x3, x1; \417vmovdqu t3, x3;418419#define aria_diff_word(x0, x1, x2, x3, \420x4, x5, x6, x7, \421y0, y1, y2, y3, \422y4, y5, y6, y7) \423/* t1 ^= t2; */ \424vpxor y0, x4, x4; \425vpxor y1, x5, x5; \426vpxor y2, x6, x6; \427vpxor y3, x7, x7; \428\429/* t2 ^= t3; */ \430vpxor y4, y0, y0; \431vpxor y5, y1, y1; \432vpxor y6, y2, y2; \433vpxor y7, y3, y3; \434\435/* t0 ^= t1; */ \436vpxor x4, x0, x0; \437vpxor x5, x1, x1; \438vpxor x6, x2, x2; \439vpxor x7, x3, x3; \440\441/* t3 ^= t1; */ \442vpxor x4, y4, y4; \443vpxor x5, y5, y5; \444vpxor x6, y6, y6; \445vpxor x7, y7, y7; \446\447/* t2 ^= t0; */ \448vpxor x0, y0, y0; \449vpxor x1, y1, y1; \450vpxor x2, y2, y2; \451vpxor x3, y3, y3; \452\453/* t1 ^= t2; */ \454vpxor y0, x4, x4; \455vpxor y1, x5, x5; \456vpxor y2, x6, x6; \457vpxor y3, x7, x7;458459#define aria_fe(x0, x1, x2, x3, \460x4, x5, x6, x7, \461y0, y1, y2, y3, \462y4, y5, y6, y7, \463mem_tmp, rk, round) \464aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \465y0, rk, 8, round); \466\467aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \468y0, y1, y2, y3, y4, y5, y6, y7); \469\470aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \471aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \472aria_store_state_8way(x0, x1, x2, x3, \473x4, x5, x6, x7, \474mem_tmp, 8); \475\476aria_load_state_8way(x0, x1, x2, x3, \477x4, x5, x6, x7, \478mem_tmp, 0); \479aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \480y0, rk, 0, round); \481\482aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \483y0, y1, y2, y3, y4, y5, y6, y7); \484\485aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \486aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \487aria_store_state_8way(x0, x1, x2, x3, \488x4, x5, x6, x7, \489mem_tmp, 0); \490aria_load_state_8way(y0, y1, y2, y3, \491y4, y5, y6, y7, \492mem_tmp, 8); \493aria_diff_word(x0, x1, x2, x3, \494x4, x5, x6, x7, \495y0, y1, y2, y3, \496y4, y5, y6, y7); \497/* aria_diff_byte() \498* T3 = ABCD -> BADC \499* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \500* T0 = ABCD -> CDAB \501* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \502* T1 = ABCD -> DCBA \503* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \504*/ \505aria_diff_word(x2, x3, x0, x1, \506x7, x6, x5, x4, \507y0, y1, y2, y3, \508y5, y4, y7, y6); \509aria_store_state_8way(x3, x2, x1, x0, \510x6, x7, x4, x5, \511mem_tmp, 0);512513#define aria_fo(x0, x1, x2, x3, \514x4, x5, x6, x7, \515y0, y1, y2, y3, \516y4, y5, y6, y7, \517mem_tmp, rk, round) \518aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \519y0, rk, 8, round); \520\521aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \522y0, y1, y2, y3, y4, y5, y6, y7); \523\524aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \525aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \526aria_store_state_8way(x0, x1, x2, x3, \527x4, x5, x6, x7, \528mem_tmp, 8); \529\530aria_load_state_8way(x0, x1, x2, x3, \531x4, x5, x6, x7, \532mem_tmp, 0); \533aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \534y0, rk, 0, round); \535\536aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \537y0, y1, y2, y3, y4, y5, y6, y7); \538\539aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \540aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \541aria_store_state_8way(x0, x1, x2, x3, \542x4, x5, x6, x7, \543mem_tmp, 0); \544aria_load_state_8way(y0, y1, y2, y3, \545y4, y5, y6, y7, \546mem_tmp, 8); \547aria_diff_word(x0, x1, x2, x3, \548x4, x5, x6, x7, \549y0, y1, y2, y3, \550y4, y5, y6, y7); \551/* aria_diff_byte() \552* T1 = ABCD -> BADC \553* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \554* T2 = ABCD -> CDAB \555* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \556* T3 = ABCD -> DCBA \557* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \558*/ \559aria_diff_word(x0, x1, x2, x3, \560x5, x4, x7, x6, \561y2, y3, y0, y1, \562y7, y6, y5, y4); \563aria_store_state_8way(x3, x2, x1, x0, \564x6, x7, x4, x5, \565mem_tmp, 0);566567#define aria_ff(x0, x1, x2, x3, \568x4, x5, x6, x7, \569y0, y1, y2, y3, \570y4, y5, y6, y7, \571mem_tmp, rk, round, last_round) \572aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \573y0, rk, 8, round); \574\575aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \576y0, y1, y2, y3, y4, y5, y6, y7); \577\578aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \579y0, rk, 8, last_round); \580\581aria_store_state_8way(x0, x1, x2, x3, \582x4, x5, x6, x7, \583mem_tmp, 8); \584\585aria_load_state_8way(x0, x1, x2, x3, \586x4, x5, x6, x7, \587mem_tmp, 0); \588aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \589y0, rk, 0, round); \590\591aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \592y0, y1, y2, y3, y4, y5, y6, y7); \593\594aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \595y0, rk, 0, last_round); \596\597aria_load_state_8way(y0, y1, y2, y3, \598y4, y5, y6, y7, \599mem_tmp, 8);600#ifdef CONFIG_AS_GFNI601#define aria_fe_gfni(x0, x1, x2, x3, \602x4, x5, x6, x7, \603y0, y1, y2, y3, \604y4, y5, y6, y7, \605mem_tmp, rk, round) \606aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \607y0, rk, 8, round); \608\609aria_sbox_8way_gfni(x2, x3, x0, x1, \610x6, x7, x4, x5, \611y0, y1, y2, y3, \612y4, y5, y6, y7); \613\614aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \615aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \616aria_store_state_8way(x0, x1, x2, x3, \617x4, x5, x6, x7, \618mem_tmp, 8); \619\620aria_load_state_8way(x0, x1, x2, x3, \621x4, x5, x6, x7, \622mem_tmp, 0); \623aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \624y0, rk, 0, round); \625\626aria_sbox_8way_gfni(x2, x3, x0, x1, \627x6, x7, x4, x5, \628y0, y1, y2, y3, \629y4, y5, y6, y7); \630\631aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \632aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \633aria_store_state_8way(x0, x1, x2, x3, \634x4, x5, x6, x7, \635mem_tmp, 0); \636aria_load_state_8way(y0, y1, y2, y3, \637y4, y5, y6, y7, \638mem_tmp, 8); \639aria_diff_word(x0, x1, x2, x3, \640x4, x5, x6, x7, \641y0, y1, y2, y3, \642y4, y5, y6, y7); \643/* aria_diff_byte() \644* T3 = ABCD -> BADC \645* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \646* T0 = ABCD -> CDAB \647* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \648* T1 = ABCD -> DCBA \649* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \650*/ \651aria_diff_word(x2, x3, x0, x1, \652x7, x6, x5, x4, \653y0, y1, y2, y3, \654y5, y4, y7, y6); \655aria_store_state_8way(x3, x2, x1, x0, \656x6, x7, x4, x5, \657mem_tmp, 0);658659#define aria_fo_gfni(x0, x1, x2, x3, \660x4, x5, x6, x7, \661y0, y1, y2, y3, \662y4, y5, y6, y7, \663mem_tmp, rk, round) \664aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \665y0, rk, 8, round); \666\667aria_sbox_8way_gfni(x0, x1, x2, x3, \668x4, x5, x6, x7, \669y0, y1, y2, y3, \670y4, y5, y6, y7); \671\672aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \673aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \674aria_store_state_8way(x0, x1, x2, x3, \675x4, x5, x6, x7, \676mem_tmp, 8); \677\678aria_load_state_8way(x0, x1, x2, x3, \679x4, x5, x6, x7, \680mem_tmp, 0); \681aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \682y0, rk, 0, round); \683\684aria_sbox_8way_gfni(x0, x1, x2, x3, \685x4, x5, x6, x7, \686y0, y1, y2, y3, \687y4, y5, y6, y7); \688\689aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \690aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \691aria_store_state_8way(x0, x1, x2, x3, \692x4, x5, x6, x7, \693mem_tmp, 0); \694aria_load_state_8way(y0, y1, y2, y3, \695y4, y5, y6, y7, \696mem_tmp, 8); \697aria_diff_word(x0, x1, x2, x3, \698x4, x5, x6, x7, \699y0, y1, y2, y3, \700y4, y5, y6, y7); \701/* aria_diff_byte() \702* T1 = ABCD -> BADC \703* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \704* T2 = ABCD -> CDAB \705* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \706* T3 = ABCD -> DCBA \707* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \708*/ \709aria_diff_word(x0, x1, x2, x3, \710x5, x4, x7, x6, \711y2, y3, y0, y1, \712y7, y6, y5, y4); \713aria_store_state_8way(x3, x2, x1, x0, \714x6, x7, x4, x5, \715mem_tmp, 0);716717#define aria_ff_gfni(x0, x1, x2, x3, \718x4, x5, x6, x7, \719y0, y1, y2, y3, \720y4, y5, y6, y7, \721mem_tmp, rk, round, last_round) \722aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \723y0, rk, 8, round); \724\725aria_sbox_8way_gfni(x2, x3, x0, x1, \726x6, x7, x4, x5, \727y0, y1, y2, y3, \728y4, y5, y6, y7); \729\730aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \731y0, rk, 8, last_round); \732\733aria_store_state_8way(x0, x1, x2, x3, \734x4, x5, x6, x7, \735mem_tmp, 8); \736\737aria_load_state_8way(x0, x1, x2, x3, \738x4, x5, x6, x7, \739mem_tmp, 0); \740aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \741y0, rk, 0, round); \742\743aria_sbox_8way_gfni(x2, x3, x0, x1, \744x6, x7, x4, x5, \745y0, y1, y2, y3, \746y4, y5, y6, y7); \747\748aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \749y0, rk, 0, last_round); \750\751aria_load_state_8way(y0, y1, y2, y3, \752y4, y5, y6, y7, \753mem_tmp, 8);754#endif /* CONFIG_AS_GFNI */755756.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32757.align 32758#define SHUFB_BYTES(idx) \7590 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)760.Lshufb_16x16b:761.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)762.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)763764.section .rodata.cst16, "aM", @progbits, 16765.align 16766/* For isolating SubBytes from AESENCLAST, inverse shift row */767.Linv_shift_row:768.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b769.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03770.Lshift_row:771.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03772.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b773/* For CTR-mode IV byteswap */774.Lbswap128_mask:775.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08776.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00777778/* AES inverse affine and S2 combined:779* 1 1 0 0 0 0 0 1 x0 0780* 0 1 0 0 1 0 0 0 x1 0781* 1 1 0 0 1 1 1 1 x2 0782* 0 1 1 0 1 0 0 1 x3 1783* 0 1 0 0 1 1 0 0 * x4 + 0784* 0 1 0 1 1 0 0 0 x5 0785* 0 0 0 0 0 1 0 1 x6 0786* 1 1 1 0 0 1 1 1 x7 1787*/788.Ltf_lo__inv_aff__and__s2:789.octa 0x92172DA81A9FA520B2370D883ABF8500790.Ltf_hi__inv_aff__and__s2:791.octa 0x2B15FFC1AF917B45E6D8320C625CB688792793/* X2 and AES forward affine combined:794* 1 0 1 1 0 0 0 1 x0 0795* 0 1 1 1 1 0 1 1 x1 0796* 0 0 0 1 1 0 1 0 x2 1797* 0 1 0 0 0 1 0 0 x3 0798* 0 0 1 1 1 0 1 1 * x4 + 0799* 0 1 0 0 1 0 0 0 x5 0800* 1 1 0 1 0 0 1 1 x6 0801* 0 1 0 0 1 0 1 0 x7 0802*/803.Ltf_lo__x2__and__fwd_aff:804.octa 0xEFAE0544FCBD1657B8F95213ABEA4100805.Ltf_hi__x2__and__fwd_aff:806.octa 0x3F893781E95FE1576CDA64D2BA0CB204807808#ifdef CONFIG_AS_GFNI809.section .rodata.cst8, "aM", @progbits, 8810.align 8811/* AES affine: */812#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)813.Ltf_aff_bitmatrix:814.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),815BV8(1, 1, 0, 0, 0, 1, 1, 1),816BV8(1, 1, 1, 0, 0, 0, 1, 1),817BV8(1, 1, 1, 1, 0, 0, 0, 1),818BV8(1, 1, 1, 1, 1, 0, 0, 0),819BV8(0, 1, 1, 1, 1, 1, 0, 0),820BV8(0, 0, 1, 1, 1, 1, 1, 0),821BV8(0, 0, 0, 1, 1, 1, 1, 1))822823/* AES inverse affine: */824#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)825.Ltf_inv_bitmatrix:826.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),827BV8(1, 0, 0, 1, 0, 0, 1, 0),828BV8(0, 1, 0, 0, 1, 0, 0, 1),829BV8(1, 0, 1, 0, 0, 1, 0, 0),830BV8(0, 1, 0, 1, 0, 0, 1, 0),831BV8(0, 0, 1, 0, 1, 0, 0, 1),832BV8(1, 0, 0, 1, 0, 1, 0, 0),833BV8(0, 1, 0, 0, 1, 0, 1, 0))834835/* S2: */836#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)837.Ltf_s2_bitmatrix:838.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),839BV8(0, 0, 1, 1, 1, 1, 1, 1),840BV8(1, 1, 1, 0, 1, 1, 0, 1),841BV8(1, 1, 0, 0, 0, 0, 1, 1),842BV8(0, 1, 0, 0, 0, 0, 1, 1),843BV8(1, 1, 0, 0, 1, 1, 1, 0),844BV8(0, 1, 1, 0, 0, 0, 1, 1),845BV8(1, 1, 1, 1, 0, 1, 1, 0))846847/* X2: */848#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)849.Ltf_x2_bitmatrix:850.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),851BV8(0, 0, 1, 0, 0, 1, 1, 0),852BV8(0, 0, 0, 0, 1, 0, 1, 0),853BV8(1, 1, 1, 0, 0, 0, 1, 1),854BV8(1, 1, 1, 0, 1, 1, 0, 0),855BV8(0, 1, 1, 0, 1, 0, 1, 1),856BV8(1, 0, 1, 1, 1, 1, 0, 1),857BV8(1, 0, 0, 1, 0, 0, 1, 1))858859/* Identity matrix: */860.Ltf_id_bitmatrix:861.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),862BV8(0, 1, 0, 0, 0, 0, 0, 0),863BV8(0, 0, 1, 0, 0, 0, 0, 0),864BV8(0, 0, 0, 1, 0, 0, 0, 0),865BV8(0, 0, 0, 0, 1, 0, 0, 0),866BV8(0, 0, 0, 0, 0, 1, 0, 0),867BV8(0, 0, 0, 0, 0, 0, 1, 0),868BV8(0, 0, 0, 0, 0, 0, 0, 1))869870#endif /* CONFIG_AS_GFNI */871872/* 4-bit mask */873.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4874.align 4875.L0f0f0f0f:876.long 0x0f0f0f0f877878.text879880SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)881/* input:882* %r9: rk883* %rsi: dst884* %rdx: src885* %ymm0..%ymm15: byte-sliced blocks886*/887888FRAME_BEGIN889890movq %rsi, %rax;891leaq 8 * 32(%rax), %r8;892893inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,894%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,895%ymm15, %rax, %r8);896aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,897%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,898%rax, %r9, 0);899aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,900%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,901%ymm15, %rax, %r9, 1);902aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,903%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,904%rax, %r9, 2);905aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,906%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,907%ymm15, %rax, %r9, 3);908aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,909%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,910%rax, %r9, 4);911aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,912%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,913%ymm15, %rax, %r9, 5);914aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,915%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,916%rax, %r9, 6);917aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,918%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,919%ymm15, %rax, %r9, 7);920aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,921%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,922%rax, %r9, 8);923aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,924%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,925%ymm15, %rax, %r9, 9);926aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,927%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,928%rax, %r9, 10);929cmpl $12, ARIA_CTX_rounds(CTX);930jne .Laria_192;931aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,932%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,933%ymm15, %rax, %r9, 11, 12);934jmp .Laria_end;935.Laria_192:936aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,937%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,938%ymm15, %rax, %r9, 11);939aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,940%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,941%rax, %r9, 12);942cmpl $14, ARIA_CTX_rounds(CTX);943jne .Laria_256;944aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,945%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,946%ymm15, %rax, %r9, 13, 14);947jmp .Laria_end;948.Laria_256:949aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,950%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,951%ymm15, %rax, %r9, 13);952aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,953%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,954%rax, %r9, 14);955aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,956%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,957%ymm15, %rax, %r9, 15, 16);958.Laria_end:959debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,960%ymm9, %ymm13, %ymm0, %ymm5,961%ymm10, %ymm14, %ymm3, %ymm6,962%ymm11, %ymm15, %ymm2, %ymm7,963(%rax), (%r8));964965FRAME_END966RET;967SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)968969SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)970/* input:971* %rdi: ctx, CTX972* %rsi: dst973* %rdx: src974*/975976FRAME_BEGIN977978leaq ARIA_CTX_enc_key(CTX), %r9;979980inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,981%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,982%ymm15, %rdx);983984call __aria_aesni_avx2_crypt_32way;985986write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,987%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,988%ymm15, %rax);989990FRAME_END991RET;992SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)993994SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)995/* input:996* %rdi: ctx, CTX997* %rsi: dst998* %rdx: src999*/10001001FRAME_BEGIN10021003leaq ARIA_CTX_dec_key(CTX), %r9;10041005inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,1006%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1007%ymm15, %rdx);10081009call __aria_aesni_avx2_crypt_32way;10101011write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,1012%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1013%ymm15, %rax);10141015FRAME_END1016RET;1017SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)10181019SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)1020/* input:1021* %rdi: ctx1022* %rsi: dst1023* %rdx: src1024* %rcx: keystream1025* %r8: iv (big endian, 128bit)1026*/10271028FRAME_BEGIN1029movq 8(%r8), %r11;1030bswapq %r11;10311032vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;1033vpcmpeqd %ymm0, %ymm0, %ymm0;1034vpsrldq $8, %ymm0, %ymm0; /* ab: -1:0 ; cd: -1:0 */1035vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */10361037/* load IV and byteswap */1038vmovdqu (%r8), %xmm7;1039vpshufb %xmm6, %xmm7, %xmm7;1040vmovdqa %xmm7, %xmm3;1041inc_le128(%xmm7, %xmm0, %xmm4);1042vinserti128 $1, %xmm7, %ymm3, %ymm3;1043vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */10441045/* check need for handling 64-bit overflow and carry */1046cmpq $(0xffffffffffffffff - 32), %r11;1047ja .Lhandle_ctr_carry;10481049/* construct IVs */1050vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */1051vpshufb %ymm6, %ymm3, %ymm9;1052vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */1053vpshufb %ymm6, %ymm3, %ymm10;1054vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */1055vpshufb %ymm6, %ymm3, %ymm11;1056vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */1057vpshufb %ymm6, %ymm3, %ymm12;1058vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */1059vpshufb %ymm6, %ymm3, %ymm13;1060vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */1061vpshufb %ymm6, %ymm3, %ymm14;1062vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */1063vpshufb %ymm6, %ymm3, %ymm15;1064vmovdqu %ymm8, (0 * 32)(%rcx);1065vmovdqu %ymm9, (1 * 32)(%rcx);1066vmovdqu %ymm10, (2 * 32)(%rcx);1067vmovdqu %ymm11, (3 * 32)(%rcx);1068vmovdqu %ymm12, (4 * 32)(%rcx);1069vmovdqu %ymm13, (5 * 32)(%rcx);1070vmovdqu %ymm14, (6 * 32)(%rcx);1071vmovdqu %ymm15, (7 * 32)(%rcx);10721073vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */1074vpshufb %ymm6, %ymm3, %ymm8;1075vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */1076vpshufb %ymm6, %ymm3, %ymm9;1077vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */1078vpshufb %ymm6, %ymm3, %ymm10;1079vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */1080vpshufb %ymm6, %ymm3, %ymm11;1081vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */1082vpshufb %ymm6, %ymm3, %ymm12;1083vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */1084vpshufb %ymm6, %ymm3, %ymm13;1085vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */1086vpshufb %ymm6, %ymm3, %ymm14;1087vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */1088vpshufb %ymm6, %ymm3, %ymm15;1089vpsubq %ymm5, %ymm3, %ymm3; /* +32 */1090vpshufb %xmm6, %xmm3, %xmm3;1091vmovdqu %xmm3, (%r8);1092vmovdqu (0 * 32)(%rcx), %ymm0;1093vmovdqu (1 * 32)(%rcx), %ymm1;1094vmovdqu (2 * 32)(%rcx), %ymm2;1095vmovdqu (3 * 32)(%rcx), %ymm3;1096vmovdqu (4 * 32)(%rcx), %ymm4;1097vmovdqu (5 * 32)(%rcx), %ymm5;1098vmovdqu (6 * 32)(%rcx), %ymm6;1099vmovdqu (7 * 32)(%rcx), %ymm7;1100jmp .Lctr_carry_done;11011102.Lhandle_ctr_carry:1103/* construct IVs */1104inc_le128(%ymm3, %ymm0, %ymm4);1105inc_le128(%ymm3, %ymm0, %ymm4);1106vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */1107inc_le128(%ymm3, %ymm0, %ymm4);1108inc_le128(%ymm3, %ymm0, %ymm4);1109vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */1110inc_le128(%ymm3, %ymm0, %ymm4);1111inc_le128(%ymm3, %ymm0, %ymm4);1112vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */1113inc_le128(%ymm3, %ymm0, %ymm4);1114inc_le128(%ymm3, %ymm0, %ymm4);1115vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */1116inc_le128(%ymm3, %ymm0, %ymm4);1117inc_le128(%ymm3, %ymm0, %ymm4);1118vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */1119inc_le128(%ymm3, %ymm0, %ymm4);1120inc_le128(%ymm3, %ymm0, %ymm4);1121vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */1122inc_le128(%ymm3, %ymm0, %ymm4);1123inc_le128(%ymm3, %ymm0, %ymm4);1124vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */1125vmovdqu %ymm8, (0 * 32)(%rcx);1126vmovdqu %ymm9, (1 * 32)(%rcx);1127vmovdqu %ymm10, (2 * 32)(%rcx);1128vmovdqu %ymm11, (3 * 32)(%rcx);1129vmovdqu %ymm12, (4 * 32)(%rcx);1130vmovdqu %ymm13, (5 * 32)(%rcx);1131vmovdqu %ymm14, (6 * 32)(%rcx);1132vmovdqu %ymm15, (7 * 32)(%rcx);11331134inc_le128(%ymm3, %ymm0, %ymm4);1135inc_le128(%ymm3, %ymm0, %ymm4);1136vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */1137inc_le128(%ymm3, %ymm0, %ymm4);1138inc_le128(%ymm3, %ymm0, %ymm4);1139vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */1140inc_le128(%ymm3, %ymm0, %ymm4);1141inc_le128(%ymm3, %ymm0, %ymm4);1142vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */1143inc_le128(%ymm3, %ymm0, %ymm4);1144inc_le128(%ymm3, %ymm0, %ymm4);1145vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */1146inc_le128(%ymm3, %ymm0, %ymm4);1147inc_le128(%ymm3, %ymm0, %ymm4);1148vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */1149inc_le128(%ymm3, %ymm0, %ymm4);1150inc_le128(%ymm3, %ymm0, %ymm4);1151vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */1152inc_le128(%ymm3, %ymm0, %ymm4);1153inc_le128(%ymm3, %ymm0, %ymm4);1154vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */1155inc_le128(%ymm3, %ymm0, %ymm4);1156inc_le128(%ymm3, %ymm0, %ymm4);1157vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */1158inc_le128(%ymm3, %ymm0, %ymm4);1159vextracti128 $1, %ymm3, %xmm3;1160vpshufb %xmm6, %xmm3, %xmm3; /* +32 */1161vmovdqu %xmm3, (%r8);1162vmovdqu (0 * 32)(%rcx), %ymm0;1163vmovdqu (1 * 32)(%rcx), %ymm1;1164vmovdqu (2 * 32)(%rcx), %ymm2;1165vmovdqu (3 * 32)(%rcx), %ymm3;1166vmovdqu (4 * 32)(%rcx), %ymm4;1167vmovdqu (5 * 32)(%rcx), %ymm5;1168vmovdqu (6 * 32)(%rcx), %ymm6;1169vmovdqu (7 * 32)(%rcx), %ymm7;11701171.Lctr_carry_done:11721173FRAME_END1174RET;1175SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)11761177SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)1178/* input:1179* %rdi: ctx1180* %rsi: dst1181* %rdx: src1182* %rcx: keystream1183* %r8: iv (big endian, 128bit)1184*/1185FRAME_BEGIN11861187call __aria_aesni_avx2_ctr_gen_keystream_32way;11881189leaq (%rsi), %r10;1190leaq (%rdx), %r11;1191leaq (%rcx), %rsi;1192leaq (%rcx), %rdx;1193leaq ARIA_CTX_enc_key(CTX), %r9;11941195call __aria_aesni_avx2_crypt_32way;11961197vpxor (0 * 32)(%r11), %ymm1, %ymm1;1198vpxor (1 * 32)(%r11), %ymm0, %ymm0;1199vpxor (2 * 32)(%r11), %ymm3, %ymm3;1200vpxor (3 * 32)(%r11), %ymm2, %ymm2;1201vpxor (4 * 32)(%r11), %ymm4, %ymm4;1202vpxor (5 * 32)(%r11), %ymm5, %ymm5;1203vpxor (6 * 32)(%r11), %ymm6, %ymm6;1204vpxor (7 * 32)(%r11), %ymm7, %ymm7;1205vpxor (8 * 32)(%r11), %ymm8, %ymm8;1206vpxor (9 * 32)(%r11), %ymm9, %ymm9;1207vpxor (10 * 32)(%r11), %ymm10, %ymm10;1208vpxor (11 * 32)(%r11), %ymm11, %ymm11;1209vpxor (12 * 32)(%r11), %ymm12, %ymm12;1210vpxor (13 * 32)(%r11), %ymm13, %ymm13;1211vpxor (14 * 32)(%r11), %ymm14, %ymm14;1212vpxor (15 * 32)(%r11), %ymm15, %ymm15;1213write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,1214%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1215%ymm15, %r10);12161217FRAME_END1218RET;1219SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)12201221#ifdef CONFIG_AS_GFNI1222SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)1223/* input:1224* %r9: rk1225* %rsi: dst1226* %rdx: src1227* %ymm0..%ymm15: 16 byte-sliced blocks1228*/12291230FRAME_BEGIN12311232movq %rsi, %rax;1233leaq 8 * 32(%rax), %r8;12341235inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,1236%ymm4, %ymm5, %ymm6, %ymm7,1237%ymm8, %ymm9, %ymm10, %ymm11,1238%ymm12, %ymm13, %ymm14,1239%ymm15, %rax, %r8);1240aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,1241%ymm12, %ymm13, %ymm14, %ymm15,1242%ymm0, %ymm1, %ymm2, %ymm3,1243%ymm4, %ymm5, %ymm6, %ymm7,1244%rax, %r9, 0);1245aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1246%ymm4, %ymm5, %ymm6, %ymm7,1247%ymm8, %ymm9, %ymm10, %ymm11,1248%ymm12, %ymm13, %ymm14,1249%ymm15, %rax, %r9, 1);1250aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1251%ymm12, %ymm13, %ymm14, %ymm15,1252%ymm0, %ymm1, %ymm2, %ymm3,1253%ymm4, %ymm5, %ymm6, %ymm7,1254%rax, %r9, 2);1255aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1256%ymm4, %ymm5, %ymm6, %ymm7,1257%ymm8, %ymm9, %ymm10, %ymm11,1258%ymm12, %ymm13, %ymm14,1259%ymm15, %rax, %r9, 3);1260aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1261%ymm12, %ymm13, %ymm14, %ymm15,1262%ymm0, %ymm1, %ymm2, %ymm3,1263%ymm4, %ymm5, %ymm6, %ymm7,1264%rax, %r9, 4);1265aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1266%ymm4, %ymm5, %ymm6, %ymm7,1267%ymm8, %ymm9, %ymm10, %ymm11,1268%ymm12, %ymm13, %ymm14,1269%ymm15, %rax, %r9, 5);1270aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1271%ymm12, %ymm13, %ymm14, %ymm15,1272%ymm0, %ymm1, %ymm2, %ymm3,1273%ymm4, %ymm5, %ymm6, %ymm7,1274%rax, %r9, 6);1275aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1276%ymm4, %ymm5, %ymm6, %ymm7,1277%ymm8, %ymm9, %ymm10, %ymm11,1278%ymm12, %ymm13, %ymm14,1279%ymm15, %rax, %r9, 7);1280aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1281%ymm12, %ymm13, %ymm14, %ymm15,1282%ymm0, %ymm1, %ymm2, %ymm3,1283%ymm4, %ymm5, %ymm6, %ymm7,1284%rax, %r9, 8);1285aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1286%ymm4, %ymm5, %ymm6, %ymm7,1287%ymm8, %ymm9, %ymm10, %ymm11,1288%ymm12, %ymm13, %ymm14,1289%ymm15, %rax, %r9, 9);1290aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1291%ymm12, %ymm13, %ymm14, %ymm15,1292%ymm0, %ymm1, %ymm2, %ymm3,1293%ymm4, %ymm5, %ymm6, %ymm7,1294%rax, %r9, 10);1295cmpl $12, ARIA_CTX_rounds(CTX);1296jne .Laria_gfni_192;1297aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,1298%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1299%ymm15, %rax, %r9, 11, 12);1300jmp .Laria_gfni_end;1301.Laria_gfni_192:1302aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1303%ymm4, %ymm5, %ymm6, %ymm7,1304%ymm8, %ymm9, %ymm10, %ymm11,1305%ymm12, %ymm13, %ymm14,1306%ymm15, %rax, %r9, 11);1307aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1308%ymm12, %ymm13, %ymm14, %ymm15,1309%ymm0, %ymm1, %ymm2, %ymm3,1310%ymm4, %ymm5, %ymm6, %ymm7,1311%rax, %r9, 12);1312cmpl $14, ARIA_CTX_rounds(CTX);1313jne .Laria_gfni_256;1314aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1315%ymm4, %ymm5, %ymm6, %ymm7,1316%ymm8, %ymm9, %ymm10, %ymm11,1317%ymm12, %ymm13, %ymm14,1318%ymm15, %rax, %r9, 13, 14);1319jmp .Laria_gfni_end;1320.Laria_gfni_256:1321aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1322%ymm4, %ymm5, %ymm6, %ymm7,1323%ymm8, %ymm9, %ymm10, %ymm11,1324%ymm12, %ymm13, %ymm14,1325%ymm15, %rax, %r9, 13);1326aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,1327%ymm12, %ymm13, %ymm14, %ymm15,1328%ymm0, %ymm1, %ymm2, %ymm3,1329%ymm4, %ymm5, %ymm6, %ymm7,1330%rax, %r9, 14);1331aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,1332%ymm4, %ymm5, %ymm6, %ymm7,1333%ymm8, %ymm9, %ymm10, %ymm11,1334%ymm12, %ymm13, %ymm14,1335%ymm15, %rax, %r9, 15, 16);1336.Laria_gfni_end:1337debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,1338%ymm9, %ymm13, %ymm0, %ymm5,1339%ymm10, %ymm14, %ymm3, %ymm6,1340%ymm11, %ymm15, %ymm2, %ymm7,1341(%rax), (%r8));13421343FRAME_END1344RET;1345SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)13461347SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)1348/* input:1349* %rdi: ctx, CTX1350* %rsi: dst1351* %rdx: src1352*/13531354FRAME_BEGIN13551356leaq ARIA_CTX_enc_key(CTX), %r9;13571358inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,1359%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1360%ymm15, %rdx);13611362call __aria_aesni_avx2_gfni_crypt_32way;13631364write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,1365%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1366%ymm15, %rax);13671368FRAME_END1369RET;1370SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)13711372SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)1373/* input:1374* %rdi: ctx, CTX1375* %rsi: dst1376* %rdx: src1377*/13781379FRAME_BEGIN13801381leaq ARIA_CTX_dec_key(CTX), %r9;13821383inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,1384%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1385%ymm15, %rdx);13861387call __aria_aesni_avx2_gfni_crypt_32way;13881389write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,1390%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1391%ymm15, %rax);13921393FRAME_END1394RET;1395SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)13961397SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)1398/* input:1399* %rdi: ctx1400* %rsi: dst1401* %rdx: src1402* %rcx: keystream1403* %r8: iv (big endian, 128bit)1404*/1405FRAME_BEGIN14061407call __aria_aesni_avx2_ctr_gen_keystream_32way14081409leaq (%rsi), %r10;1410leaq (%rdx), %r11;1411leaq (%rcx), %rsi;1412leaq (%rcx), %rdx;1413leaq ARIA_CTX_enc_key(CTX), %r9;14141415call __aria_aesni_avx2_gfni_crypt_32way;14161417vpxor (0 * 32)(%r11), %ymm1, %ymm1;1418vpxor (1 * 32)(%r11), %ymm0, %ymm0;1419vpxor (2 * 32)(%r11), %ymm3, %ymm3;1420vpxor (3 * 32)(%r11), %ymm2, %ymm2;1421vpxor (4 * 32)(%r11), %ymm4, %ymm4;1422vpxor (5 * 32)(%r11), %ymm5, %ymm5;1423vpxor (6 * 32)(%r11), %ymm6, %ymm6;1424vpxor (7 * 32)(%r11), %ymm7, %ymm7;1425vpxor (8 * 32)(%r11), %ymm8, %ymm8;1426vpxor (9 * 32)(%r11), %ymm9, %ymm9;1427vpxor (10 * 32)(%r11), %ymm10, %ymm10;1428vpxor (11 * 32)(%r11), %ymm11, %ymm11;1429vpxor (12 * 32)(%r11), %ymm12, %ymm12;1430vpxor (13 * 32)(%r11), %ymm13, %ymm13;1431vpxor (14 * 32)(%r11), %ymm14, %ymm14;1432vpxor (15 * 32)(%r11), %ymm15, %ymm15;1433write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,1434%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,1435%ymm15, %r10);14361437FRAME_END1438RET;1439SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)1440#endif /* CONFIG_AS_GFNI */144114421443