Path: blob/master/arch/x86/crypto/aria-gfni-avx512-asm_64.S
26451 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* ARIA Cipher 64-way parallel algorithm (AVX512)3*4* Copyright (c) 2022 Taehee Yoo <[email protected]>5*6*/78#include <linux/linkage.h>9#include <asm/frame.h>10#include <asm/asm-offsets.h>11#include <linux/cfi_types.h>1213/* register macros */14#define CTX %rdi151617#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \18( (((a0) & 1) << 0) | \19(((a1) & 1) << 1) | \20(((a2) & 1) << 2) | \21(((a3) & 1) << 3) | \22(((a4) & 1) << 4) | \23(((a5) & 1) << 5) | \24(((a6) & 1) << 6) | \25(((a7) & 1) << 7) )2627#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \28( ((l7) << (0 * 8)) | \29((l6) << (1 * 8)) | \30((l5) << (2 * 8)) | \31((l4) << (3 * 8)) | \32((l3) << (4 * 8)) | \33((l2) << (5 * 8)) | \34((l1) << (6 * 8)) | \35((l0) << (7 * 8)) )3637#define add_le128(out, in, lo_counter, hi_counter1) \38vpaddq lo_counter, in, out; \39vpcmpuq $1, lo_counter, out, %k1; \40kaddb %k1, %k1, %k1; \41vpaddq hi_counter1, out, out{%k1};4243#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \44vpandq x, mask4bit, tmp0; \45vpandqn x, mask4bit, x; \46vpsrld $4, x, x; \47\48vpshufb tmp0, lo_t, tmp0; \49vpshufb x, hi_t, x; \50vpxorq tmp0, x, x;5152#define transpose_4x4(x0, x1, x2, x3, t1, t2) \53vpunpckhdq x1, x0, t2; \54vpunpckldq x1, x0, x0; \55\56vpunpckldq x3, x2, t1; \57vpunpckhdq x3, x2, x2; \58\59vpunpckhqdq t1, x0, x1; \60vpunpcklqdq t1, x0, x0; \61\62vpunpckhqdq x2, t2, x3; \63vpunpcklqdq x2, t2, x2;6465#define byteslice_16x16b(a0, b0, c0, d0, \66a1, b1, c1, d1, \67a2, b2, c2, d2, \68a3, b3, c3, d3, \69st0, st1) \70vmovdqu64 d2, st0; \71vmovdqu64 d3, st1; \72transpose_4x4(a0, a1, a2, a3, d2, d3); \73transpose_4x4(b0, b1, b2, b3, d2, d3); \74vmovdqu64 st0, d2; \75vmovdqu64 st1, d3; \76\77vmovdqu64 a0, st0; \78vmovdqu64 a1, st1; \79transpose_4x4(c0, c1, c2, c3, a0, a1); \80transpose_4x4(d0, d1, d2, d3, a0, a1); \81\82vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \83vmovdqu64 st1, a1; \84vpshufb a0, a2, a2; \85vpshufb a0, a3, a3; \86vpshufb a0, b0, b0; \87vpshufb a0, b1, b1; \88vpshufb a0, b2, b2; \89vpshufb a0, b3, b3; \90vpshufb a0, a1, a1; \91vpshufb a0, c0, c0; \92vpshufb a0, c1, c1; \93vpshufb a0, c2, c2; \94vpshufb a0, c3, c3; \95vpshufb a0, d0, d0; \96vpshufb a0, d1, d1; \97vpshufb a0, d2, d2; \98vpshufb a0, d3, d3; \99vmovdqu64 d3, st1; \100vmovdqu64 st0, d3; \101vpshufb a0, d3, a0; \102vmovdqu64 d2, st0; \103\104transpose_4x4(a0, b0, c0, d0, d2, d3); \105transpose_4x4(a1, b1, c1, d1, d2, d3); \106vmovdqu64 st0, d2; \107vmovdqu64 st1, d3; \108\109vmovdqu64 b0, st0; \110vmovdqu64 b1, st1; \111transpose_4x4(a2, b2, c2, d2, b0, b1); \112transpose_4x4(a3, b3, c3, d3, b0, b1); \113vmovdqu64 st0, b0; \114vmovdqu64 st1, b1; \115/* does not adjust output bytes inside vectors */116117#define debyteslice_16x16b(a0, b0, c0, d0, \118a1, b1, c1, d1, \119a2, b2, c2, d2, \120a3, b3, c3, d3, \121st0, st1) \122vmovdqu64 d2, st0; \123vmovdqu64 d3, st1; \124transpose_4x4(a0, a1, a2, a3, d2, d3); \125transpose_4x4(b0, b1, b2, b3, d2, d3); \126vmovdqu64 st0, d2; \127vmovdqu64 st1, d3; \128\129vmovdqu64 a0, st0; \130vmovdqu64 a1, st1; \131transpose_4x4(c0, c1, c2, c3, a0, a1); \132transpose_4x4(d0, d1, d2, d3, a0, a1); \133\134vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \135vmovdqu64 st1, a1; \136vpshufb a0, a2, a2; \137vpshufb a0, a3, a3; \138vpshufb a0, b0, b0; \139vpshufb a0, b1, b1; \140vpshufb a0, b2, b2; \141vpshufb a0, b3, b3; \142vpshufb a0, a1, a1; \143vpshufb a0, c0, c0; \144vpshufb a0, c1, c1; \145vpshufb a0, c2, c2; \146vpshufb a0, c3, c3; \147vpshufb a0, d0, d0; \148vpshufb a0, d1, d1; \149vpshufb a0, d2, d2; \150vpshufb a0, d3, d3; \151vmovdqu64 d3, st1; \152vmovdqu64 st0, d3; \153vpshufb a0, d3, a0; \154vmovdqu64 d2, st0; \155\156transpose_4x4(c0, d0, a0, b0, d2, d3); \157transpose_4x4(c1, d1, a1, b1, d2, d3); \158vmovdqu64 st0, d2; \159vmovdqu64 st1, d3; \160\161vmovdqu64 b0, st0; \162vmovdqu64 b1, st1; \163transpose_4x4(c2, d2, a2, b2, b0, b1); \164transpose_4x4(c3, d3, a3, b3, b0, b1); \165vmovdqu64 st0, b0; \166vmovdqu64 st1, b1; \167/* does not adjust output bytes inside vectors */168169/* load blocks to registers and apply pre-whitening */170#define inpack16_pre(x0, x1, x2, x3, \171x4, x5, x6, x7, \172y0, y1, y2, y3, \173y4, y5, y6, y7, \174rio) \175vmovdqu64 (0 * 64)(rio), x0; \176vmovdqu64 (1 * 64)(rio), x1; \177vmovdqu64 (2 * 64)(rio), x2; \178vmovdqu64 (3 * 64)(rio), x3; \179vmovdqu64 (4 * 64)(rio), x4; \180vmovdqu64 (5 * 64)(rio), x5; \181vmovdqu64 (6 * 64)(rio), x6; \182vmovdqu64 (7 * 64)(rio), x7; \183vmovdqu64 (8 * 64)(rio), y0; \184vmovdqu64 (9 * 64)(rio), y1; \185vmovdqu64 (10 * 64)(rio), y2; \186vmovdqu64 (11 * 64)(rio), y3; \187vmovdqu64 (12 * 64)(rio), y4; \188vmovdqu64 (13 * 64)(rio), y5; \189vmovdqu64 (14 * 64)(rio), y6; \190vmovdqu64 (15 * 64)(rio), y7;191192/* byteslice pre-whitened blocks and store to temporary memory */193#define inpack16_post(x0, x1, x2, x3, \194x4, x5, x6, x7, \195y0, y1, y2, y3, \196y4, y5, y6, y7, \197mem_ab, mem_cd) \198byteslice_16x16b(x0, x1, x2, x3, \199x4, x5, x6, x7, \200y0, y1, y2, y3, \201y4, y5, y6, y7, \202(mem_ab), (mem_cd)); \203\204vmovdqu64 x0, 0 * 64(mem_ab); \205vmovdqu64 x1, 1 * 64(mem_ab); \206vmovdqu64 x2, 2 * 64(mem_ab); \207vmovdqu64 x3, 3 * 64(mem_ab); \208vmovdqu64 x4, 4 * 64(mem_ab); \209vmovdqu64 x5, 5 * 64(mem_ab); \210vmovdqu64 x6, 6 * 64(mem_ab); \211vmovdqu64 x7, 7 * 64(mem_ab); \212vmovdqu64 y0, 0 * 64(mem_cd); \213vmovdqu64 y1, 1 * 64(mem_cd); \214vmovdqu64 y2, 2 * 64(mem_cd); \215vmovdqu64 y3, 3 * 64(mem_cd); \216vmovdqu64 y4, 4 * 64(mem_cd); \217vmovdqu64 y5, 5 * 64(mem_cd); \218vmovdqu64 y6, 6 * 64(mem_cd); \219vmovdqu64 y7, 7 * 64(mem_cd);220221#define write_output(x0, x1, x2, x3, \222x4, x5, x6, x7, \223y0, y1, y2, y3, \224y4, y5, y6, y7, \225mem) \226vmovdqu64 x0, 0 * 64(mem); \227vmovdqu64 x1, 1 * 64(mem); \228vmovdqu64 x2, 2 * 64(mem); \229vmovdqu64 x3, 3 * 64(mem); \230vmovdqu64 x4, 4 * 64(mem); \231vmovdqu64 x5, 5 * 64(mem); \232vmovdqu64 x6, 6 * 64(mem); \233vmovdqu64 x7, 7 * 64(mem); \234vmovdqu64 y0, 8 * 64(mem); \235vmovdqu64 y1, 9 * 64(mem); \236vmovdqu64 y2, 10 * 64(mem); \237vmovdqu64 y3, 11 * 64(mem); \238vmovdqu64 y4, 12 * 64(mem); \239vmovdqu64 y5, 13 * 64(mem); \240vmovdqu64 y6, 14 * 64(mem); \241vmovdqu64 y7, 15 * 64(mem); \242243#define aria_store_state_8way(x0, x1, x2, x3, \244x4, x5, x6, x7, \245mem_tmp, idx) \246vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp); \247vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp); \248vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp); \249vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp); \250vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp); \251vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp); \252vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp); \253vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);254255#define aria_load_state_8way(x0, x1, x2, x3, \256x4, x5, x6, x7, \257mem_tmp, idx) \258vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0; \259vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1; \260vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2; \261vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3; \262vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4; \263vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5; \264vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6; \265vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;266267#define aria_ark_16way(x0, x1, x2, x3, \268x4, x5, x6, x7, \269y0, y1, y2, y3, \270y4, y5, y6, y7, \271t0, rk, round) \272/* AddRoundKey */ \273vpbroadcastb ((round * 16) + 3)(rk), t0; \274vpxorq t0, x0, x0; \275vpbroadcastb ((round * 16) + 2)(rk), t0; \276vpxorq t0, x1, x1; \277vpbroadcastb ((round * 16) + 1)(rk), t0; \278vpxorq t0, x2, x2; \279vpbroadcastb ((round * 16) + 0)(rk), t0; \280vpxorq t0, x3, x3; \281vpbroadcastb ((round * 16) + 7)(rk), t0; \282vpxorq t0, x4, x4; \283vpbroadcastb ((round * 16) + 6)(rk), t0; \284vpxorq t0, x5, x5; \285vpbroadcastb ((round * 16) + 5)(rk), t0; \286vpxorq t0, x6, x6; \287vpbroadcastb ((round * 16) + 4)(rk), t0; \288vpxorq t0, x7, x7; \289vpbroadcastb ((round * 16) + 11)(rk), t0; \290vpxorq t0, y0, y0; \291vpbroadcastb ((round * 16) + 10)(rk), t0; \292vpxorq t0, y1, y1; \293vpbroadcastb ((round * 16) + 9)(rk), t0; \294vpxorq t0, y2, y2; \295vpbroadcastb ((round * 16) + 8)(rk), t0; \296vpxorq t0, y3, y3; \297vpbroadcastb ((round * 16) + 15)(rk), t0; \298vpxorq t0, y4, y4; \299vpbroadcastb ((round * 16) + 14)(rk), t0; \300vpxorq t0, y5, y5; \301vpbroadcastb ((round * 16) + 13)(rk), t0; \302vpxorq t0, y6, y6; \303vpbroadcastb ((round * 16) + 12)(rk), t0; \304vpxorq t0, y7, y7;305306#define aria_sbox_8way_gfni(x0, x1, x2, x3, \307x4, x5, x6, x7, \308t0, t1, t2, t3, \309t4, t5, t6, t7) \310vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \311vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \312vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \313vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \314vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \315vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \316vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \317vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \318vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \319vgf2p8affineinvqb $0, t2, x2, x2; \320vgf2p8affineinvqb $0, t2, x6, x6; \321vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \322vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \323vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \324vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \325vgf2p8affineinvqb $0, t2, x3, x3; \326vgf2p8affineinvqb $0, t2, x7, x7;327328#define aria_sbox_16way_gfni(x0, x1, x2, x3, \329x4, x5, x6, x7, \330y0, y1, y2, y3, \331y4, y5, y6, y7, \332t0, t1, t2, t3, \333t4, t5, t6, t7) \334vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \335vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \336vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \337vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \338vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \339vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \340vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \341vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \342vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \343vgf2p8affineinvqb $0, t2, x2, x2; \344vgf2p8affineinvqb $0, t2, x6, x6; \345vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \346vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \347vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \348vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \349vgf2p8affineinvqb $0, t2, x3, x3; \350vgf2p8affineinvqb $0, t2, x7, x7; \351vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1; \352vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5; \353vgf2p8affineqb $(tf_inv_const), t1, y2, y2; \354vgf2p8affineqb $(tf_inv_const), t1, y6, y6; \355vgf2p8affineinvqb $0, t2, y2, y2; \356vgf2p8affineinvqb $0, t2, y6, y6; \357vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0; \358vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4; \359vgf2p8affineqb $(tf_x2_const), t4, y3, y3; \360vgf2p8affineqb $(tf_x2_const), t4, y7, y7; \361vgf2p8affineinvqb $0, t2, y3, y3; \362vgf2p8affineinvqb $0, t2, y7, y7;363364365#define aria_diff_m(x0, x1, x2, x3, \366t0, t1, t2, t3) \367/* T = rotr32(X, 8); */ \368/* X ^= T */ \369vpxorq x0, x3, t0; \370vpxorq x1, x0, t1; \371vpxorq x2, x1, t2; \372vpxorq x3, x2, t3; \373/* X = T ^ rotr(X, 16); */ \374vpxorq t2, x0, x0; \375vpxorq x1, t3, t3; \376vpxorq t0, x2, x2; \377vpxorq t1, x3, x1; \378vmovdqu64 t3, x3;379380#define aria_diff_word(x0, x1, x2, x3, \381x4, x5, x6, x7, \382y0, y1, y2, y3, \383y4, y5, y6, y7) \384/* t1 ^= t2; */ \385vpxorq y0, x4, x4; \386vpxorq y1, x5, x5; \387vpxorq y2, x6, x6; \388vpxorq y3, x7, x7; \389\390/* t2 ^= t3; */ \391vpxorq y4, y0, y0; \392vpxorq y5, y1, y1; \393vpxorq y6, y2, y2; \394vpxorq y7, y3, y3; \395\396/* t0 ^= t1; */ \397vpxorq x4, x0, x0; \398vpxorq x5, x1, x1; \399vpxorq x6, x2, x2; \400vpxorq x7, x3, x3; \401\402/* t3 ^= t1; */ \403vpxorq x4, y4, y4; \404vpxorq x5, y5, y5; \405vpxorq x6, y6, y6; \406vpxorq x7, y7, y7; \407\408/* t2 ^= t0; */ \409vpxorq x0, y0, y0; \410vpxorq x1, y1, y1; \411vpxorq x2, y2, y2; \412vpxorq x3, y3, y3; \413\414/* t1 ^= t2; */ \415vpxorq y0, x4, x4; \416vpxorq y1, x5, x5; \417vpxorq y2, x6, x6; \418vpxorq y3, x7, x7;419420#define aria_fe_gfni(x0, x1, x2, x3, \421x4, x5, x6, x7, \422y0, y1, y2, y3, \423y4, y5, y6, y7, \424z0, z1, z2, z3, \425z4, z5, z6, z7, \426mem_tmp, rk, round) \427aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \428y0, y1, y2, y3, y4, y5, y6, y7, \429z0, rk, round); \430\431aria_sbox_16way_gfni(x2, x3, x0, x1, \432x6, x7, x4, x5, \433y2, y3, y0, y1, \434y6, y7, y4, y5, \435z0, z1, z2, z3, \436z4, z5, z6, z7); \437\438aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \439aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \440aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \441aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \442aria_diff_word(x0, x1, x2, x3, \443x4, x5, x6, x7, \444y0, y1, y2, y3, \445y4, y5, y6, y7); \446/* aria_diff_byte() \447* T3 = ABCD -> BADC \448* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \449* T0 = ABCD -> CDAB \450* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \451* T1 = ABCD -> DCBA \452* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \453*/ \454aria_diff_word(x2, x3, x0, x1, \455x7, x6, x5, x4, \456y0, y1, y2, y3, \457y5, y4, y7, y6); \458459460#define aria_fo_gfni(x0, x1, x2, x3, \461x4, x5, x6, x7, \462y0, y1, y2, y3, \463y4, y5, y6, y7, \464z0, z1, z2, z3, \465z4, z5, z6, z7, \466mem_tmp, rk, round) \467aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \468y0, y1, y2, y3, y4, y5, y6, y7, \469z0, rk, round); \470\471aria_sbox_16way_gfni(x0, x1, x2, x3, \472x4, x5, x6, x7, \473y0, y1, y2, y3, \474y4, y5, y6, y7, \475z0, z1, z2, z3, \476z4, z5, z6, z7); \477\478aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \479aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \480aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \481aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \482aria_diff_word(x0, x1, x2, x3, \483x4, x5, x6, x7, \484y0, y1, y2, y3, \485y4, y5, y6, y7); \486/* aria_diff_byte() \487* T1 = ABCD -> BADC \488* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \489* T2 = ABCD -> CDAB \490* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \491* T3 = ABCD -> DCBA \492* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \493*/ \494aria_diff_word(x0, x1, x2, x3, \495x5, x4, x7, x6, \496y2, y3, y0, y1, \497y7, y6, y5, y4);498499#define aria_ff_gfni(x0, x1, x2, x3, \500x4, x5, x6, x7, \501y0, y1, y2, y3, \502y4, y5, y6, y7, \503z0, z1, z2, z3, \504z4, z5, z6, z7, \505mem_tmp, rk, round, last_round) \506aria_ark_16way(x0, x1, x2, x3, \507x4, x5, x6, x7, \508y0, y1, y2, y3, \509y4, y5, y6, y7, \510z0, rk, round); \511aria_sbox_16way_gfni(x2, x3, x0, x1, \512x6, x7, x4, x5, \513y2, y3, y0, y1, \514y6, y7, y4, y5, \515z0, z1, z2, z3, \516z4, z5, z6, z7); \517aria_ark_16way(x0, x1, x2, x3, \518x4, x5, x6, x7, \519y0, y1, y2, y3, \520y4, y5, y6, y7, \521z0, rk, last_round);522523524.section .rodata.cst64, "aM", @progbits, 64525.align 64526.Lcounter0123_lo:527.quad 0, 0528.quad 1, 0529.quad 2, 0530.quad 3, 0531532.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32533.align 32534#define SHUFB_BYTES(idx) \5350 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)536.Lshufb_16x16b:537.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)538.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)539540.section .rodata.cst16, "aM", @progbits, 16541.align 16542543.Lcounter4444_lo:544.quad 4, 0545.Lcounter8888_lo:546.quad 8, 0547.Lcounter16161616_lo:548.quad 16, 0549.Lcounter1111_hi:550.quad 0, 1551552/* For CTR-mode IV byteswap */553.Lbswap128_mask:554.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08555.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00556557.section .rodata.cst8, "aM", @progbits, 8558.align 8559/* AES affine: */560#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)561.Ltf_aff_bitmatrix:562.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),563BV8(1, 1, 0, 0, 0, 1, 1, 1),564BV8(1, 1, 1, 0, 0, 0, 1, 1),565BV8(1, 1, 1, 1, 0, 0, 0, 1),566BV8(1, 1, 1, 1, 1, 0, 0, 0),567BV8(0, 1, 1, 1, 1, 1, 0, 0),568BV8(0, 0, 1, 1, 1, 1, 1, 0),569BV8(0, 0, 0, 1, 1, 1, 1, 1))570571/* AES inverse affine: */572#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)573.Ltf_inv_bitmatrix:574.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),575BV8(1, 0, 0, 1, 0, 0, 1, 0),576BV8(0, 1, 0, 0, 1, 0, 0, 1),577BV8(1, 0, 1, 0, 0, 1, 0, 0),578BV8(0, 1, 0, 1, 0, 0, 1, 0),579BV8(0, 0, 1, 0, 1, 0, 0, 1),580BV8(1, 0, 0, 1, 0, 1, 0, 0),581BV8(0, 1, 0, 0, 1, 0, 1, 0))582583/* S2: */584#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)585.Ltf_s2_bitmatrix:586.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),587BV8(0, 0, 1, 1, 1, 1, 1, 1),588BV8(1, 1, 1, 0, 1, 1, 0, 1),589BV8(1, 1, 0, 0, 0, 0, 1, 1),590BV8(0, 1, 0, 0, 0, 0, 1, 1),591BV8(1, 1, 0, 0, 1, 1, 1, 0),592BV8(0, 1, 1, 0, 0, 0, 1, 1),593BV8(1, 1, 1, 1, 0, 1, 1, 0))594595/* X2: */596#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)597.Ltf_x2_bitmatrix:598.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),599BV8(0, 0, 1, 0, 0, 1, 1, 0),600BV8(0, 0, 0, 0, 1, 0, 1, 0),601BV8(1, 1, 1, 0, 0, 0, 1, 1),602BV8(1, 1, 1, 0, 1, 1, 0, 0),603BV8(0, 1, 1, 0, 1, 0, 1, 1),604BV8(1, 0, 1, 1, 1, 1, 0, 1),605BV8(1, 0, 0, 1, 0, 0, 1, 1))606607/* Identity matrix: */608.Ltf_id_bitmatrix:609.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),610BV8(0, 1, 0, 0, 0, 0, 0, 0),611BV8(0, 0, 1, 0, 0, 0, 0, 0),612BV8(0, 0, 0, 1, 0, 0, 0, 0),613BV8(0, 0, 0, 0, 1, 0, 0, 0),614BV8(0, 0, 0, 0, 0, 1, 0, 0),615BV8(0, 0, 0, 0, 0, 0, 1, 0),616BV8(0, 0, 0, 0, 0, 0, 0, 1))617618.text619SYM_FUNC_START_LOCAL(__aria_gfni_avx512_crypt_64way)620/* input:621* %r9: rk622* %rsi: dst623* %rdx: src624* %zmm0..%zmm15: byte-sliced blocks625*/626627FRAME_BEGIN628629movq %rsi, %rax;630leaq 8 * 64(%rax), %r8;631632inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,633%zmm4, %zmm5, %zmm6, %zmm7,634%zmm8, %zmm9, %zmm10, %zmm11,635%zmm12, %zmm13, %zmm14,636%zmm15, %rax, %r8);637aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,638%zmm4, %zmm5, %zmm6, %zmm7,639%zmm8, %zmm9, %zmm10, %zmm11,640%zmm12, %zmm13, %zmm14, %zmm15,641%zmm24, %zmm25, %zmm26, %zmm27,642%zmm28, %zmm29, %zmm30, %zmm31,643%rax, %r9, 0);644aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,645%zmm6, %zmm7, %zmm4, %zmm5,646%zmm9, %zmm8, %zmm11, %zmm10,647%zmm12, %zmm13, %zmm14, %zmm15,648%zmm24, %zmm25, %zmm26, %zmm27,649%zmm28, %zmm29, %zmm30, %zmm31,650%rax, %r9, 1);651aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,652%zmm4, %zmm5, %zmm6, %zmm7,653%zmm8, %zmm9, %zmm10, %zmm11,654%zmm12, %zmm13, %zmm14, %zmm15,655%zmm24, %zmm25, %zmm26, %zmm27,656%zmm28, %zmm29, %zmm30, %zmm31,657%rax, %r9, 2);658aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,659%zmm6, %zmm7, %zmm4, %zmm5,660%zmm9, %zmm8, %zmm11, %zmm10,661%zmm12, %zmm13, %zmm14, %zmm15,662%zmm24, %zmm25, %zmm26, %zmm27,663%zmm28, %zmm29, %zmm30, %zmm31,664%rax, %r9, 3);665aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,666%zmm4, %zmm5, %zmm6, %zmm7,667%zmm8, %zmm9, %zmm10, %zmm11,668%zmm12, %zmm13, %zmm14, %zmm15,669%zmm24, %zmm25, %zmm26, %zmm27,670%zmm28, %zmm29, %zmm30, %zmm31,671%rax, %r9, 4);672aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,673%zmm6, %zmm7, %zmm4, %zmm5,674%zmm9, %zmm8, %zmm11, %zmm10,675%zmm12, %zmm13, %zmm14, %zmm15,676%zmm24, %zmm25, %zmm26, %zmm27,677%zmm28, %zmm29, %zmm30, %zmm31,678%rax, %r9, 5);679aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,680%zmm4, %zmm5, %zmm6, %zmm7,681%zmm8, %zmm9, %zmm10, %zmm11,682%zmm12, %zmm13, %zmm14, %zmm15,683%zmm24, %zmm25, %zmm26, %zmm27,684%zmm28, %zmm29, %zmm30, %zmm31,685%rax, %r9, 6);686aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,687%zmm6, %zmm7, %zmm4, %zmm5,688%zmm9, %zmm8, %zmm11, %zmm10,689%zmm12, %zmm13, %zmm14, %zmm15,690%zmm24, %zmm25, %zmm26, %zmm27,691%zmm28, %zmm29, %zmm30, %zmm31,692%rax, %r9, 7);693aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,694%zmm4, %zmm5, %zmm6, %zmm7,695%zmm8, %zmm9, %zmm10, %zmm11,696%zmm12, %zmm13, %zmm14, %zmm15,697%zmm24, %zmm25, %zmm26, %zmm27,698%zmm28, %zmm29, %zmm30, %zmm31,699%rax, %r9, 8);700aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,701%zmm6, %zmm7, %zmm4, %zmm5,702%zmm9, %zmm8, %zmm11, %zmm10,703%zmm12, %zmm13, %zmm14, %zmm15,704%zmm24, %zmm25, %zmm26, %zmm27,705%zmm28, %zmm29, %zmm30, %zmm31,706%rax, %r9, 9);707aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,708%zmm4, %zmm5, %zmm6, %zmm7,709%zmm8, %zmm9, %zmm10, %zmm11,710%zmm12, %zmm13, %zmm14, %zmm15,711%zmm24, %zmm25, %zmm26, %zmm27,712%zmm28, %zmm29, %zmm30, %zmm31,713%rax, %r9, 10);714cmpl $12, ARIA_CTX_rounds(CTX);715jne .Laria_gfni_192;716aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,717%zmm6, %zmm7, %zmm4, %zmm5,718%zmm9, %zmm8, %zmm11, %zmm10,719%zmm12, %zmm13, %zmm14, %zmm15,720%zmm24, %zmm25, %zmm26, %zmm27,721%zmm28, %zmm29, %zmm30, %zmm31,722%rax, %r9, 11, 12);723jmp .Laria_gfni_end;724.Laria_gfni_192:725aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,726%zmm6, %zmm7, %zmm4, %zmm5,727%zmm9, %zmm8, %zmm11, %zmm10,728%zmm12, %zmm13, %zmm14, %zmm15,729%zmm24, %zmm25, %zmm26, %zmm27,730%zmm28, %zmm29, %zmm30, %zmm31,731%rax, %r9, 11);732aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,733%zmm4, %zmm5, %zmm6, %zmm7,734%zmm8, %zmm9, %zmm10, %zmm11,735%zmm12, %zmm13, %zmm14, %zmm15,736%zmm24, %zmm25, %zmm26, %zmm27,737%zmm28, %zmm29, %zmm30, %zmm31,738%rax, %r9, 12);739cmpl $14, ARIA_CTX_rounds(CTX);740jne .Laria_gfni_256;741aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,742%zmm6, %zmm7, %zmm4, %zmm5,743%zmm9, %zmm8, %zmm11, %zmm10,744%zmm12, %zmm13, %zmm14, %zmm15,745%zmm24, %zmm25, %zmm26, %zmm27,746%zmm28, %zmm29, %zmm30, %zmm31,747%rax, %r9, 13, 14);748jmp .Laria_gfni_end;749.Laria_gfni_256:750aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,751%zmm6, %zmm7, %zmm4, %zmm5,752%zmm9, %zmm8, %zmm11, %zmm10,753%zmm12, %zmm13, %zmm14, %zmm15,754%zmm24, %zmm25, %zmm26, %zmm27,755%zmm28, %zmm29, %zmm30, %zmm31,756%rax, %r9, 13);757aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,758%zmm4, %zmm5, %zmm6, %zmm7,759%zmm8, %zmm9, %zmm10, %zmm11,760%zmm12, %zmm13, %zmm14, %zmm15,761%zmm24, %zmm25, %zmm26, %zmm27,762%zmm28, %zmm29, %zmm30, %zmm31,763%rax, %r9, 14);764aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,765%zmm6, %zmm7, %zmm4, %zmm5,766%zmm9, %zmm8, %zmm11, %zmm10,767%zmm12, %zmm13, %zmm14, %zmm15,768%zmm24, %zmm25, %zmm26, %zmm27,769%zmm28, %zmm29, %zmm30, %zmm31,770%rax, %r9, 15, 16);771.Laria_gfni_end:772debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,773%zmm8, %zmm13, %zmm2, %zmm7,774%zmm11, %zmm14, %zmm1, %zmm4,775%zmm10, %zmm15, %zmm0, %zmm5,776(%rax), (%r8));777FRAME_END778RET;779SYM_FUNC_END(__aria_gfni_avx512_crypt_64way)780781SYM_TYPED_FUNC_START(aria_gfni_avx512_encrypt_64way)782/* input:783* %rdi: ctx, CTX784* %rsi: dst785* %rdx: src786*/787788FRAME_BEGIN789790leaq ARIA_CTX_enc_key(CTX), %r9;791792inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,793%zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,794%zmm15, %rdx);795796call __aria_gfni_avx512_crypt_64way;797798write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,799%zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,800%zmm15, %rax);801802FRAME_END803RET;804SYM_FUNC_END(aria_gfni_avx512_encrypt_64way)805806SYM_TYPED_FUNC_START(aria_gfni_avx512_decrypt_64way)807/* input:808* %rdi: ctx, CTX809* %rsi: dst810* %rdx: src811*/812813FRAME_BEGIN814815leaq ARIA_CTX_dec_key(CTX), %r9;816817inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,818%zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,819%zmm15, %rdx);820821call __aria_gfni_avx512_crypt_64way;822823write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,824%zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,825%zmm15, %rax);826827FRAME_END828RET;829SYM_FUNC_END(aria_gfni_avx512_decrypt_64way)830831SYM_FUNC_START_LOCAL(__aria_gfni_avx512_ctr_gen_keystream_64way)832/* input:833* %rdi: ctx834* %rsi: dst835* %rdx: src836* %rcx: keystream837* %r8: iv (big endian, 128bit)838*/839840FRAME_BEGIN841842vbroadcasti64x2 .Lbswap128_mask (%rip), %zmm19;843vmovdqa64 .Lcounter0123_lo (%rip), %zmm21;844vbroadcasti64x2 .Lcounter4444_lo (%rip), %zmm22;845vbroadcasti64x2 .Lcounter8888_lo (%rip), %zmm23;846vbroadcasti64x2 .Lcounter16161616_lo (%rip), %zmm24;847vbroadcasti64x2 .Lcounter1111_hi (%rip), %zmm25;848849/* load IV and byteswap */850movq 8(%r8), %r11;851movq (%r8), %r10;852bswapq %r11;853bswapq %r10;854vbroadcasti64x2 (%r8), %zmm20;855vpshufb %zmm19, %zmm20, %zmm20;856857/* check need for handling 64-bit overflow and carry */858cmpq $(0xffffffffffffffff - 64), %r11;859ja .Lload_ctr_carry;860861/* construct IVs */862vpaddq %zmm21, %zmm20, %zmm0; /* +0:+1:+2:+3 */863vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */864vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */865vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */866vpaddq %zmm24, %zmm0, %zmm4; /* +16... */867vpaddq %zmm24, %zmm1, %zmm5; /* +20... */868vpaddq %zmm24, %zmm2, %zmm6; /* +24... */869vpaddq %zmm24, %zmm3, %zmm7; /* +28... */870vpaddq %zmm24, %zmm4, %zmm8; /* +32... */871vpaddq %zmm24, %zmm5, %zmm9; /* +36... */872vpaddq %zmm24, %zmm6, %zmm10; /* +40... */873vpaddq %zmm24, %zmm7, %zmm11; /* +44... */874vpaddq %zmm24, %zmm8, %zmm12; /* +48... */875vpaddq %zmm24, %zmm9, %zmm13; /* +52... */876vpaddq %zmm24, %zmm10, %zmm14; /* +56... */877vpaddq %zmm24, %zmm11, %zmm15; /* +60... */878jmp .Lload_ctr_done;879880.Lload_ctr_carry:881/* construct IVs */882add_le128(%zmm0, %zmm20, %zmm21, %zmm25); /* +0:+1:+2:+3 */883add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */884add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */885add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */886add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */887add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */888add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */889add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */890add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */891add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */892add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */893add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */894add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */895add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */896add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */897add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */898899.Lload_ctr_done:900/* Byte-swap IVs and update counter. */901addq $64, %r11;902adcq $0, %r10;903vpshufb %zmm19, %zmm15, %zmm15;904vpshufb %zmm19, %zmm14, %zmm14;905vpshufb %zmm19, %zmm13, %zmm13;906vpshufb %zmm19, %zmm12, %zmm12;907vpshufb %zmm19, %zmm11, %zmm11;908vpshufb %zmm19, %zmm10, %zmm10;909vpshufb %zmm19, %zmm9, %zmm9;910vpshufb %zmm19, %zmm8, %zmm8;911bswapq %r11;912bswapq %r10;913vpshufb %zmm19, %zmm7, %zmm7;914vpshufb %zmm19, %zmm6, %zmm6;915vpshufb %zmm19, %zmm5, %zmm5;916vpshufb %zmm19, %zmm4, %zmm4;917vpshufb %zmm19, %zmm3, %zmm3;918vpshufb %zmm19, %zmm2, %zmm2;919vpshufb %zmm19, %zmm1, %zmm1;920vpshufb %zmm19, %zmm0, %zmm0;921movq %r11, 8(%r8);922movq %r10, (%r8);923924FRAME_END925RET;926SYM_FUNC_END(__aria_gfni_avx512_ctr_gen_keystream_64way)927928SYM_TYPED_FUNC_START(aria_gfni_avx512_ctr_crypt_64way)929/* input:930* %rdi: ctx931* %rsi: dst932* %rdx: src933* %rcx: keystream934* %r8: iv (big endian, 128bit)935*/936FRAME_BEGIN937938call __aria_gfni_avx512_ctr_gen_keystream_64way939940leaq (%rsi), %r10;941leaq (%rdx), %r11;942leaq (%rcx), %rsi;943leaq (%rcx), %rdx;944leaq ARIA_CTX_enc_key(CTX), %r9;945946call __aria_gfni_avx512_crypt_64way;947948vpxorq (0 * 64)(%r11), %zmm3, %zmm3;949vpxorq (1 * 64)(%r11), %zmm2, %zmm2;950vpxorq (2 * 64)(%r11), %zmm1, %zmm1;951vpxorq (3 * 64)(%r11), %zmm0, %zmm0;952vpxorq (4 * 64)(%r11), %zmm6, %zmm6;953vpxorq (5 * 64)(%r11), %zmm7, %zmm7;954vpxorq (6 * 64)(%r11), %zmm4, %zmm4;955vpxorq (7 * 64)(%r11), %zmm5, %zmm5;956vpxorq (8 * 64)(%r11), %zmm9, %zmm9;957vpxorq (9 * 64)(%r11), %zmm8, %zmm8;958vpxorq (10 * 64)(%r11), %zmm11, %zmm11;959vpxorq (11 * 64)(%r11), %zmm10, %zmm10;960vpxorq (12 * 64)(%r11), %zmm12, %zmm12;961vpxorq (13 * 64)(%r11), %zmm13, %zmm13;962vpxorq (14 * 64)(%r11), %zmm14, %zmm14;963vpxorq (15 * 64)(%r11), %zmm15, %zmm15;964write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,965%zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,966%zmm15, %r10);967968FRAME_END969RET;970SYM_FUNC_END(aria_gfni_avx512_ctr_crypt_64way)971972973