Path: blob/master/arch/x86/crypto/camellia-aesni-avx-asm_64.S
26451 views
/*1* x86_64/AVX/AES-NI assembler implementation of Camellia2*3* Copyright © 2012-2013 Jussi Kivilinna <[email protected]>4*5* This program is free software; you can redistribute it and/or modify6* it under the terms of the GNU General Public License as published by7* the Free Software Foundation; either version 2 of the License, or8* (at your option) any later version.9*10*/1112/*13* Version licensed under 2-clause BSD License is available at:14* http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz15*/1617#include <linux/linkage.h>18#include <linux/cfi_types.h>19#include <asm/frame.h>2021#define CAMELLIA_TABLE_BYTE_LEN 2722223/* struct camellia_ctx: */24#define key_table 025#define key_length CAMELLIA_TABLE_BYTE_LEN2627/* register macros */28#define CTX %rdi2930/**********************************************************************3116-way camellia32**********************************************************************/33#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \34vpand x, mask4bit, tmp0; \35vpandn x, mask4bit, x; \36vpsrld $4, x, x; \37\38vpshufb tmp0, lo_t, tmp0; \39vpshufb x, hi_t, x; \40vpxor tmp0, x, x;4142/*43* IN:44* x0..x7: byte-sliced AB state45* mem_cd: register pointer storing CD state46* key: index for key material47* OUT:48* x0..x7: new byte-sliced CD state49*/50#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \51t7, mem_cd, key) \52/* \53* S-function with AES subbytes \54*/ \55vmovdqa .Linv_shift_row(%rip), t4; \56vbroadcastss .L0f0f0f0f(%rip), t7; \57vmovdqa .Lpre_tf_lo_s1(%rip), t0; \58vmovdqa .Lpre_tf_hi_s1(%rip), t1; \59\60/* AES inverse shift rows */ \61vpshufb t4, x0, x0; \62vpshufb t4, x7, x7; \63vpshufb t4, x1, x1; \64vpshufb t4, x4, x4; \65vpshufb t4, x2, x2; \66vpshufb t4, x5, x5; \67vpshufb t4, x3, x3; \68vpshufb t4, x6, x6; \69\70/* prefilter sboxes 1, 2 and 3 */ \71vmovdqa .Lpre_tf_lo_s4(%rip), t2; \72vmovdqa .Lpre_tf_hi_s4(%rip), t3; \73filter_8bit(x0, t0, t1, t7, t6); \74filter_8bit(x7, t0, t1, t7, t6); \75filter_8bit(x1, t0, t1, t7, t6); \76filter_8bit(x4, t0, t1, t7, t6); \77filter_8bit(x2, t0, t1, t7, t6); \78filter_8bit(x5, t0, t1, t7, t6); \79\80/* prefilter sbox 4 */ \81vpxor t4, t4, t4; \82filter_8bit(x3, t2, t3, t7, t6); \83filter_8bit(x6, t2, t3, t7, t6); \84\85/* AES subbytes + AES shift rows */ \86vmovdqa .Lpost_tf_lo_s1(%rip), t0; \87vmovdqa .Lpost_tf_hi_s1(%rip), t1; \88vaesenclast t4, x0, x0; \89vaesenclast t4, x7, x7; \90vaesenclast t4, x1, x1; \91vaesenclast t4, x4, x4; \92vaesenclast t4, x2, x2; \93vaesenclast t4, x5, x5; \94vaesenclast t4, x3, x3; \95vaesenclast t4, x6, x6; \96\97/* postfilter sboxes 1 and 4 */ \98vmovdqa .Lpost_tf_lo_s3(%rip), t2; \99vmovdqa .Lpost_tf_hi_s3(%rip), t3; \100filter_8bit(x0, t0, t1, t7, t6); \101filter_8bit(x7, t0, t1, t7, t6); \102filter_8bit(x3, t0, t1, t7, t6); \103filter_8bit(x6, t0, t1, t7, t6); \104\105/* postfilter sbox 3 */ \106vmovdqa .Lpost_tf_lo_s2(%rip), t4; \107vmovdqa .Lpost_tf_hi_s2(%rip), t5; \108filter_8bit(x2, t2, t3, t7, t6); \109filter_8bit(x5, t2, t3, t7, t6); \110\111vpxor t6, t6, t6; \112vmovq key, t0; \113\114/* postfilter sbox 2 */ \115filter_8bit(x1, t4, t5, t7, t2); \116filter_8bit(x4, t4, t5, t7, t2); \117\118vpsrldq $5, t0, t5; \119vpsrldq $1, t0, t1; \120vpsrldq $2, t0, t2; \121vpsrldq $3, t0, t3; \122vpsrldq $4, t0, t4; \123vpshufb t6, t0, t0; \124vpshufb t6, t1, t1; \125vpshufb t6, t2, t2; \126vpshufb t6, t3, t3; \127vpshufb t6, t4, t4; \128vpsrldq $2, t5, t7; \129vpshufb t6, t7, t7; \130\131/* \132* P-function \133*/ \134vpxor x5, x0, x0; \135vpxor x6, x1, x1; \136vpxor x7, x2, x2; \137vpxor x4, x3, x3; \138\139vpxor x2, x4, x4; \140vpxor x3, x5, x5; \141vpxor x0, x6, x6; \142vpxor x1, x7, x7; \143\144vpxor x7, x0, x0; \145vpxor x4, x1, x1; \146vpxor x5, x2, x2; \147vpxor x6, x3, x3; \148\149vpxor x3, x4, x4; \150vpxor x0, x5, x5; \151vpxor x1, x6, x6; \152vpxor x2, x7, x7; /* note: high and low parts swapped */ \153\154/* \155* Add key material and result to CD (x becomes new CD) \156*/ \157\158vpxor t3, x4, x4; \159vpxor 0 * 16(mem_cd), x4, x4; \160\161vpxor t2, x5, x5; \162vpxor 1 * 16(mem_cd), x5, x5; \163\164vpsrldq $1, t5, t3; \165vpshufb t6, t5, t5; \166vpshufb t6, t3, t6; \167\168vpxor t1, x6, x6; \169vpxor 2 * 16(mem_cd), x6, x6; \170\171vpxor t0, x7, x7; \172vpxor 3 * 16(mem_cd), x7, x7; \173\174vpxor t7, x0, x0; \175vpxor 4 * 16(mem_cd), x0, x0; \176\177vpxor t6, x1, x1; \178vpxor 5 * 16(mem_cd), x1, x1; \179\180vpxor t5, x2, x2; \181vpxor 6 * 16(mem_cd), x2, x2; \182\183vpxor t4, x3, x3; \184vpxor 7 * 16(mem_cd), x3, x3;185186/*187* Size optimization... with inlined roundsm16, binary would be over 5 times188* larger and would only be 0.5% faster (on sandy-bridge).189*/190.align 8191SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)192roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,193%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,194%rcx, (%r9));195RET;196SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)197198.align 8199SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)200roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,201%xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,202%rax, (%r9));203RET;204SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)205206/*207* IN/OUT:208* x0..x7: byte-sliced AB state preloaded209* mem_ab: byte-sliced AB state in memory210* mem_cb: byte-sliced CD state in memory211*/212#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \213y6, y7, mem_ab, mem_cd, i, dir, store_ab) \214leaq (key_table + (i) * 8)(CTX), %r9; \215call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \216\217vmovdqu x4, 0 * 16(mem_cd); \218vmovdqu x5, 1 * 16(mem_cd); \219vmovdqu x6, 2 * 16(mem_cd); \220vmovdqu x7, 3 * 16(mem_cd); \221vmovdqu x0, 4 * 16(mem_cd); \222vmovdqu x1, 5 * 16(mem_cd); \223vmovdqu x2, 6 * 16(mem_cd); \224vmovdqu x3, 7 * 16(mem_cd); \225\226leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \227call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \228\229store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);230231#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */232233#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \234/* Store new AB state */ \235vmovdqu x0, 0 * 16(mem_ab); \236vmovdqu x1, 1 * 16(mem_ab); \237vmovdqu x2, 2 * 16(mem_ab); \238vmovdqu x3, 3 * 16(mem_ab); \239vmovdqu x4, 4 * 16(mem_ab); \240vmovdqu x5, 5 * 16(mem_ab); \241vmovdqu x6, 6 * 16(mem_ab); \242vmovdqu x7, 7 * 16(mem_ab);243244#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \245y6, y7, mem_ab, mem_cd, i) \246two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \247y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \248two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \249y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \250two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \251y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);252253#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \254y6, y7, mem_ab, mem_cd, i) \255two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \256y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \257two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \258y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \259two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \260y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);261262/*263* IN:264* v0..3: byte-sliced 32-bit integers265* OUT:266* v0..3: (IN <<< 1)267*/268#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \269vpcmpgtb v0, zero, t0; \270vpaddb v0, v0, v0; \271vpabsb t0, t0; \272\273vpcmpgtb v1, zero, t1; \274vpaddb v1, v1, v1; \275vpabsb t1, t1; \276\277vpcmpgtb v2, zero, t2; \278vpaddb v2, v2, v2; \279vpabsb t2, t2; \280\281vpor t0, v1, v1; \282\283vpcmpgtb v3, zero, t0; \284vpaddb v3, v3, v3; \285vpabsb t0, t0; \286\287vpor t1, v2, v2; \288vpor t2, v3, v3; \289vpor t0, v0, v0;290291/*292* IN:293* r: byte-sliced AB state in memory294* l: byte-sliced CD state in memory295* OUT:296* x0..x7: new byte-sliced CD state297*/298#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \299tt1, tt2, tt3, kll, klr, krl, krr) \300/* \301* t0 = kll; \302* t0 &= ll; \303* lr ^= rol32(t0, 1); \304*/ \305vpxor tt0, tt0, tt0; \306vmovd kll, t0; \307vpshufb tt0, t0, t3; \308vpsrldq $1, t0, t0; \309vpshufb tt0, t0, t2; \310vpsrldq $1, t0, t0; \311vpshufb tt0, t0, t1; \312vpsrldq $1, t0, t0; \313vpshufb tt0, t0, t0; \314\315vpand l0, t0, t0; \316vpand l1, t1, t1; \317vpand l2, t2, t2; \318vpand l3, t3, t3; \319\320rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \321\322vpxor l4, t0, l4; \323vmovdqu l4, 4 * 16(l); \324vpxor l5, t1, l5; \325vmovdqu l5, 5 * 16(l); \326vpxor l6, t2, l6; \327vmovdqu l6, 6 * 16(l); \328vpxor l7, t3, l7; \329vmovdqu l7, 7 * 16(l); \330\331/* \332* t2 = krr; \333* t2 |= rr; \334* rl ^= t2; \335*/ \336\337vmovd krr, t0; \338vpshufb tt0, t0, t3; \339vpsrldq $1, t0, t0; \340vpshufb tt0, t0, t2; \341vpsrldq $1, t0, t0; \342vpshufb tt0, t0, t1; \343vpsrldq $1, t0, t0; \344vpshufb tt0, t0, t0; \345\346vpor 4 * 16(r), t0, t0; \347vpor 5 * 16(r), t1, t1; \348vpor 6 * 16(r), t2, t2; \349vpor 7 * 16(r), t3, t3; \350\351vpxor 0 * 16(r), t0, t0; \352vpxor 1 * 16(r), t1, t1; \353vpxor 2 * 16(r), t2, t2; \354vpxor 3 * 16(r), t3, t3; \355vmovdqu t0, 0 * 16(r); \356vmovdqu t1, 1 * 16(r); \357vmovdqu t2, 2 * 16(r); \358vmovdqu t3, 3 * 16(r); \359\360/* \361* t2 = krl; \362* t2 &= rl; \363* rr ^= rol32(t2, 1); \364*/ \365vmovd krl, t0; \366vpshufb tt0, t0, t3; \367vpsrldq $1, t0, t0; \368vpshufb tt0, t0, t2; \369vpsrldq $1, t0, t0; \370vpshufb tt0, t0, t1; \371vpsrldq $1, t0, t0; \372vpshufb tt0, t0, t0; \373\374vpand 0 * 16(r), t0, t0; \375vpand 1 * 16(r), t1, t1; \376vpand 2 * 16(r), t2, t2; \377vpand 3 * 16(r), t3, t3; \378\379rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \380\381vpxor 4 * 16(r), t0, t0; \382vpxor 5 * 16(r), t1, t1; \383vpxor 6 * 16(r), t2, t2; \384vpxor 7 * 16(r), t3, t3; \385vmovdqu t0, 4 * 16(r); \386vmovdqu t1, 5 * 16(r); \387vmovdqu t2, 6 * 16(r); \388vmovdqu t3, 7 * 16(r); \389\390/* \391* t0 = klr; \392* t0 |= lr; \393* ll ^= t0; \394*/ \395\396vmovd klr, t0; \397vpshufb tt0, t0, t3; \398vpsrldq $1, t0, t0; \399vpshufb tt0, t0, t2; \400vpsrldq $1, t0, t0; \401vpshufb tt0, t0, t1; \402vpsrldq $1, t0, t0; \403vpshufb tt0, t0, t0; \404\405vpor l4, t0, t0; \406vpor l5, t1, t1; \407vpor l6, t2, t2; \408vpor l7, t3, t3; \409\410vpxor l0, t0, l0; \411vmovdqu l0, 0 * 16(l); \412vpxor l1, t1, l1; \413vmovdqu l1, 1 * 16(l); \414vpxor l2, t2, l2; \415vmovdqu l2, 2 * 16(l); \416vpxor l3, t3, l3; \417vmovdqu l3, 3 * 16(l);418419#define transpose_4x4(x0, x1, x2, x3, t1, t2) \420vpunpckhdq x1, x0, t2; \421vpunpckldq x1, x0, x0; \422\423vpunpckldq x3, x2, t1; \424vpunpckhdq x3, x2, x2; \425\426vpunpckhqdq t1, x0, x1; \427vpunpcklqdq t1, x0, x0; \428\429vpunpckhqdq x2, t2, x3; \430vpunpcklqdq x2, t2, x2;431432#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \433b3, c3, d3, st0, st1) \434vmovdqu d2, st0; \435vmovdqu d3, st1; \436transpose_4x4(a0, a1, a2, a3, d2, d3); \437transpose_4x4(b0, b1, b2, b3, d2, d3); \438vmovdqu st0, d2; \439vmovdqu st1, d3; \440\441vmovdqu a0, st0; \442vmovdqu a1, st1; \443transpose_4x4(c0, c1, c2, c3, a0, a1); \444transpose_4x4(d0, d1, d2, d3, a0, a1); \445\446vmovdqu .Lshufb_16x16b(%rip), a0; \447vmovdqu st1, a1; \448vpshufb a0, a2, a2; \449vpshufb a0, a3, a3; \450vpshufb a0, b0, b0; \451vpshufb a0, b1, b1; \452vpshufb a0, b2, b2; \453vpshufb a0, b3, b3; \454vpshufb a0, a1, a1; \455vpshufb a0, c0, c0; \456vpshufb a0, c1, c1; \457vpshufb a0, c2, c2; \458vpshufb a0, c3, c3; \459vpshufb a0, d0, d0; \460vpshufb a0, d1, d1; \461vpshufb a0, d2, d2; \462vpshufb a0, d3, d3; \463vmovdqu d3, st1; \464vmovdqu st0, d3; \465vpshufb a0, d3, a0; \466vmovdqu d2, st0; \467\468transpose_4x4(a0, b0, c0, d0, d2, d3); \469transpose_4x4(a1, b1, c1, d1, d2, d3); \470vmovdqu st0, d2; \471vmovdqu st1, d3; \472\473vmovdqu b0, st0; \474vmovdqu b1, st1; \475transpose_4x4(a2, b2, c2, d2, b0, b1); \476transpose_4x4(a3, b3, c3, d3, b0, b1); \477vmovdqu st0, b0; \478vmovdqu st1, b1; \479/* does not adjust output bytes inside vectors */480481/* load blocks to registers and apply pre-whitening */482#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \483y6, y7, rio, key) \484vmovq key, x0; \485vpshufb .Lpack_bswap(%rip), x0, x0; \486\487vpxor 0 * 16(rio), x0, y7; \488vpxor 1 * 16(rio), x0, y6; \489vpxor 2 * 16(rio), x0, y5; \490vpxor 3 * 16(rio), x0, y4; \491vpxor 4 * 16(rio), x0, y3; \492vpxor 5 * 16(rio), x0, y2; \493vpxor 6 * 16(rio), x0, y1; \494vpxor 7 * 16(rio), x0, y0; \495vpxor 8 * 16(rio), x0, x7; \496vpxor 9 * 16(rio), x0, x6; \497vpxor 10 * 16(rio), x0, x5; \498vpxor 11 * 16(rio), x0, x4; \499vpxor 12 * 16(rio), x0, x3; \500vpxor 13 * 16(rio), x0, x2; \501vpxor 14 * 16(rio), x0, x1; \502vpxor 15 * 16(rio), x0, x0;503504/* byteslice pre-whitened blocks and store to temporary memory */505#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \506y6, y7, mem_ab, mem_cd) \507byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \508y5, y6, y7, (mem_ab), (mem_cd)); \509\510vmovdqu x0, 0 * 16(mem_ab); \511vmovdqu x1, 1 * 16(mem_ab); \512vmovdqu x2, 2 * 16(mem_ab); \513vmovdqu x3, 3 * 16(mem_ab); \514vmovdqu x4, 4 * 16(mem_ab); \515vmovdqu x5, 5 * 16(mem_ab); \516vmovdqu x6, 6 * 16(mem_ab); \517vmovdqu x7, 7 * 16(mem_ab); \518vmovdqu y0, 0 * 16(mem_cd); \519vmovdqu y1, 1 * 16(mem_cd); \520vmovdqu y2, 2 * 16(mem_cd); \521vmovdqu y3, 3 * 16(mem_cd); \522vmovdqu y4, 4 * 16(mem_cd); \523vmovdqu y5, 5 * 16(mem_cd); \524vmovdqu y6, 6 * 16(mem_cd); \525vmovdqu y7, 7 * 16(mem_cd);526527/* de-byteslice, apply post-whitening and store blocks */528#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \529y5, y6, y7, key, stack_tmp0, stack_tmp1) \530byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \531y7, x3, x7, stack_tmp0, stack_tmp1); \532\533vmovdqu x0, stack_tmp0; \534\535vmovq key, x0; \536vpshufb .Lpack_bswap(%rip), x0, x0; \537\538vpxor x0, y7, y7; \539vpxor x0, y6, y6; \540vpxor x0, y5, y5; \541vpxor x0, y4, y4; \542vpxor x0, y3, y3; \543vpxor x0, y2, y2; \544vpxor x0, y1, y1; \545vpxor x0, y0, y0; \546vpxor x0, x7, x7; \547vpxor x0, x6, x6; \548vpxor x0, x5, x5; \549vpxor x0, x4, x4; \550vpxor x0, x3, x3; \551vpxor x0, x2, x2; \552vpxor x0, x1, x1; \553vpxor stack_tmp0, x0, x0;554555#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \556y6, y7, rio) \557vmovdqu x0, 0 * 16(rio); \558vmovdqu x1, 1 * 16(rio); \559vmovdqu x2, 2 * 16(rio); \560vmovdqu x3, 3 * 16(rio); \561vmovdqu x4, 4 * 16(rio); \562vmovdqu x5, 5 * 16(rio); \563vmovdqu x6, 6 * 16(rio); \564vmovdqu x7, 7 * 16(rio); \565vmovdqu y0, 8 * 16(rio); \566vmovdqu y1, 9 * 16(rio); \567vmovdqu y2, 10 * 16(rio); \568vmovdqu y3, 11 * 16(rio); \569vmovdqu y4, 12 * 16(rio); \570vmovdqu y5, 13 * 16(rio); \571vmovdqu y6, 14 * 16(rio); \572vmovdqu y7, 15 * 16(rio);573574575/* NB: section is mergeable, all elements must be aligned 16-byte blocks */576.section .rodata.cst16, "aM", @progbits, 16577.align 16578579#define SHUFB_BYTES(idx) \5800 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)581582.Lshufb_16x16b:583.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);584585.Lpack_bswap:586.long 0x00010203587.long 0x04050607588.long 0x80808080589.long 0x80808080590591/*592* pre-SubByte transform593*594* pre-lookup for sbox1, sbox2, sbox3:595* swap_bitendianness(596* isom_map_camellia_to_aes(597* camellia_f(598* swap_bitendianess(in)599* )600* )601* )602*603* (note: '⊕ 0xc5' inside camellia_f())604*/605.Lpre_tf_lo_s1:606.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86607.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88608.Lpre_tf_hi_s1:609.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a610.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23611612/*613* pre-SubByte transform614*615* pre-lookup for sbox4:616* swap_bitendianness(617* isom_map_camellia_to_aes(618* camellia_f(619* swap_bitendianess(in <<< 1)620* )621* )622* )623*624* (note: '⊕ 0xc5' inside camellia_f())625*/626.Lpre_tf_lo_s4:627.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25628.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74629.Lpre_tf_hi_s4:630.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72631.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf632633/*634* post-SubByte transform635*636* post-lookup for sbox1, sbox4:637* swap_bitendianness(638* camellia_h(639* isom_map_aes_to_camellia(640* swap_bitendianness(641* aes_inverse_affine_transform(in)642* )643* )644* )645* )646*647* (note: '⊕ 0x6e' inside camellia_h())648*/649.Lpost_tf_lo_s1:650.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31651.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1652.Lpost_tf_hi_s1:653.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8654.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c655656/*657* post-SubByte transform658*659* post-lookup for sbox2:660* swap_bitendianness(661* camellia_h(662* isom_map_aes_to_camellia(663* swap_bitendianness(664* aes_inverse_affine_transform(in)665* )666* )667* )668* ) <<< 1669*670* (note: '⊕ 0x6e' inside camellia_h())671*/672.Lpost_tf_lo_s2:673.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62674.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3675.Lpost_tf_hi_s2:676.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51677.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18678679/*680* post-SubByte transform681*682* post-lookup for sbox3:683* swap_bitendianness(684* camellia_h(685* isom_map_aes_to_camellia(686* swap_bitendianness(687* aes_inverse_affine_transform(in)688* )689* )690* )691* ) >>> 1692*693* (note: '⊕ 0x6e' inside camellia_h())694*/695.Lpost_tf_lo_s3:696.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98697.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8698.Lpost_tf_hi_s3:699.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54700.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06701702/* For isolating SubBytes from AESENCLAST, inverse shift row */703.Linv_shift_row:704.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b705.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03706707/* 4-bit mask */708.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4709.align 4710.L0f0f0f0f:711.long 0x0f0f0f0f712713.text714715SYM_FUNC_START_LOCAL(__camellia_enc_blk16)716/* input:717* %rdi: ctx, CTX718* %rax: temporary storage, 256 bytes719* %xmm0..%xmm15: 16 plaintext blocks720* output:721* %xmm0..%xmm15: 16 encrypted blocks, order swapped:722* 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8723*/724FRAME_BEGIN725726leaq 8 * 16(%rax), %rcx;727728inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,729%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,730%xmm15, %rax, %rcx);731732enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,733%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,734%xmm15, %rax, %rcx, 0);735736fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,737%rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,738%xmm15,739((key_table + (8) * 8) + 0)(CTX),740((key_table + (8) * 8) + 4)(CTX),741((key_table + (8) * 8) + 8)(CTX),742((key_table + (8) * 8) + 12)(CTX));743744enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,745%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,746%xmm15, %rax, %rcx, 8);747748fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,749%rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,750%xmm15,751((key_table + (16) * 8) + 0)(CTX),752((key_table + (16) * 8) + 4)(CTX),753((key_table + (16) * 8) + 8)(CTX),754((key_table + (16) * 8) + 12)(CTX));755756enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,757%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,758%xmm15, %rax, %rcx, 16);759760movl $24, %r8d;761cmpl $16, key_length(CTX);762jne .Lenc_max32;763764.Lenc_done:765/* load CD for output */766vmovdqu 0 * 16(%rcx), %xmm8;767vmovdqu 1 * 16(%rcx), %xmm9;768vmovdqu 2 * 16(%rcx), %xmm10;769vmovdqu 3 * 16(%rcx), %xmm11;770vmovdqu 4 * 16(%rcx), %xmm12;771vmovdqu 5 * 16(%rcx), %xmm13;772vmovdqu 6 * 16(%rcx), %xmm14;773vmovdqu 7 * 16(%rcx), %xmm15;774775outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,776%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,777%xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));778779FRAME_END780RET;781782.align 8783.Lenc_max32:784movl $32, %r8d;785786fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,787%rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,788%xmm15,789((key_table + (24) * 8) + 0)(CTX),790((key_table + (24) * 8) + 4)(CTX),791((key_table + (24) * 8) + 8)(CTX),792((key_table + (24) * 8) + 12)(CTX));793794enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,795%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,796%xmm15, %rax, %rcx, 24);797798jmp .Lenc_done;799SYM_FUNC_END(__camellia_enc_blk16)800801SYM_FUNC_START_LOCAL(__camellia_dec_blk16)802/* input:803* %rdi: ctx, CTX804* %rax: temporary storage, 256 bytes805* %r8d: 24 for 16 byte key, 32 for larger806* %xmm0..%xmm15: 16 encrypted blocks807* output:808* %xmm0..%xmm15: 16 plaintext blocks, order swapped:809* 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8810*/811FRAME_BEGIN812813leaq 8 * 16(%rax), %rcx;814815inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,816%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,817%xmm15, %rax, %rcx);818819cmpl $32, %r8d;820je .Ldec_max32;821822.Ldec_max24:823dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,824%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,825%xmm15, %rax, %rcx, 16);826827fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,828%rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,829%xmm15,830((key_table + (16) * 8) + 8)(CTX),831((key_table + (16) * 8) + 12)(CTX),832((key_table + (16) * 8) + 0)(CTX),833((key_table + (16) * 8) + 4)(CTX));834835dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,836%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,837%xmm15, %rax, %rcx, 8);838839fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,840%rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,841%xmm15,842((key_table + (8) * 8) + 8)(CTX),843((key_table + (8) * 8) + 12)(CTX),844((key_table + (8) * 8) + 0)(CTX),845((key_table + (8) * 8) + 4)(CTX));846847dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,848%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,849%xmm15, %rax, %rcx, 0);850851/* load CD for output */852vmovdqu 0 * 16(%rcx), %xmm8;853vmovdqu 1 * 16(%rcx), %xmm9;854vmovdqu 2 * 16(%rcx), %xmm10;855vmovdqu 3 * 16(%rcx), %xmm11;856vmovdqu 4 * 16(%rcx), %xmm12;857vmovdqu 5 * 16(%rcx), %xmm13;858vmovdqu 6 * 16(%rcx), %xmm14;859vmovdqu 7 * 16(%rcx), %xmm15;860861outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,862%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,863%xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));864865FRAME_END866RET;867868.align 8869.Ldec_max32:870dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,871%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,872%xmm15, %rax, %rcx, 24);873874fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,875%rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,876%xmm15,877((key_table + (24) * 8) + 8)(CTX),878((key_table + (24) * 8) + 12)(CTX),879((key_table + (24) * 8) + 0)(CTX),880((key_table + (24) * 8) + 4)(CTX));881882jmp .Ldec_max24;883SYM_FUNC_END(__camellia_dec_blk16)884885SYM_TYPED_FUNC_START(camellia_ecb_enc_16way)886/* input:887* %rdi: ctx, CTX888* %rsi: dst (16 blocks)889* %rdx: src (16 blocks)890*/891FRAME_BEGIN892893inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,894%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,895%xmm15, %rdx, (key_table)(CTX));896897/* now dst can be used as temporary buffer (even in src == dst case) */898movq %rsi, %rax;899900call __camellia_enc_blk16;901902write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,903%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,904%xmm8, %rsi);905906FRAME_END907RET;908SYM_FUNC_END(camellia_ecb_enc_16way)909910SYM_TYPED_FUNC_START(camellia_ecb_dec_16way)911/* input:912* %rdi: ctx, CTX913* %rsi: dst (16 blocks)914* %rdx: src (16 blocks)915*/916FRAME_BEGIN917918cmpl $16, key_length(CTX);919movl $32, %r8d;920movl $24, %eax;921cmovel %eax, %r8d; /* max */922923inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,924%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,925%xmm15, %rdx, (key_table)(CTX, %r8, 8));926927/* now dst can be used as temporary buffer (even in src == dst case) */928movq %rsi, %rax;929930call __camellia_dec_blk16;931932write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,933%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,934%xmm8, %rsi);935936FRAME_END937RET;938SYM_FUNC_END(camellia_ecb_dec_16way)939940SYM_TYPED_FUNC_START(camellia_cbc_dec_16way)941/* input:942* %rdi: ctx, CTX943* %rsi: dst (16 blocks)944* %rdx: src (16 blocks)945*/946FRAME_BEGIN947948cmpl $16, key_length(CTX);949movl $32, %r8d;950movl $24, %eax;951cmovel %eax, %r8d; /* max */952953inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,954%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,955%xmm15, %rdx, (key_table)(CTX, %r8, 8));956957/*958* dst might still be in-use (in case dst == src), so use stack for959* temporary storage.960*/961subq $(16 * 16), %rsp;962movq %rsp, %rax;963964call __camellia_dec_blk16;965966addq $(16 * 16), %rsp;967968vpxor (0 * 16)(%rdx), %xmm6, %xmm6;969vpxor (1 * 16)(%rdx), %xmm5, %xmm5;970vpxor (2 * 16)(%rdx), %xmm4, %xmm4;971vpxor (3 * 16)(%rdx), %xmm3, %xmm3;972vpxor (4 * 16)(%rdx), %xmm2, %xmm2;973vpxor (5 * 16)(%rdx), %xmm1, %xmm1;974vpxor (6 * 16)(%rdx), %xmm0, %xmm0;975vpxor (7 * 16)(%rdx), %xmm15, %xmm15;976vpxor (8 * 16)(%rdx), %xmm14, %xmm14;977vpxor (9 * 16)(%rdx), %xmm13, %xmm13;978vpxor (10 * 16)(%rdx), %xmm12, %xmm12;979vpxor (11 * 16)(%rdx), %xmm11, %xmm11;980vpxor (12 * 16)(%rdx), %xmm10, %xmm10;981vpxor (13 * 16)(%rdx), %xmm9, %xmm9;982vpxor (14 * 16)(%rdx), %xmm8, %xmm8;983write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,984%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,985%xmm8, %rsi);986987FRAME_END988RET;989SYM_FUNC_END(camellia_cbc_dec_16way)990991992