Path: blob/master/arch/x86/crypto/camellia-x86_64-asm_64.S
26451 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* Camellia Cipher Algorithm (x86_64)3*4* Copyright (C) 2012 Jussi Kivilinna <[email protected]>5*/67#include <linux/linkage.h>8#include <linux/cfi_types.h>910.file "camellia-x86_64-asm_64.S"11.text1213.extern camellia_sp10011110;14.extern camellia_sp22000222;15.extern camellia_sp03303033;16.extern camellia_sp00444404;17.extern camellia_sp02220222;18.extern camellia_sp30333033;19.extern camellia_sp44044404;20.extern camellia_sp11101110;2122#define sp10011110 camellia_sp1001111023#define sp22000222 camellia_sp2200022224#define sp03303033 camellia_sp0330303325#define sp00444404 camellia_sp0044440426#define sp02220222 camellia_sp0222022227#define sp30333033 camellia_sp3033303328#define sp44044404 camellia_sp4404440429#define sp11101110 camellia_sp111011103031#define CAMELLIA_TABLE_BYTE_LEN 2723233/* struct camellia_ctx: */34#define key_table 035#define key_length CAMELLIA_TABLE_BYTE_LEN3637/* register macros */38#define CTX %rdi39#define RIO %rsi40#define RIOd %esi4142#define RAB0 %rax43#define RCD0 %rcx44#define RAB1 %rbx45#define RCD1 %rdx4647#define RAB0d %eax48#define RCD0d %ecx49#define RAB1d %ebx50#define RCD1d %edx5152#define RAB0bl %al53#define RCD0bl %cl54#define RAB1bl %bl55#define RCD1bl %dl5657#define RAB0bh %ah58#define RCD0bh %ch59#define RAB1bh %bh60#define RCD1bh %dh6162#define RT0 %rsi63#define RT1 %r1264#define RT2 %r86566#define RT0d %esi67#define RT1d %r12d68#define RT2d %r8d6970#define RT2bl %r8b7172#define RXOR %r973#define RR12 %r1074#define RDST %r117576#define RXORd %r9d77#define RXORbl %r9b7879#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \80leaq T0(%rip), tmp1; \81movzbl ab ## bl, tmp2 ## d; \82xorq (tmp1, tmp2, 8), dst; \83leaq T1(%rip), tmp2; \84movzbl ab ## bh, tmp1 ## d; \85rorq $16, ab; \86xorq (tmp2, tmp1, 8), dst;8788/**********************************************************************891-way camellia90**********************************************************************/91#define roundsm(ab, subkey, cd) \92movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \93\94xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \95xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \96xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \97xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \98\99xorq RT2, cd ## 0;100101#define fls(l, r, kl, kr) \102movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \103andl l ## 0d, RT0d; \104roll $1, RT0d; \105shlq $32, RT0; \106xorq RT0, l ## 0; \107movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \108orq r ## 0, RT1; \109shrq $32, RT1; \110xorq RT1, r ## 0; \111\112movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \113orq l ## 0, RT2; \114shrq $32, RT2; \115xorq RT2, l ## 0; \116movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \117andl r ## 0d, RT0d; \118roll $1, RT0d; \119shlq $32, RT0; \120xorq RT0, r ## 0;121122#define enc_rounds(i) \123roundsm(RAB, i + 2, RCD); \124roundsm(RCD, i + 3, RAB); \125roundsm(RAB, i + 4, RCD); \126roundsm(RCD, i + 5, RAB); \127roundsm(RAB, i + 6, RCD); \128roundsm(RCD, i + 7, RAB);129130#define enc_fls(i) \131fls(RAB, RCD, i + 0, i + 1);132133#define enc_inpack() \134movq (RIO), RAB0; \135bswapq RAB0; \136rolq $32, RAB0; \137movq 4*2(RIO), RCD0; \138bswapq RCD0; \139rorq $32, RCD0; \140xorq key_table(CTX), RAB0;141142#define enc_outunpack(op, max) \143xorq key_table(CTX, max, 8), RCD0; \144rorq $32, RCD0; \145bswapq RCD0; \146op ## q RCD0, (RIO); \147rolq $32, RAB0; \148bswapq RAB0; \149op ## q RAB0, 4*2(RIO);150151#define dec_rounds(i) \152roundsm(RAB, i + 7, RCD); \153roundsm(RCD, i + 6, RAB); \154roundsm(RAB, i + 5, RCD); \155roundsm(RCD, i + 4, RAB); \156roundsm(RAB, i + 3, RCD); \157roundsm(RCD, i + 2, RAB);158159#define dec_fls(i) \160fls(RAB, RCD, i + 1, i + 0);161162#define dec_inpack(max) \163movq (RIO), RAB0; \164bswapq RAB0; \165rolq $32, RAB0; \166movq 4*2(RIO), RCD0; \167bswapq RCD0; \168rorq $32, RCD0; \169xorq key_table(CTX, max, 8), RAB0;170171#define dec_outunpack() \172xorq key_table(CTX), RCD0; \173rorq $32, RCD0; \174bswapq RCD0; \175movq RCD0, (RIO); \176rolq $32, RAB0; \177bswapq RAB0; \178movq RAB0, 4*2(RIO);179180SYM_TYPED_FUNC_START(__camellia_enc_blk)181/* input:182* %rdi: ctx, CTX183* %rsi: dst184* %rdx: src185* %rcx: bool xor186*/187movq %r12, RR12;188189movq %rcx, RXOR;190movq %rsi, RDST;191movq %rdx, RIO;192193enc_inpack();194195enc_rounds(0);196enc_fls(8);197enc_rounds(8);198enc_fls(16);199enc_rounds(16);200movl $24, RT1d; /* max */201202cmpb $16, key_length(CTX);203je .L__enc_done;204205enc_fls(24);206enc_rounds(24);207movl $32, RT1d; /* max */208209.L__enc_done:210testb RXORbl, RXORbl;211movq RDST, RIO;212213jnz .L__enc_xor;214215enc_outunpack(mov, RT1);216217movq RR12, %r12;218RET;219220.L__enc_xor:221enc_outunpack(xor, RT1);222223movq RR12, %r12;224RET;225SYM_FUNC_END(__camellia_enc_blk)226227SYM_TYPED_FUNC_START(camellia_dec_blk)228/* input:229* %rdi: ctx, CTX230* %rsi: dst231* %rdx: src232*/233cmpl $16, key_length(CTX);234movl $32, RT2d;235movl $24, RXORd;236cmovel RXORd, RT2d; /* max */237238movq %r12, RR12;239movq %rsi, RDST;240movq %rdx, RIO;241242dec_inpack(RT2);243244cmpb $24, RT2bl;245je .L__dec_rounds16;246247dec_rounds(24);248dec_fls(24);249250.L__dec_rounds16:251dec_rounds(16);252dec_fls(16);253dec_rounds(8);254dec_fls(8);255dec_rounds(0);256257movq RDST, RIO;258259dec_outunpack();260261movq RR12, %r12;262RET;263SYM_FUNC_END(camellia_dec_blk)264265/**********************************************************************2662-way camellia267**********************************************************************/268#define roundsm2(ab, subkey, cd) \269movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \270xorq RT2, cd ## 1; \271\272xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \273xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \274xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \275xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \276\277xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \278xorq RT2, cd ## 0; \279xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \280xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \281xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);282283#define fls2(l, r, kl, kr) \284movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \285andl l ## 0d, RT0d; \286roll $1, RT0d; \287shlq $32, RT0; \288xorq RT0, l ## 0; \289movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \290orq r ## 0, RT1; \291shrq $32, RT1; \292xorq RT1, r ## 0; \293\294movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \295andl l ## 1d, RT2d; \296roll $1, RT2d; \297shlq $32, RT2; \298xorq RT2, l ## 1; \299movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \300orq r ## 1, RT0; \301shrq $32, RT0; \302xorq RT0, r ## 1; \303\304movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \305orq l ## 0, RT1; \306shrq $32, RT1; \307xorq RT1, l ## 0; \308movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \309andl r ## 0d, RT2d; \310roll $1, RT2d; \311shlq $32, RT2; \312xorq RT2, r ## 0; \313\314movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \315orq l ## 1, RT0; \316shrq $32, RT0; \317xorq RT0, l ## 1; \318movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \319andl r ## 1d, RT1d; \320roll $1, RT1d; \321shlq $32, RT1; \322xorq RT1, r ## 1;323324#define enc_rounds2(i) \325roundsm2(RAB, i + 2, RCD); \326roundsm2(RCD, i + 3, RAB); \327roundsm2(RAB, i + 4, RCD); \328roundsm2(RCD, i + 5, RAB); \329roundsm2(RAB, i + 6, RCD); \330roundsm2(RCD, i + 7, RAB);331332#define enc_fls2(i) \333fls2(RAB, RCD, i + 0, i + 1);334335#define enc_inpack2() \336movq (RIO), RAB0; \337bswapq RAB0; \338rorq $32, RAB0; \339movq 4*2(RIO), RCD0; \340bswapq RCD0; \341rolq $32, RCD0; \342xorq key_table(CTX), RAB0; \343\344movq 8*2(RIO), RAB1; \345bswapq RAB1; \346rorq $32, RAB1; \347movq 12*2(RIO), RCD1; \348bswapq RCD1; \349rolq $32, RCD1; \350xorq key_table(CTX), RAB1;351352#define enc_outunpack2(op, max) \353xorq key_table(CTX, max, 8), RCD0; \354rolq $32, RCD0; \355bswapq RCD0; \356op ## q RCD0, (RIO); \357rorq $32, RAB0; \358bswapq RAB0; \359op ## q RAB0, 4*2(RIO); \360\361xorq key_table(CTX, max, 8), RCD1; \362rolq $32, RCD1; \363bswapq RCD1; \364op ## q RCD1, 8*2(RIO); \365rorq $32, RAB1; \366bswapq RAB1; \367op ## q RAB1, 12*2(RIO);368369#define dec_rounds2(i) \370roundsm2(RAB, i + 7, RCD); \371roundsm2(RCD, i + 6, RAB); \372roundsm2(RAB, i + 5, RCD); \373roundsm2(RCD, i + 4, RAB); \374roundsm2(RAB, i + 3, RCD); \375roundsm2(RCD, i + 2, RAB);376377#define dec_fls2(i) \378fls2(RAB, RCD, i + 1, i + 0);379380#define dec_inpack2(max) \381movq (RIO), RAB0; \382bswapq RAB0; \383rorq $32, RAB0; \384movq 4*2(RIO), RCD0; \385bswapq RCD0; \386rolq $32, RCD0; \387xorq key_table(CTX, max, 8), RAB0; \388\389movq 8*2(RIO), RAB1; \390bswapq RAB1; \391rorq $32, RAB1; \392movq 12*2(RIO), RCD1; \393bswapq RCD1; \394rolq $32, RCD1; \395xorq key_table(CTX, max, 8), RAB1;396397#define dec_outunpack2() \398xorq key_table(CTX), RCD0; \399rolq $32, RCD0; \400bswapq RCD0; \401movq RCD0, (RIO); \402rorq $32, RAB0; \403bswapq RAB0; \404movq RAB0, 4*2(RIO); \405\406xorq key_table(CTX), RCD1; \407rolq $32, RCD1; \408bswapq RCD1; \409movq RCD1, 8*2(RIO); \410rorq $32, RAB1; \411bswapq RAB1; \412movq RAB1, 12*2(RIO);413414SYM_TYPED_FUNC_START(__camellia_enc_blk_2way)415/* input:416* %rdi: ctx, CTX417* %rsi: dst418* %rdx: src419* %rcx: bool xor420*/421pushq %rbx;422423movq %r12, RR12;424movq %rcx, RXOR;425movq %rsi, RDST;426movq %rdx, RIO;427428enc_inpack2();429430enc_rounds2(0);431enc_fls2(8);432enc_rounds2(8);433enc_fls2(16);434enc_rounds2(16);435movl $24, RT2d; /* max */436437cmpb $16, key_length(CTX);438je .L__enc2_done;439440enc_fls2(24);441enc_rounds2(24);442movl $32, RT2d; /* max */443444.L__enc2_done:445test RXORbl, RXORbl;446movq RDST, RIO;447jnz .L__enc2_xor;448449enc_outunpack2(mov, RT2);450451movq RR12, %r12;452popq %rbx;453RET;454455.L__enc2_xor:456enc_outunpack2(xor, RT2);457458movq RR12, %r12;459popq %rbx;460RET;461SYM_FUNC_END(__camellia_enc_blk_2way)462463SYM_TYPED_FUNC_START(camellia_dec_blk_2way)464/* input:465* %rdi: ctx, CTX466* %rsi: dst467* %rdx: src468*/469cmpl $16, key_length(CTX);470movl $32, RT2d;471movl $24, RXORd;472cmovel RXORd, RT2d; /* max */473474movq %rbx, RXOR;475movq %r12, RR12;476movq %rsi, RDST;477movq %rdx, RIO;478479dec_inpack2(RT2);480481cmpb $24, RT2bl;482je .L__dec2_rounds16;483484dec_rounds2(24);485dec_fls2(24);486487.L__dec2_rounds16:488dec_rounds2(16);489dec_fls2(16);490dec_rounds2(8);491dec_fls2(8);492dec_rounds2(0);493494movq RDST, RIO;495496dec_outunpack2();497498movq RR12, %r12;499movq RXOR, %rbx;500RET;501SYM_FUNC_END(camellia_dec_blk_2way)502503504