Path: blob/master/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
170899 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* Twofish Cipher 8-way parallel algorithm (AVX/x86_64)3*4* Copyright (C) 2012 Johannes Goetzfried5* <[email protected]>6*7* Copyright © 2012-2013 Jussi Kivilinna <[email protected]>8*/910#include <linux/linkage.h>11#include <asm/frame.h>12#include "glue_helper-asm-avx.S"1314.file "twofish-avx-x86_64-asm_64.S"1516.section .rodata.cst16.bswap128_mask, "aM", @progbits, 1617.align 1618.Lbswap128_mask:19.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 02021.text2223/* structure of crypto context */24#define s0 025#define s1 102426#define s2 204827#define s3 307228#define w 409629#define k 41283031/**********************************************************************328-way AVX twofish33**********************************************************************/34#define CTX %rdi3536#define RA1 %xmm037#define RB1 %xmm138#define RC1 %xmm239#define RD1 %xmm34041#define RA2 %xmm442#define RB2 %xmm543#define RC2 %xmm644#define RD2 %xmm74546#define RX0 %xmm847#define RY0 %xmm94849#define RX1 %xmm1050#define RY1 %xmm115152#define RK1 %xmm1253#define RK2 %xmm135455#define RT %xmm1456#define RR %xmm155758#define RID1 %r1359#define RID1d %r13d60#define RID2 %rsi61#define RID2d %esi6263#define RGI1 %rdx64#define RGI1bl %dl65#define RGI1bh %dh66#define RGI2 %rcx67#define RGI2bl %cl68#define RGI2bh %ch6970#define RGI3 %rax71#define RGI3bl %al72#define RGI3bh %ah73#define RGI4 %rbx74#define RGI4bl %bl75#define RGI4bh %bh7677#define RGS1 %r878#define RGS1d %r8d79#define RGS2 %r980#define RGS2d %r9d81#define RGS3 %r1082#define RGS3d %r10d838485#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \86movzbl src ## bl, RID1d; \87movzbl src ## bh, RID2d; \88shrq $16, src; \89movl t0(CTX, RID1, 4), dst ## d; \90movl t1(CTX, RID2, 4), RID2d; \91movzbl src ## bl, RID1d; \92xorl RID2d, dst ## d; \93movzbl src ## bh, RID2d; \94interleave_op(il_reg); \95xorl t2(CTX, RID1, 4), dst ## d; \96xorl t3(CTX, RID2, 4), dst ## d;9798#define dummy(d) /* do nothing */99100#define shr_next(reg) \101shrq $16, reg;102103#define G(gi1, gi2, x, t0, t1, t2, t3) \104lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1); \105lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2); \106\107lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none); \108shlq $32, RGS2; \109orq RGS1, RGS2; \110lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none); \111shlq $32, RGS1; \112orq RGS1, RGS3;113114#define round_head_2(a, b, x1, y1, x2, y2) \115vmovq b ## 1, RGI3; \116vpextrq $1, b ## 1, RGI4; \117\118G(RGI1, RGI2, x1, s0, s1, s2, s3); \119vmovq a ## 2, RGI1; \120vpextrq $1, a ## 2, RGI2; \121vmovq RGS2, x1; \122vpinsrq $1, RGS3, x1, x1; \123\124G(RGI3, RGI4, y1, s1, s2, s3, s0); \125vmovq b ## 2, RGI3; \126vpextrq $1, b ## 2, RGI4; \127vmovq RGS2, y1; \128vpinsrq $1, RGS3, y1, y1; \129\130G(RGI1, RGI2, x2, s0, s1, s2, s3); \131vmovq RGS2, x2; \132vpinsrq $1, RGS3, x2, x2; \133\134G(RGI3, RGI4, y2, s1, s2, s3, s0); \135vmovq RGS2, y2; \136vpinsrq $1, RGS3, y2, y2;137138#define encround_tail(a, b, c, d, x, y, prerotate) \139vpaddd x, y, x; \140vpaddd x, RK1, RT;\141prerotate(b); \142vpxor RT, c, c; \143vpaddd y, x, y; \144vpaddd y, RK2, y; \145vpsrld $1, c, RT; \146vpslld $(32 - 1), c, c; \147vpor c, RT, c; \148vpxor d, y, d; \149150#define decround_tail(a, b, c, d, x, y, prerotate) \151vpaddd x, y, x; \152vpaddd x, RK1, RT;\153prerotate(a); \154vpxor RT, c, c; \155vpaddd y, x, y; \156vpaddd y, RK2, y; \157vpxor d, y, d; \158vpsrld $1, d, y; \159vpslld $(32 - 1), d, d; \160vpor d, y, d; \161162#define rotate_1l(x) \163vpslld $1, x, RR; \164vpsrld $(32 - 1), x, x; \165vpor x, RR, x;166167#define preload_rgi(c) \168vmovq c, RGI1; \169vpextrq $1, c, RGI2;170171#define encrypt_round(n, a, b, c, d, preload, prerotate) \172vbroadcastss (k+4*(2*(n)))(CTX), RK1; \173vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \174round_head_2(a, b, RX0, RY0, RX1, RY1); \175encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \176preload(c ## 1); \177encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);178179#define decrypt_round(n, a, b, c, d, preload, prerotate) \180vbroadcastss (k+4*(2*(n)))(CTX), RK1; \181vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \182round_head_2(a, b, RX0, RY0, RX1, RY1); \183decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \184preload(c ## 1); \185decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);186187#define encrypt_cycle(n) \188encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \189encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l);190191#define encrypt_cycle_last(n) \192encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \193encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy);194195#define decrypt_cycle(n) \196decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \197decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l);198199#define decrypt_cycle_last(n) \200decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \201decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy);202203#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \204vpunpckldq x1, x0, t0; \205vpunpckhdq x1, x0, t2; \206vpunpckldq x3, x2, t1; \207vpunpckhdq x3, x2, x3; \208\209vpunpcklqdq t1, t0, x0; \210vpunpckhqdq t1, t0, x1; \211vpunpcklqdq x3, t2, x2; \212vpunpckhqdq x3, t2, x3;213214#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \215vpxor x0, wkey, x0; \216vpxor x1, wkey, x1; \217vpxor x2, wkey, x2; \218vpxor x3, wkey, x3; \219\220transpose_4x4(x0, x1, x2, x3, t0, t1, t2)221222#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \223transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \224\225vpxor x0, wkey, x0; \226vpxor x1, wkey, x1; \227vpxor x2, wkey, x2; \228vpxor x3, wkey, x3;229230SYM_FUNC_START_LOCAL(__twofish_enc_blk8)231/* input:232* %rdi: ctx, CTX233* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks234* output:235* RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks236*/237238vmovdqu w(CTX), RK1;239240pushq %r13;241pushq %rbx;242pushq %rcx;243244inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);245preload_rgi(RA1);246rotate_1l(RD1);247inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);248rotate_1l(RD2);249250encrypt_cycle(0);251encrypt_cycle(1);252encrypt_cycle(2);253encrypt_cycle(3);254encrypt_cycle(4);255encrypt_cycle(5);256encrypt_cycle(6);257encrypt_cycle_last(7);258259vmovdqu (w+4*4)(CTX), RK1;260261popq %rcx;262popq %rbx;263popq %r13;264265outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);266outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);267268RET;269SYM_FUNC_END(__twofish_enc_blk8)270271SYM_FUNC_START_LOCAL(__twofish_dec_blk8)272/* input:273* %rdi: ctx, CTX274* RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks275* output:276* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks277*/278279vmovdqu (w+4*4)(CTX), RK1;280281pushq %r13;282pushq %rbx;283284inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);285preload_rgi(RC1);286rotate_1l(RA1);287inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);288rotate_1l(RA2);289290decrypt_cycle(7);291decrypt_cycle(6);292decrypt_cycle(5);293decrypt_cycle(4);294decrypt_cycle(3);295decrypt_cycle(2);296decrypt_cycle(1);297decrypt_cycle_last(0);298299vmovdqu (w)(CTX), RK1;300301popq %rbx;302popq %r13;303304outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);305outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);306307RET;308SYM_FUNC_END(__twofish_dec_blk8)309310SYM_FUNC_START(twofish_ecb_enc_8way)311/* input:312* %rdi: ctx, CTX313* %rsi: dst314* %rdx: src315*/316FRAME_BEGIN317318movq %rsi, %r11;319320load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);321322call __twofish_enc_blk8;323324store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);325326FRAME_END327RET;328SYM_FUNC_END(twofish_ecb_enc_8way)329330SYM_FUNC_START(twofish_ecb_dec_8way)331/* input:332* %rdi: ctx, CTX333* %rsi: dst334* %rdx: src335*/336FRAME_BEGIN337338movq %rsi, %r11;339340load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);341342call __twofish_dec_blk8;343344store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);345346FRAME_END347RET;348SYM_FUNC_END(twofish_ecb_dec_8way)349350SYM_FUNC_START(twofish_cbc_dec_8way)351/* input:352* %rdi: ctx, CTX353* %rsi: dst354* %rdx: src355*/356FRAME_BEGIN357358pushq %r12;359360movq %rsi, %r11;361movq %rdx, %r12;362363load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);364365call __twofish_dec_blk8;366367store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);368369popq %r12;370371FRAME_END372RET;373SYM_FUNC_END(twofish_cbc_dec_8way)374375376