Path: blob/master/arch/x86/crypto/blowfish-x86_64-asm_64.S
26451 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* Blowfish Cipher Algorithm (x86_64)3*4* Copyright (C) 2011 Jussi Kivilinna <[email protected]>5*/67#include <linux/linkage.h>89.file "blowfish-x86_64-asm.S"10.text1112/* structure of crypto context */13#define p 014#define s0 ((16 + 2) * 4)15#define s1 ((16 + 2 + (1 * 256)) * 4)16#define s2 ((16 + 2 + (2 * 256)) * 4)17#define s3 ((16 + 2 + (3 * 256)) * 4)1819/* register macros */20#define CTX %r1221#define RIO %rsi2223#define RX0 %rax24#define RX1 %rbx25#define RX2 %rcx26#define RX3 %rdx2728#define RX0d %eax29#define RX1d %ebx30#define RX2d %ecx31#define RX3d %edx3233#define RX0bl %al34#define RX1bl %bl35#define RX2bl %cl36#define RX3bl %dl3738#define RX0bh %ah39#define RX1bh %bh40#define RX2bh %ch41#define RX3bh %dh4243#define RT0 %rdi44#define RT1 %rsi45#define RT2 %r846#define RT3 %r94748#define RT0d %edi49#define RT1d %esi50#define RT2d %r8d51#define RT3d %r9d5253#define RKEY %r105455/***********************************************************************56* 1-way blowfish57***********************************************************************/58#define F() \59rorq $16, RX0; \60movzbl RX0bh, RT0d; \61movzbl RX0bl, RT1d; \62rolq $16, RX0; \63movl s0(CTX,RT0,4), RT0d; \64addl s1(CTX,RT1,4), RT0d; \65movzbl RX0bh, RT1d; \66movzbl RX0bl, RT2d; \67rolq $32, RX0; \68xorl s2(CTX,RT1,4), RT0d; \69addl s3(CTX,RT2,4), RT0d; \70xorq RT0, RX0;7172#define add_roundkey_enc(n) \73xorq p+4*(n)(CTX), RX0;7475#define round_enc(n) \76add_roundkey_enc(n); \77\78F(); \79F();8081#define add_roundkey_dec(n) \82movq p+4*(n-1)(CTX), RT0; \83rorq $32, RT0; \84xorq RT0, RX0;8586#define round_dec(n) \87add_roundkey_dec(n); \88\89F(); \90F(); \9192#define read_block() \93movq (RIO), RX0; \94rorq $32, RX0; \95bswapq RX0;9697#define write_block() \98bswapq RX0; \99movq RX0, (RIO);100101SYM_FUNC_START(blowfish_enc_blk)102/* input:103* %rdi: ctx104* %rsi: dst105* %rdx: src106*/107movq %r12, %r11;108109movq %rdi, CTX;110movq %rsi, %r10;111movq %rdx, RIO;112113read_block();114115round_enc(0);116round_enc(2);117round_enc(4);118round_enc(6);119round_enc(8);120round_enc(10);121round_enc(12);122round_enc(14);123add_roundkey_enc(16);124125movq %r11, %r12;126movq %r10, RIO;127128write_block();129RET;130SYM_FUNC_END(blowfish_enc_blk)131132SYM_FUNC_START(blowfish_dec_blk)133/* input:134* %rdi: ctx135* %rsi: dst136* %rdx: src137*/138movq %r12, %r11;139140movq %rdi, CTX;141movq %rsi, %r10;142movq %rdx, RIO;143144read_block();145146round_dec(17);147round_dec(15);148round_dec(13);149round_dec(11);150round_dec(9);151round_dec(7);152round_dec(5);153round_dec(3);154add_roundkey_dec(1);155156movq %r10, RIO;157write_block();158159movq %r11, %r12;160161RET;162SYM_FUNC_END(blowfish_dec_blk)163164/**********************************************************************1654-way blowfish, four blocks parallel166**********************************************************************/167168/* F() for 4-way. Slower when used alone/1-way, but faster when used169* parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).170*/171#define F4(x) \172movzbl x ## bh, RT1d; \173movzbl x ## bl, RT3d; \174rorq $16, x; \175movzbl x ## bh, RT0d; \176movzbl x ## bl, RT2d; \177rorq $16, x; \178movl s0(CTX,RT0,4), RT0d; \179addl s1(CTX,RT2,4), RT0d; \180xorl s2(CTX,RT1,4), RT0d; \181addl s3(CTX,RT3,4), RT0d; \182xorq RT0, x;183184#define add_preloaded_roundkey4() \185xorq RKEY, RX0; \186xorq RKEY, RX1; \187xorq RKEY, RX2; \188xorq RKEY, RX3;189190#define preload_roundkey_enc(n) \191movq p+4*(n)(CTX), RKEY;192193#define add_roundkey_enc4(n) \194add_preloaded_roundkey4(); \195preload_roundkey_enc(n + 2);196197#define round_enc4(n) \198add_roundkey_enc4(n); \199\200F4(RX0); \201F4(RX1); \202F4(RX2); \203F4(RX3); \204\205F4(RX0); \206F4(RX1); \207F4(RX2); \208F4(RX3);209210#define preload_roundkey_dec(n) \211movq p+4*((n)-1)(CTX), RKEY; \212rorq $32, RKEY;213214#define add_roundkey_dec4(n) \215add_preloaded_roundkey4(); \216preload_roundkey_dec(n - 2);217218#define round_dec4(n) \219add_roundkey_dec4(n); \220\221F4(RX0); \222F4(RX1); \223F4(RX2); \224F4(RX3); \225\226F4(RX0); \227F4(RX1); \228F4(RX2); \229F4(RX3);230231#define read_block4() \232movq (RIO), RX0; \233rorq $32, RX0; \234bswapq RX0; \235\236movq 8(RIO), RX1; \237rorq $32, RX1; \238bswapq RX1; \239\240movq 16(RIO), RX2; \241rorq $32, RX2; \242bswapq RX2; \243\244movq 24(RIO), RX3; \245rorq $32, RX3; \246bswapq RX3;247248#define write_block4() \249bswapq RX0; \250movq RX0, (RIO); \251\252bswapq RX1; \253movq RX1, 8(RIO); \254\255bswapq RX2; \256movq RX2, 16(RIO); \257\258bswapq RX3; \259movq RX3, 24(RIO);260261#define xor_block4() \262movq (RIO), RT0; \263bswapq RT0; \264xorq RT0, RX1; \265\266movq 8(RIO), RT2; \267bswapq RT2; \268xorq RT2, RX2; \269\270movq 16(RIO), RT3; \271bswapq RT3; \272xorq RT3, RX3;273274SYM_FUNC_START(blowfish_enc_blk_4way)275/* input:276* %rdi: ctx277* %rsi: dst278* %rdx: src279*/280pushq %r12;281pushq %rbx;282283movq %rdi, CTX284movq %rsi, %r11;285movq %rdx, RIO;286287preload_roundkey_enc(0);288289read_block4();290291round_enc4(0);292round_enc4(2);293round_enc4(4);294round_enc4(6);295round_enc4(8);296round_enc4(10);297round_enc4(12);298round_enc4(14);299add_preloaded_roundkey4();300301movq %r11, RIO;302write_block4();303304popq %rbx;305popq %r12;306RET;307SYM_FUNC_END(blowfish_enc_blk_4way)308309SYM_FUNC_START(__blowfish_dec_blk_4way)310/* input:311* %rdi: ctx312* %rsi: dst313* %rdx: src314* %rcx: cbc (bool)315*/316pushq %r12;317pushq %rbx;318pushq %rcx;319pushq %rdx;320321movq %rdi, CTX;322movq %rsi, %r11;323movq %rdx, RIO;324325preload_roundkey_dec(17);326read_block4();327328round_dec4(17);329round_dec4(15);330round_dec4(13);331round_dec4(11);332round_dec4(9);333round_dec4(7);334round_dec4(5);335round_dec4(3);336add_preloaded_roundkey4();337338popq RIO;339popq %r12;340testq %r12, %r12;341jz .L_no_cbc_xor;342343xor_block4();344345.L_no_cbc_xor:346movq %r11, RIO;347write_block4();348349popq %rbx;350popq %r12;351352RET;353SYM_FUNC_END(__blowfish_dec_blk_4way)354355356