/*1* Implement fast SHA-1 with AVX2 instructions. (x86_64)2*3* This file is provided under a dual BSD/GPLv2 license. When using or4* redistributing this file, you may do so under either license.5*6* GPL LICENSE SUMMARY7*8* Copyright(c) 2014 Intel Corporation.9*10* This program is free software; you can redistribute it and/or modify11* it under the terms of version 2 of the GNU General Public License as12* published by the Free Software Foundation.13*14* This program is distributed in the hope that it will be useful, but15* WITHOUT ANY WARRANTY; without even the implied warranty of16* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU17* General Public License for more details.18*19* Contact Information:20* Ilya Albrekht <[email protected]>21* Maxim Locktyukhin <[email protected]>22* Ronen Zohar <[email protected]>23* Chandramouli Narayanan <[email protected]>24*25* BSD LICENSE26*27* Copyright(c) 2014 Intel Corporation.28*29* Redistribution and use in source and binary forms, with or without30* modification, are permitted provided that the following conditions31* are met:32*33* Redistributions of source code must retain the above copyright34* notice, this list of conditions and the following disclaimer.35* Redistributions in binary form must reproduce the above copyright36* notice, this list of conditions and the following disclaimer in37* the documentation and/or other materials provided with the38* distribution.39* Neither the name of Intel Corporation nor the names of its40* contributors may be used to endorse or promote products derived41* from this software without specific prior written permission.42*43* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS44* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT45* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR46* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT47* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,48* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT49* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,50* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY51* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT52* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE53* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.54*55*/5657/*58* SHA-1 implementation with Intel(R) AVX2 instruction set extensions.59*60*This implementation is based on the previous SSSE3 release:61*Visit http://software.intel.com/en-us/articles/62*and refer to improving-the-performance-of-the-secure-hash-algorithm-1/63*64* void sha1_transform_avx2(struct sha1_block_state *state,65* const u8 *data, size_t nblocks);66*/6768#include <linux/linkage.h>6970#define CTX %rdi /* arg1 */71#define BUF %rsi /* arg2 */72#define CNT %rdx /* arg3 */7374#define REG_A %ecx75#define REG_B %esi76#define REG_C %edi77#define REG_D %eax78#define REG_E %edx79#define REG_TB %ebx80#define REG_TA %r12d81#define REG_RA %rcx82#define REG_RB %rsi83#define REG_RC %rdi84#define REG_RD %rax85#define REG_RE %rdx86#define REG_RTA %r1287#define REG_RTB %rbx88#define REG_T1 %r11d89#define xmm_mov vmovups90#define avx2_zeroupper vzeroupper91#define RND_F1 192#define RND_F2 293#define RND_F3 39495.macro REGALLOC96.set A, REG_A97.set B, REG_B98.set C, REG_C99.set D, REG_D100.set E, REG_E101.set TB, REG_TB102.set TA, REG_TA103104.set RA, REG_RA105.set RB, REG_RB106.set RC, REG_RC107.set RD, REG_RD108.set RE, REG_RE109110.set RTA, REG_RTA111.set RTB, REG_RTB112113.set T1, REG_T1114.endm115116#define HASH_PTR %r9117#define BLOCKS_CTR %r8118#define BUFFER_PTR %r10119#define BUFFER_PTR2 %r13120121#define PRECALC_BUF %r14122#define WK_BUF %r15123124#define W_TMP %xmm0125#define WY_TMP %ymm0126#define WY_TMP2 %ymm9127128# AVX2 variables129#define WY0 %ymm3130#define WY4 %ymm5131#define WY08 %ymm7132#define WY12 %ymm8133#define WY16 %ymm12134#define WY20 %ymm13135#define WY24 %ymm14136#define WY28 %ymm15137138#define YMM_SHUFB_BSWAP %ymm10139140/*141* Keep 2 iterations precalculated at a time:142* - 80 DWORDs per iteration * 2143*/144#define W_SIZE (80*2*2 +16)145146#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)147#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)148149150.macro UPDATE_HASH hash, val151add \hash, \val152mov \val, \hash153.endm154155.macro PRECALC_RESET_WY156.set WY_00, WY0157.set WY_04, WY4158.set WY_08, WY08159.set WY_12, WY12160.set WY_16, WY16161.set WY_20, WY20162.set WY_24, WY24163.set WY_28, WY28164.set WY_32, WY_00165.endm166167.macro PRECALC_ROTATE_WY168/* Rotate macros */169.set WY_32, WY_28170.set WY_28, WY_24171.set WY_24, WY_20172.set WY_20, WY_16173.set WY_16, WY_12174.set WY_12, WY_08175.set WY_08, WY_04176.set WY_04, WY_00177.set WY_00, WY_32178179/* Define register aliases */180.set WY, WY_00181.set WY_minus_04, WY_04182.set WY_minus_08, WY_08183.set WY_minus_12, WY_12184.set WY_minus_16, WY_16185.set WY_minus_20, WY_20186.set WY_minus_24, WY_24187.set WY_minus_28, WY_28188.set WY_minus_32, WY189.endm190191.macro PRECALC_00_15192.if (i == 0) # Initialize and rotate registers193PRECALC_RESET_WY194PRECALC_ROTATE_WY195.endif196197/* message scheduling pre-compute for rounds 0-15 */198.if ((i & 7) == 0)199/*200* blended AVX2 and ALU instruction scheduling201* 1 vector iteration per 8 rounds202*/203vmovdqu (i * 2)(BUFFER_PTR), W_TMP204.elseif ((i & 7) == 1)205vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\206WY_TMP, WY_TMP207.elseif ((i & 7) == 2)208vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY209.elseif ((i & 7) == 4)210vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP211.elseif ((i & 7) == 7)212vmovdqu WY_TMP, PRECALC_WK(i&~7)213214PRECALC_ROTATE_WY215.endif216.endm217218.macro PRECALC_16_31219/*220* message scheduling pre-compute for rounds 16-31221* calculating last 32 w[i] values in 8 XMM registers222* pre-calculate K+w[i] values and store to mem223* for later load by ALU add instruction224*225* "brute force" vectorization for rounds 16-31 only226* due to w[i]->w[i-3] dependency227*/228.if ((i & 7) == 0)229/*230* blended AVX2 and ALU instruction scheduling231* 1 vector iteration per 8 rounds232*/233/* w[i-14] */234vpalignr $8, WY_minus_16, WY_minus_12, WY235vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */236.elseif ((i & 7) == 1)237vpxor WY_minus_08, WY, WY238vpxor WY_minus_16, WY_TMP, WY_TMP239.elseif ((i & 7) == 2)240vpxor WY_TMP, WY, WY241vpslldq $12, WY, WY_TMP2242.elseif ((i & 7) == 3)243vpslld $1, WY, WY_TMP244vpsrld $31, WY, WY245.elseif ((i & 7) == 4)246vpor WY, WY_TMP, WY_TMP247vpslld $2, WY_TMP2, WY248.elseif ((i & 7) == 5)249vpsrld $30, WY_TMP2, WY_TMP2250vpxor WY, WY_TMP, WY_TMP251.elseif ((i & 7) == 7)252vpxor WY_TMP2, WY_TMP, WY253vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP254vmovdqu WY_TMP, PRECALC_WK(i&~7)255256PRECALC_ROTATE_WY257.endif258.endm259260.macro PRECALC_32_79261/*262* in SHA-1 specification:263* w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1264* instead we do equal:265* w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2266* allows more efficient vectorization267* since w[i]=>w[i-3] dependency is broken268*/269270.if ((i & 7) == 0)271/*272* blended AVX2 and ALU instruction scheduling273* 1 vector iteration per 8 rounds274*/275vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP276.elseif ((i & 7) == 1)277/* W is W_minus_32 before xor */278vpxor WY_minus_28, WY, WY279.elseif ((i & 7) == 2)280vpxor WY_minus_16, WY_TMP, WY_TMP281.elseif ((i & 7) == 3)282vpxor WY_TMP, WY, WY283.elseif ((i & 7) == 4)284vpslld $2, WY, WY_TMP285.elseif ((i & 7) == 5)286vpsrld $30, WY, WY287vpor WY, WY_TMP, WY288.elseif ((i & 7) == 7)289vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP290vmovdqu WY_TMP, PRECALC_WK(i&~7)291292PRECALC_ROTATE_WY293.endif294.endm295296.macro PRECALC r, s297.set i, \r298299.if (i < 40)300.set K_XMM, 32*0301.elseif (i < 80)302.set K_XMM, 32*1303.elseif (i < 120)304.set K_XMM, 32*2305.else306.set K_XMM, 32*3307.endif308309.if (i<32)310PRECALC_00_15 \s311.elseif (i<64)312PRECALC_16_31 \s313.elseif (i < 160)314PRECALC_32_79 \s315.endif316.endm317318.macro ROTATE_STATE319.set T_REG, E320.set E, D321.set D, C322.set C, B323.set B, TB324.set TB, A325.set A, T_REG326327.set T_REG, RE328.set RE, RD329.set RD, RC330.set RC, RB331.set RB, RTB332.set RTB, RA333.set RA, T_REG334.endm335336/* Macro relies on saved ROUND_Fx */337338.macro RND_FUN f, r339.if (\f == RND_F1)340ROUND_F1 \r341.elseif (\f == RND_F2)342ROUND_F2 \r343.elseif (\f == RND_F3)344ROUND_F3 \r345.endif346.endm347348.macro RR r349.set round_id, (\r % 80)350351.if (round_id == 0) /* Precalculate F for first round */352.set ROUND_FUNC, RND_F1353mov B, TB354355rorx $(32-30), B, B /* b>>>2 */356andn D, TB, T1357and C, TB358xor T1, TB359.endif360361RND_FUN ROUND_FUNC, \r362ROTATE_STATE363364.if (round_id == 18)365.set ROUND_FUNC, RND_F2366.elseif (round_id == 38)367.set ROUND_FUNC, RND_F3368.elseif (round_id == 58)369.set ROUND_FUNC, RND_F2370.endif371372.set round_id, ( (\r+1) % 80)373374RND_FUN ROUND_FUNC, (\r+1)375ROTATE_STATE376.endm377378.macro ROUND_F1 r379add WK(\r), E380381andn C, A, T1 /* ~b&d */382lea (RE,RTB), E /* Add F from the previous round */383384rorx $(32-5), A, TA /* T2 = A >>> 5 */385rorx $(32-30),A, TB /* b>>>2 for next round */386387PRECALC (\r) /* msg scheduling for next 2 blocks */388389/*390* Calculate F for the next round391* (b & c) ^ andn[b, d]392*/393and B, A /* b&c */394xor T1, A /* F1 = (b&c) ^ (~b&d) */395396lea (RE,RTA), E /* E += A >>> 5 */397.endm398399.macro ROUND_F2 r400add WK(\r), E401lea (RE,RTB), E /* Add F from the previous round */402403/* Calculate F for the next round */404rorx $(32-5), A, TA /* T2 = A >>> 5 */405.if ((round_id) < 79)406rorx $(32-30), A, TB /* b>>>2 for next round */407.endif408PRECALC (\r) /* msg scheduling for next 2 blocks */409410.if ((round_id) < 79)411xor B, A412.endif413414add TA, E /* E += A >>> 5 */415416.if ((round_id) < 79)417xor C, A418.endif419.endm420421.macro ROUND_F3 r422add WK(\r), E423PRECALC (\r) /* msg scheduling for next 2 blocks */424425lea (RE,RTB), E /* Add F from the previous round */426427mov B, T1428or A, T1429430rorx $(32-5), A, TA /* T2 = A >>> 5 */431rorx $(32-30), A, TB /* b>>>2 for next round */432433/* Calculate F for the next round434* (b and c) or (d and (b or c))435*/436and C, T1437and B, A438or T1, A439440add TA, E /* E += A >>> 5 */441442.endm443444/* Add constant only if (%2 > %3) condition met (uses RTA as temp)445* %1 + %2 >= %3 ? %4 : 0446*/447.macro ADD_IF_GE a, b, c, d448mov \a, RTA449add $\d, RTA450cmp $\c, \b451cmovge RTA, \a452.endm453454/*455* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining456*/457.macro SHA1_PIPELINED_MAIN_BODY458459REGALLOC460461mov (HASH_PTR), A462mov 4(HASH_PTR), B463mov 8(HASH_PTR), C464mov 12(HASH_PTR), D465mov 16(HASH_PTR), E466467mov %rsp, PRECALC_BUF468lea (2*4*80+32)(%rsp), WK_BUF469470# Precalc WK for first 2 blocks471ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64472.set i, 0473.rept 160474PRECALC i475.set i, i + 1476.endr477478/* Go to next block if needed */479ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128480ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128481xchg WK_BUF, PRECALC_BUF482483.align 32484.L_loop:485/*486* code loops through more than one block487* we use K_BASE value as a signal of a last block,488* it is set below by: cmovae BUFFER_PTR, K_BASE489*/490test BLOCKS_CTR, BLOCKS_CTR491jnz .L_begin492.align 32493jmp .L_end494.align 32495.L_begin:496497/*498* Do first block499* rounds: 0,2,4,6,8500*/501.set j, 0502.rept 5503RR j504.set j, j+2505.endr506507/*508* rounds:509* 10,12,14,16,18510* 20,22,24,26,28511* 30,32,34,36,38512* 40,42,44,46,48513* 50,52,54,56,58514*/515.rept 25516RR j517.set j, j+2518.endr519520/* Update Counter */521sub $1, BLOCKS_CTR522/* Move to the next block only if needed*/523ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128524/*525* rounds526* 60,62,64,66,68527* 70,72,74,76,78528*/529.rept 10530RR j531.set j, j+2532.endr533534UPDATE_HASH (HASH_PTR), A535UPDATE_HASH 4(HASH_PTR), TB536UPDATE_HASH 8(HASH_PTR), C537UPDATE_HASH 12(HASH_PTR), D538UPDATE_HASH 16(HASH_PTR), E539540test BLOCKS_CTR, BLOCKS_CTR541jz .L_loop542543mov TB, B544545/* Process second block */546/*547* rounds548* 0+80, 2+80, 4+80, 6+80, 8+80549* 10+80,12+80,14+80,16+80,18+80550*/551552.set j, 0553.rept 10554RR j+80555.set j, j+2556.endr557558/*559* rounds560* 20+80,22+80,24+80,26+80,28+80561* 30+80,32+80,34+80,36+80,38+80562*/563.rept 10564RR j+80565.set j, j+2566.endr567568/*569* rounds570* 40+80,42+80,44+80,46+80,48+80571* 50+80,52+80,54+80,56+80,58+80572*/573.rept 10574RR j+80575.set j, j+2576.endr577578/* update counter */579sub $1, BLOCKS_CTR580/* Move to the next block only if needed*/581ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128582583/*584* rounds585* 60+80,62+80,64+80,66+80,68+80586* 70+80,72+80,74+80,76+80,78+80587*/588.rept 10589RR j+80590.set j, j+2591.endr592593UPDATE_HASH (HASH_PTR), A594UPDATE_HASH 4(HASH_PTR), TB595UPDATE_HASH 8(HASH_PTR), C596UPDATE_HASH 12(HASH_PTR), D597UPDATE_HASH 16(HASH_PTR), E598599/* Reset state for AVX2 reg permutation */600mov A, TA601mov TB, A602mov C, TB603mov E, C604mov D, B605mov TA, D606607REGALLOC608609xchg WK_BUF, PRECALC_BUF610611jmp .L_loop612613.align 32614.L_end:615616.endm617/*618* macro implements SHA-1 function's body for several 64-byte blocks619* param: function's name620*/621.macro SHA1_VECTOR_ASM name622SYM_FUNC_START(\name)623624push %rbx625push %r12626push %r13627push %r14628push %r15629630RESERVE_STACK = (W_SIZE*4 + 8+24)631632/* Align stack */633push %rbp634mov %rsp, %rbp635and $~(0x20-1), %rsp636sub $RESERVE_STACK, %rsp637638avx2_zeroupper639640/* Setup initial values */641mov CTX, HASH_PTR642mov BUF, BUFFER_PTR643644mov BUF, BUFFER_PTR2645mov CNT, BLOCKS_CTR646647xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP648649SHA1_PIPELINED_MAIN_BODY650651avx2_zeroupper652653mov %rbp, %rsp654pop %rbp655656pop %r15657pop %r14658pop %r13659pop %r12660pop %rbx661662RET663664SYM_FUNC_END(\name)665.endm666667.section .rodata668669#define K1 0x5a827999670#define K2 0x6ed9eba1671#define K3 0x8f1bbcdc672#define K4 0xca62c1d6673674.align 128675K_XMM_AR:676.long K1, K1, K1, K1677.long K1, K1, K1, K1678.long K2, K2, K2, K2679.long K2, K2, K2, K2680.long K3, K3, K3, K3681.long K3, K3, K3, K3682.long K4, K4, K4, K4683.long K4, K4, K4, K4684685BSWAP_SHUFB_CTL:686.long 0x00010203687.long 0x04050607688.long 0x08090a0b689.long 0x0c0d0e0f690.long 0x00010203691.long 0x04050607692.long 0x08090a0b693.long 0x0c0d0e0f694.text695696SHA1_VECTOR_ASM sha1_transform_avx2697698699