Path: blob/main/crypto/openssl/engines/asm/e_padlock-x86_64.pl
34876 views
#! /usr/bin/env perl1# Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved.2#3# Licensed under the Apache License 2.0 (the "License"). You may not use4# this file except in compliance with the License. You can obtain a copy5# in the file LICENSE in the source distribution or at6# https://www.openssl.org/source/license.html789# ====================================================================10# Written by Andy Polyakov <[email protected]> for the OpenSSL11# project. The module is, however, dual licensed under OpenSSL and12# CRYPTOGAMS licenses depending on where you obtain it. For further13# details see http://www.openssl.org/~appro/cryptogams/.14# ====================================================================1516# September 201117#18# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for19# details.2021# $output is the last argument if it looks like a file (it has an extension)22# $flavour is the first argument if it doesn't look like a file23$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;24$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;2526$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);2728$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;29( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or30( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or31die "can't locate x86_64-xlate.pl";3233open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""34or die "can't call $xlate: $!";35*STDOUT=*OUT;3637$code=".text\n";3839%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata40$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^204142$ctx="%rdx";43$out="%rdi";44$inp="%rsi";45$len="%rcx";46$chunk="%rbx";4748($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order49("%rdi","%rsi","%rdx","%rcx"); # Unix order5051$code.=<<___;52.globl padlock_capability53.type padlock_capability,\@abi-omnipotent54.align 1655padlock_capability:56mov %rbx,%r857xor %eax,%eax58cpuid59xor %eax,%eax60cmp \$`"0x".unpack("H*",'tneC')`,%ebx61jne .Lzhaoxin62cmp \$`"0x".unpack("H*",'Hrua')`,%edx63jne .Lnoluck64cmp \$`"0x".unpack("H*",'slua')`,%ecx65jne .Lnoluck66jmp .LzhaoxinEnd67.Lzhaoxin:68cmp \$`"0x".unpack("H*",'hS ')`,%ebx69jne .Lnoluck70cmp \$`"0x".unpack("H*",'hgna')`,%edx71jne .Lnoluck72cmp \$`"0x".unpack("H*",' ia')`,%ecx73jne .Lnoluck74.LzhaoxinEnd:75mov \$0xC0000000,%eax76cpuid77mov %eax,%edx78xor %eax,%eax79cmp \$0xC0000001,%edx80jb .Lnoluck81mov \$0xC0000001,%eax82cpuid83mov %edx,%eax84and \$0xffffffef,%eax85or \$0x10,%eax # set Nano bit#486.Lnoluck:87mov %r8,%rbx88ret89.size padlock_capability,.-padlock_capability9091.globl padlock_key_bswap92.type padlock_key_bswap,\@abi-omnipotent,093.align 1694padlock_key_bswap:95mov 240($arg1),%edx96inc %edx97shl \$2,%edx98.Lbswap_loop:99mov ($arg1),%eax100bswap %eax101mov %eax,($arg1)102lea 4($arg1),$arg1103sub \$1,%edx104jnz .Lbswap_loop105ret106.size padlock_key_bswap,.-padlock_key_bswap107108.globl padlock_verify_context109.type padlock_verify_context,\@abi-omnipotent110.align 16111padlock_verify_context:112mov $arg1,$ctx113pushf114lea .Lpadlock_saved_context(%rip),%rax115call _padlock_verify_ctx116lea 8(%rsp),%rsp117ret118.size padlock_verify_context,.-padlock_verify_context119120.type _padlock_verify_ctx,\@abi-omnipotent121.align 16122_padlock_verify_ctx:123mov 8(%rsp),%r8124bt \$30,%r8125jnc .Lverified126cmp (%rax),$ctx127je .Lverified128pushf129popf130.Lverified:131mov $ctx,(%rax)132ret133.size _padlock_verify_ctx,.-_padlock_verify_ctx134135.globl padlock_reload_key136.type padlock_reload_key,\@abi-omnipotent137.align 16138padlock_reload_key:139pushf140popf141ret142.size padlock_reload_key,.-padlock_reload_key143144.globl padlock_aes_block145.type padlock_aes_block,\@function,3146.align 16147padlock_aes_block:148mov %rbx,%r8149mov \$1,$len150lea 32($ctx),%rbx # key151lea 16($ctx),$ctx # control word152.byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb153mov %r8,%rbx154ret155.size padlock_aes_block,.-padlock_aes_block156157.globl padlock_xstore158.type padlock_xstore,\@function,2159.align 16160padlock_xstore:161mov %esi,%edx162.byte 0x0f,0xa7,0xc0 # xstore163ret164.size padlock_xstore,.-padlock_xstore165166.globl padlock_sha1_oneshot167.type padlock_sha1_oneshot,\@function,3168.align 16169padlock_sha1_oneshot:170mov %rdx,%rcx171mov %rdi,%rdx # put aside %rdi172movups (%rdi),%xmm0 # copy-in context173sub \$128+8,%rsp174mov 16(%rdi),%eax175movaps %xmm0,(%rsp)176mov %rsp,%rdi177mov %eax,16(%rsp)178xor %rax,%rax179.byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1180movaps (%rsp),%xmm0181mov 16(%rsp),%eax182add \$128+8,%rsp183movups %xmm0,(%rdx) # copy-out context184mov %eax,16(%rdx)185ret186.size padlock_sha1_oneshot,.-padlock_sha1_oneshot187188.globl padlock_sha1_blocks189.type padlock_sha1_blocks,\@function,3190.align 16191padlock_sha1_blocks:192mov %rdx,%rcx193mov %rdi,%rdx # put aside %rdi194movups (%rdi),%xmm0 # copy-in context195sub \$128+8,%rsp196mov 16(%rdi),%eax197movaps %xmm0,(%rsp)198mov %rsp,%rdi199mov %eax,16(%rsp)200mov \$-1,%rax201.byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1202movaps (%rsp),%xmm0203mov 16(%rsp),%eax204add \$128+8,%rsp205movups %xmm0,(%rdx) # copy-out context206mov %eax,16(%rdx)207ret208.size padlock_sha1_blocks,.-padlock_sha1_blocks209210.globl padlock_sha256_oneshot211.type padlock_sha256_oneshot,\@function,3212.align 16213padlock_sha256_oneshot:214mov %rdx,%rcx215mov %rdi,%rdx # put aside %rdi216movups (%rdi),%xmm0 # copy-in context217sub \$128+8,%rsp218movups 16(%rdi),%xmm1219movaps %xmm0,(%rsp)220mov %rsp,%rdi221movaps %xmm1,16(%rsp)222xor %rax,%rax223.byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256224movaps (%rsp),%xmm0225movaps 16(%rsp),%xmm1226add \$128+8,%rsp227movups %xmm0,(%rdx) # copy-out context228movups %xmm1,16(%rdx)229ret230.size padlock_sha256_oneshot,.-padlock_sha256_oneshot231232.globl padlock_sha256_blocks233.type padlock_sha256_blocks,\@function,3234.align 16235padlock_sha256_blocks:236mov %rdx,%rcx237mov %rdi,%rdx # put aside %rdi238movups (%rdi),%xmm0 # copy-in context239sub \$128+8,%rsp240movups 16(%rdi),%xmm1241movaps %xmm0,(%rsp)242mov %rsp,%rdi243movaps %xmm1,16(%rsp)244mov \$-1,%rax245.byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256246movaps (%rsp),%xmm0247movaps 16(%rsp),%xmm1248add \$128+8,%rsp249movups %xmm0,(%rdx) # copy-out context250movups %xmm1,16(%rdx)251ret252.size padlock_sha256_blocks,.-padlock_sha256_blocks253254.globl padlock_sha512_blocks255.type padlock_sha512_blocks,\@function,3256.align 16257padlock_sha512_blocks:258mov %rdx,%rcx259mov %rdi,%rdx # put aside %rdi260movups (%rdi),%xmm0 # copy-in context261sub \$128+8,%rsp262movups 16(%rdi),%xmm1263movups 32(%rdi),%xmm2264movups 48(%rdi),%xmm3265movaps %xmm0,(%rsp)266mov %rsp,%rdi267movaps %xmm1,16(%rsp)268movaps %xmm2,32(%rsp)269movaps %xmm3,48(%rsp)270.byte 0xf3,0x0f,0xa6,0xe0 # rep xha512271movaps (%rsp),%xmm0272movaps 16(%rsp),%xmm1273movaps 32(%rsp),%xmm2274movaps 48(%rsp),%xmm3275add \$128+8,%rsp276movups %xmm0,(%rdx) # copy-out context277movups %xmm1,16(%rdx)278movups %xmm2,32(%rdx)279movups %xmm3,48(%rdx)280ret281.size padlock_sha512_blocks,.-padlock_sha512_blocks282___283284sub generate_mode {285my ($mode,$opcode) = @_;286# int padlock_$mode_encrypt(void *out, const void *inp,287# struct padlock_cipher_data *ctx, size_t len);288$code.=<<___;289.globl padlock_${mode}_encrypt290.type padlock_${mode}_encrypt,\@function,4291.align 16292padlock_${mode}_encrypt:293push %rbp294push %rbx295296xor %eax,%eax297test \$15,$ctx298jnz .L${mode}_abort299test \$15,$len300jnz .L${mode}_abort301lea .Lpadlock_saved_context(%rip),%rax302pushf303cld304call _padlock_verify_ctx305lea 16($ctx),$ctx # control word306xor %eax,%eax307xor %ebx,%ebx308testl \$`1<<5`,($ctx) # align bit in control word309jnz .L${mode}_aligned310test \$0x0f,$out311setz %al # !out_misaligned312test \$0x0f,$inp313setz %bl # !inp_misaligned314test %ebx,%eax315jnz .L${mode}_aligned316neg %rax317mov \$$PADLOCK_CHUNK,$chunk318not %rax # out_misaligned?-1:0319lea (%rsp),%rbp320cmp $chunk,$len321cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len322and $chunk,%rax # out_misaligned?chunk:0323mov $len,$chunk324neg %rax325and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK326lea (%rax,%rbp),%rsp327mov \$$PADLOCK_CHUNK,%rax328cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK329___330$code.=<<___ if ($mode eq "ctr32");331.L${mode}_reenter:332mov -4($ctx),%eax # pull 32-bit counter333bswap %eax334neg %eax335and \$`$PADLOCK_CHUNK/16-1`,%eax336mov \$$PADLOCK_CHUNK,$chunk337shl \$4,%eax338cmovz $chunk,%rax339cmp %rax,$len340cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK341cmovbe $len,$chunk342___343$code.=<<___ if ($PADLOCK_PREFETCH{$mode});344cmp $chunk,$len345ja .L${mode}_loop346mov $inp,%rax # check if prefetch crosses page347cmp %rsp,%rbp348cmove $out,%rax349add $len,%rax350neg %rax351and \$0xfff,%rax # distance to page boundary352cmp \$$PADLOCK_PREFETCH{$mode},%rax353mov \$-$PADLOCK_PREFETCH{$mode},%rax354cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1355and %rax,$chunk356jz .L${mode}_unaligned_tail357___358$code.=<<___;359jmp .L${mode}_loop360.align 16361.L${mode}_loop:362cmp $len,$chunk # ctr32 artefact363cmova $len,$chunk # ctr32 artefact364mov $out,%r8 # save parameters365mov $inp,%r9366mov $len,%r10367mov $chunk,$len368mov $chunk,%r11369test \$0x0f,$out # out_misaligned370cmovnz %rsp,$out371test \$0x0f,$inp # inp_misaligned372jz .L${mode}_inp_aligned373shr \$3,$len374.byte 0xf3,0x48,0xa5 # rep movsq375sub $chunk,$out376mov $chunk,$len377mov $out,$inp378.L${mode}_inp_aligned:379lea -16($ctx),%rax # ivp380lea 16($ctx),%rbx # key381shr \$4,$len382.byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*383___384$code.=<<___ if ($mode !~ /ecb|ctr/);385movdqa (%rax),%xmm0386movdqa %xmm0,-16($ctx) # copy [or refresh] iv387___388$code.=<<___ if ($mode eq "ctr32");389mov -4($ctx),%eax # pull 32-bit counter390test \$0xffff0000,%eax391jnz .L${mode}_no_carry392bswap %eax393add \$0x10000,%eax394bswap %eax395mov %eax,-4($ctx)396.L${mode}_no_carry:397___398$code.=<<___;399mov %r8,$out # restore parameters400mov %r11,$chunk401test \$0x0f,$out402jz .L${mode}_out_aligned403mov $chunk,$len404lea (%rsp),$inp405shr \$3,$len406.byte 0xf3,0x48,0xa5 # rep movsq407sub $chunk,$out408.L${mode}_out_aligned:409mov %r9,$inp410mov %r10,$len411add $chunk,$out412add $chunk,$inp413sub $chunk,$len414mov \$$PADLOCK_CHUNK,$chunk415___416if (!$PADLOCK_PREFETCH{$mode}) {417$code.=<<___;418jnz .L${mode}_loop419___420} else {421$code.=<<___;422jz .L${mode}_break423cmp $chunk,$len424jae .L${mode}_loop425___426$code.=<<___ if ($mode eq "ctr32");427mov $len,$chunk428mov $inp,%rax # check if prefetch crosses page429cmp %rsp,%rbp430cmove $out,%rax431add $len,%rax432neg %rax433and \$0xfff,%rax # distance to page boundary434cmp \$$PADLOCK_PREFETCH{$mode},%rax435mov \$-$PADLOCK_PREFETCH{$mode},%rax436cmovae $chunk,%rax437and %rax,$chunk438jnz .L${mode}_loop439___440$code.=<<___;441.L${mode}_unaligned_tail:442xor %eax,%eax443cmp %rsp,%rbp444cmove $len,%rax445mov $out,%r8 # save parameters446mov $len,$chunk447sub %rax,%rsp # alloca448shr \$3,$len449lea (%rsp),$out450.byte 0xf3,0x48,0xa5 # rep movsq451mov %rsp,$inp452mov %r8, $out # restore parameters453mov $chunk,$len454jmp .L${mode}_loop455.align 16456.L${mode}_break:457___458}459$code.=<<___;460cmp %rbp,%rsp461je .L${mode}_done462463pxor %xmm0,%xmm0464lea (%rsp),%rax465.L${mode}_bzero:466movaps %xmm0,(%rax)467lea 16(%rax),%rax468cmp %rax,%rbp469ja .L${mode}_bzero470471.L${mode}_done:472lea (%rbp),%rsp473jmp .L${mode}_exit474475.align 16476.L${mode}_aligned:477___478$code.=<<___ if ($mode eq "ctr32");479mov -4($ctx),%eax # pull 32-bit counter480bswap %eax481neg %eax482and \$0xffff,%eax483mov \$`16*0x10000`,$chunk484shl \$4,%eax485cmovz $chunk,%rax486cmp %rax,$len487cmova %rax,$chunk # don't let counter cross 2^16488cmovbe $len,$chunk489jbe .L${mode}_aligned_skip490491.L${mode}_aligned_loop:492mov $len,%r10 # save parameters493mov $chunk,$len494mov $chunk,%r11495496lea -16($ctx),%rax # ivp497lea 16($ctx),%rbx # key498shr \$4,$len # len/=AES_BLOCK_SIZE499.byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*500501mov -4($ctx),%eax # pull 32-bit counter502bswap %eax503add \$0x10000,%eax504bswap %eax505mov %eax,-4($ctx)506507mov %r10,$len # restore parameters508sub %r11,$len509mov \$`16*0x10000`,$chunk510jz .L${mode}_exit511cmp $chunk,$len512jae .L${mode}_aligned_loop513514.L${mode}_aligned_skip:515___516$code.=<<___ if ($PADLOCK_PREFETCH{$mode});517lea ($inp,$len),%rbp518neg %rbp519and \$0xfff,%rbp # distance to page boundary520xor %eax,%eax521cmp \$$PADLOCK_PREFETCH{$mode},%rbp522mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp523cmovae %rax,%rbp524and $len,%rbp # remainder525sub %rbp,$len526jz .L${mode}_aligned_tail527___528$code.=<<___;529lea -16($ctx),%rax # ivp530lea 16($ctx),%rbx # key531shr \$4,$len # len/=AES_BLOCK_SIZE532.byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*533___534$code.=<<___ if ($mode !~ /ecb|ctr/);535movdqa (%rax),%xmm0536movdqa %xmm0,-16($ctx) # copy [or refresh] iv537___538$code.=<<___ if ($PADLOCK_PREFETCH{$mode});539test %rbp,%rbp # check remainder540jz .L${mode}_exit541542.L${mode}_aligned_tail:543mov $out,%r8544mov %rbp,$chunk545mov %rbp,$len546lea (%rsp),%rbp547sub $len,%rsp548shr \$3,$len549lea (%rsp),$out550.byte 0xf3,0x48,0xa5 # rep movsq551lea (%r8),$out552lea (%rsp),$inp553mov $chunk,$len554jmp .L${mode}_loop555___556$code.=<<___;557.L${mode}_exit:558mov \$1,%eax559lea 8(%rsp),%rsp560.L${mode}_abort:561pop %rbx562pop %rbp563ret564.size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt565___566}567568&generate_mode("ecb",0xc8);569&generate_mode("cbc",0xd0);570&generate_mode("cfb",0xe0);571&generate_mode("ofb",0xe8);572&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...573574$code.=<<___;575.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"576.align 16577.data578.align 8579.Lpadlock_saved_context:580.quad 0581___582$code =~ s/\`([^\`]*)\`/eval($1)/gem;583584print $code;585586close STDOUT;587588589