Path: blob/main/crypto/openssl/engines/asm/e_padlock-x86.pl
34876 views
#! /usr/bin/env perl1# Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved.2#3# Licensed under the Apache License 2.0 (the "License"). You may not use4# this file except in compliance with the License. You can obtain a copy5# in the file LICENSE in the source distribution or at6# https://www.openssl.org/source/license.html789# ====================================================================10# Written by Andy Polyakov <[email protected]> for the OpenSSL11# project. The module is, however, dual licensed under OpenSSL and12# CRYPTOGAMS licenses depending on where you obtain it. For further13# details see http://www.openssl.org/~appro/cryptogams/.14# ====================================================================1516# September 201117#18# Assembler helpers for Padlock engine. Compared to original engine19# version relying on inline assembler and compiled with gcc 3.4.6 it20# was measured to provide ~100% improvement on misaligned data in ECB21# mode and ~75% in CBC mode. For aligned data improvement can be22# observed for short inputs only, e.g. 45% for 64-byte messages in23# ECB mode, 20% in CBC. Difference in performance for aligned vs.24# misaligned data depends on misalignment and is either ~1.8x or 2.9x.25# These are approximately same factors as for hardware support, so26# there is little reason to rely on the latter. On the contrary, it27# might actually hurt performance in mixture of aligned and misaligned28# buffers, because a) if you choose to flip 'align' flag in control29# word on per-buffer basis, then you'd have to reload key context,30# which incurs penalty; b) if you choose to set 'align' flag31# permanently, it limits performance even for aligned data to ~1/2.32# All above mentioned results were collected on 1.5GHz C7. Nano on the33# other hand handles unaligned data more gracefully. Depending on34# algorithm and how unaligned data is, hardware can be up to 70% more35# efficient than below software alignment procedures, nor does 'align'36# flag have affect on aligned performance [if has any meaning at all].37# Therefore suggestion is to unconditionally set 'align' flag on Nano38# for optimal performance.3940$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;41push(@INC,"${dir}","${dir}../../crypto/perlasm");42require "x86asm.pl";4344$output=pop and open STDOUT,">$output";4546&asm_init($ARGV[0]);4748%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata49$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 165051$ctx="edx";52$out="edi";53$inp="esi";54$len="ecx";55$chunk="ebx";5657&function_begin_B("padlock_capability");58&push ("ebx");59&pushf ();60&pop ("eax");61&mov ("ecx","eax");62&xor ("eax",1<<21);63&push ("eax");64&popf ();65&pushf ();66&pop ("eax");67&xor ("ecx","eax");68&xor ("eax","eax");69&bt ("ecx",21);70&jnc (&label("noluck"));71&cpuid ();72&xor ("eax","eax");73&cmp ("ebx","0x".unpack("H*",'tneC'));74&jne (&label("zhaoxin"));75&cmp ("edx","0x".unpack("H*",'Hrua'));76&jne (&label("noluck"));77&cmp ("ecx","0x".unpack("H*",'slua'));78&jne (&label("noluck"));79&jmp (&label("zhaoxinEnd"));80&set_label("zhaoxin");81&cmp ("ebx","0x".unpack("H*",'hS '));82&jne (&label("noluck"));83&cmp ("edx","0x".unpack("H*",'hgna'));84&jne (&label("noluck"));85&cmp ("ecx","0x".unpack("H*",' ia'));86&jne (&label("noluck"));87&set_label("zhaoxinEnd");88&mov ("eax",0xC0000000);89&cpuid ();90&mov ("edx","eax");91&xor ("eax","eax");92&cmp ("edx",0xC0000001);93&jb (&label("noluck"));94&mov ("eax",1);95&cpuid ();96&or ("eax",0x0f);97&xor ("ebx","ebx");98&and ("eax",0x0fff);99&cmp ("eax",0x06ff); # check for Nano100&sete ("bl");101&mov ("eax",0xC0000001);102&push ("ebx");103&cpuid ();104&pop ("ebx");105&mov ("eax","edx");106&shl ("ebx",4); # bit#4 denotes Nano107&and ("eax",0xffffffef);108&or ("eax","ebx")109&set_label("noluck");110&pop ("ebx");111&ret ();112&function_end_B("padlock_capability")113114&function_begin_B("padlock_key_bswap");115&mov ("edx",&wparam(0));116&mov ("ecx",&DWP(240,"edx"));117&inc ("ecx");118&shl ("ecx",2);119&set_label("bswap_loop");120&mov ("eax",&DWP(0,"edx"));121&bswap ("eax");122&mov (&DWP(0,"edx"),"eax");123&lea ("edx",&DWP(4,"edx"));124&sub ("ecx",1);125&jnz (&label("bswap_loop"));126&ret ();127&function_end_B("padlock_key_bswap");128129# This is heuristic key context tracing. At first one130# believes that one should use atomic swap instructions,131# but it's not actually necessary. Point is that if132# padlock_saved_context was changed by another thread133# after we've read it and before we compare it with ctx,134# our key *shall* be reloaded upon thread context switch135# and we are therefore set in either case...136&static_label("padlock_saved_context");137138&function_begin_B("padlock_verify_context");139&mov ($ctx,&wparam(0));140&lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :141&DWP(&label("padlock_saved_context")."-".&label("verify_pic_point")));142&pushf ();143&call ("_padlock_verify_ctx");144&set_label("verify_pic_point");145&lea ("esp",&DWP(4,"esp"));146&ret ();147&function_end_B("padlock_verify_context");148149&function_begin_B("_padlock_verify_ctx");150&add ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context151&bt (&DWP(4,"esp"),30); # eflags152&jnc (&label("verified"));153&cmp ($ctx,&DWP(0,"eax"));154&je (&label("verified"));155&pushf ();156&popf ();157&set_label("verified");158&mov (&DWP(0,"eax"),$ctx);159&ret ();160&function_end_B("_padlock_verify_ctx");161162&function_begin_B("padlock_reload_key");163&pushf ();164&popf ();165&ret ();166&function_end_B("padlock_reload_key");167168&function_begin_B("padlock_aes_block");169&push ("edi");170&push ("esi");171&push ("ebx");172&mov ($out,&wparam(0)); # must be 16-byte aligned173&mov ($inp,&wparam(1)); # must be 16-byte aligned174&mov ($ctx,&wparam(2));175&mov ($len,1);176&lea ("ebx",&DWP(32,$ctx)); # key177&lea ($ctx,&DWP(16,$ctx)); # control word178&data_byte(0xf3,0x0f,0xa7,0xc8); # rep xcryptecb179&pop ("ebx");180&pop ("esi");181&pop ("edi");182&ret ();183&function_end_B("padlock_aes_block");184185sub generate_mode {186my ($mode,$opcode) = @_;187# int padlock_$mode_encrypt(void *out, const void *inp,188# struct padlock_cipher_data *ctx, size_t len);189&function_begin("padlock_${mode}_encrypt");190&mov ($out,&wparam(0));191&mov ($inp,&wparam(1));192&mov ($ctx,&wparam(2));193&mov ($len,&wparam(3));194&test ($ctx,15);195&jnz (&label("${mode}_abort"));196&test ($len,15);197&jnz (&label("${mode}_abort"));198&lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :199&DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point")));200&pushf ();201&cld ();202&call ("_padlock_verify_ctx");203&set_label("${mode}_pic_point");204&lea ($ctx,&DWP(16,$ctx)); # control word205&xor ("eax","eax");206if ($mode eq "ctr32") {207&movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter208} else {209&xor ("ebx","ebx");210&test (&DWP(0,$ctx),1<<5); # align bit in control word211&jnz (&label("${mode}_aligned"));212&test ($out,0x0f);213&setz ("al"); # !out_misaligned214&test ($inp,0x0f);215&setz ("bl"); # !inp_misaligned216&test ("eax","ebx");217&jnz (&label("${mode}_aligned"));218&neg ("eax");219}220&mov ($chunk,$PADLOCK_CHUNK);221¬ ("eax"); # out_misaligned?-1:0222&lea ("ebp",&DWP(-24,"esp"));223&cmp ($len,$chunk);224&cmovc ($chunk,$len); # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len225&and ("eax",$chunk); # out_misaligned?chunk:0226&mov ($chunk,$len);227&neg ("eax");228&and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK229&lea ("esp",&DWP(0,"eax","ebp")); # alloca230&mov ("eax",$PADLOCK_CHUNK);231&cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK232&mov ("eax","ebp");233&and ("ebp",-16);234&and ("esp",-16);235&mov (&DWP(16,"ebp"),"eax");236if ($PADLOCK_PREFETCH{$mode}) {237&cmp ($len,$chunk);238&ja (&label("${mode}_loop"));239&mov ("eax",$inp); # check if prefetch crosses page240&cmp ("ebp","esp");241&cmove ("eax",$out);242&add ("eax",$len);243&neg ("eax");244&and ("eax",0xfff); # distance to page boundary245&cmp ("eax",$PADLOCK_PREFETCH{$mode});246&mov ("eax",-$PADLOCK_PREFETCH{$mode});247&cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1248&and ($chunk,"eax");249&jz (&label("${mode}_unaligned_tail"));250}251&jmp (&label("${mode}_loop"));252253&set_label("${mode}_loop",16);254&mov (&DWP(0,"ebp"),$out); # save parameters255&mov (&DWP(4,"ebp"),$inp);256&mov (&DWP(8,"ebp"),$len);257&mov ($len,$chunk);258&mov (&DWP(12,"ebp"),$chunk); # chunk259if ($mode eq "ctr32") {260&mov ("ecx",&DWP(-4,$ctx));261&xor ($out,$out);262&mov ("eax",&DWP(-8,$ctx)); # borrow $len263&set_label("${mode}_prepare");264&mov (&DWP(12,"esp",$out),"ecx");265&bswap ("ecx");266&movq (&QWP(0,"esp",$out),"mm0");267&inc ("ecx");268&mov (&DWP(8,"esp",$out),"eax");269&bswap ("ecx");270&lea ($out,&DWP(16,$out));271&cmp ($out,$chunk);272&jb (&label("${mode}_prepare"));273274&mov (&DWP(-4,$ctx),"ecx");275&lea ($inp,&DWP(0,"esp"));276&lea ($out,&DWP(0,"esp"));277&mov ($len,$chunk);278} else {279&test ($out,0x0f); # out_misaligned280&cmovnz ($out,"esp");281&test ($inp,0x0f); # inp_misaligned282&jz (&label("${mode}_inp_aligned"));283&shr ($len,2);284&data_byte(0xf3,0xa5); # rep movsl285&sub ($out,$chunk);286&mov ($len,$chunk);287&mov ($inp,$out);288&set_label("${mode}_inp_aligned");289}290&lea ("eax",&DWP(-16,$ctx)); # ivp291&lea ("ebx",&DWP(16,$ctx)); # key292&shr ($len,4); # len/=AES_BLOCK_SIZE293&data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt*294if ($mode !~ /ecb|ctr/) {295&movaps ("xmm0",&QWP(0,"eax"));296&movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv297}298&mov ($out,&DWP(0,"ebp")); # restore parameters299&mov ($chunk,&DWP(12,"ebp"));300if ($mode eq "ctr32") {301&mov ($inp,&DWP(4,"ebp"));302&xor ($len,$len);303&set_label("${mode}_xor");304&movups ("xmm1",&QWP(0,$inp,$len));305&lea ($len,&DWP(16,$len));306&pxor ("xmm1",&QWP(-16,"esp",$len));307&movups (&QWP(-16,$out,$len),"xmm1");308&cmp ($len,$chunk);309&jb (&label("${mode}_xor"));310} else {311&test ($out,0x0f);312&jz (&label("${mode}_out_aligned"));313&mov ($len,$chunk);314&lea ($inp,&DWP(0,"esp"));315&shr ($len,2);316&data_byte(0xf3,0xa5); # rep movsl317&sub ($out,$chunk);318&set_label("${mode}_out_aligned");319&mov ($inp,&DWP(4,"ebp"));320}321&mov ($len,&DWP(8,"ebp"));322&add ($out,$chunk);323&add ($inp,$chunk);324&sub ($len,$chunk);325&mov ($chunk,$PADLOCK_CHUNK);326if (!$PADLOCK_PREFETCH{$mode}) {327&jnz (&label("${mode}_loop"));328} else {329&jz (&label("${mode}_break"));330&cmp ($len,$chunk);331&jae (&label("${mode}_loop"));332333&set_label("${mode}_unaligned_tail");334&xor ("eax","eax");335&cmp ("esp","ebp");336&cmove ("eax",$len);337&sub ("esp","eax"); # alloca338&mov ("eax", $out); # save parameters339&mov ($chunk,$len);340&shr ($len,2);341&lea ($out,&DWP(0,"esp"));342&data_byte(0xf3,0xa5); # rep movsl343&mov ($inp,"esp");344&mov ($out,"eax"); # restore parameters345&mov ($len,$chunk);346&jmp (&label("${mode}_loop"));347348&set_label("${mode}_break",16);349}350if ($mode ne "ctr32") {351&cmp ("esp","ebp");352&je (&label("${mode}_done"));353}354&pxor ("xmm0","xmm0");355&lea ("eax",&DWP(0,"esp"));356&set_label("${mode}_bzero");357&movaps (&QWP(0,"eax"),"xmm0");358&lea ("eax",&DWP(16,"eax"));359&cmp ("ebp","eax");360&ja (&label("${mode}_bzero"));361362&set_label("${mode}_done");363&mov ("ebp",&DWP(16,"ebp"));364&lea ("esp",&DWP(24,"ebp"));365if ($mode ne "ctr32") {366&jmp (&label("${mode}_exit"));367368&set_label("${mode}_aligned",16);369if ($PADLOCK_PREFETCH{$mode}) {370&lea ("ebp",&DWP(0,$inp,$len));371&neg ("ebp");372&and ("ebp",0xfff); # distance to page boundary373&xor ("eax","eax");374&cmp ("ebp",$PADLOCK_PREFETCH{$mode});375&mov ("ebp",$PADLOCK_PREFETCH{$mode}-1);376&cmovae ("ebp","eax");377&and ("ebp",$len); # remainder378&sub ($len,"ebp");379&jz (&label("${mode}_aligned_tail"));380}381&lea ("eax",&DWP(-16,$ctx)); # ivp382&lea ("ebx",&DWP(16,$ctx)); # key383&shr ($len,4); # len/=AES_BLOCK_SIZE384&data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt*385if ($mode ne "ecb") {386&movaps ("xmm0",&QWP(0,"eax"));387&movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv388}389if ($PADLOCK_PREFETCH{$mode}) {390&test ("ebp","ebp");391&jz (&label("${mode}_exit"));392393&set_label("${mode}_aligned_tail");394&mov ($len,"ebp");395&lea ("ebp",&DWP(-24,"esp"));396&mov ("esp","ebp");397&mov ("eax","ebp");398&sub ("esp",$len);399&and ("ebp",-16);400&and ("esp",-16);401&mov (&DWP(16,"ebp"),"eax");402&mov ("eax", $out); # save parameters403&mov ($chunk,$len);404&shr ($len,2);405&lea ($out,&DWP(0,"esp"));406&data_byte(0xf3,0xa5); # rep movsl407&mov ($inp,"esp");408&mov ($out,"eax"); # restore parameters409&mov ($len,$chunk);410&jmp (&label("${mode}_loop"));411}412&set_label("${mode}_exit"); }413&mov ("eax",1);414&lea ("esp",&DWP(4,"esp")); # popf415&emms () if ($mode eq "ctr32");416&set_label("${mode}_abort");417&function_end("padlock_${mode}_encrypt");418}419420&generate_mode("ecb",0xc8);421&generate_mode("cbc",0xd0);422&generate_mode("cfb",0xe0);423&generate_mode("ofb",0xe8);424&generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode,425# because hardware CTR was introduced later426# and even has errata on certain C7 stepping.427# own implementation *always* works, though428# ~15% slower than dedicated hardware...429430&function_begin_B("padlock_xstore");431&push ("edi");432&mov ("edi",&wparam(0));433&mov ("edx",&wparam(1));434&data_byte(0x0f,0xa7,0xc0); # xstore435&pop ("edi");436&ret ();437&function_end_B("padlock_xstore");438439&function_begin_B("_win32_segv_handler");440&mov ("eax",1); # ExceptionContinueSearch441&mov ("edx",&wparam(0)); # *ExceptionRecord442&mov ("ecx",&wparam(2)); # *ContextRecord443&cmp (&DWP(0,"edx"),0xC0000005) # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION444&jne (&label("ret"));445&add (&DWP(184,"ecx"),4); # skip over rep sha*446&mov ("eax",0); # ExceptionContinueExecution447&set_label("ret");448&ret ();449&function_end_B("_win32_segv_handler");450&safeseh("_win32_segv_handler") if ($::win32);451452&function_begin_B("padlock_sha1_oneshot");453&push ("edi");454&push ("esi");455&xor ("eax","eax");456&mov ("edi",&wparam(0));457&mov ("esi",&wparam(1));458&mov ("ecx",&wparam(2));459if ($::win32 or $::coff) {460&push (&::islabel("_win32_segv_handler"));461&data_byte(0x64,0xff,0x30); # push %fs:(%eax)462&data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax)463}464&mov ("edx","esp"); # put aside %esp465&add ("esp",-128); # 32 is enough but spec says 128466&movups ("xmm0",&QWP(0,"edi")); # copy-in context467&and ("esp",-16);468&mov ("eax",&DWP(16,"edi"));469&movaps (&QWP(0,"esp"),"xmm0");470&mov ("edi","esp");471&mov (&DWP(16,"esp"),"eax");472&xor ("eax","eax");473&data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1474&movaps ("xmm0",&QWP(0,"esp"));475&mov ("eax",&DWP(16,"esp"));476&mov ("esp","edx"); # restore %esp477if ($::win32 or $::coff) {478&data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0479&lea ("esp",&DWP(4,"esp"));480}481&mov ("edi",&wparam(0));482&movups (&QWP(0,"edi"),"xmm0"); # copy-out context483&mov (&DWP(16,"edi"),"eax");484&pop ("esi");485&pop ("edi");486&ret ();487&function_end_B("padlock_sha1_oneshot");488489&function_begin_B("padlock_sha1_blocks");490&push ("edi");491&push ("esi");492&mov ("edi",&wparam(0));493&mov ("esi",&wparam(1));494&mov ("edx","esp"); # put aside %esp495&mov ("ecx",&wparam(2));496&add ("esp",-128);497&movups ("xmm0",&QWP(0,"edi")); # copy-in context498&and ("esp",-16);499&mov ("eax",&DWP(16,"edi"));500&movaps (&QWP(0,"esp"),"xmm0");501&mov ("edi","esp");502&mov (&DWP(16,"esp"),"eax");503&mov ("eax",-1);504&data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1505&movaps ("xmm0",&QWP(0,"esp"));506&mov ("eax",&DWP(16,"esp"));507&mov ("esp","edx"); # restore %esp508&mov ("edi",&wparam(0));509&movups (&QWP(0,"edi"),"xmm0"); # copy-out context510&mov (&DWP(16,"edi"),"eax");511&pop ("esi");512&pop ("edi");513&ret ();514&function_end_B("padlock_sha1_blocks");515516&function_begin_B("padlock_sha256_oneshot");517&push ("edi");518&push ("esi");519&xor ("eax","eax");520&mov ("edi",&wparam(0));521&mov ("esi",&wparam(1));522&mov ("ecx",&wparam(2));523if ($::win32 or $::coff) {524&push (&::islabel("_win32_segv_handler"));525&data_byte(0x64,0xff,0x30); # push %fs:(%eax)526&data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax)527}528&mov ("edx","esp"); # put aside %esp529&add ("esp",-128);530&movups ("xmm0",&QWP(0,"edi")); # copy-in context531&and ("esp",-16);532&movups ("xmm1",&QWP(16,"edi"));533&movaps (&QWP(0,"esp"),"xmm0");534&mov ("edi","esp");535&movaps (&QWP(16,"esp"),"xmm1");536&xor ("eax","eax");537&data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256538&movaps ("xmm0",&QWP(0,"esp"));539&movaps ("xmm1",&QWP(16,"esp"));540&mov ("esp","edx"); # restore %esp541if ($::win32 or $::coff) {542&data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0543&lea ("esp",&DWP(4,"esp"));544}545&mov ("edi",&wparam(0));546&movups (&QWP(0,"edi"),"xmm0"); # copy-out context547&movups (&QWP(16,"edi"),"xmm1");548&pop ("esi");549&pop ("edi");550&ret ();551&function_end_B("padlock_sha256_oneshot");552553&function_begin_B("padlock_sha256_blocks");554&push ("edi");555&push ("esi");556&mov ("edi",&wparam(0));557&mov ("esi",&wparam(1));558&mov ("ecx",&wparam(2));559&mov ("edx","esp"); # put aside %esp560&add ("esp",-128);561&movups ("xmm0",&QWP(0,"edi")); # copy-in context562&and ("esp",-16);563&movups ("xmm1",&QWP(16,"edi"));564&movaps (&QWP(0,"esp"),"xmm0");565&mov ("edi","esp");566&movaps (&QWP(16,"esp"),"xmm1");567&mov ("eax",-1);568&data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256569&movaps ("xmm0",&QWP(0,"esp"));570&movaps ("xmm1",&QWP(16,"esp"));571&mov ("esp","edx"); # restore %esp572&mov ("edi",&wparam(0));573&movups (&QWP(0,"edi"),"xmm0"); # copy-out context574&movups (&QWP(16,"edi"),"xmm1");575&pop ("esi");576&pop ("edi");577&ret ();578&function_end_B("padlock_sha256_blocks");579580&function_begin_B("padlock_sha512_blocks");581&push ("edi");582&push ("esi");583&mov ("edi",&wparam(0));584&mov ("esi",&wparam(1));585&mov ("ecx",&wparam(2));586&mov ("edx","esp"); # put aside %esp587&add ("esp",-128);588&movups ("xmm0",&QWP(0,"edi")); # copy-in context589&and ("esp",-16);590&movups ("xmm1",&QWP(16,"edi"));591&movups ("xmm2",&QWP(32,"edi"));592&movups ("xmm3",&QWP(48,"edi"));593&movaps (&QWP(0,"esp"),"xmm0");594&mov ("edi","esp");595&movaps (&QWP(16,"esp"),"xmm1");596&movaps (&QWP(32,"esp"),"xmm2");597&movaps (&QWP(48,"esp"),"xmm3");598&data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512599&movaps ("xmm0",&QWP(0,"esp"));600&movaps ("xmm1",&QWP(16,"esp"));601&movaps ("xmm2",&QWP(32,"esp"));602&movaps ("xmm3",&QWP(48,"esp"));603&mov ("esp","edx"); # restore %esp604&mov ("edi",&wparam(0));605&movups (&QWP(0,"edi"),"xmm0"); # copy-out context606&movups (&QWP(16,"edi"),"xmm1");607&movups (&QWP(32,"edi"),"xmm2");608&movups (&QWP(48,"edi"),"xmm3");609&pop ("esi");610&pop ("edi");611&ret ();612&function_end_B("padlock_sha512_blocks");613614&asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");615&align (16);616617&dataseg();618# Essentially this variable belongs in thread local storage.619# Having this variable global on the other hand can only cause620# few bogus key reloads [if any at all on signle-CPU system],621# so we accept the penalty...622&set_label("padlock_saved_context",4);623&data_word(0);624625&asm_finish();626627close STDOUT;628629630