Path: blob/master/Utilities/cmliblzma/liblzma/check/crc64_x86.S
3153 views
/* SPDX-License-Identifier: 0BSD */12/*3* Speed-optimized CRC64 using slicing-by-four algorithm4*5* This uses only i386 instructions, but it is optimized for i686 and later6* (including e.g. Pentium II/III/IV, Athlon XP, and Core 2).7*8* Authors: Igor Pavlov (original CRC32 assembly code)9* Lasse Collin (CRC64 adaptation of the modified CRC32 code)10*11* This code needs lzma_crc64_table, which can be created using the12* following C code:1314uint64_t lzma_crc64_table[4][256];1516void17init_table(void)18{19// ECMA-18220static const uint64_t poly64 = UINT64_C(0xC96C5795D7870F42);2122for (size_t s = 0; s < 4; ++s) {23for (size_t b = 0; b < 256; ++b) {24uint64_t r = s == 0 ? b : lzma_crc64_table[s - 1][b];2526for (size_t i = 0; i < 8; ++i) {27if (r & 1)28r = (r >> 1) ^ poly64;29else30r >>= 1;31}3233lzma_crc64_table[s][b] = r;34}35}36}3738* The prototype of the CRC64 function:39* extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc);40*/4142/* When Intel CET is enabled, include <cet.h> in assembly code to mark43Intel CET support. */44#ifdef __CET__45# include <cet.h>46#else47# define _CET_ENDBR48#endif4950/*51* On some systems, the functions need to be prefixed. The prefix is52* usually an underscore.53*/54#ifndef __USER_LABEL_PREFIX__55# define __USER_LABEL_PREFIX__56#endif57#define MAKE_SYM_CAT(prefix, sym) prefix ## sym58#define MAKE_SYM(prefix, sym) MAKE_SYM_CAT(prefix, sym)59#define LZMA_CRC64 MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64)60#define LZMA_CRC64_TABLE MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64_table)6162/*63* Solaris assembler doesn't have .p2align, and Darwin uses .align64* differently than GNU/Linux and Solaris.65*/66#if defined(__APPLE__) || defined(__MSDOS__)67# define ALIGN(pow2, abs) .align pow268#else69# define ALIGN(pow2, abs) .align abs70#endif7172.text73.globl LZMA_CRC647475#if !defined(__APPLE__) && !defined(_WIN32) && !defined(__CYGWIN__) \76&& !defined(__MSDOS__)77.type LZMA_CRC64, @function78#endif7980ALIGN(4, 16)81LZMA_CRC64:82_CET_ENDBR83/*84* Register usage:85* %eax crc LSB86* %edx crc MSB87* %esi buf88* %edi size or buf + size89* %ebx lzma_crc64_table90* %ebp Table index91* %ecx Temporary92*/93pushl %ebx94pushl %esi95pushl %edi96pushl %ebp97movl 0x14(%esp), %esi /* buf */98movl 0x18(%esp), %edi /* size */99movl 0x1C(%esp), %eax /* crc LSB */100movl 0x20(%esp), %edx /* crc MSB */101102/*103* Store the address of lzma_crc64_table to %ebx. This is needed to104* get position-independent code (PIC).105*106* The PIC macro is defined by libtool, while __PIC__ is defined107* by GCC but only on some systems. Testing for both makes it simpler108* to test this code without libtool, and keeps the code working also109* when built with libtool but using something else than GCC.110*111* I understood that libtool may define PIC on Windows even though112* the code in Windows DLLs is not PIC in sense that it is in ELF113* binaries, so we need a separate check to always use the non-PIC114* code on Windows.115*/116#if (!defined(PIC) && !defined(__PIC__)) \117|| (defined(_WIN32) || defined(__CYGWIN__))118/* Not PIC */119movl $ LZMA_CRC64_TABLE, %ebx120#elif defined(__APPLE__)121/* Mach-O */122call .L_get_pc123.L_pic:124leal .L_lzma_crc64_table$non_lazy_ptr-.L_pic(%ebx), %ebx125movl (%ebx), %ebx126#else127/* ELF */128call .L_get_pc129addl $_GLOBAL_OFFSET_TABLE_, %ebx130movl LZMA_CRC64_TABLE@GOT(%ebx), %ebx131#endif132133/* Complement the initial value. */134notl %eax135notl %edx136137.L_align:138/*139* Check if there is enough input to use slicing-by-four.140* We need eight bytes, because the loop pre-reads four bytes.141*/142cmpl $8, %edi143jb .L_rest144145/* Check if we have reached alignment of four bytes. */146testl $3, %esi147jz .L_slice148149/* Calculate CRC of the next input byte. */150movzbl (%esi), %ebp151incl %esi152movzbl %al, %ecx153xorl %ecx, %ebp154shrdl $8, %edx, %eax155xorl (%ebx, %ebp, 8), %eax156shrl $8, %edx157xorl 4(%ebx, %ebp, 8), %edx158decl %edi159jmp .L_align160161.L_slice:162/*163* If we get here, there's at least eight bytes of aligned input164* available. Make %edi multiple of four bytes. Store the possible165* remainder over the "size" variable in the argument stack.166*/167movl %edi, 0x18(%esp)168andl $-4, %edi169subl %edi, 0x18(%esp)170171/*172* Let %edi be buf + size - 4 while running the main loop. This way173* we can compare for equality to determine when exit the loop.174*/175addl %esi, %edi176subl $4, %edi177178/* Read in the first four aligned bytes. */179movl (%esi), %ecx180181.L_loop:182xorl %eax, %ecx183movzbl %cl, %ebp184movl 0x1800(%ebx, %ebp, 8), %eax185xorl %edx, %eax186movl 0x1804(%ebx, %ebp, 8), %edx187movzbl %ch, %ebp188xorl 0x1000(%ebx, %ebp, 8), %eax189xorl 0x1004(%ebx, %ebp, 8), %edx190shrl $16, %ecx191movzbl %cl, %ebp192xorl 0x0800(%ebx, %ebp, 8), %eax193xorl 0x0804(%ebx, %ebp, 8), %edx194movzbl %ch, %ebp195addl $4, %esi196xorl (%ebx, %ebp, 8), %eax197xorl 4(%ebx, %ebp, 8), %edx198199/* Check for end of aligned input. */200cmpl %edi, %esi201202/*203* Copy the next input byte to %ecx. It is slightly faster to204* read it here than at the top of the loop.205*/206movl (%esi), %ecx207jb .L_loop208209/*210* Process the remaining four bytes, which we have already211* copied to %ecx.212*/213xorl %eax, %ecx214movzbl %cl, %ebp215movl 0x1800(%ebx, %ebp, 8), %eax216xorl %edx, %eax217movl 0x1804(%ebx, %ebp, 8), %edx218movzbl %ch, %ebp219xorl 0x1000(%ebx, %ebp, 8), %eax220xorl 0x1004(%ebx, %ebp, 8), %edx221shrl $16, %ecx222movzbl %cl, %ebp223xorl 0x0800(%ebx, %ebp, 8), %eax224xorl 0x0804(%ebx, %ebp, 8), %edx225movzbl %ch, %ebp226addl $4, %esi227xorl (%ebx, %ebp, 8), %eax228xorl 4(%ebx, %ebp, 8), %edx229230/* Copy the number of remaining bytes to %edi. */231movl 0x18(%esp), %edi232233.L_rest:234/* Check for end of input. */235testl %edi, %edi236jz .L_return237238/* Calculate CRC of the next input byte. */239movzbl (%esi), %ebp240incl %esi241movzbl %al, %ecx242xorl %ecx, %ebp243shrdl $8, %edx, %eax244xorl (%ebx, %ebp, 8), %eax245shrl $8, %edx246xorl 4(%ebx, %ebp, 8), %edx247decl %edi248jmp .L_rest249250.L_return:251/* Complement the final value. */252notl %eax253notl %edx254255popl %ebp256popl %edi257popl %esi258popl %ebx259ret260261#if defined(PIC) || defined(__PIC__)262ALIGN(4, 16)263.L_get_pc:264movl (%esp), %ebx265ret266#endif267268#if defined(__APPLE__) && (defined(PIC) || defined(__PIC__))269/* Mach-O PIC */270.section __IMPORT,__pointers,non_lazy_symbol_pointers271.L_lzma_crc64_table$non_lazy_ptr:272.indirect_symbol LZMA_CRC64_TABLE273.long 0274275#elif defined(_WIN32) || defined(__CYGWIN__)276# ifdef DLL_EXPORT277/* This is equivalent of __declspec(dllexport). */278.section .drectve279.ascii " -export:lzma_crc64"280# endif281282#elif !defined(__MSDOS__)283/* ELF */284.size LZMA_CRC64, .-LZMA_CRC64285#endif286287/*288* This is needed to support non-executable stack. It's ugly to289* use __FreeBSD__ and __linux__ here, but I don't know a way to detect when290* we are using GNU assembler.291*/292#if defined(__ELF__) && (defined(__FreeBSD__) || defined(__linux__))293.section .note.GNU-stack,"",@progbits294#endif295296297