Path: blob/master/Utilities/cmliblzma/liblzma/check/crc32_fast.c
3153 views
// SPDX-License-Identifier: 0BSD12///////////////////////////////////////////////////////////////////////////////3//4/// \file crc32.c5/// \brief CRC32 calculation6//7// Authors: Lasse Collin8// Ilya Kurdyukov9// Hans Jansen10//11///////////////////////////////////////////////////////////////////////////////1213#include "check.h"14#include "crc_common.h"1516#if defined(CRC_X86_CLMUL)17# define BUILDING_CRC32_CLMUL18# include "crc_x86_clmul.h"19#elif defined(CRC32_ARM64)20# include "crc32_arm64.h"21#endif222324#ifdef CRC32_GENERIC2526///////////////////27// Generic CRC32 //28///////////////////2930static uint32_t31crc32_generic(const uint8_t *buf, size_t size, uint32_t crc)32{33crc = ~crc;3435#ifdef WORDS_BIGENDIAN36crc = byteswap32(crc);37#endif3839if (size > 8) {40// Fix the alignment, if needed. The if statement above41// ensures that this won't read past the end of buf[].42while ((uintptr_t)(buf) & 7) {43crc = lzma_crc32_table[0][*buf++ ^ A(crc)] ^ S8(crc);44--size;45}4647// Calculate the position where to stop.48const uint8_t *const limit = buf + (size & ~(size_t)(7));4950// Calculate how many bytes must be calculated separately51// before returning the result.52size &= (size_t)(7);5354// Calculate the CRC32 using the slice-by-eight algorithm.55while (buf < limit) {56crc ^= aligned_read32ne(buf);57buf += 4;5859crc = lzma_crc32_table[7][A(crc)]60^ lzma_crc32_table[6][B(crc)]61^ lzma_crc32_table[5][C(crc)]62^ lzma_crc32_table[4][D(crc)];6364const uint32_t tmp = aligned_read32ne(buf);65buf += 4;6667// At least with some compilers, it is critical for68// performance, that the crc variable is XORed69// between the two table-lookup pairs.70crc = lzma_crc32_table[3][A(tmp)]71^ lzma_crc32_table[2][B(tmp)]72^ crc73^ lzma_crc32_table[1][C(tmp)]74^ lzma_crc32_table[0][D(tmp)];75}76}7778while (size-- != 0)79crc = lzma_crc32_table[0][*buf++ ^ A(crc)] ^ S8(crc);8081#ifdef WORDS_BIGENDIAN82crc = byteswap32(crc);83#endif8485return ~crc;86}87#endif888990#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)9192//////////////////////////93// Function dispatching //94//////////////////////////9596// If both the generic and arch-optimized implementations are built, then97// the function to use is selected at runtime because the system running98// the binary might not have the arch-specific instruction set extension(s)99// available. The dispatch methods in order of priority:100//101// 1. Constructor. This method uses __attribute__((__constructor__)) to102// set crc32_func at load time. This avoids extra computation (and any103// unlikely threading bugs) on the first call to lzma_crc32() to decide104// which implementation should be used.105//106// 2. First Call Resolution. On the very first call to lzma_crc32(), the107// call will be directed to crc32_dispatch() instead. This will set the108// appropriate implementation function and will not be called again.109// This method does not use any kind of locking but is safe because if110// multiple threads run the dispatcher simultaneously then they will all111// set crc32_func to the same value.112113typedef uint32_t (*crc32_func_type)(114const uint8_t *buf, size_t size, uint32_t crc);115116// This resolver is shared between all dispatch methods.117static crc32_func_type118crc32_resolve(void)119{120return is_arch_extension_supported()121? &crc32_arch_optimized : &crc32_generic;122}123124125#ifdef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR126// Constructor method.127# define CRC32_SET_FUNC_ATTR __attribute__((__constructor__))128static crc32_func_type crc32_func;129#else130// First Call Resolution method.131# define CRC32_SET_FUNC_ATTR132static uint32_t crc32_dispatch(const uint8_t *buf, size_t size, uint32_t crc);133static crc32_func_type crc32_func = &crc32_dispatch;134#endif135136CRC32_SET_FUNC_ATTR137static void138crc32_set_func(void)139{140crc32_func = crc32_resolve();141return;142}143144#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR145static uint32_t146crc32_dispatch(const uint8_t *buf, size_t size, uint32_t crc)147{148// When __attribute__((__constructor__)) isn't supported, set the149// function pointer without any locking. If multiple threads run150// the detection code in parallel, they will all end up setting151// the pointer to the same value. This avoids the use of152// mythread_once() on every call to lzma_crc32() but this likely153// isn't strictly standards compliant. Let's change it if it breaks.154crc32_set_func();155return crc32_func(buf, size, crc);156}157158#endif159#endif160161162extern LZMA_API(uint32_t)163lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc)164{165#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)166// On x86-64, if CLMUL is available, it is the best for non-tiny167// inputs, being over twice as fast as the generic slice-by-four168// version. However, for size <= 16 it's different. In the extreme169// case of size == 1 the generic version can be five times faster.170// At size >= 8 the CLMUL starts to become reasonable. It171// varies depending on the alignment of buf too.172//173// The above doesn't include the overhead of mythread_once().174// At least on x86-64 GNU/Linux, pthread_once() is very fast but175// it still makes lzma_crc32(buf, 1, crc) 50-100 % slower. When176// size reaches 12-16 bytes the overhead becomes negligible.177//178// So using the generic version for size <= 16 may give better179// performance with tiny inputs but if such inputs happen rarely180// it's not so obvious because then the lookup table of the181// generic version may not be in the processor cache.182#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS183if (size <= 16)184return crc32_generic(buf, size, crc);185#endif186187/*188#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR189// See crc32_dispatch(). This would be the alternative which uses190// locking and doesn't use crc32_dispatch(). Note that on Windows191// this method needs Vista threads.192mythread_once(crc64_set_func);193#endif194*/195return crc32_func(buf, size, crc);196197#elif defined(CRC32_ARCH_OPTIMIZED)198return crc32_arch_optimized(buf, size, crc);199200#else201return crc32_generic(buf, size, crc);202#endif203}204205206