Path: blob/master/Utilities/cmliblzma/liblzma/check/crc_x86_clmul.h
3153 views
// SPDX-License-Identifier: 0BSD12///////////////////////////////////////////////////////////////////////////////3//4/// \file crc_x86_clmul.h5/// \brief CRC32 and CRC64 implementations using CLMUL instructions.6///7/// The CRC32 and CRC64 implementations use 32/64-bit x86 SSSE3, SSE4.1, and8/// CLMUL instructions. This is compatible with Elbrus 2000 (E2K) too.9///10/// They were derived from11/// https://www.researchgate.net/publication/263424619_Fast_CRC_computation12/// and the public domain code from https://github.com/rawrunprotected/crc13/// (URLs were checked on 2023-10-14).14///15/// While this file has both CRC32 and CRC64 implementations, only one16/// should be built at a time to ensure that crc_simd_body() is inlined17/// even with compilers with which lzma_always_inline expands to plain inline.18/// The version to build is selected by defining BUILDING_CRC32_CLMUL or19/// BUILDING_CRC64_CLMUL before including this file.20///21/// FIXME: Builds for 32-bit x86 use the assembly .S files by default22/// unless configured with --disable-assembler. Even then the lookup table23/// isn't omitted in crc64_table.c since it doesn't know that assembly24/// code has been disabled.25//26// Authors: Ilya Kurdyukov27// Hans Jansen28// Lasse Collin29// Jia Tan30//31///////////////////////////////////////////////////////////////////////////////3233// This file must not be included more than once.34#ifdef LZMA_CRC_X86_CLMUL_H35# error crc_x86_clmul.h was included twice.36#endif37#define LZMA_CRC_X86_CLMUL_H3839#include <immintrin.h>4041#if defined(_MSC_VER)42# include <intrin.h>43#elif defined(HAVE_CPUID_H)44# include <cpuid.h>45#endif464748// EDG-based compilers (Intel's classic compiler and compiler for E2K) can49// define __GNUC__ but the attribute must not be used with them.50// The new Clang-based ICX needs the attribute.51//52// NOTE: Build systems check for this too, keep them in sync with this.53#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)54# define crc_attr_target \55__attribute__((__target__("ssse3,sse4.1,pclmul")))56#else57# define crc_attr_target58#endif596061#define MASK_L(in, mask, r) r = _mm_shuffle_epi8(in, mask)6263#define MASK_H(in, mask, r) \64r = _mm_shuffle_epi8(in, _mm_xor_si128(mask, vsign))6566#define MASK_LH(in, mask, low, high) \67MASK_L(in, mask, low); \68MASK_H(in, mask, high)697071crc_attr_target72crc_attr_no_sanitize_address73static lzma_always_inline void74crc_simd_body(const uint8_t *buf, const size_t size, __m128i *v0, __m128i *v1,75const __m128i vfold16, const __m128i initial_crc)76{77// Create a vector with 8-bit values 0 to 15. This is used to78// construct control masks for _mm_blendv_epi8 and _mm_shuffle_epi8.79const __m128i vramp = _mm_setr_epi32(800x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c);8182// This is used to inverse the control mask of _mm_shuffle_epi883// so that bytes that wouldn't be picked with the original mask84// will be picked and vice versa.85const __m128i vsign = _mm_set1_epi8(-0x80);8687// Memory addresses A to D and the distances between them:88//89// A B C D90// [skip_start][size][skip_end]91// [ size2 ]92//93// A and D are 16-byte aligned. B and C are 1-byte aligned.94// skip_start and skip_end are 0-15 bytes. size is at least 1 byte.95//96// A = aligned_buf will initially point to this address.97// B = The address pointed by the caller-supplied buf.98// C = buf + size == aligned_buf + size299// D = buf + size + skip_end == aligned_buf + size2 + skip_end100const size_t skip_start = (size_t)((uintptr_t)buf & 15);101const size_t skip_end = (size_t)((0U - (uintptr_t)(buf + size)) & 15);102const __m128i *aligned_buf = (const __m128i *)(103(uintptr_t)buf & ~(uintptr_t)15);104105// If size2 <= 16 then the whole input fits into a single 16-byte106// vector. If size2 > 16 then at least two 16-byte vectors must107// be processed. If size2 > 16 && size <= 16 then there is only108// one 16-byte vector's worth of input but it is unaligned in memory.109//110// NOTE: There is no integer overflow here if the arguments111// are valid. If this overflowed, buf + size would too.112const size_t size2 = skip_start + size;113114// Masks to be used with _mm_blendv_epi8 and _mm_shuffle_epi8:115// The first skip_start or skip_end bytes in the vectors will have116// the high bit (0x80) set. _mm_blendv_epi8 and _mm_shuffle_epi8117// will produce zeros for these positions. (Bitwise-xor of these118// masks with vsign will produce the opposite behavior.)119const __m128i mask_start120= _mm_sub_epi8(vramp, _mm_set1_epi8((char)skip_start));121const __m128i mask_end122= _mm_sub_epi8(vramp, _mm_set1_epi8((char)skip_end));123124// Get the first 1-16 bytes into data0. If loading less than 16125// bytes, the bytes are loaded to the high bits of the vector and126// the least significant positions are filled with zeros.127const __m128i data0 = _mm_blendv_epi8(_mm_load_si128(aligned_buf),128_mm_setzero_si128(), mask_start);129aligned_buf++;130131__m128i v2, v3;132133#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS134if (size <= 16) {135// Right-shift initial_crc by 1-16 bytes based on "size"136// and store the result in v1 (high bytes) and v0 (low bytes).137//138// NOTE: The highest 8 bytes of initial_crc are zeros so139// v1 will be filled with zeros if size >= 8. The highest140// 8 bytes of v1 will always become zeros.141//142// [ v1 ][ v0 ]143// [ initial_crc ] size == 1144// [ initial_crc ] size == 2145// [ initial_crc ] size == 15146// [ initial_crc ] size == 16 (all in v0)147const __m128i mask_low = _mm_add_epi8(148vramp, _mm_set1_epi8((char)(size - 16)));149MASK_LH(initial_crc, mask_low, *v0, *v1);150151if (size2 <= 16) {152// There are 1-16 bytes of input and it is all153// in data0. Copy the input bytes to v3. If there154// are fewer than 16 bytes, the low bytes in v3155// will be filled with zeros. That is, the input156// bytes are stored to the same position as157// (part of) initial_crc is in v0.158MASK_L(data0, mask_end, v3);159} else {160// There are 2-16 bytes of input but not all bytes161// are in data0.162const __m128i data1 = _mm_load_si128(aligned_buf);163164// Collect the 2-16 input bytes from data0 and data1165// to v2 and v3, and bitwise-xor them with the166// low bits of initial_crc in v0. Note that the167// the second xor is below this else-block as it168// is shared with the other branch.169MASK_H(data0, mask_end, v2);170MASK_L(data1, mask_end, v3);171*v0 = _mm_xor_si128(*v0, v2);172}173174*v0 = _mm_xor_si128(*v0, v3);175*v1 = _mm_alignr_epi8(*v1, *v0, 8);176} else177#endif178{179// There is more than 16 bytes of input.180const __m128i data1 = _mm_load_si128(aligned_buf);181const __m128i *end = (const __m128i*)(182(const char *)aligned_buf - 16 + size2);183aligned_buf++;184185MASK_LH(initial_crc, mask_start, *v0, *v1);186*v0 = _mm_xor_si128(*v0, data0);187*v1 = _mm_xor_si128(*v1, data1);188189while (aligned_buf < end) {190*v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(191*v0, vfold16, 0x00));192*v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(193*v0, vfold16, 0x11));194*v1 = _mm_load_si128(aligned_buf++);195}196197if (aligned_buf != end) {198MASK_H(*v0, mask_end, v2);199MASK_L(*v0, mask_end, *v0);200MASK_L(*v1, mask_end, v3);201*v1 = _mm_or_si128(v2, v3);202}203204*v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(205*v0, vfold16, 0x00));206*v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(207*v0, vfold16, 0x11));208*v1 = _mm_srli_si128(*v0, 8);209}210}211212213/////////////////////214// x86 CLMUL CRC32 //215/////////////////////216217/*218// These functions were used to generate the constants219// at the top of crc32_arch_optimized().220static uint64_t221calc_lo(uint64_t p, uint64_t a, int n)222{223uint64_t b = 0; int i;224for (i = 0; i < n; i++) {225b = b >> 1 | (a & 1) << (n - 1);226a = (a >> 1) ^ ((0 - (a & 1)) & p);227}228return b;229}230231// same as ~crc(&a, sizeof(a), ~0)232static uint64_t233calc_hi(uint64_t p, uint64_t a, int n)234{235int i;236for (i = 0; i < n; i++)237a = (a >> 1) ^ ((0 - (a & 1)) & p);238return a;239}240*/241242#ifdef BUILDING_CRC32_CLMUL243244crc_attr_target245crc_attr_no_sanitize_address246static uint32_t247crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)248{249#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS250// The code assumes that there is at least one byte of input.251if (size == 0)252return crc;253#endif254255// uint32_t poly = 0xedb88320;256const int64_t p = 0x1db710640; // p << 1257const int64_t mu = 0x1f7011641; // calc_lo(p, p, 32) << 1 | 1258const int64_t k5 = 0x163cd6124; // calc_hi(p, p, 32) << 1259const int64_t k4 = 0x0ccaa009e; // calc_hi(p, p, 64) << 1260const int64_t k3 = 0x1751997d0; // calc_hi(p, p, 128) << 1261262const __m128i vfold4 = _mm_set_epi64x(mu, p);263const __m128i vfold8 = _mm_set_epi64x(0, k5);264const __m128i vfold16 = _mm_set_epi64x(k4, k3);265266__m128i v0, v1, v2;267268crc_simd_body(buf, size, &v0, &v1, vfold16,269_mm_cvtsi32_si128((int32_t)~crc));270271v1 = _mm_xor_si128(272_mm_clmulepi64_si128(v0, vfold16, 0x10), v1); // xxx0273v2 = _mm_shuffle_epi32(v1, 0xe7); // 0xx0274v0 = _mm_slli_epi64(v1, 32); // [0]275v0 = _mm_clmulepi64_si128(v0, vfold8, 0x00);276v0 = _mm_xor_si128(v0, v2); // [1] [2]277v2 = _mm_clmulepi64_si128(v0, vfold4, 0x10);278v2 = _mm_clmulepi64_si128(v2, vfold4, 0x00);279v0 = _mm_xor_si128(v0, v2); // [2]280return ~(uint32_t)_mm_extract_epi32(v0, 2);281}282#endif // BUILDING_CRC32_CLMUL283284285/////////////////////286// x86 CLMUL CRC64 //287/////////////////////288289/*290// These functions were used to generate the constants291// at the top of crc64_arch_optimized().292static uint64_t293calc_lo(uint64_t poly)294{295uint64_t a = poly;296uint64_t b = 0;297298for (unsigned i = 0; i < 64; ++i) {299b = (b >> 1) | (a << 63);300a = (a >> 1) ^ (a & 1 ? poly : 0);301}302303return b;304}305306static uint64_t307calc_hi(uint64_t poly, uint64_t a)308{309for (unsigned i = 0; i < 64; ++i)310a = (a >> 1) ^ (a & 1 ? poly : 0);311312return a;313}314*/315316#ifdef BUILDING_CRC64_CLMUL317318// MSVC (VS2015 - VS2022) produces bad 32-bit x86 code from the CLMUL CRC319// code when optimizations are enabled (release build). According to the bug320// report, the ebx register is corrupted and the calculated result is wrong.321// Trying to workaround the problem with "__asm mov ebx, ebx" didn't help.322// The following pragma works and performance is still good. x86-64 builds323// and CRC32 CLMUL aren't affected by this problem. The problem does not324// happen in crc_simd_body() either (which is shared with CRC32 CLMUL anyway).325//326// NOTE: Another pragma after crc64_arch_optimized() restores327// the optimizations. If the #if condition here is updated,328// the other one must be updated too.329#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \330&& defined(_M_IX86)331# pragma optimize("g", off)332#endif333334crc_attr_target335crc_attr_no_sanitize_address336static uint64_t337crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc)338{339#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS340// The code assumes that there is at least one byte of input.341if (size == 0)342return crc;343#endif344345// const uint64_t poly = 0xc96c5795d7870f42; // CRC polynomial346const uint64_t p = 0x92d8af2baf0e1e85; // (poly << 1) | 1347const uint64_t mu = 0x9c3e466c172963d5; // (calc_lo(poly) << 1) | 1348const uint64_t k2 = 0xdabe95afc7875f40; // calc_hi(poly, 1)349const uint64_t k1 = 0xe05dd497ca393ae4; // calc_hi(poly, k2)350351const __m128i vfold8 = _mm_set_epi64x((int64_t)p, (int64_t)mu);352const __m128i vfold16 = _mm_set_epi64x((int64_t)k2, (int64_t)k1);353354__m128i v0, v1, v2;355356#if defined(__i386__) || defined(_M_IX86)357crc_simd_body(buf, size, &v0, &v1, vfold16,358_mm_set_epi64x(0, (int64_t)~crc));359#else360// GCC and Clang would produce good code with _mm_set_epi64x361// but MSVC needs _mm_cvtsi64_si128 on x86-64.362crc_simd_body(buf, size, &v0, &v1, vfold16,363_mm_cvtsi64_si128((int64_t)~crc));364#endif365366v1 = _mm_xor_si128(_mm_clmulepi64_si128(v0, vfold16, 0x10), v1);367v0 = _mm_clmulepi64_si128(v1, vfold8, 0x00);368v2 = _mm_clmulepi64_si128(v0, vfold8, 0x10);369v0 = _mm_xor_si128(_mm_xor_si128(v1, _mm_slli_si128(v0, 8)), v2);370371#if defined(__i386__) || defined(_M_IX86)372return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |373(uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));374#else375return ~(uint64_t)_mm_extract_epi64(v0, 1);376#endif377}378379#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \380&& defined(_M_IX86)381# pragma optimize("", on)382#endif383384#endif // BUILDING_CRC64_CLMUL385386387// Even though this is an inline function, compile it only when needed.388// This way it won't appear in E2K builds at all.389#if defined(CRC32_GENERIC) || defined(CRC64_GENERIC)390// Inlining this function duplicates the function body in crc32_resolve() and391// crc64_resolve(), but this is acceptable because this is a tiny function.392static inline bool393is_arch_extension_supported(void)394{395int success = 1;396uint32_t r[4]; // eax, ebx, ecx, edx397398#if defined(_MSC_VER)399// This needs <intrin.h> with MSVC. ICC has it as a built-in400// on all platforms.401__cpuid(r, 1);402#elif defined(HAVE_CPUID_H)403// Compared to just using __asm__ to run CPUID, this also checks404// that CPUID is supported and saves and restores ebx as that is405// needed with GCC < 5 with position-independent code (PIC).406success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);407#else408// Just a fallback that shouldn't be needed.409__asm__("cpuid\n\t"410: "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])411: "a"(1), "c"(0));412#endif413414// Returns true if these are supported:415// CLMUL (bit 1 in ecx)416// SSSE3 (bit 9 in ecx)417// SSE4.1 (bit 19 in ecx)418const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);419return success && (r[2] & ecx_mask) == ecx_mask;420421// Alternative methods that weren't used:422// - ICC's _may_i_use_cpu_feature: the other methods should work too.423// - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")424//425// CPUID decoding is needed with MSVC anyway and older GCC. This keeps426// the feature checks in the build system simpler too. The nice thing427// about __builtin_cpu_supports would be that it generates very short428// code as is it only reads a variable set at startup but a few bytes429// doesn't matter here.430}431#endif432433434