/*1* xxHash - Extremely Fast Hash algorithm2* Header File3* Copyright (C) 2012-2020 Yann Collet4*5* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)6*7* Redistribution and use in source and binary forms, with or without8* modification, are permitted provided that the following conditions are9* met:10*11* * Redistributions of source code must retain the above copyright12* notice, this list of conditions and the following disclaimer.13* * Redistributions in binary form must reproduce the above14* copyright notice, this list of conditions and the following disclaimer15* in the documentation and/or other materials provided with the16* distribution.17*18* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS19* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT20* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR21* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT22* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,23* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT24* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,25* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY26* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT27* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE28* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.29*30* You can contact the author at:31* - xxHash homepage: https://www.xxhash.com32* - xxHash source repository: https://github.com/Cyan4973/xxHash33*/34/*!35* @mainpage xxHash36*37* @file xxhash.h38* xxHash prototypes and implementation39*/40/* TODO: update */41/* Notice extracted from xxHash homepage:4243xxHash is an extremely fast hash algorithm, running at RAM speed limits.44It also successfully passes all tests from the SMHasher suite.4546Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)4748Name Speed Q.Score Author49xxHash 5.4 GB/s 1050CrapWow 3.2 GB/s 2 Andrew51MurmurHash 3a 2.7 GB/s 10 Austin Appleby52SpookyHash 2.0 GB/s 10 Bob Jenkins53SBox 1.4 GB/s 9 Bret Mulvey54Lookup3 1.2 GB/s 9 Bob Jenkins55SuperFastHash 1.2 GB/s 1 Paul Hsieh56CityHash64 1.05 GB/s 10 Pike & Alakuijala57FNV 0.55 GB/s 5 Fowler, Noll, Vo58CRC32 0.43 GB/s 959MD5-32 0.33 GB/s 10 Ronald L. Rivest60SHA1-32 0.28 GB/s 106162Q.Score is a measure of quality of the hash function.63It depends on successfully passing SMHasher test set.6410 is a perfect score.6566Note: SMHasher's CRC32 implementation is not the fastest one.67Other speed-oriented implementations can be faster,68especially in combination with PCLMUL instruction:69https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c34900923404611707357071A 64-bit version, named XXH64, is available since r35.72It offers much better speed, but for 64-bit applications only.73Name Speed on 64 bits Speed on 32 bits74XXH64 13.8 GB/s 1.9 GB/s75XXH32 6.8 GB/s 6.0 GB/s76*/7778#if defined (__cplusplus)79extern "C" {80#endif8182/* ****************************83* INLINE mode84******************************/85/*!86* XXH_INLINE_ALL (and XXH_PRIVATE_API)87* Use these build macros to inline xxhash into the target unit.88* Inlining improves performance on small inputs, especially when the length is89* expressed as a compile-time constant:90*91* https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html92*93* It also keeps xxHash symbols private to the unit, so they are not exported.94*95* Usage:96* #define XXH_INLINE_ALL97* #include "xxhash.h"98*99* Do not compile and link xxhash.o as a separate object, as it is not useful.100*/101#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \102&& !defined(XXH_INLINE_ALL_31684351384)103/* this section should be traversed only once */104# define XXH_INLINE_ALL_31684351384105/* give access to the advanced API, required to compile implementations */106# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */107# define XXH_STATIC_LINKING_ONLY108/* make all functions private */109# undef XXH_PUBLIC_API110# if defined(__GNUC__)111# define XXH_PUBLIC_API static __inline __attribute__((unused))112# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)113# define XXH_PUBLIC_API static inline114# elif defined(_MSC_VER)115# define XXH_PUBLIC_API static __inline116# else117/* note: this version may generate warnings for unused static functions */118# define XXH_PUBLIC_API static119# endif120121/*122* This part deals with the special case where a unit wants to inline xxHash,123* but "xxhash.h" has previously been included without XXH_INLINE_ALL, such124* as part of some previously included *.h header file.125* Without further action, the new include would just be ignored,126* and functions would effectively _not_ be inlined (silent failure).127* The following macros solve this situation by prefixing all inlined names,128* avoiding naming collision with previous inclusions.129*/130# ifdef XXH_NAMESPACE131# error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"132/*133* Note: Alternative: #undef all symbols (it's a pretty large list).134* Without #error: it compiles, but functions are actually not inlined.135*/136# endif137# define XXH_NAMESPACE XXH_INLINE_138/*139* Some identifiers (enums, type names) are not symbols, but they must140* still be renamed to avoid redeclaration.141* Alternative solution: do not redeclare them.142* However, this requires some #ifdefs, and is a more dispersed action.143* Meanwhile, renaming can be achieved in a single block144*/145# define XXH_IPREF(Id) XXH_INLINE_ ## Id146# define XXH_OK XXH_IPREF(XXH_OK)147# define XXH_ERROR XXH_IPREF(XXH_ERROR)148# define XXH_errorcode XXH_IPREF(XXH_errorcode)149# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t)150# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t)151# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)152# define XXH32_state_s XXH_IPREF(XXH32_state_s)153# define XXH32_state_t XXH_IPREF(XXH32_state_t)154# define XXH64_state_s XXH_IPREF(XXH64_state_s)155# define XXH64_state_t XXH_IPREF(XXH64_state_t)156# define XXH3_state_s XXH_IPREF(XXH3_state_s)157# define XXH3_state_t XXH_IPREF(XXH3_state_t)158# define XXH128_hash_t XXH_IPREF(XXH128_hash_t)159/* Ensure the header is parsed again, even if it was previously included */160# undef XXHASH_H_5627135585666179161# undef XXHASH_H_STATIC_13879238742162#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */163164165166/* ****************************************************************167* Stable API168*****************************************************************/169#ifndef XXHASH_H_5627135585666179170#define XXHASH_H_5627135585666179 1171172173/*!174* @defgroup public Public API175* Contains details on the public xxHash functions.176* @{177*/178/* specific declaration modes for Windows */179#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)180# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))181# ifdef XXH_EXPORT182# define XXH_PUBLIC_API __declspec(dllexport)183# elif XXH_IMPORT184# define XXH_PUBLIC_API __declspec(dllimport)185# endif186# else187# define XXH_PUBLIC_API /* do nothing */188# endif189#endif190191#ifdef XXH_DOXYGEN192/*!193* @brief Emulate a namespace by transparently prefixing all symbols.194*195* If you want to include _and expose_ xxHash functions from within your own196* library, but also want to avoid symbol collisions with other libraries which197* may also include xxHash, you can use XXH_NAMESPACE to automatically prefix198* any public symbol from xxhash library with the value of XXH_NAMESPACE199* (therefore, avoid empty or numeric values).200*201* Note that no change is required within the calling program as long as it202* includes `xxhash.h`: Regular symbol names will be automatically translated203* by this header.204*/205# define XXH_NAMESPACE /* YOUR NAME HERE */206# undef XXH_NAMESPACE207#endif208209#ifdef XXH_NAMESPACE210# define XXH_CAT(A,B) A##B211# define XXH_NAME2(A,B) XXH_CAT(A,B)212# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)213/* XXH32 */214# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)215# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)216# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)217# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)218# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)219# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)220# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)221# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)222# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)223/* XXH64 */224# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)225# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)226# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)227# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)228# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)229# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)230# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)231# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)232# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)233/* XXH3_64bits */234# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)235# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)236# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)237# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)238# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)239# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)240# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)241# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)242# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)243# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)244# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)245# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)246/* XXH3_128bits */247# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)248# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)249# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)250# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)251# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)252# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)253# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)254# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)255# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)256# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)257# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)258# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)259# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)260#endif261262263/* *************************************264* Version265***************************************/266#define XXH_VERSION_MAJOR 0267#define XXH_VERSION_MINOR 8268#define XXH_VERSION_RELEASE 0269#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)270271/*!272* @brief Obtains the xxHash version.273*274* This is only useful when xxHash is compiled as a shared library, as it is275* independent of the version defined in the header.276*277* @return `XXH_VERSION_NUMBER` as of when the function was compiled.278*/279XXH_PUBLIC_API unsigned XXH_versionNumber (void);280281282/* ****************************283* Definitions284******************************/285#include <stddef.h> /* size_t */286typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;287288289/*-**********************************************************************290* 32-bit hash291************************************************************************/292#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */293/*!294* @brief An unsigned 32-bit integer.295*296* Not necessarily defined to `uint32_t` but functionally equivalent.297*/298typedef uint32_t XXH32_hash_t;299#elif !defined (__VMS) \300&& (defined (__cplusplus) \301|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )302# include <stdint.h>303typedef uint32_t XXH32_hash_t;304#else305# include <limits.h>306# if UINT_MAX == 0xFFFFFFFFUL307typedef unsigned int XXH32_hash_t;308# else309# if ULONG_MAX == 0xFFFFFFFFUL310typedef unsigned long XXH32_hash_t;311# else312# error "unsupported platform: need a 32-bit type"313# endif314# endif315#endif316317/*!318* @}319*320* @defgroup xxh32_family XXH32 family321* @ingroup public322* Contains functions used in the classic 32-bit xxHash algorithm.323*324* @note325* XXH32 is considered rather weak by today's standards.326* The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit327* systems, and offers true 64/128 bit hash results. It provides a superior328* level of dispersion, and greatly reduces the risks of collisions.329*330* @see @ref xxh64_family, @ref xxh3_family : Other xxHash families331* @see @ref xxh32_impl for implementation details332* @{333*/334335/*!336* @brief Calculates the 32-bit hash of @p input using xxHash32.337*338* Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s339*340* @param input The block of data to be hashed, at least @p length bytes in size.341* @param length The length of @p input, in bytes.342* @param seed The 32-bit seed to alter the hash's output predictably.343*344* @pre345* The memory between @p input and @p input + @p length must be valid,346* readable, contiguous memory. However, if @p length is `0`, @p input may be347* `NULL`. In C++, this also must be *TriviallyCopyable*.348*349* @return The calculated 32-bit hash value.350*351* @see352* XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():353* Direct equivalents for the other variants of xxHash.354* @see355* XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.356*/357XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);358359/*!360* Streaming functions generate the xxHash value from an incrememtal input.361* This method is slower than single-call functions, due to state management.362* For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.363*364* An XXH state must first be allocated using `XXH*_createState()`.365*366* Start a new hash by initializing the state with a seed using `XXH*_reset()`.367*368* Then, feed the hash state by calling `XXH*_update()` as many times as necessary.369*370* The function returns an error code, with 0 meaning OK, and any other value371* meaning there is an error.372*373* Finally, a hash value can be produced anytime, by using `XXH*_digest()`.374* This function returns the nn-bits hash as an int or long long.375*376* It's still possible to continue inserting input into the hash state after a377* digest, and generate new hash values later on by invoking `XXH*_digest()`.378*379* When done, release the state using `XXH*_freeState()`.380*381* Example code for incrementally hashing a file:382* @code{.c}383* #include <stdio.h>384* #include <xxhash.h>385* #define BUFFER_SIZE 256386*387* // Note: XXH64 and XXH3 use the same interface.388* XXH32_hash_t389* hashFile(FILE* stream)390* {391* XXH32_state_t* state;392* unsigned char buf[BUFFER_SIZE];393* size_t amt;394* XXH32_hash_t hash;395*396* state = XXH32_createState(); // Create a state397* assert(state != NULL); // Error check here398* XXH32_reset(state, 0xbaad5eed); // Reset state with our seed399* while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {400* XXH32_update(state, buf, amt); // Hash the file in chunks401* }402* hash = XXH32_digest(state); // Finalize the hash403* XXH32_freeState(state); // Clean up404* return hash;405* }406* @endcode407*/408409/*!410* @typedef struct XXH32_state_s XXH32_state_t411* @brief The opaque state struct for the XXH32 streaming API.412*413* @see XXH32_state_s for details.414*/415typedef struct XXH32_state_s XXH32_state_t;416417/*!418* @brief Allocates an @ref XXH32_state_t.419*420* Must be freed with XXH32_freeState().421* @return An allocated XXH32_state_t on success, `NULL` on failure.422*/423XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);424/*!425* @brief Frees an @ref XXH32_state_t.426*427* Must be allocated with XXH32_createState().428* @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().429* @return XXH_OK.430*/431XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);432/*!433* @brief Copies one @ref XXH32_state_t to another.434*435* @param dst_state The state to copy to.436* @param src_state The state to copy from.437* @pre438* @p dst_state and @p src_state must not be `NULL` and must not overlap.439*/440XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);441442/*!443* @brief Resets an @ref XXH32_state_t to begin a new hash.444*445* This function resets and seeds a state. Call it before @ref XXH32_update().446*447* @param statePtr The state struct to reset.448* @param seed The 32-bit seed to alter the hash result predictably.449*450* @pre451* @p statePtr must not be `NULL`.452*453* @return @ref XXH_OK on success, @ref XXH_ERROR on failure.454*/455XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed);456457/*!458* @brief Consumes a block of @p input to an @ref XXH32_state_t.459*460* Call this to incrementally consume blocks of data.461*462* @param statePtr The state struct to update.463* @param input The block of data to be hashed, at least @p length bytes in size.464* @param length The length of @p input, in bytes.465*466* @pre467* @p statePtr must not be `NULL`.468* @pre469* The memory between @p input and @p input + @p length must be valid,470* readable, contiguous memory. However, if @p length is `0`, @p input may be471* `NULL`. In C++, this also must be *TriviallyCopyable*.472*473* @return @ref XXH_OK on success, @ref XXH_ERROR on failure.474*/475XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);476477/*!478* @brief Returns the calculated hash value from an @ref XXH32_state_t.479*480* @note481* Calling XXH32_digest() will not affect @p statePtr, so you can update,482* digest, and update again.483*484* @param statePtr The state struct to calculate the hash from.485*486* @pre487* @p statePtr must not be `NULL`.488*489* @return The calculated xxHash32 value from that state.490*/491XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);492493/******* Canonical representation *******/494495/*496* The default return values from XXH functions are unsigned 32 and 64 bit497* integers.498* This the simplest and fastest format for further post-processing.499*500* However, this leaves open the question of what is the order on the byte level,501* since little and big endian conventions will store the same number differently.502*503* The canonical representation settles this issue by mandating big-endian504* convention, the same convention as human-readable numbers (large digits first).505*506* When writing hash values to storage, sending them over a network, or printing507* them, it's highly recommended to use the canonical representation to ensure508* portability across a wider range of systems, present and future.509*510* The following functions allow transformation of hash values to and from511* canonical format.512*/513514/*!515* @brief Canonical (big endian) representation of @ref XXH32_hash_t.516*/517typedef struct {518unsigned char digest[4]; /*!< Hash bytes, big endian */519} XXH32_canonical_t;520521/*!522* @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.523*524* @param dst The @ref XXH32_canonical_t pointer to be stored to.525* @param hash The @ref XXH32_hash_t to be converted.526*527* @pre528* @p dst must not be `NULL`.529*/530XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);531532/*!533* @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.534*535* @param src The @ref XXH32_canonical_t to convert.536*537* @pre538* @p src must not be `NULL`.539*540* @return The converted hash.541*/542XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);543544545/*!546* @}547* @ingroup public548* @{549*/550551#ifndef XXH_NO_LONG_LONG552/*-**********************************************************************553* 64-bit hash554************************************************************************/555#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */556/*!557* @brief An unsigned 64-bit integer.558*559* Not necessarily defined to `uint64_t` but functionally equivalent.560*/561typedef uint64_t XXH64_hash_t;562#elif !defined (__VMS) \563&& (defined (__cplusplus) \564|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )565# include <stdint.h>566typedef uint64_t XXH64_hash_t;567#else568# include <limits.h>569# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL570/* LP64 ABI says uint64_t is unsigned long */571typedef unsigned long XXH64_hash_t;572# else573/* the following type must have a width of 64-bit */574typedef unsigned long long XXH64_hash_t;575# endif576#endif577578/*!579* @}580*581* @defgroup xxh64_family XXH64 family582* @ingroup public583* @{584* Contains functions used in the classic 64-bit xxHash algorithm.585*586* @note587* XXH3 provides competitive speed for both 32-bit and 64-bit systems,588* and offers true 64/128 bit hash results. It provides a superior level of589* dispersion, and greatly reduces the risks of collisions.590*/591592593/*!594* @brief Calculates the 64-bit hash of @p input using xxHash64.595*596* This function usually runs faster on 64-bit systems, but slower on 32-bit597* systems (see benchmark).598*599* @param input The block of data to be hashed, at least @p length bytes in size.600* @param length The length of @p input, in bytes.601* @param seed The 64-bit seed to alter the hash's output predictably.602*603* @pre604* The memory between @p input and @p input + @p length must be valid,605* readable, contiguous memory. However, if @p length is `0`, @p input may be606* `NULL`. In C++, this also must be *TriviallyCopyable*.607*608* @return The calculated 64-bit hash.609*610* @see611* XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():612* Direct equivalents for the other variants of xxHash.613* @see614* XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.615*/616XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);617618/******* Streaming *******/619/*!620* @brief The opaque state struct for the XXH64 streaming API.621*622* @see XXH64_state_s for details.623*/624typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */625XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);626XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);627XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);628629XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed);630XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);631XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);632633/******* Canonical representation *******/634typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;635XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);636XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);637638/*!639* @}640* ************************************************************************641* @defgroup xxh3_family XXH3 family642* @ingroup public643* @{644*645* XXH3 is a more recent hash algorithm featuring:646* - Improved speed for both small and large inputs647* - True 64-bit and 128-bit outputs648* - SIMD acceleration649* - Improved 32-bit viability650*651* Speed analysis methodology is explained here:652*653* https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html654*655* Compared to XXH64, expect XXH3 to run approximately656* ~2x faster on large inputs and >3x faster on small ones,657* exact differences vary depending on platform.658*659* XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,660* but does not require it.661* Any 32-bit and 64-bit targets that can run XXH32 smoothly662* can run XXH3 at competitive speeds, even without vector support.663* Further details are explained in the implementation.664*665* Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,666* ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.667*668* XXH3 implementation is portable:669* it has a generic C90 formulation that can be compiled on any platform,670* all implementations generage exactly the same hash value on all platforms.671* Starting from v0.8.0, it's also labelled "stable", meaning that672* any future version will also generate the same hash value.673*674* XXH3 offers 2 variants, _64bits and _128bits.675*676* When only 64 bits are needed, prefer invoking the _64bits variant, as it677* reduces the amount of mixing, resulting in faster speed on small inputs.678* It's also generally simpler to manipulate a scalar return type than a struct.679*680* The API supports one-shot hashing, streaming mode, and custom secrets.681*/682683/*-**********************************************************************684* XXH3 64-bit variant685************************************************************************/686687/* XXH3_64bits():688* default 64-bit variant, using default secret and default seed of 0.689* It's the fastest variant. */690XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);691692/*693* XXH3_64bits_withSeed():694* This variant generates a custom secret on the fly695* based on default secret altered using the `seed` value.696* While this operation is decently fast, note that it's not completely free.697* Note: seed==0 produces the same results as XXH3_64bits().698*/699XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);700701/*!702* The bare minimum size for a custom secret.703*704* @see705* XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),706* XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().707*/708#define XXH3_SECRET_SIZE_MIN 136709710/*711* XXH3_64bits_withSecret():712* It's possible to provide any blob of bytes as a "secret" to generate the hash.713* This makes it more difficult for an external actor to prepare an intentional collision.714* The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).715* However, the quality of produced hash values depends on secret's entropy.716* Technically, the secret must look like a bunch of random bytes.717* Avoid "trivial" or structured data such as repeated sequences or a text document.718* Whenever unsure about the "randomness" of the blob of bytes,719* consider relabelling it as a "custom seed" instead,720* and employ "XXH3_generateSecret()" (see below)721* to generate a high entropy secret derived from the custom seed.722*/723XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);724725726/******* Streaming *******/727/*728* Streaming requires state maintenance.729* This operation costs memory and CPU.730* As a consequence, streaming is slower than one-shot hashing.731* For better performance, prefer one-shot functions whenever applicable.732*/733734/*!735* @brief The state struct for the XXH3 streaming API.736*737* @see XXH3_state_s for details.738*/739typedef struct XXH3_state_s XXH3_state_t;740XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);741XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);742XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);743744/*745* XXH3_64bits_reset():746* Initialize with default parameters.747* digest will be equivalent to `XXH3_64bits()`.748*/749XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);750/*751* XXH3_64bits_reset_withSeed():752* Generate a custom secret from `seed`, and store it into `statePtr`.753* digest will be equivalent to `XXH3_64bits_withSeed()`.754*/755XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);756/*757* XXH3_64bits_reset_withSecret():758* `secret` is referenced, it _must outlive_ the hash streaming session.759* Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,760* and the quality of produced hash values depends on secret's entropy761* (secret's content should look like a bunch of random bytes).762* When in doubt about the randomness of a candidate `secret`,763* consider employing `XXH3_generateSecret()` instead (see below).764*/765XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);766767XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);768XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr);769770/* note : canonical representation of XXH3 is the same as XXH64771* since they both produce XXH64_hash_t values */772773774/*-**********************************************************************775* XXH3 128-bit variant776************************************************************************/777778/*!779* @brief The return value from 128-bit hashes.780*781* Stored in little endian order, although the fields themselves are in native782* endianness.783*/784typedef struct {785XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */786XXH64_hash_t high64; /*!< `value >> 64` */787} XXH128_hash_t;788789XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);790XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);791XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);792793/******* Streaming *******/794/*795* Streaming requires state maintenance.796* This operation costs memory and CPU.797* As a consequence, streaming is slower than one-shot hashing.798* For better performance, prefer one-shot functions whenever applicable.799*800* XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().801* Use already declared XXH3_createState() and XXH3_freeState().802*803* All reset and streaming functions have same meaning as their 64-bit counterpart.804*/805806XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);807XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);808XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);809810XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);811XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);812813/* Following helper functions make it possible to compare XXH128_hast_t values.814* Since XXH128_hash_t is a structure, this capability is not offered by the language.815* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */816817/*!818* XXH128_isEqual():819* Return: 1 if `h1` and `h2` are equal, 0 if they are not.820*/821XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);822823/*!824* XXH128_cmp():825*826* This comparator is compatible with stdlib's `qsort()`/`bsearch()`.827*828* return: >0 if *h128_1 > *h128_2829* =0 if *h128_1 == *h128_2830* <0 if *h128_1 < *h128_2831*/832XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);833834835/******* Canonical representation *******/836typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;837XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);838XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);839840841#endif /* XXH_NO_LONG_LONG */842843/*!844* @}845*/846#endif /* XXHASH_H_5627135585666179 */847848849850#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)851#define XXHASH_H_STATIC_13879238742852/* ****************************************************************************853* This section contains declarations which are not guaranteed to remain stable.854* They may change in future versions, becoming incompatible with a different855* version of the library.856* These declarations should only be used with static linking.857* Never use them in association with dynamic linking!858***************************************************************************** */859860/*861* These definitions are only present to allow static allocation862* of XXH states, on stack or in a struct, for example.863* Never **ever** access their members directly.864*/865866/*!867* @internal868* @brief Structure for XXH32 streaming API.869*870* @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,871* @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is872* an opaque type. This allows fields to safely be changed.873*874* Typedef'd to @ref XXH32_state_t.875* Do not access the members of this struct directly.876* @see XXH64_state_s, XXH3_state_s877*/878struct XXH32_state_s {879XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */880XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */881XXH32_hash_t v1; /*!< First accumulator lane */882XXH32_hash_t v2; /*!< Second accumulator lane */883XXH32_hash_t v3; /*!< Third accumulator lane */884XXH32_hash_t v4; /*!< Fourth accumulator lane */885XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */886XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */887XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may be removed. */888}; /* typedef'd to XXH32_state_t */889890891#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */892893/*!894* @internal895* @brief Structure for XXH64 streaming API.896*897* @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,898* @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is899* an opaque type. This allows fields to safely be changed.900*901* Typedef'd to @ref XXH64_state_t.902* Do not access the members of this struct directly.903* @see XXH32_state_s, XXH3_state_s904*/905struct XXH64_state_s {906XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */907XXH64_hash_t v1; /*!< First accumulator lane */908XXH64_hash_t v2; /*!< Second accumulator lane */909XXH64_hash_t v3; /*!< Third accumulator lane */910XXH64_hash_t v4; /*!< Fourth accumulator lane */911XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */912XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */913XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/914XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it may be removed. */915}; /* typedef'd to XXH64_state_t */916917#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */918# include <stdalign.h>919# define XXH_ALIGN(n) alignas(n)920#elif defined(__GNUC__)921# define XXH_ALIGN(n) __attribute__ ((aligned(n)))922#elif defined(_MSC_VER)923# define XXH_ALIGN(n) __declspec(align(n))924#else925# define XXH_ALIGN(n) /* disabled */926#endif927928/* Old GCC versions only accept the attribute after the type in structures. */929#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \930&& defined(__GNUC__)931# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)932#else933# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type934#endif935936/*!937* @brief The size of the internal XXH3 buffer.938*939* This is the optimal update size for incremental hashing.940*941* @see XXH3_64b_update(), XXH3_128b_update().942*/943#define XXH3_INTERNALBUFFER_SIZE 256944945/*!946* @brief Default size of the secret buffer (and @ref XXH3_kSecret).947*948* This is the size used in @ref XXH3_kSecret and the seeded functions.949*950* Not to be confused with @ref XXH3_SECRET_SIZE_MIN.951*/952#define XXH3_SECRET_DEFAULT_SIZE 192953954/*!955* @internal956* @brief Structure for XXH3 streaming API.957*958* @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,959* @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is960* an opaque type. This allows fields to safely be changed.961*962* @note **This structure has a strict alignment requirement of 64 bytes.** Do963* not allocate this with `malloc()` or `new`, it will not be sufficiently964* aligned. Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack965* allocation.966*967* Typedef'd to @ref XXH3_state_t.968* Do not access the members of this struct directly.969*970* @see XXH3_INITSTATE() for stack initialization.971* @see XXH3_createState(), XXH3_freeState().972* @see XXH32_state_s, XXH64_state_s973*/974struct XXH3_state_s {975XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);976/*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */977XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);978/*!< Used to store a custom secret generated from a seed. */979XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);980/*!< The internal buffer. @see XXH32_state_s::mem32 */981XXH32_hash_t bufferedSize;982/*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */983XXH32_hash_t reserved32;984/*!< Reserved field. Needed for padding on 64-bit. */985size_t nbStripesSoFar;986/*!< Number or stripes processed. */987XXH64_hash_t totalLen;988/*!< Total length hashed. 64-bit even on 32-bit targets. */989size_t nbStripesPerBlock;990/*!< Number of stripes per block. */991size_t secretLimit;992/*!< Size of @ref customSecret or @ref extSecret */993XXH64_hash_t seed;994/*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */995XXH64_hash_t reserved64;996/*!< Reserved field. */997const unsigned char* extSecret;998/*!< Reference to an external secret for the _withSecret variants, NULL999* for other variants. */1000/* note: there may be some padding at the end due to alignment on 64 bytes */1001}; /* typedef'd to XXH3_state_t */10021003#undef XXH_ALIGN_MEMBER10041005/*!1006* @brief Initializes a stack-allocated `XXH3_state_s`.1007*1008* When the @ref XXH3_state_t structure is merely emplaced on stack,1009* it should be initialized with XXH3_INITSTATE() or a memset()1010* in case its first reset uses XXH3_NNbits_reset_withSeed().1011* This init can be omitted if the first reset uses default or _withSecret mode.1012* This operation isn't necessary when the state is created with XXH3_createState().1013* Note that this doesn't prepare the state for a streaming operation,1014* it's still necessary to use XXH3_NNbits_reset*() afterwards.1015*/1016#define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; }101710181019/* === Experimental API === */1020/* Symbols defined below must be considered tied to a specific library version. */10211022/*1023* XXH3_generateSecret():1024*1025* Derive a high-entropy secret from any user-defined content, named customSeed.1026* The generated secret can be used in combination with `*_withSecret()` functions.1027* The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,1028* as it becomes much more difficult for an external actor to guess how to impact the calculation logic.1029*1030* The function accepts as input a custom seed of any length and any content,1031* and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE1032* into an already allocated buffer secretBuffer.1033* The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.1034*1035* The generated secret can then be used with any `*_withSecret()` variant.1036* Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,1037* `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`1038* are part of this list. They all accept a `secret` parameter1039* which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)1040* _and_ feature very high entropy (consist of random-looking bytes).1041* These conditions can be a high bar to meet, so1042* this function can be used to generate a secret of proper quality.1043*1044* customSeed can be anything. It can have any size, even small ones,1045* and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes.1046* The resulting `secret` will nonetheless provide all expected qualities.1047*1048* Supplying NULL as the customSeed copies the default secret into `secretBuffer`.1049* When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.1050*/1051XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize);105210531054/* simple short-cut to pre-selected XXH3_128bits variant */1055XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);105610571058#endif /* XXH_NO_LONG_LONG */1059#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)1060# define XXH_IMPLEMENTATION1061#endif10621063#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */106410651066/* ======================================================================== */1067/* ======================================================================== */1068/* ======================================================================== */106910701071/*-**********************************************************************1072* xxHash implementation1073*-**********************************************************************1074* xxHash's implementation used to be hosted inside xxhash.c.1075*1076* However, inlining requires implementation to be visible to the compiler,1077* hence be included alongside the header.1078* Previously, implementation was hosted inside xxhash.c,1079* which was then #included when inlining was activated.1080* This construction created issues with a few build and install systems,1081* as it required xxhash.c to be stored in /include directory.1082*1083* xxHash implementation is now directly integrated within xxhash.h.1084* As a consequence, xxhash.c is no longer needed in /include.1085*1086* xxhash.c is still available and is still useful.1087* In a "normal" setup, when xxhash is not inlined,1088* xxhash.h only exposes the prototypes and public symbols,1089* while xxhash.c can be built into an object file xxhash.o1090* which can then be linked into the final binary.1091************************************************************************/10921093#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \1094|| defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)1095# define XXH_IMPLEM_13a873738710961097/* *************************************1098* Tuning parameters1099***************************************/11001101/*!1102* @defgroup tuning Tuning parameters1103* @{1104*1105* Various macros to control xxHash's behavior.1106*/1107#ifdef XXH_DOXYGEN1108/*!1109* @brief Define this to disable 64-bit code.1110*1111* Useful if only using the @ref xxh32_family and you have a strict C90 compiler.1112*/1113# define XXH_NO_LONG_LONG1114# undef XXH_NO_LONG_LONG /* don't actually */1115/*!1116* @brief Controls how unaligned memory is accessed.1117*1118* By default, access to unaligned memory is controlled by `memcpy()`, which is1119* safe and portable.1120*1121* Unfortunately, on some target/compiler combinations, the generated assembly1122* is sub-optimal.1123*1124* The below switch allow selection of a different access method1125* in the search for improved performance.1126*1127* @par Possible options:1128*1129* - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`1130* @par1131* Use `memcpy()`. Safe and portable. Note that most modern compilers will1132* eliminate the function call and treat it as an unaligned access.1133*1134* - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`1135* @par1136* Depends on compiler extensions and is therefore not portable.1137* This method is safe if your compiler supports it, and *generally* as1138* fast or faster than `memcpy`.1139*1140* - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast1141* @par1142* Casts directly and dereferences. This method doesn't depend on the1143* compiler, but it violates the C standard as it directly dereferences an1144* unaligned pointer. It can generate buggy code on targets which do not1145* support unaligned memory accesses, but in some circumstances, it's the1146* only known way to get the most performance (example: GCC + ARMv6).1147*1148* - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift1149* @par1150* Also portable. This can generate the best code on old compilers which don't1151* inline small `memcpy()` calls, and it might also be faster on big-endian1152* systems which lack a native byteswap instruction. However, some compilers1153* will emit literal byteshifts even if the target supports unaligned access.1154*1155* .1156*1157* @warning1158* Methods 1 and 2 rely on implementation-defined behavior. Use these with1159* care, as what works on one compiler/platform/optimization level may cause1160* another to read garbage data or even crash.1161*1162* See https://stackoverflow.com/a/32095106/646947 for details.1163*1164* Prefer these methods in priority order (0 > 3 > 1 > 2)1165*/1166# define XXH_FORCE_MEMORY_ACCESS 01167/*!1168* @def XXH_ACCEPT_NULL_INPUT_POINTER1169* @brief Whether to add explicit `NULL` checks.1170*1171* If the input pointer is `NULL` and the length is non-zero, xxHash's default1172* behavior is to dereference it, triggering a segfault.1173*1174* When this macro is enabled, xxHash actively checks the input for a null pointer.1175* If it is, the result for null input pointers is the same as a zero-length input.1176*/1177# define XXH_ACCEPT_NULL_INPUT_POINTER 01178/*!1179* @def XXH_FORCE_ALIGN_CHECK1180* @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()1181* and XXH64() only).1182*1183* This is an important performance trick for architectures without decent1184* unaligned memory access performance.1185*1186* It checks for input alignment, and when conditions are met, uses a "fast1187* path" employing direct 32-bit/64-bit reads, resulting in _dramatically1188* faster_ read speed.1189*1190* The check costs one initial branch per hash, which is generally negligible,1191* but not zero.1192*1193* Moreover, it's not useful to generate an additional code path if memory1194* access uses the same instruction for both aligned and unaligned1195* adresses (e.g. x86 and aarch64).1196*1197* In these cases, the alignment check can be removed by setting this macro to 0.1198* Then the code will always use unaligned memory access.1199* Align check is automatically disabled on x86, x64 & arm64,1200* which are platforms known to offer good unaligned memory accesses performance.1201*1202* This option does not affect XXH3 (only XXH32 and XXH64).1203*/1204# define XXH_FORCE_ALIGN_CHECK 012051206/*!1207* @def XXH_NO_INLINE_HINTS1208* @brief When non-zero, sets all functions to `static`.1209*1210* By default, xxHash tries to force the compiler to inline almost all internal1211* functions.1212*1213* This can usually improve performance due to reduced jumping and improved1214* constant folding, but significantly increases the size of the binary which1215* might not be favorable.1216*1217* Additionally, sometimes the forced inlining can be detrimental to performance,1218* depending on the architecture.1219*1220* XXH_NO_INLINE_HINTS marks all internal functions as static, giving the1221* compiler full control on whether to inline or not.1222*1223* When not optimizing (-O0), optimizing for size (-Os, -Oz), or using1224* -fno-inline with GCC or Clang, this will automatically be defined.1225*/1226# define XXH_NO_INLINE_HINTS 012271228/*!1229* @def XXH_REROLL1230* @brief Whether to reroll `XXH32_finalize` and `XXH64_finalize`.1231*1232* For performance, `XXH32_finalize` and `XXH64_finalize` use an unrolled loop1233* in the form of a switch statement.1234*1235* This is not always desirable, as it generates larger code, and depending on1236* the architecture, may even be slower1237*1238* This is automatically defined with `-Os`/`-Oz` on GCC and Clang.1239*/1240# define XXH_REROLL 012411242/*!1243* @internal1244* @brief Redefines old internal names.1245*1246* For compatibility with code that uses xxHash's internals before the names1247* were changed to improve namespacing. There is no other reason to use this.1248*/1249# define XXH_OLD_NAMES1250# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */1251#endif /* XXH_DOXYGEN */1252/*!1253* @}1254*/12551256#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */1257# if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)1258# define XXH_FORCE_MEMORY_ACCESS 21259# elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \1260(defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))1261# define XXH_FORCE_MEMORY_ACCESS 11262# endif1263#endif12641265#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */1266# define XXH_ACCEPT_NULL_INPUT_POINTER 01267#endif12681269#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */1270# if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) \1271|| defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */1272# define XXH_FORCE_ALIGN_CHECK 01273# else1274# define XXH_FORCE_ALIGN_CHECK 11275# endif1276#endif12771278#ifndef XXH_NO_INLINE_HINTS1279# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \1280|| defined(__NO_INLINE__) /* -O0, -fno-inline */1281# define XXH_NO_INLINE_HINTS 11282# else1283# define XXH_NO_INLINE_HINTS 01284# endif1285#endif12861287#ifndef XXH_REROLL1288# if defined(__OPTIMIZE_SIZE__)1289# define XXH_REROLL 11290# else1291# define XXH_REROLL 01292# endif1293#endif12941295/*!1296* @defgroup impl Implementation1297* @{1298*/129913001301/* *************************************1302* Includes & Memory related functions1303***************************************/1304/*1305* Modify the local functions below should you wish to use1306* different memory routines for malloc() and free()1307*/1308#include <stdlib.h>13091310/*!1311* @internal1312* @brief Modify this function to use a different routine than malloc().1313*/1314static void* XXH_malloc(size_t s) { return malloc(s); }13151316/*!1317* @internal1318* @brief Modify this function to use a different routine than free().1319*/1320static void XXH_free(void* p) { free(p); }13211322#include <string.h>13231324/*!1325* @internal1326* @brief Modify this function to use a different routine than memcpy().1327*/1328static void* XXH_memcpy(void* dest, const void* src, size_t size)1329{1330return memcpy(dest,src,size);1331}13321333#include <limits.h> /* ULLONG_MAX */133413351336/* *************************************1337* Compiler Specific Options1338***************************************/1339#ifdef _MSC_VER /* Visual Studio warning fix */1340# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */1341#endif13421343#if XXH_NO_INLINE_HINTS /* disable inlining hints */1344# if defined(__GNUC__)1345# define XXH_FORCE_INLINE static __attribute__((unused))1346# else1347# define XXH_FORCE_INLINE static1348# endif1349# define XXH_NO_INLINE static1350/* enable inlining hints */1351#elif defined(_MSC_VER) /* Visual Studio */1352# define XXH_FORCE_INLINE static __forceinline1353# define XXH_NO_INLINE static __declspec(noinline)1354#elif defined(__GNUC__)1355# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))1356# define XXH_NO_INLINE static __attribute__((noinline))1357#elif defined (__cplusplus) \1358|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */1359# define XXH_FORCE_INLINE static inline1360# define XXH_NO_INLINE static1361#else1362# define XXH_FORCE_INLINE static1363# define XXH_NO_INLINE static1364#endif1365136613671368/* *************************************1369* Debug1370***************************************/1371/*!1372* @ingroup tuning1373* @def XXH_DEBUGLEVEL1374* @brief Sets the debugging level.1375*1376* XXH_DEBUGLEVEL is expected to be defined externally, typically via the1377* compiler's command line options. The value must be a number.1378*/1379#ifndef XXH_DEBUGLEVEL1380# ifdef DEBUGLEVEL /* backwards compat */1381# define XXH_DEBUGLEVEL DEBUGLEVEL1382# else1383# define XXH_DEBUGLEVEL 01384# endif1385#endif13861387#if (XXH_DEBUGLEVEL>=1)1388# include <assert.h> /* note: can still be disabled with NDEBUG */1389# define XXH_ASSERT(c) assert(c)1390#else1391# define XXH_ASSERT(c) ((void)0)1392#endif13931394/* note: use after variable declarations */1395#define XXH_STATIC_ASSERT(c) do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)139613971398/* *************************************1399* Basic Types1400***************************************/1401#if !defined (__VMS) \1402&& (defined (__cplusplus) \1403|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )1404# include <stdint.h>1405typedef uint8_t xxh_u8;1406#else1407typedef unsigned char xxh_u8;1408#endif1409typedef XXH32_hash_t xxh_u32;14101411#ifdef XXH_OLD_NAMES1412# define BYTE xxh_u81413# define U8 xxh_u81414# define U32 xxh_u321415#endif14161417/* *** Memory access *** */14181419/*!1420* @internal1421* @fn xxh_u32 XXH_read32(const void* ptr)1422* @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.1423*1424* Affected by @ref XXH_FORCE_MEMORY_ACCESS.1425*1426* @param ptr The pointer to read from.1427* @return The 32-bit native endian integer from the bytes at @p ptr.1428*/14291430/*!1431* @internal1432* @fn xxh_u32 XXH_readLE32(const void* ptr)1433* @brief Reads an unaligned 32-bit little endian integer from @p ptr.1434*1435* Affected by @ref XXH_FORCE_MEMORY_ACCESS.1436*1437* @param ptr The pointer to read from.1438* @return The 32-bit little endian integer from the bytes at @p ptr.1439*/14401441/*!1442* @internal1443* @fn xxh_u32 XXH_readBE32(const void* ptr)1444* @brief Reads an unaligned 32-bit big endian integer from @p ptr.1445*1446* Affected by @ref XXH_FORCE_MEMORY_ACCESS.1447*1448* @param ptr The pointer to read from.1449* @return The 32-bit big endian integer from the bytes at @p ptr.1450*/14511452/*!1453* @internal1454* @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)1455* @brief Like @ref XXH_readLE32(), but has an option for aligned reads.1456*1457* Affected by @ref XXH_FORCE_MEMORY_ACCESS.1458* Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is1459* always @ref XXH_alignment::XXH_unaligned.1460*1461* @param ptr The pointer to read from.1462* @param align Whether @p ptr is aligned.1463* @pre1464* If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte1465* aligned.1466* @return The 32-bit little endian integer from the bytes at @p ptr.1467*/14681469#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))1470/*1471* Manual byteshift. Best for old compilers which don't inline memcpy.1472* We actually directly use XXH_readLE32 and XXH_readBE32.1473*/1474#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))14751476/*1477* Force direct memory access. Only works on CPU which support unaligned memory1478* access in hardware.1479*/1480static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }14811482#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))14831484/*1485* __pack instructions are safer but compiler specific, hence potentially1486* problematic for some compilers.1487*1488* Currently only defined for GCC and ICC.1489*/1490#ifdef XXH_OLD_NAMES1491typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;1492#endif1493static xxh_u32 XXH_read32(const void* ptr)1494{1495typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;1496return ((const xxh_unalign*)ptr)->u32;1497}14981499#else15001501/*1502* Portable and safe solution. Generally efficient.1503* see: https://stackoverflow.com/a/32095106/6469471504*/1505static xxh_u32 XXH_read32(const void* memPtr)1506{1507xxh_u32 val;1508memcpy(&val, memPtr, sizeof(val));1509return val;1510}15111512#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */151315141515/* *** Endianess *** */1516typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;15171518/*!1519* @ingroup tuning1520* @def XXH_CPU_LITTLE_ENDIAN1521* @brief Whether the target is little endian.1522*1523* Defined to 1 if the target is little endian, or 0 if it is big endian.1524* It can be defined externally, for example on the compiler command line.1525*1526* If it is not defined, a runtime check (which is usually constant folded)1527* is used instead.1528*1529* @note1530* This is not necessarily defined to an integer constant.1531*1532* @see XXH_isLittleEndian() for the runtime check.1533*/1534#ifndef XXH_CPU_LITTLE_ENDIAN1535/*1536* Try to detect endianness automatically, to avoid the nonstandard behavior1537* in `XXH_isLittleEndian()`1538*/1539# if defined(_WIN32) /* Windows is always little endian */ \1540|| defined(__LITTLE_ENDIAN__) \1541|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)1542# define XXH_CPU_LITTLE_ENDIAN 11543# elif defined(__BIG_ENDIAN__) \1544|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)1545# define XXH_CPU_LITTLE_ENDIAN 01546# else1547/*!1548* @internal1549* @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.1550*1551* Most compilers will constant fold this.1552*/1553static int XXH_isLittleEndian(void)1554{1555/*1556* Portable and well-defined behavior.1557* Don't use static: it is detrimental to performance.1558*/1559const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };1560return one.c[0];1561}1562# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()1563# endif1564#endif15651566156715681569/* ****************************************1570* Compiler-specific Functions and Macros1571******************************************/1572#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)15731574#ifdef __has_builtin1575# define XXH_HAS_BUILTIN(x) __has_builtin(x)1576#else1577# define XXH_HAS_BUILTIN(x) 01578#endif15791580/*!1581* @internal1582* @def XXH_rotl32(x,r)1583* @brief 32-bit rotate left.1584*1585* @param x The 32-bit integer to be rotated.1586* @param r The number of bits to rotate.1587* @pre1588* @p r > 0 && @p r < 321589* @note1590* @p x and @p r may be evaluated multiple times.1591* @return The rotated result.1592*/1593#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \1594&& XXH_HAS_BUILTIN(__builtin_rotateleft64)1595# define XXH_rotl32 __builtin_rotateleft321596# define XXH_rotl64 __builtin_rotateleft641597/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */1598#elif defined(_MSC_VER)1599# define XXH_rotl32(x,r) _rotl(x,r)1600# define XXH_rotl64(x,r) _rotl64(x,r)1601#else1602# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))1603# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))1604#endif16051606/*!1607* @internal1608* @fn xxh_u32 XXH_swap32(xxh_u32 x)1609* @brief A 32-bit byteswap.1610*1611* @param x The 32-bit integer to byteswap.1612* @return @p x, byteswapped.1613*/1614#if defined(_MSC_VER) /* Visual Studio */1615# define XXH_swap32 _byteswap_ulong1616#elif XXH_GCC_VERSION >= 4031617# define XXH_swap32 __builtin_bswap321618#else1619static xxh_u32 XXH_swap32 (xxh_u32 x)1620{1621return ((x << 24) & 0xff000000 ) |1622((x << 8) & 0x00ff0000 ) |1623((x >> 8) & 0x0000ff00 ) |1624((x >> 24) & 0x000000ff );1625}1626#endif162716281629/* ***************************1630* Memory reads1631*****************************/16321633/*!1634* @internal1635* @brief Enum to indicate whether a pointer is aligned.1636*/1637typedef enum {1638XXH_aligned, /*!< Aligned */1639XXH_unaligned /*!< Possibly unaligned */1640} XXH_alignment;16411642/*1643* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.1644*1645* This is ideal for older compilers which don't inline memcpy.1646*/1647#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))16481649XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)1650{1651const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;1652return bytePtr[0]1653| ((xxh_u32)bytePtr[1] << 8)1654| ((xxh_u32)bytePtr[2] << 16)1655| ((xxh_u32)bytePtr[3] << 24);1656}16571658XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)1659{1660const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;1661return bytePtr[3]1662| ((xxh_u32)bytePtr[2] << 8)1663| ((xxh_u32)bytePtr[1] << 16)1664| ((xxh_u32)bytePtr[0] << 24);1665}16661667#else1668XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)1669{1670return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));1671}16721673static xxh_u32 XXH_readBE32(const void* ptr)1674{1675return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);1676}1677#endif16781679XXH_FORCE_INLINE xxh_u321680XXH_readLE32_align(const void* ptr, XXH_alignment align)1681{1682if (align==XXH_unaligned) {1683return XXH_readLE32(ptr);1684} else {1685return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);1686}1687}168816891690/* *************************************1691* Misc1692***************************************/1693/*! @ingroup public */1694XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }169516961697/* *******************************************************************1698* 32-bit hash functions1699*********************************************************************/1700/*!1701* @}1702* @defgroup xxh32_impl XXH32 implementation1703* @ingroup impl1704* @{1705*/1706static const xxh_u32 XXH_PRIME32_1 = 0x9E3779B1U; /*!< 0b10011110001101110111100110110001 */1707static const xxh_u32 XXH_PRIME32_2 = 0x85EBCA77U; /*!< 0b10000101111010111100101001110111 */1708static const xxh_u32 XXH_PRIME32_3 = 0xC2B2AE3DU; /*!< 0b11000010101100101010111000111101 */1709static const xxh_u32 XXH_PRIME32_4 = 0x27D4EB2FU; /*!< 0b00100111110101001110101100101111 */1710static const xxh_u32 XXH_PRIME32_5 = 0x165667B1U; /*!< 0b00010110010101100110011110110001 */17111712#ifdef XXH_OLD_NAMES1713# define PRIME32_1 XXH_PRIME32_11714# define PRIME32_2 XXH_PRIME32_21715# define PRIME32_3 XXH_PRIME32_31716# define PRIME32_4 XXH_PRIME32_41717# define PRIME32_5 XXH_PRIME32_51718#endif17191720/*!1721* @internal1722* @brief Normal stripe processing routine.1723*1724* This shuffles the bits so that any bit from @p input impacts several bits in1725* @p acc.1726*1727* @param acc The accumulator lane.1728* @param input The stripe of input to mix.1729* @return The mixed accumulator lane.1730*/1731static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)1732{1733acc += input * XXH_PRIME32_2;1734acc = XXH_rotl32(acc, 13);1735acc *= XXH_PRIME32_1;1736#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)1737/*1738* UGLY HACK:1739* This inline assembly hack forces acc into a normal register. This is the1740* only thing that prevents GCC and Clang from autovectorizing the XXH321741* loop (pragmas and attributes don't work for some resason) without globally1742* disabling SSE4.1.1743*1744* The reason we want to avoid vectorization is because despite working on1745* 4 integers at a time, there are multiple factors slowing XXH32 down on1746* SSE4:1747* - There's a ridiculous amount of lag from pmulld (10 cycles of latency on1748* newer chips!) making it slightly slower to multiply four integers at1749* once compared to four integers independently. Even when pmulld was1750* fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE1751* just to multiply unless doing a long operation.1752*1753* - Four instructions are required to rotate,1754* movqda tmp, v // not required with VEX encoding1755* pslld tmp, 13 // tmp <<= 131756* psrld v, 19 // x >>= 191757* por v, tmp // x |= tmp1758* compared to one for scalar:1759* roll v, 13 // reliably fast across the board1760* shldl v, v, 13 // Sandy Bridge and later prefer this for some reason1761*1762* - Instruction level parallelism is actually more beneficial here because1763* the SIMD actually serializes this operation: While v1 is rotating, v21764* can load data, while v3 can multiply. SSE forces them to operate1765* together.1766*1767* How this hack works:1768* __asm__("" // Declare an assembly block but don't declare any instructions1769* : // However, as an Input/Output Operand,1770* "+r" // constrain a read/write operand (+) as a general purpose register (r).1771* (acc) // and set acc as the operand1772* );1773*1774* Because of the 'r', the compiler has promised that seed will be in a1775* general purpose register and the '+' says that it will be 'read/write',1776* so it has to assume it has changed. It is like volatile without all the1777* loads and stores.1778*1779* Since the argument has to be in a normal register (not an SSE register),1780* each time XXH32_round is called, it is impossible to vectorize.1781*/1782__asm__("" : "+r" (acc));1783#endif1784return acc;1785}17861787/*!1788* @internal1789* @brief Mixes all bits to finalize the hash.1790*1791* The final mix ensures that all input bits have a chance to impact any bit in1792* the output digest, resulting in an unbiased distribution.1793*1794* @param h32 The hash to avalanche.1795* @return The avalanched hash.1796*/1797static xxh_u32 XXH32_avalanche(xxh_u32 h32)1798{1799h32 ^= h32 >> 15;1800h32 *= XXH_PRIME32_2;1801h32 ^= h32 >> 13;1802h32 *= XXH_PRIME32_3;1803h32 ^= h32 >> 16;1804return(h32);1805}18061807#define XXH_get32bits(p) XXH_readLE32_align(p, align)18081809/*!1810* @internal1811* @brief Processes the last 0-15 bytes of @p ptr.1812*1813* There may be up to 15 bytes remaining to consume from the input.1814* This final stage will digest them to ensure that all input bytes are present1815* in the final mix.1816*1817* @param h32 The hash to finalize.1818* @param ptr The pointer to the remaining input.1819* @param len The remaining length, modulo 16.1820* @param align Whether @p ptr is aligned.1821* @return The finalized hash.1822*/1823static xxh_u321824XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)1825{1826#define XXH_PROCESS1 do { \1827h32 += (*ptr++) * XXH_PRIME32_5; \1828h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \1829} while (0)18301831#define XXH_PROCESS4 do { \1832h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \1833ptr += 4; \1834h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \1835} while (0)18361837/* Compact rerolled version */1838if (XXH_REROLL) {1839len &= 15;1840while (len >= 4) {1841XXH_PROCESS4;1842len -= 4;1843}1844while (len > 0) {1845XXH_PROCESS1;1846--len;1847}1848return XXH32_avalanche(h32);1849} else {1850switch(len&15) /* or switch(bEnd - p) */ {1851case 12: XXH_PROCESS4;1852/* fallthrough */1853case 8: XXH_PROCESS4;1854/* fallthrough */1855case 4: XXH_PROCESS4;1856return XXH32_avalanche(h32);18571858case 13: XXH_PROCESS4;1859/* fallthrough */1860case 9: XXH_PROCESS4;1861/* fallthrough */1862case 5: XXH_PROCESS4;1863XXH_PROCESS1;1864return XXH32_avalanche(h32);18651866case 14: XXH_PROCESS4;1867/* fallthrough */1868case 10: XXH_PROCESS4;1869/* fallthrough */1870case 6: XXH_PROCESS4;1871XXH_PROCESS1;1872XXH_PROCESS1;1873return XXH32_avalanche(h32);18741875case 15: XXH_PROCESS4;1876/* fallthrough */1877case 11: XXH_PROCESS4;1878/* fallthrough */1879case 7: XXH_PROCESS4;1880/* fallthrough */1881case 3: XXH_PROCESS1;1882/* fallthrough */1883case 2: XXH_PROCESS1;1884/* fallthrough */1885case 1: XXH_PROCESS1;1886/* fallthrough */1887case 0: return XXH32_avalanche(h32);1888}1889XXH_ASSERT(0);1890return h32; /* reaching this point is deemed impossible */1891}1892}18931894#ifdef XXH_OLD_NAMES1895# define PROCESS1 XXH_PROCESS11896# define PROCESS4 XXH_PROCESS41897#else1898# undef XXH_PROCESS11899# undef XXH_PROCESS41900#endif19011902/*!1903* @internal1904* @brief The implementation for @ref XXH32().1905*1906* @param input, len, seed Directly passed from @ref XXH32().1907* @param align Whether @p input is aligned.1908* @return The calculated hash.1909*/1910XXH_FORCE_INLINE xxh_u321911XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)1912{1913const xxh_u8* bEnd = input + len;1914xxh_u32 h32;19151916#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)1917if (input==NULL) {1918len=0;1919bEnd=input=(const xxh_u8*)(size_t)16;1920}1921#endif19221923if (len>=16) {1924const xxh_u8* const limit = bEnd - 15;1925xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;1926xxh_u32 v2 = seed + XXH_PRIME32_2;1927xxh_u32 v3 = seed + 0;1928xxh_u32 v4 = seed - XXH_PRIME32_1;19291930do {1931v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;1932v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;1933v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;1934v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;1935} while (input < limit);19361937h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7)1938+ XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);1939} else {1940h32 = seed + XXH_PRIME32_5;1941}19421943h32 += (xxh_u32)len;19441945return XXH32_finalize(h32, input, len&15, align);1946}19471948/*! @ingroup xxh32_family */1949XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)1950{1951#if 01952/* Simple version, good for code maintenance, but unfortunately slow for small inputs */1953XXH32_state_t state;1954XXH32_reset(&state, seed);1955XXH32_update(&state, (const xxh_u8*)input, len);1956return XXH32_digest(&state);1957#else1958if (XXH_FORCE_ALIGN_CHECK) {1959if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */1960return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);1961} }19621963return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);1964#endif1965}1966196719681969/******* Hash streaming *******/1970/*!1971* @ingroup xxh32_family1972*/1973XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)1974{1975return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));1976}1977/*! @ingroup xxh32_family */1978XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)1979{1980XXH_free(statePtr);1981return XXH_OK;1982}19831984/*! @ingroup xxh32_family */1985XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)1986{1987memcpy(dstState, srcState, sizeof(*dstState));1988}19891990/*! @ingroup xxh32_family */1991XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)1992{1993XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */1994memset(&state, 0, sizeof(state));1995state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;1996state.v2 = seed + XXH_PRIME32_2;1997state.v3 = seed + 0;1998state.v4 = seed - XXH_PRIME32_1;1999/* do not write into reserved, planned to be removed in a future version */2000memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));2001return XXH_OK;2002}200320042005/*! @ingroup xxh32_family */2006XXH_PUBLIC_API XXH_errorcode2007XXH32_update(XXH32_state_t* state, const void* input, size_t len)2008{2009if (input==NULL)2010#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)2011return XXH_OK;2012#else2013return XXH_ERROR;2014#endif20152016{ const xxh_u8* p = (const xxh_u8*)input;2017const xxh_u8* const bEnd = p + len;20182019state->total_len_32 += (XXH32_hash_t)len;2020state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));20212022if (state->memsize + len < 16) { /* fill in tmp buffer */2023XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);2024state->memsize += (XXH32_hash_t)len;2025return XXH_OK;2026}20272028if (state->memsize) { /* some data left from previous update */2029XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);2030{ const xxh_u32* p32 = state->mem32;2031state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;2032state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;2033state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;2034state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));2035}2036p += 16-state->memsize;2037state->memsize = 0;2038}20392040if (p <= bEnd-16) {2041const xxh_u8* const limit = bEnd - 16;2042xxh_u32 v1 = state->v1;2043xxh_u32 v2 = state->v2;2044xxh_u32 v3 = state->v3;2045xxh_u32 v4 = state->v4;20462047do {2048v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;2049v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;2050v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;2051v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;2052} while (p<=limit);20532054state->v1 = v1;2055state->v2 = v2;2056state->v3 = v3;2057state->v4 = v4;2058}20592060if (p < bEnd) {2061XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));2062state->memsize = (unsigned)(bEnd-p);2063}2064}20652066return XXH_OK;2067}206820692070/*! @ingroup xxh32_family */2071XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)2072{2073xxh_u32 h32;20742075if (state->large_len) {2076h32 = XXH_rotl32(state->v1, 1)2077+ XXH_rotl32(state->v2, 7)2078+ XXH_rotl32(state->v3, 12)2079+ XXH_rotl32(state->v4, 18);2080} else {2081h32 = state->v3 /* == seed */ + XXH_PRIME32_5;2082}20832084h32 += state->total_len_32;20852086return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);2087}208820892090/******* Canonical representation *******/20912092/*!2093* @ingroup xxh32_family2094* The default return values from XXH functions are unsigned 32 and 64 bit2095* integers.2096*2097* The canonical representation uses big endian convention, the same convention2098* as human-readable numbers (large digits first).2099*2100* This way, hash values can be written into a file or buffer, remaining2101* comparable across different systems.2102*2103* The following functions allow transformation of hash values to and from their2104* canonical format.2105*/2106XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)2107{2108XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));2109if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);2110memcpy(dst, &hash, sizeof(*dst));2111}2112/*! @ingroup xxh32_family */2113XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)2114{2115return XXH_readBE32(src);2116}211721182119#ifndef XXH_NO_LONG_LONG21202121/* *******************************************************************2122* 64-bit hash functions2123*********************************************************************/2124/*!2125* @}2126* @ingroup impl2127* @{2128*/2129/******* Memory access *******/21302131typedef XXH64_hash_t xxh_u64;21322133#ifdef XXH_OLD_NAMES2134# define U64 xxh_u642135#endif21362137/*!2138* XXH_REROLL_XXH64:2139* Whether to reroll the XXH64_finalize() loop.2140*2141* Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a2142* performance gain on 64-bit hosts, as only one jump is required.2143*2144* However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit2145* registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial2146* to unroll. The code becomes ridiculously large (the largest function in the2147* binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is2148* also slightly faster because it fits into cache better and is more likely2149* to be inlined by the compiler.2150*2151* If XXH_REROLL is defined, this is ignored and the loop is always rerolled.2152*/2153#ifndef XXH_REROLL_XXH642154# if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \2155|| !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \2156|| defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \2157|| defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \2158|| defined(__mips64__) || defined(__mips64)) /* mips64 */ \2159|| (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */2160# define XXH_REROLL_XXH64 12161# else2162# define XXH_REROLL_XXH64 02163# endif2164#endif /* !defined(XXH_REROLL_XXH64) */21652166#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))2167/*2168* Manual byteshift. Best for old compilers which don't inline memcpy.2169* We actually directly use XXH_readLE64 and XXH_readBE64.2170*/2171#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))21722173/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */2174static xxh_u64 XXH_read64(const void* memPtr)2175{2176return *(const xxh_u64*) memPtr;2177}21782179#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))21802181/*2182* __pack instructions are safer, but compiler specific, hence potentially2183* problematic for some compilers.2184*2185* Currently only defined for GCC and ICC.2186*/2187#ifdef XXH_OLD_NAMES2188typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;2189#endif2190static xxh_u64 XXH_read64(const void* ptr)2191{2192typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;2193return ((const xxh_unalign64*)ptr)->u64;2194}21952196#else21972198/*2199* Portable and safe solution. Generally efficient.2200* see: https://stackoverflow.com/a/32095106/6469472201*/2202static xxh_u64 XXH_read64(const void* memPtr)2203{2204xxh_u64 val;2205memcpy(&val, memPtr, sizeof(val));2206return val;2207}22082209#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */22102211#if defined(_MSC_VER) /* Visual Studio */2212# define XXH_swap64 _byteswap_uint642213#elif XXH_GCC_VERSION >= 4032214# define XXH_swap64 __builtin_bswap642215#else2216static xxh_u64 XXH_swap64(xxh_u64 x)2217{2218return ((x << 56) & 0xff00000000000000ULL) |2219((x << 40) & 0x00ff000000000000ULL) |2220((x << 24) & 0x0000ff0000000000ULL) |2221((x << 8) & 0x000000ff00000000ULL) |2222((x >> 8) & 0x00000000ff000000ULL) |2223((x >> 24) & 0x0000000000ff0000ULL) |2224((x >> 40) & 0x000000000000ff00ULL) |2225((x >> 56) & 0x00000000000000ffULL);2226}2227#endif222822292230/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */2231#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))22322233XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)2234{2235const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;2236return bytePtr[0]2237| ((xxh_u64)bytePtr[1] << 8)2238| ((xxh_u64)bytePtr[2] << 16)2239| ((xxh_u64)bytePtr[3] << 24)2240| ((xxh_u64)bytePtr[4] << 32)2241| ((xxh_u64)bytePtr[5] << 40)2242| ((xxh_u64)bytePtr[6] << 48)2243| ((xxh_u64)bytePtr[7] << 56);2244}22452246XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)2247{2248const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;2249return bytePtr[7]2250| ((xxh_u64)bytePtr[6] << 8)2251| ((xxh_u64)bytePtr[5] << 16)2252| ((xxh_u64)bytePtr[4] << 24)2253| ((xxh_u64)bytePtr[3] << 32)2254| ((xxh_u64)bytePtr[2] << 40)2255| ((xxh_u64)bytePtr[1] << 48)2256| ((xxh_u64)bytePtr[0] << 56);2257}22582259#else2260XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)2261{2262return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));2263}22642265static xxh_u64 XXH_readBE64(const void* ptr)2266{2267return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);2268}2269#endif22702271XXH_FORCE_INLINE xxh_u642272XXH_readLE64_align(const void* ptr, XXH_alignment align)2273{2274if (align==XXH_unaligned)2275return XXH_readLE64(ptr);2276else2277return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);2278}227922802281/******* xxh64 *******/2282/*!2283* @}2284* @defgroup xxh64_impl XXH64 implementation2285* @ingroup impl2286* @{2287*/2288static const xxh_u64 XXH_PRIME64_1 = 0x9E3779B185EBCA87ULL; /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */2289static const xxh_u64 XXH_PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */2290static const xxh_u64 XXH_PRIME64_3 = 0x165667B19E3779F9ULL; /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */2291static const xxh_u64 XXH_PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */2292static const xxh_u64 XXH_PRIME64_5 = 0x27D4EB2F165667C5ULL; /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */22932294#ifdef XXH_OLD_NAMES2295# define PRIME64_1 XXH_PRIME64_12296# define PRIME64_2 XXH_PRIME64_22297# define PRIME64_3 XXH_PRIME64_32298# define PRIME64_4 XXH_PRIME64_42299# define PRIME64_5 XXH_PRIME64_52300#endif23012302static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)2303{2304acc += input * XXH_PRIME64_2;2305acc = XXH_rotl64(acc, 31);2306acc *= XXH_PRIME64_1;2307return acc;2308}23092310static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)2311{2312val = XXH64_round(0, val);2313acc ^= val;2314acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4;2315return acc;2316}23172318static xxh_u64 XXH64_avalanche(xxh_u64 h64)2319{2320h64 ^= h64 >> 33;2321h64 *= XXH_PRIME64_2;2322h64 ^= h64 >> 29;2323h64 *= XXH_PRIME64_3;2324h64 ^= h64 >> 32;2325return h64;2326}232723282329#define XXH_get64bits(p) XXH_readLE64_align(p, align)23302331static xxh_u642332XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)2333{2334#define XXH_PROCESS1_64 do { \2335h64 ^= (*ptr++) * XXH_PRIME64_5; \2336h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; \2337} while (0)23382339#define XXH_PROCESS4_64 do { \2340h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; \2341ptr += 4; \2342h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; \2343} while (0)23442345#define XXH_PROCESS8_64 do { \2346xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \2347ptr += 8; \2348h64 ^= k1; \2349h64 = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4; \2350} while (0)23512352/* Rerolled version for 32-bit targets is faster and much smaller. */2353if (XXH_REROLL || XXH_REROLL_XXH64) {2354len &= 31;2355while (len >= 8) {2356XXH_PROCESS8_64;2357len -= 8;2358}2359if (len >= 4) {2360XXH_PROCESS4_64;2361len -= 4;2362}2363while (len > 0) {2364XXH_PROCESS1_64;2365--len;2366}2367return XXH64_avalanche(h64);2368} else {2369switch(len & 31) {2370case 24: XXH_PROCESS8_64;2371/* fallthrough */2372case 16: XXH_PROCESS8_64;2373/* fallthrough */2374case 8: XXH_PROCESS8_64;2375return XXH64_avalanche(h64);23762377case 28: XXH_PROCESS8_64;2378/* fallthrough */2379case 20: XXH_PROCESS8_64;2380/* fallthrough */2381case 12: XXH_PROCESS8_64;2382/* fallthrough */2383case 4: XXH_PROCESS4_64;2384return XXH64_avalanche(h64);23852386case 25: XXH_PROCESS8_64;2387/* fallthrough */2388case 17: XXH_PROCESS8_64;2389/* fallthrough */2390case 9: XXH_PROCESS8_64;2391XXH_PROCESS1_64;2392return XXH64_avalanche(h64);23932394case 29: XXH_PROCESS8_64;2395/* fallthrough */2396case 21: XXH_PROCESS8_64;2397/* fallthrough */2398case 13: XXH_PROCESS8_64;2399/* fallthrough */2400case 5: XXH_PROCESS4_64;2401XXH_PROCESS1_64;2402return XXH64_avalanche(h64);24032404case 26: XXH_PROCESS8_64;2405/* fallthrough */2406case 18: XXH_PROCESS8_64;2407/* fallthrough */2408case 10: XXH_PROCESS8_64;2409XXH_PROCESS1_64;2410XXH_PROCESS1_64;2411return XXH64_avalanche(h64);24122413case 30: XXH_PROCESS8_64;2414/* fallthrough */2415case 22: XXH_PROCESS8_64;2416/* fallthrough */2417case 14: XXH_PROCESS8_64;2418/* fallthrough */2419case 6: XXH_PROCESS4_64;2420XXH_PROCESS1_64;2421XXH_PROCESS1_64;2422return XXH64_avalanche(h64);24232424case 27: XXH_PROCESS8_64;2425/* fallthrough */2426case 19: XXH_PROCESS8_64;2427/* fallthrough */2428case 11: XXH_PROCESS8_64;2429XXH_PROCESS1_64;2430XXH_PROCESS1_64;2431XXH_PROCESS1_64;2432return XXH64_avalanche(h64);24332434case 31: XXH_PROCESS8_64;2435/* fallthrough */2436case 23: XXH_PROCESS8_64;2437/* fallthrough */2438case 15: XXH_PROCESS8_64;2439/* fallthrough */2440case 7: XXH_PROCESS4_64;2441/* fallthrough */2442case 3: XXH_PROCESS1_64;2443/* fallthrough */2444case 2: XXH_PROCESS1_64;2445/* fallthrough */2446case 1: XXH_PROCESS1_64;2447/* fallthrough */2448case 0: return XXH64_avalanche(h64);2449}2450}2451/* impossible to reach */2452XXH_ASSERT(0);2453return 0; /* unreachable, but some compilers complain without it */2454}24552456#ifdef XXH_OLD_NAMES2457# define PROCESS1_64 XXH_PROCESS1_642458# define PROCESS4_64 XXH_PROCESS4_642459# define PROCESS8_64 XXH_PROCESS8_642460#else2461# undef XXH_PROCESS1_642462# undef XXH_PROCESS4_642463# undef XXH_PROCESS8_642464#endif24652466XXH_FORCE_INLINE xxh_u642467XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)2468{2469const xxh_u8* bEnd = input + len;2470xxh_u64 h64;24712472#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)2473if (input==NULL) {2474len=0;2475bEnd=input=(const xxh_u8*)(size_t)32;2476}2477#endif24782479if (len>=32) {2480const xxh_u8* const limit = bEnd - 32;2481xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;2482xxh_u64 v2 = seed + XXH_PRIME64_2;2483xxh_u64 v3 = seed + 0;2484xxh_u64 v4 = seed - XXH_PRIME64_1;24852486do {2487v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;2488v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;2489v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;2490v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;2491} while (input<=limit);24922493h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);2494h64 = XXH64_mergeRound(h64, v1);2495h64 = XXH64_mergeRound(h64, v2);2496h64 = XXH64_mergeRound(h64, v3);2497h64 = XXH64_mergeRound(h64, v4);24982499} else {2500h64 = seed + XXH_PRIME64_5;2501}25022503h64 += (xxh_u64) len;25042505return XXH64_finalize(h64, input, len, align);2506}250725082509/*! @ingroup xxh64_family */2510XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)2511{2512#if 02513/* Simple version, good for code maintenance, but unfortunately slow for small inputs */2514XXH64_state_t state;2515XXH64_reset(&state, seed);2516XXH64_update(&state, (const xxh_u8*)input, len);2517return XXH64_digest(&state);2518#else2519if (XXH_FORCE_ALIGN_CHECK) {2520if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */2521return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);2522} }25232524return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);25252526#endif2527}25282529/******* Hash Streaming *******/25302531/*! @ingroup xxh64_family*/2532XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)2533{2534return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));2535}2536/*! @ingroup xxh64_family */2537XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)2538{2539XXH_free(statePtr);2540return XXH_OK;2541}25422543/*! @ingroup xxh64_family */2544XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)2545{2546memcpy(dstState, srcState, sizeof(*dstState));2547}25482549/*! @ingroup xxh64_family */2550XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)2551{2552XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */2553memset(&state, 0, sizeof(state));2554state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;2555state.v2 = seed + XXH_PRIME64_2;2556state.v3 = seed + 0;2557state.v4 = seed - XXH_PRIME64_1;2558/* do not write into reserved64, might be removed in a future version */2559memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));2560return XXH_OK;2561}25622563/*! @ingroup xxh64_family */2564XXH_PUBLIC_API XXH_errorcode2565XXH64_update (XXH64_state_t* state, const void* input, size_t len)2566{2567if (input==NULL)2568#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)2569return XXH_OK;2570#else2571return XXH_ERROR;2572#endif25732574{ const xxh_u8* p = (const xxh_u8*)input;2575const xxh_u8* const bEnd = p + len;25762577state->total_len += len;25782579if (state->memsize + len < 32) { /* fill in tmp buffer */2580XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);2581state->memsize += (xxh_u32)len;2582return XXH_OK;2583}25842585if (state->memsize) { /* tmp buffer is full */2586XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);2587state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));2588state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));2589state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));2590state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));2591p += 32 - state->memsize;2592state->memsize = 0;2593}25942595if (p+32 <= bEnd) {2596const xxh_u8* const limit = bEnd - 32;2597xxh_u64 v1 = state->v1;2598xxh_u64 v2 = state->v2;2599xxh_u64 v3 = state->v3;2600xxh_u64 v4 = state->v4;26012602do {2603v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;2604v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;2605v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;2606v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;2607} while (p<=limit);26082609state->v1 = v1;2610state->v2 = v2;2611state->v3 = v3;2612state->v4 = v4;2613}26142615if (p < bEnd) {2616XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));2617state->memsize = (unsigned)(bEnd-p);2618}2619}26202621return XXH_OK;2622}262326242625/*! @ingroup xxh64_family */2626XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)2627{2628xxh_u64 h64;26292630if (state->total_len >= 32) {2631xxh_u64 const v1 = state->v1;2632xxh_u64 const v2 = state->v2;2633xxh_u64 const v3 = state->v3;2634xxh_u64 const v4 = state->v4;26352636h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);2637h64 = XXH64_mergeRound(h64, v1);2638h64 = XXH64_mergeRound(h64, v2);2639h64 = XXH64_mergeRound(h64, v3);2640h64 = XXH64_mergeRound(h64, v4);2641} else {2642h64 = state->v3 /*seed*/ + XXH_PRIME64_5;2643}26442645h64 += (xxh_u64) state->total_len;26462647return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);2648}264926502651/******* Canonical representation *******/26522653/*! @ingroup xxh64_family */2654XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)2655{2656XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));2657if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);2658memcpy(dst, &hash, sizeof(*dst));2659}26602661/*! @ingroup xxh64_family */2662XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)2663{2664return XXH_readBE64(src);2665}2666266726682669/* *********************************************************************2670* XXH32671* New generation hash designed for speed on small keys and vectorization2672************************************************************************ */2673/*!2674* @}2675* @defgroup xxh3_impl XXH3 implementation2676* @ingroup impl2677* @{2678*/26792680/* === Compiler specifics === */26812682#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */2683# define XXH_RESTRICT restrict2684#else2685/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */2686# define XXH_RESTRICT /* disable */2687#endif26882689#if (defined(__GNUC__) && (__GNUC__ >= 3)) \2690|| (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \2691|| defined(__clang__)2692# define XXH_likely(x) __builtin_expect(x, 1)2693# define XXH_unlikely(x) __builtin_expect(x, 0)2694#else2695# define XXH_likely(x) (x)2696# define XXH_unlikely(x) (x)2697#endif26982699#if defined(__GNUC__)2700# if defined(__AVX2__)2701# include <immintrin.h>2702# elif defined(__SSE2__)2703# include <emmintrin.h>2704# elif defined(__ARM_NEON__) || defined(__ARM_NEON)2705# define inline __inline__ /* circumvent a clang bug */2706# include <arm_neon.h>2707# undef inline2708# endif2709#elif defined(_MSC_VER)2710# include <intrin.h>2711#endif27122713/*2714* One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while2715* remaining a true 64-bit/128-bit hash function.2716*2717* This is done by prioritizing a subset of 64-bit operations that can be2718* emulated without too many steps on the average 32-bit machine.2719*2720* For example, these two lines seem similar, and run equally fast on 64-bit:2721*2722* xxh_u64 x;2723* x ^= (x >> 47); // good2724* x ^= (x >> 13); // bad2725*2726* However, to a 32-bit machine, there is a major difference.2727*2728* x ^= (x >> 47) looks like this:2729*2730* x.lo ^= (x.hi >> (47 - 32));2731*2732* while x ^= (x >> 13) looks like this:2733*2734* // note: funnel shifts are not usually cheap.2735* x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));2736* x.hi ^= (x.hi >> 13);2737*2738* The first one is significantly faster than the second, simply because the2739* shift is larger than 32. This means:2740* - All the bits we need are in the upper 32 bits, so we can ignore the lower2741* 32 bits in the shift.2742* - The shift result will always fit in the lower 32 bits, and therefore,2743* we can ignore the upper 32 bits in the xor.2744*2745* Thanks to this optimization, XXH3 only requires these features to be efficient:2746*2747* - Usable unaligned access2748* - A 32-bit or 64-bit ALU2749* - If 32-bit, a decent ADC instruction2750* - A 32 or 64-bit multiply with a 64-bit result2751* - For the 128-bit variant, a decent byteswap helps short inputs.2752*2753* The first two are already required by XXH32, and almost all 32-bit and 64-bit2754* platforms which can run XXH32 can run XXH3 efficiently.2755*2756* Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one2757* notable exception.2758*2759* First of all, Thumb-1 lacks support for the UMULL instruction which2760* performs the important long multiply. This means numerous __aeabi_lmul2761* calls.2762*2763* Second of all, the 8 functional registers are just not enough.2764* Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need2765* Lo registers, and this shuffling results in thousands more MOVs than A32.2766*2767* A32 and T32 don't have this limitation. They can access all 14 registers,2768* do a 32->64 multiply with UMULL, and the flexible operand allowing free2769* shifts is helpful, too.2770*2771* Therefore, we do a quick sanity check.2772*2773* If compiling Thumb-1 for a target which supports ARM instructions, we will2774* emit a warning, as it is not a "sane" platform to compile for.2775*2776* Usually, if this happens, it is because of an accident and you probably need2777* to specify -march, as you likely meant to compile for a newer architecture.2778*2779* Credit: large sections of the vectorial and asm source code paths2780* have been contributed by @easyaspi3142781*/2782#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)2783# warning "XXH3 is highly inefficient without ARM or Thumb-2."2784#endif27852786/* ==========================================2787* Vectorization detection2788* ========================================== */27892790#ifdef XXH_DOXYGEN2791/*!2792* @ingroup tuning2793* @brief Overrides the vectorization implementation chosen for XXH3.2794*2795* Can be defined to 0 to disable SIMD or any of the values mentioned in2796* @ref XXH_VECTOR_TYPE.2797*2798* If this is not defined, it uses predefined macros to determine the best2799* implementation.2800*/2801# define XXH_VECTOR XXH_SCALAR2802/*!2803* @ingroup tuning2804* @brief Possible values for @ref XXH_VECTOR.2805*2806* Note that these are actually implemented as macros.2807*2808* If this is not defined, it is detected automatically.2809* @ref XXH_X86DISPATCH overrides this.2810*/2811enum XXH_VECTOR_TYPE /* fake enum */ {2812XXH_SCALAR = 0, /*!< Portable scalar version */2813XXH_SSE2 = 1, /*!<2814* SSE2 for Pentium 4, Opteron, all x86_64.2815*2816* @note SSE2 is also guaranteed on Windows 10, macOS, and2817* Android x86.2818*/2819XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */2820XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */2821XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */2822XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */2823};2824/*!2825* @ingroup tuning2826* @brief Selects the minimum alignment for XXH3's accumulators.2827*2828* When using SIMD, this should match the alignment reqired for said vector2829* type, so, for example, 32 for AVX2.2830*2831* Default: Auto detected.2832*/2833# define XXH_ACC_ALIGN 82834#endif28352836/* Actual definition */2837#ifndef XXH_DOXYGEN2838# define XXH_SCALAR 02839# define XXH_SSE2 12840# define XXH_AVX2 22841# define XXH_AVX512 32842# define XXH_NEON 42843# define XXH_VSX 52844#endif28452846#ifndef XXH_VECTOR /* can be defined on command line */2847# if defined(__AVX512F__)2848# define XXH_VECTOR XXH_AVX5122849# elif defined(__AVX2__)2850# define XXH_VECTOR XXH_AVX22851# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))2852# define XXH_VECTOR XXH_SSE22853# elif defined(__GNUC__) /* msvc support maybe later */ \2854&& (defined(__ARM_NEON__) || defined(__ARM_NEON)) \2855&& (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \2856|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))2857# define XXH_VECTOR XXH_NEON2858# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \2859|| (defined(__s390x__) && defined(__VEC__)) \2860&& defined(__GNUC__) /* TODO: IBM XL */2861# define XXH_VECTOR XXH_VSX2862# else2863# define XXH_VECTOR XXH_SCALAR2864# endif2865#endif28662867/*2868* Controls the alignment of the accumulator,2869* for compatibility with aligned vector loads, which are usually faster.2870*/2871#ifndef XXH_ACC_ALIGN2872# if defined(XXH_X86DISPATCH)2873# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */2874# elif XXH_VECTOR == XXH_SCALAR /* scalar */2875# define XXH_ACC_ALIGN 82876# elif XXH_VECTOR == XXH_SSE2 /* sse2 */2877# define XXH_ACC_ALIGN 162878# elif XXH_VECTOR == XXH_AVX2 /* avx2 */2879# define XXH_ACC_ALIGN 322880# elif XXH_VECTOR == XXH_NEON /* neon */2881# define XXH_ACC_ALIGN 162882# elif XXH_VECTOR == XXH_VSX /* vsx */2883# define XXH_ACC_ALIGN 162884# elif XXH_VECTOR == XXH_AVX512 /* avx512 */2885# define XXH_ACC_ALIGN 642886# endif2887#endif28882889#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \2890|| XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX5122891# define XXH_SEC_ALIGN XXH_ACC_ALIGN2892#else2893# define XXH_SEC_ALIGN 82894#endif28952896/*2897* UGLY HACK:2898* GCC usually generates the best code with -O3 for xxHash.2899*2900* However, when targeting AVX2, it is overzealous in its unrolling resulting2901* in code roughly 3/4 the speed of Clang.2902*2903* There are other issues, such as GCC splitting _mm256_loadu_si256 into2904* _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which2905* only applies to Sandy and Ivy Bridge... which don't even support AVX2.2906*2907* That is why when compiling the AVX2 version, it is recommended to use either2908* -O2 -mavx2 -march=haswell2909* or2910* -O2 -mavx2 -mno-avx256-split-unaligned-load2911* for decent performance, or to use Clang instead.2912*2913* Fortunately, we can control the first one with a pragma that forces GCC into2914* -O2, but the other one we can't control without "failed to inline always2915* inline function due to target mismatch" warnings.2916*/2917#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \2918&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \2919&& defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */2920# pragma GCC push_options2921# pragma GCC optimize("-O2")2922#endif292329242925#if XXH_VECTOR == XXH_NEON2926/*2927* NEON's setup for vmlal_u32 is a little more complicated than it is on2928* SSE2, AVX2, and VSX.2929*2930* While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.2931*2932* To do the same operation, the 128-bit 'Q' register needs to be split into2933* two 64-bit 'D' registers, performing this operation::2934*2935* [ a | b ]2936* | '---------. .--------' |2937* | x |2938* | .---------' '--------. |2939* [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ]2940*2941* Due to significant changes in aarch64, the fastest method for aarch64 is2942* completely different than the fastest method for ARMv7-A.2943*2944* ARMv7-A treats D registers as unions overlaying Q registers, so modifying2945* D11 will modify the high half of Q5. This is similar to how modifying AH2946* will only affect bits 8-15 of AX on x86.2947*2948* VZIP takes two registers, and puts even lanes in one register and odd lanes2949* in the other.2950*2951* On ARMv7-A, this strangely modifies both parameters in place instead of2952* taking the usual 3-operand form.2953*2954* Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the2955* lower and upper halves of the Q register to end up with the high and low2956* halves where we want - all in one instruction.2957*2958* vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }2959*2960* Unfortunately we need inline assembly for this: Instructions modifying two2961* registers at once is not possible in GCC or Clang's IR, and they have to2962* create a copy.2963*2964* aarch64 requires a different approach.2965*2966* In order to make it easier to write a decent compiler for aarch64, many2967* quirks were removed, such as conditional execution.2968*2969* NEON was also affected by this.2970*2971* aarch64 cannot access the high bits of a Q-form register, and writes to a2972* D-form register zero the high bits, similar to how writes to W-form scalar2973* registers (or DWORD registers on x86_64) work.2974*2975* The formerly free vget_high intrinsics now require a vext (with a few2976* exceptions)2977*2978* Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent2979* of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one2980* operand.2981*2982* The equivalent of the VZIP.32 on the lower and upper halves would be this2983* mess:2984*2985* ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }2986* zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }2987* zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }2988*2989* Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):2990*2991* shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);2992* xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);2993*2994* This is available on ARMv7-A, but is less efficient than a single VZIP.32.2995*/29962997/*!2998* Function-like macro:2999* void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)3000* {3001* outLo = (uint32x2_t)(in & 0xFFFFFFFF);3002* outHi = (uint32x2_t)(in >> 32);3003* in = UNDEFINED;3004* }3005*/3006# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \3007&& defined(__GNUC__) \3008&& !defined(__aarch64__) && !defined(__arm64__)3009# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \3010do { \3011/* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \3012/* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \3013/* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \3014__asm__("vzip.32 %e0, %f0" : "+w" (in)); \3015(outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \3016(outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \3017} while (0)3018# else3019# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \3020do { \3021(outLo) = vmovn_u64 (in); \3022(outHi) = vshrn_n_u64 ((in), 32); \3023} while (0)3024# endif3025#endif /* XXH_VECTOR == XXH_NEON */30263027/*3028* VSX and Z Vector helpers.3029*3030* This is very messy, and any pull requests to clean this up are welcome.3031*3032* There are a lot of problems with supporting VSX and s390x, due to3033* inconsistent intrinsics, spotty coverage, and multiple endiannesses.3034*/3035#if XXH_VECTOR == XXH_VSX3036# if defined(__s390x__)3037# include <s390intrin.h>3038# else3039/* gcc's altivec.h can have the unwanted consequence to unconditionally3040* #define bool, vector, and pixel keywords,3041* with bad consequences for programs already using these keywords for other purposes.3042* The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined.3043* __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler,3044* but it seems that, in some cases, it isn't.3045* Force the build macro to be defined, so that keywords are not altered.3046*/3047# if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)3048# define __APPLE_ALTIVEC__3049# endif3050# include <altivec.h>3051# endif30523053typedef __vector unsigned long long xxh_u64x2;3054typedef __vector unsigned char xxh_u8x16;3055typedef __vector unsigned xxh_u32x4;30563057# ifndef XXH_VSX_BE3058# if defined(__BIG_ENDIAN__) \3059|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)3060# define XXH_VSX_BE 13061# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__3062# warning "-maltivec=be is not recommended. Please use native endianness."3063# define XXH_VSX_BE 13064# else3065# define XXH_VSX_BE 03066# endif3067# endif /* !defined(XXH_VSX_BE) */30683069# if XXH_VSX_BE3070# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))3071# define XXH_vec_revb vec_revb3072# else3073/*!3074* A polyfill for POWER9's vec_revb().3075*/3076XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)3077{3078xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,30790x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };3080return vec_perm(val, val, vByteSwap);3081}3082# endif3083# endif /* XXH_VSX_BE */30843085/*!3086* Performs an unaligned vector load and byte swaps it on big endian.3087*/3088XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)3089{3090xxh_u64x2 ret;3091memcpy(&ret, ptr, sizeof(xxh_u64x2));3092# if XXH_VSX_BE3093ret = XXH_vec_revb(ret);3094# endif3095return ret;3096}30973098/*3099* vec_mulo and vec_mule are very problematic intrinsics on PowerPC3100*3101* These intrinsics weren't added until GCC 8, despite existing for a while,3102* and they are endian dependent. Also, their meaning swap depending on version.3103* */3104# if defined(__s390x__)3105/* s390x is always big endian, no issue on this platform */3106# define XXH_vec_mulo vec_mulo3107# define XXH_vec_mule vec_mule3108# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)3109/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */3110# define XXH_vec_mulo __builtin_altivec_vmulouw3111# define XXH_vec_mule __builtin_altivec_vmuleuw3112# else3113/* gcc needs inline assembly */3114/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */3115XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)3116{3117xxh_u64x2 result;3118__asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));3119return result;3120}3121XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)3122{3123xxh_u64x2 result;3124__asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));3125return result;3126}3127# endif /* XXH_vec_mulo, XXH_vec_mule */3128#endif /* XXH_VECTOR == XXH_VSX */312931303131/* prefetch3132* can be disabled, by declaring XXH_NO_PREFETCH build macro */3133#if defined(XXH_NO_PREFETCH)3134# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */3135#else3136# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */3137# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */3138# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)3139# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )3140# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)3141# else3142# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */3143# endif3144#endif /* XXH_NO_PREFETCH */314531463147/* ==========================================3148* XXH3 default settings3149* ========================================== */31503151#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */31523153#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)3154# error "default keyset is not large enough"3155#endif31563157/*! Pseudorandom secret taken directly from FARSH. */3158XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {31590xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,31600xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,31610xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,31620xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,31630x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,31640x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,31650xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,31660x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,31670xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,31680x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,31690x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,31700x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,3171};317231733174#ifdef XXH_OLD_NAMES3175# define kSecret XXH3_kSecret3176#endif31773178#ifdef XXH_DOXYGEN3179/*!3180* @brief Calculates a 32-bit to 64-bit long multiply.3181*3182* Implemented as a macro.3183*3184* Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't3185* need to (but it shouldn't need to anyways, it is about 7 instructions to do3186* a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we3187* use that instead of the normal method.3188*3189* If you are compiling for platforms like Thumb-1 and don't have a better option,3190* you may also want to write your own long multiply routine here.3191*3192* @param x, y Numbers to be multiplied3193* @return 64-bit product of the low 32 bits of @p x and @p y.3194*/3195XXH_FORCE_INLINE xxh_u643196XXH_mult32to64(xxh_u64 x, xxh_u64 y)3197{3198return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);3199}3200#elif defined(_MSC_VER) && defined(_M_IX86)3201# include <intrin.h>3202# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))3203#else3204/*3205* Downcast + upcast is usually better than masking on older compilers like3206* GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.3207*3208* The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands3209* and perform a full 64x64 multiply -- entirely redundant on 32-bit.3210*/3211# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))3212#endif32133214/*!3215* @brief Calculates a 64->128-bit long multiply.3216*3217* Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar3218* version.3219*3220* @param lhs, rhs The 64-bit integers to be multiplied3221* @return The 128-bit result represented in an @ref XXH128_hash_t.3222*/3223static XXH128_hash_t3224XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)3225{3226/*3227* GCC/Clang __uint128_t method.3228*3229* On most 64-bit targets, GCC and Clang define a __uint128_t type.3230* This is usually the best way as it usually uses a native long 64-bit3231* multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.3232*3233* Usually.3234*3235* Despite being a 32-bit platform, Clang (and emscripten) define this type3236* despite not having the arithmetic for it. This results in a laggy3237* compiler builtin call which calculates a full 128-bit multiply.3238* In that case it is best to use the portable one.3239* https://github.com/Cyan4973/xxHash/issues/211#issuecomment-5155756773240*/3241#if defined(__GNUC__) && !defined(__wasm__) \3242&& defined(__SIZEOF_INT128__) \3243|| (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)32443245__uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;3246XXH128_hash_t r128;3247r128.low64 = (xxh_u64)(product);3248r128.high64 = (xxh_u64)(product >> 64);3249return r128;32503251/*3252* MSVC for x64's _umul128 method.3253*3254* xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);3255*3256* This compiles to single operand MUL on x64.3257*/3258#elif defined(_M_X64) || defined(_M_IA64)32593260#ifndef _MSC_VER3261# pragma intrinsic(_umul128)3262#endif3263xxh_u64 product_high;3264xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);3265XXH128_hash_t r128;3266r128.low64 = product_low;3267r128.high64 = product_high;3268return r128;32693270#else3271/*3272* Portable scalar method. Optimized for 32-bit and 64-bit ALUs.3273*3274* This is a fast and simple grade school multiply, which is shown below3275* with base 10 arithmetic instead of base 0x100000000.3276*3277* 9 3 // D2 lhs = 933278* x 7 5 // D2 rhs = 753279* ----------3280* 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 153281* 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 453282* 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 213283* + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 633284* ---------3285* 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 273286* + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 673287* ---------3288* 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 69753289*3290* The reasons for adding the products like this are:3291* 1. It avoids manual carry tracking. Just like how3292* (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.3293* This avoids a lot of complexity.3294*3295* 2. It hints for, and on Clang, compiles to, the powerful UMAAL3296* instruction available in ARM's Digital Signal Processing extension3297* in 32-bit ARMv6 and later, which is shown below:3298*3299* void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)3300* {3301* xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;3302* *RdLo = (xxh_u32)(product & 0xFFFFFFFF);3303* *RdHi = (xxh_u32)(product >> 32);3304* }3305*3306* This instruction was designed for efficient long multiplication, and3307* allows this to be calculated in only 4 instructions at speeds3308* comparable to some 64-bit ALUs.3309*3310* 3. It isn't terrible on other platforms. Usually this will be a couple3311* of 32-bit ADD/ADCs.3312*/33133314/* First calculate all of the cross products. */3315xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);3316xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);3317xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);3318xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);33193320/* Now add the products together. These will never overflow. */3321xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;3322xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;3323xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);33243325XXH128_hash_t r128;3326r128.low64 = lower;3327r128.high64 = upper;3328return r128;3329#endif3330}33313332/*!3333* @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.3334*3335* The reason for the separate function is to prevent passing too many structs3336* around by value. This will hopefully inline the multiply, but we don't force it.3337*3338* @param lhs, rhs The 64-bit integers to multiply3339* @return The low 64 bits of the product XOR'd by the high 64 bits.3340* @see XXH_mult64to128()3341*/3342static xxh_u643343XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)3344{3345XXH128_hash_t product = XXH_mult64to128(lhs, rhs);3346return product.low64 ^ product.high64;3347}33483349/*! Seems to produce slightly better code on GCC for some reason. */3350XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)3351{3352XXH_ASSERT(0 <= shift && shift < 64);3353return v64 ^ (v64 >> shift);3354}33553356/*3357* This is a fast avalanche stage,3358* suitable when input bits are already partially mixed3359*/3360static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)3361{3362h64 = XXH_xorshift64(h64, 37);3363h64 *= 0x165667919E3779F9ULL;3364h64 = XXH_xorshift64(h64, 32);3365return h64;3366}33673368/*3369* This is a stronger avalanche,3370* inspired by Pelle Evensen's rrmxmx3371* preferable when input has not been previously mixed3372*/3373static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)3374{3375/* this mix is inspired by Pelle Evensen's rrmxmx */3376h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);3377h64 *= 0x9FB21C651E98DF25ULL;3378h64 ^= (h64 >> 35) + len ;3379h64 *= 0x9FB21C651E98DF25ULL;3380return XXH_xorshift64(h64, 28);3381}338233833384/* ==========================================3385* Short keys3386* ==========================================3387* One of the shortcomings of XXH32 and XXH64 was that their performance was3388* sub-optimal on short lengths. It used an iterative algorithm which strongly3389* favored lengths that were a multiple of 4 or 8.3390*3391* Instead of iterating over individual inputs, we use a set of single shot3392* functions which piece together a range of lengths and operate in constant time.3393*3394* Additionally, the number of multiplies has been significantly reduced. This3395* reduces latency, especially when emulating 64-bit multiplies on 32-bit.3396*3397* Depending on the platform, this may or may not be faster than XXH32, but it3398* is almost guaranteed to be faster than XXH64.3399*/34003401/*3402* At very short lengths, there isn't enough input to fully hide secrets, or use3403* the entire secret.3404*3405* There is also only a limited amount of mixing we can do before significantly3406* impacting performance.3407*3408* Therefore, we use different sections of the secret and always mix two secret3409* samples with an XOR. This should have no effect on performance on the3410* seedless or withSeed variants because everything _should_ be constant folded3411* by modern compilers.3412*3413* The XOR mixing hides individual parts of the secret and increases entropy.3414*3415* This adds an extra layer of strength for custom secrets.3416*/3417XXH_FORCE_INLINE XXH64_hash_t3418XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)3419{3420XXH_ASSERT(input != NULL);3421XXH_ASSERT(1 <= len && len <= 3);3422XXH_ASSERT(secret != NULL);3423/*3424* len = 1: combined = { input[0], 0x01, input[0], input[0] }3425* len = 2: combined = { input[1], 0x02, input[0], input[1] }3426* len = 3: combined = { input[2], 0x03, input[0], input[1] }3427*/3428{ xxh_u8 const c1 = input[0];3429xxh_u8 const c2 = input[len >> 1];3430xxh_u8 const c3 = input[len - 1];3431xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24)3432| ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);3433xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;3434xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;3435return XXH64_avalanche(keyed);3436}3437}34383439XXH_FORCE_INLINE XXH64_hash_t3440XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)3441{3442XXH_ASSERT(input != NULL);3443XXH_ASSERT(secret != NULL);3444XXH_ASSERT(4 <= len && len < 8);3445seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;3446{ xxh_u32 const input1 = XXH_readLE32(input);3447xxh_u32 const input2 = XXH_readLE32(input + len - 4);3448xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;3449xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);3450xxh_u64 const keyed = input64 ^ bitflip;3451return XXH3_rrmxmx(keyed, len);3452}3453}34543455XXH_FORCE_INLINE XXH64_hash_t3456XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)3457{3458XXH_ASSERT(input != NULL);3459XXH_ASSERT(secret != NULL);3460XXH_ASSERT(8 <= len && len <= 16);3461{ xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;3462xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;3463xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;3464xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;3465xxh_u64 const acc = len3466+ XXH_swap64(input_lo) + input_hi3467+ XXH3_mul128_fold64(input_lo, input_hi);3468return XXH3_avalanche(acc);3469}3470}34713472XXH_FORCE_INLINE XXH64_hash_t3473XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)3474{3475XXH_ASSERT(len <= 16);3476{ if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed);3477if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);3478if (len) return XXH3_len_1to3_64b(input, len, secret, seed);3479return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));3480}3481}34823483/*3484* DISCLAIMER: There are known *seed-dependent* multicollisions here due to3485* multiplication by zero, affecting hashes of lengths 17 to 240.3486*3487* However, they are very unlikely.3488*3489* Keep this in mind when using the unseeded XXH3_64bits() variant: As with all3490* unseeded non-cryptographic hashes, it does not attempt to defend itself3491* against specially crafted inputs, only random inputs.3492*3493* Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes3494* cancelling out the secret is taken an arbitrary number of times (addressed3495* in XXH3_accumulate_512), this collision is very unlikely with random inputs3496* and/or proper seeding:3497*3498* This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a3499* function that is only called up to 16 times per hash with up to 240 bytes of3500* input.3501*3502* This is not too bad for a non-cryptographic hash function, especially with3503* only 64 bit outputs.3504*3505* The 128-bit variant (which trades some speed for strength) is NOT affected3506* by this, although it is always a good idea to use a proper seed if you care3507* about strength.3508*/3509XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,3510const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)3511{3512#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \3513&& defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \3514&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */3515/*3516* UGLY HACK:3517* GCC for x86 tends to autovectorize the 128-bit multiply, resulting in3518* slower code.3519*3520* By forcing seed64 into a register, we disrupt the cost model and3521* cause it to scalarize. See `XXH32_round()`3522*3523* FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,3524* XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on3525* GCC 9.2, despite both emitting scalar code.3526*3527* GCC generates much better scalar code than Clang for the rest of XXH3,3528* which is why finding a more optimal codepath is an interest.3529*/3530__asm__ ("" : "+r" (seed64));3531#endif3532{ xxh_u64 const input_lo = XXH_readLE64(input);3533xxh_u64 const input_hi = XXH_readLE64(input+8);3534return XXH3_mul128_fold64(3535input_lo ^ (XXH_readLE64(secret) + seed64),3536input_hi ^ (XXH_readLE64(secret+8) - seed64)3537);3538}3539}35403541/* For mid range keys, XXH3 uses a Mum-hash variant. */3542XXH_FORCE_INLINE XXH64_hash_t3543XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,3544const xxh_u8* XXH_RESTRICT secret, size_t secretSize,3545XXH64_hash_t seed)3546{3547XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;3548XXH_ASSERT(16 < len && len <= 128);35493550{ xxh_u64 acc = len * XXH_PRIME64_1;3551if (len > 32) {3552if (len > 64) {3553if (len > 96) {3554acc += XXH3_mix16B(input+48, secret+96, seed);3555acc += XXH3_mix16B(input+len-64, secret+112, seed);3556}3557acc += XXH3_mix16B(input+32, secret+64, seed);3558acc += XXH3_mix16B(input+len-48, secret+80, seed);3559}3560acc += XXH3_mix16B(input+16, secret+32, seed);3561acc += XXH3_mix16B(input+len-32, secret+48, seed);3562}3563acc += XXH3_mix16B(input+0, secret+0, seed);3564acc += XXH3_mix16B(input+len-16, secret+16, seed);35653566return XXH3_avalanche(acc);3567}3568}35693570#define XXH3_MIDSIZE_MAX 24035713572XXH_NO_INLINE XXH64_hash_t3573XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,3574const xxh_u8* XXH_RESTRICT secret, size_t secretSize,3575XXH64_hash_t seed)3576{3577XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;3578XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);35793580#define XXH3_MIDSIZE_STARTOFFSET 33581#define XXH3_MIDSIZE_LASTOFFSET 1735823583{ xxh_u64 acc = len * XXH_PRIME64_1;3584int const nbRounds = (int)len / 16;3585int i;3586for (i=0; i<8; i++) {3587acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);3588}3589acc = XXH3_avalanche(acc);3590XXH_ASSERT(nbRounds >= 8);3591#if defined(__clang__) /* Clang */ \3592&& (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \3593&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */3594/*3595* UGLY HACK:3596* Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.3597* In everywhere else, it uses scalar code.3598*3599* For 64->128-bit multiplies, even if the NEON was 100% optimal, it3600* would still be slower than UMAAL (see XXH_mult64to128).3601*3602* Unfortunately, Clang doesn't handle the long multiplies properly and3603* converts them to the nonexistent "vmulq_u64" intrinsic, which is then3604* scalarized into an ugly mess of VMOV.32 instructions.3605*3606* This mess is difficult to avoid without turning autovectorization3607* off completely, but they are usually relatively minor and/or not3608* worth it to fix.3609*3610* This loop is the easiest to fix, as unlike XXH32, this pragma3611* _actually works_ because it is a loop vectorization instead of an3612* SLP vectorization.3613*/3614#pragma clang loop vectorize(disable)3615#endif3616for (i=8 ; i < nbRounds; i++) {3617acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);3618}3619/* last bytes */3620acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);3621return XXH3_avalanche(acc);3622}3623}362436253626/* ======= Long Keys ======= */36273628#define XXH_STRIPE_LEN 643629#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */3630#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))36313632#ifdef XXH_OLD_NAMES3633# define STRIPE_LEN XXH_STRIPE_LEN3634# define ACC_NB XXH_ACC_NB3635#endif36363637XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)3638{3639if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);3640memcpy(dst, &v64, sizeof(v64));3641}36423643/* Several intrinsic functions below are supposed to accept __int64 as argument,3644* as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .3645* However, several environments do not define __int64 type,3646* requiring a workaround.3647*/3648#if !defined (__VMS) \3649&& (defined (__cplusplus) \3650|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )3651typedef int64_t xxh_i64;3652#else3653/* the following type must have a width of 64-bit */3654typedef long long xxh_i64;3655#endif36563657/*3658* XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.3659*3660* It is a hardened version of UMAC, based off of FARSH's implementation.3661*3662* This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD3663* implementations, and it is ridiculously fast.3664*3665* We harden it by mixing the original input to the accumulators as well as the product.3666*3667* This means that in the (relatively likely) case of a multiply by zero, the3668* original input is preserved.3669*3670* On 128-bit inputs, we swap 64-bit pairs when we add the input to improve3671* cross-pollination, as otherwise the upper and lower halves would be3672* essentially independent.3673*3674* This doesn't matter on 64-bit hashes since they all get merged together in3675* the end, so we skip the extra step.3676*3677* Both XXH3_64bits and XXH3_128bits use this subroutine.3678*/36793680#if (XXH_VECTOR == XXH_AVX512) \3681|| (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)36823683#ifndef XXH_TARGET_AVX5123684# define XXH_TARGET_AVX512 /* disable attribute target */3685#endif36863687XXH_FORCE_INLINE XXH_TARGET_AVX512 void3688XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,3689const void* XXH_RESTRICT input,3690const void* XXH_RESTRICT secret)3691{3692XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc;3693XXH_ASSERT((((size_t)acc) & 63) == 0);3694XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));36953696{3697/* data_vec = input[0]; */3698__m512i const data_vec = _mm512_loadu_si512 (input);3699/* key_vec = secret[0]; */3700__m512i const key_vec = _mm512_loadu_si512 (secret);3701/* data_key = data_vec ^ key_vec; */3702__m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);3703/* data_key_lo = data_key >> 32; */3704__m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));3705/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */3706__m512i const product = _mm512_mul_epu32 (data_key, data_key_lo);3707/* xacc[0] += swap(data_vec); */3708__m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));3709__m512i const sum = _mm512_add_epi64(*xacc, data_swap);3710/* xacc[0] += product; */3711*xacc = _mm512_add_epi64(product, sum);3712}3713}37143715/*3716* XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.3717*3718* Multiplication isn't perfect, as explained by Google in HighwayHash:3719*3720* // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to3721* // varying degrees. In descending order of goodness, bytes3722* // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.3723* // As expected, the upper and lower bytes are much worse.3724*3725* Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L2913726*3727* Since our algorithm uses a pseudorandom secret to add some variance into the3728* mix, we don't need to (or want to) mix as often or as much as HighwayHash does.3729*3730* This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid3731* extraction.3732*3733* Both XXH3_64bits and XXH3_128bits use this subroutine.3734*/37353736XXH_FORCE_INLINE XXH_TARGET_AVX512 void3737XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)3738{3739XXH_ASSERT((((size_t)acc) & 63) == 0);3740XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));3741{ XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;3742const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);37433744/* xacc[0] ^= (xacc[0] >> 47) */3745__m512i const acc_vec = *xacc;3746__m512i const shifted = _mm512_srli_epi64 (acc_vec, 47);3747__m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted);3748/* xacc[0] ^= secret; */3749__m512i const key_vec = _mm512_loadu_si512 (secret);3750__m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);37513752/* xacc[0] *= XXH_PRIME32_1; */3753__m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));3754__m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32);3755__m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32);3756*xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));3757}3758}37593760XXH_FORCE_INLINE XXH_TARGET_AVX512 void3761XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)3762{3763XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);3764XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);3765XXH_ASSERT(((size_t)customSecret & 63) == 0);3766(void)(&XXH_writeLE64);3767{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);3768__m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64);37693770XXH_ALIGN(64) const __m512i* const src = (const __m512i*) XXH3_kSecret;3771XXH_ALIGN(64) __m512i* const dest = ( __m512i*) customSecret;3772int i;3773for (i=0; i < nbRounds; ++i) {3774/* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',3775* this will warn "discards ‘const’ qualifier". */3776union {3777XXH_ALIGN(64) const __m512i* cp;3778XXH_ALIGN(64) void* p;3779} remote_const_void;3780remote_const_void.cp = src + i;3781dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);3782} }3783}37843785#endif37863787#if (XXH_VECTOR == XXH_AVX2) \3788|| (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)37893790#ifndef XXH_TARGET_AVX23791# define XXH_TARGET_AVX2 /* disable attribute target */3792#endif37933794XXH_FORCE_INLINE XXH_TARGET_AVX2 void3795XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,3796const void* XXH_RESTRICT input,3797const void* XXH_RESTRICT secret)3798{3799XXH_ASSERT((((size_t)acc) & 31) == 0);3800{ XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc;3801/* Unaligned. This is mainly for pointer arithmetic, and because3802* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */3803const __m256i* const xinput = (const __m256i *) input;3804/* Unaligned. This is mainly for pointer arithmetic, and because3805* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */3806const __m256i* const xsecret = (const __m256i *) secret;38073808size_t i;3809for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {3810/* data_vec = xinput[i]; */3811__m256i const data_vec = _mm256_loadu_si256 (xinput+i);3812/* key_vec = xsecret[i]; */3813__m256i const key_vec = _mm256_loadu_si256 (xsecret+i);3814/* data_key = data_vec ^ key_vec; */3815__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);3816/* data_key_lo = data_key >> 32; */3817__m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));3818/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */3819__m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);3820/* xacc[i] += swap(data_vec); */3821__m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));3822__m256i const sum = _mm256_add_epi64(xacc[i], data_swap);3823/* xacc[i] += product; */3824xacc[i] = _mm256_add_epi64(product, sum);3825} }3826}38273828XXH_FORCE_INLINE XXH_TARGET_AVX2 void3829XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)3830{3831XXH_ASSERT((((size_t)acc) & 31) == 0);3832{ XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;3833/* Unaligned. This is mainly for pointer arithmetic, and because3834* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */3835const __m256i* const xsecret = (const __m256i *) secret;3836const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);38373838size_t i;3839for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {3840/* xacc[i] ^= (xacc[i] >> 47) */3841__m256i const acc_vec = xacc[i];3842__m256i const shifted = _mm256_srli_epi64 (acc_vec, 47);3843__m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted);3844/* xacc[i] ^= xsecret; */3845__m256i const key_vec = _mm256_loadu_si256 (xsecret+i);3846__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);38473848/* xacc[i] *= XXH_PRIME32_1; */3849__m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));3850__m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);3851__m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);3852xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));3853}3854}3855}38563857XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)3858{3859XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);3860XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);3861XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);3862(void)(&XXH_writeLE64);3863XXH_PREFETCH(customSecret);3864{ __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64, -(xxh_i64)seed64, (xxh_i64)seed64);38653866XXH_ALIGN(64) const __m256i* const src = (const __m256i*) XXH3_kSecret;3867XXH_ALIGN(64) __m256i* dest = ( __m256i*) customSecret;38683869# if defined(__GNUC__) || defined(__clang__)3870/*3871* On GCC & Clang, marking 'dest' as modified will cause the compiler:3872* - do not extract the secret from sse registers in the internal loop3873* - use less common registers, and avoid pushing these reg into stack3874* The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with3875* customSecret, and on aarch64, this prevented LDP from merging two3876* loads together for free. Putting the loads together before the stores3877* properly generates LDP.3878*/3879__asm__("" : "+r" (dest));3880# endif38813882/* GCC -O2 need unroll loop manually */3883dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);3884dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);3885dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);3886dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);3887dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);3888dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);3889}3890}38913892#endif38933894/* x86dispatch always generates SSE2 */3895#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)38963897#ifndef XXH_TARGET_SSE23898# define XXH_TARGET_SSE2 /* disable attribute target */3899#endif39003901XXH_FORCE_INLINE XXH_TARGET_SSE2 void3902XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,3903const void* XXH_RESTRICT input,3904const void* XXH_RESTRICT secret)3905{3906/* SSE2 is just a half-scale version of the AVX2 version. */3907XXH_ASSERT((((size_t)acc) & 15) == 0);3908{ XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc;3909/* Unaligned. This is mainly for pointer arithmetic, and because3910* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */3911const __m128i* const xinput = (const __m128i *) input;3912/* Unaligned. This is mainly for pointer arithmetic, and because3913* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */3914const __m128i* const xsecret = (const __m128i *) secret;39153916size_t i;3917for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {3918/* data_vec = xinput[i]; */3919__m128i const data_vec = _mm_loadu_si128 (xinput+i);3920/* key_vec = xsecret[i]; */3921__m128i const key_vec = _mm_loadu_si128 (xsecret+i);3922/* data_key = data_vec ^ key_vec; */3923__m128i const data_key = _mm_xor_si128 (data_vec, key_vec);3924/* data_key_lo = data_key >> 32; */3925__m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));3926/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */3927__m128i const product = _mm_mul_epu32 (data_key, data_key_lo);3928/* xacc[i] += swap(data_vec); */3929__m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));3930__m128i const sum = _mm_add_epi64(xacc[i], data_swap);3931/* xacc[i] += product; */3932xacc[i] = _mm_add_epi64(product, sum);3933} }3934}39353936XXH_FORCE_INLINE XXH_TARGET_SSE2 void3937XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)3938{3939XXH_ASSERT((((size_t)acc) & 15) == 0);3940{ XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;3941/* Unaligned. This is mainly for pointer arithmetic, and because3942* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */3943const __m128i* const xsecret = (const __m128i *) secret;3944const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);39453946size_t i;3947for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {3948/* xacc[i] ^= (xacc[i] >> 47) */3949__m128i const acc_vec = xacc[i];3950__m128i const shifted = _mm_srli_epi64 (acc_vec, 47);3951__m128i const data_vec = _mm_xor_si128 (acc_vec, shifted);3952/* xacc[i] ^= xsecret[i]; */3953__m128i const key_vec = _mm_loadu_si128 (xsecret+i);3954__m128i const data_key = _mm_xor_si128 (data_vec, key_vec);39553956/* xacc[i] *= XXH_PRIME32_1; */3957__m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));3958__m128i const prod_lo = _mm_mul_epu32 (data_key, prime32);3959__m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32);3960xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));3961}3962}3963}39643965XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)3966{3967XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);3968(void)(&XXH_writeLE64);3969{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);39703971# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 19003972// MSVC 32bit mode does not support _mm_set_epi64x before 20153973XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, -(xxh_i64)seed64 };3974__m128i const seed = _mm_load_si128((__m128i const*)seed64x2);3975# else3976__m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64);3977# endif3978int i;39793980XXH_ALIGN(64) const float* const src = (float const*) XXH3_kSecret;3981XXH_ALIGN(XXH_SEC_ALIGN) __m128i* dest = (__m128i*) customSecret;3982# if defined(__GNUC__) || defined(__clang__)3983/*3984* On GCC & Clang, marking 'dest' as modified will cause the compiler:3985* - do not extract the secret from sse registers in the internal loop3986* - use less common registers, and avoid pushing these reg into stack3987*/3988__asm__("" : "+r" (dest));3989# endif39903991for (i=0; i < nbRounds; ++i) {3992dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src+i*4)), seed);3993} }3994}39953996#endif39973998#if (XXH_VECTOR == XXH_NEON)39994000XXH_FORCE_INLINE void4001XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,4002const void* XXH_RESTRICT input,4003const void* XXH_RESTRICT secret)4004{4005XXH_ASSERT((((size_t)acc) & 15) == 0);4006{4007XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;4008/* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */4009uint8_t const* const xinput = (const uint8_t *) input;4010uint8_t const* const xsecret = (const uint8_t *) secret;40114012size_t i;4013for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {4014/* data_vec = xinput[i]; */4015uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));4016/* key_vec = xsecret[i]; */4017uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));4018uint64x2_t data_key;4019uint32x2_t data_key_lo, data_key_hi;4020/* xacc[i] += swap(data_vec); */4021uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);4022uint64x2_t const swapped = vextq_u64(data64, data64, 1);4023xacc[i] = vaddq_u64 (xacc[i], swapped);4024/* data_key = data_vec ^ key_vec; */4025data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));4026/* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);4027* data_key_hi = (uint32x2_t) (data_key >> 32);4028* data_key = UNDEFINED; */4029XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);4030/* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */4031xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);40324033}4034}4035}40364037XXH_FORCE_INLINE void4038XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)4039{4040XXH_ASSERT((((size_t)acc) & 15) == 0);40414042{ uint64x2_t* xacc = (uint64x2_t*) acc;4043uint8_t const* xsecret = (uint8_t const*) secret;4044uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1);40454046size_t i;4047for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) {4048/* xacc[i] ^= (xacc[i] >> 47); */4049uint64x2_t acc_vec = xacc[i];4050uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);4051uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);40524053/* xacc[i] ^= xsecret[i]; */4054uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));4055uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));40564057/* xacc[i] *= XXH_PRIME32_1 */4058uint32x2_t data_key_lo, data_key_hi;4059/* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);4060* data_key_hi = (uint32x2_t) (xacc[i] >> 32);4061* xacc[i] = UNDEFINED; */4062XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);4063{ /*4064* prod_hi = (data_key >> 32) * XXH_PRIME32_1;4065*4066* Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will4067* incorrectly "optimize" this:4068* tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));4069* shifted = vshll_n_u32(tmp, 32);4070* to this:4071* tmp = "vmulq_u64"(a, b); // no such thing!4072* shifted = vshlq_n_u64(tmp, 32);4073*4074* However, unlike SSE, Clang lacks a 64-bit multiply routine4075* for NEON, and it scalarizes two 64-bit multiplies instead.4076*4077* vmull_u32 has the same timing as vmul_u32, and it avoids4078* this bug completely.4079* See https://bugs.llvm.org/show_bug.cgi?id=399674080*/4081uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);4082/* xacc[i] = prod_hi << 32; */4083xacc[i] = vshlq_n_u64(prod_hi, 32);4084/* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */4085xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);4086}4087} }4088}40894090#endif40914092#if (XXH_VECTOR == XXH_VSX)40934094XXH_FORCE_INLINE void4095XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,4096const void* XXH_RESTRICT input,4097const void* XXH_RESTRICT secret)4098{4099xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */4100xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */4101xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */4102xxh_u64x2 const v32 = { 32, 32 };4103size_t i;4104for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {4105/* data_vec = xinput[i]; */4106xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);4107/* key_vec = xsecret[i]; */4108xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);4109xxh_u64x2 const data_key = data_vec ^ key_vec;4110/* shuffled = (data_key << 32) | (data_key >> 32); */4111xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);4112/* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */4113xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);4114xacc[i] += product;41154116/* swap high and low halves */4117#ifdef __s390x__4118xacc[i] += vec_permi(data_vec, data_vec, 2);4119#else4120xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);4121#endif4122}4123}41244125XXH_FORCE_INLINE void4126XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)4127{4128XXH_ASSERT((((size_t)acc) & 15) == 0);41294130{ xxh_u64x2* const xacc = (xxh_u64x2*) acc;4131const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;4132/* constants */4133xxh_u64x2 const v32 = { 32, 32 };4134xxh_u64x2 const v47 = { 47, 47 };4135xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };4136size_t i;4137for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {4138/* xacc[i] ^= (xacc[i] >> 47); */4139xxh_u64x2 const acc_vec = xacc[i];4140xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);41414142/* xacc[i] ^= xsecret[i]; */4143xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);4144xxh_u64x2 const data_key = data_vec ^ key_vec;41454146/* xacc[i] *= XXH_PRIME32_1 */4147/* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */4148xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);4149/* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */4150xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);4151xacc[i] = prod_odd + (prod_even << v32);4152} }4153}41544155#endif41564157/* scalar variants - universal */41584159XXH_FORCE_INLINE void4160XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,4161const void* XXH_RESTRICT input,4162const void* XXH_RESTRICT secret)4163{4164XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */4165const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */4166const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */4167size_t i;4168XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);4169for (i=0; i < XXH_ACC_NB; i++) {4170xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);4171xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);4172xacc[i ^ 1] += data_val; /* swap adjacent lanes */4173xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);4174}4175}41764177XXH_FORCE_INLINE void4178XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)4179{4180XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */4181const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */4182size_t i;4183XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);4184for (i=0; i < XXH_ACC_NB; i++) {4185xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);4186xxh_u64 acc64 = xacc[i];4187acc64 = XXH_xorshift64(acc64, 47);4188acc64 ^= key64;4189acc64 *= XXH_PRIME32_1;4190xacc[i] = acc64;4191}4192}41934194XXH_FORCE_INLINE void4195XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)4196{4197/*4198* We need a separate pointer for the hack below,4199* which requires a non-const pointer.4200* Any decent compiler will optimize this out otherwise.4201*/4202const xxh_u8* kSecretPtr = XXH3_kSecret;4203XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);42044205#if defined(__clang__) && defined(__aarch64__)4206/*4207* UGLY HACK:4208* Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are4209* placed sequentially, in order, at the top of the unrolled loop.4210*4211* While MOVK is great for generating constants (2 cycles for a 64-bit4212* constant compared to 4 cycles for LDR), long MOVK chains stall the4213* integer pipelines:4214* I L S4215* MOVK4216* MOVK4217* MOVK4218* MOVK4219* ADD4220* SUB STR4221* STR4222* By forcing loads from memory (as the asm line causes Clang to assume4223* that XXH3_kSecretPtr has been changed), the pipelines are used more4224* efficiently:4225* I L S4226* LDR4227* ADD LDR4228* SUB STR4229* STR4230* XXH3_64bits_withSeed, len == 256, Snapdragon 8354231* without hack: 2654.4 MB/s4232* with hack: 3202.9 MB/s4233*/4234__asm__("" : "+r" (kSecretPtr));4235#endif4236/*4237* Note: in debug mode, this overrides the asm optimization4238* and Clang will emit MOVK chains again.4239*/4240XXH_ASSERT(kSecretPtr == XXH3_kSecret);42414242{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;4243int i;4244for (i=0; i < nbRounds; i++) {4245/*4246* The asm hack causes Clang to assume that kSecretPtr aliases with4247* customSecret, and on aarch64, this prevented LDP from merging two4248* loads together for free. Putting the loads together before the stores4249* properly generates LDP.4250*/4251xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64;4252xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;4253XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo);4254XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);4255} }4256}425742584259typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*);4260typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);4261typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);426242634264#if (XXH_VECTOR == XXH_AVX512)42654266#define XXH3_accumulate_512 XXH3_accumulate_512_avx5124267#define XXH3_scrambleAcc XXH3_scrambleAcc_avx5124268#define XXH3_initCustomSecret XXH3_initCustomSecret_avx51242694270#elif (XXH_VECTOR == XXH_AVX2)42714272#define XXH3_accumulate_512 XXH3_accumulate_512_avx24273#define XXH3_scrambleAcc XXH3_scrambleAcc_avx24274#define XXH3_initCustomSecret XXH3_initCustomSecret_avx242754276#elif (XXH_VECTOR == XXH_SSE2)42774278#define XXH3_accumulate_512 XXH3_accumulate_512_sse24279#define XXH3_scrambleAcc XXH3_scrambleAcc_sse24280#define XXH3_initCustomSecret XXH3_initCustomSecret_sse242814282#elif (XXH_VECTOR == XXH_NEON)42834284#define XXH3_accumulate_512 XXH3_accumulate_512_neon4285#define XXH3_scrambleAcc XXH3_scrambleAcc_neon4286#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar42874288#elif (XXH_VECTOR == XXH_VSX)42894290#define XXH3_accumulate_512 XXH3_accumulate_512_vsx4291#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx4292#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar42934294#else /* scalar */42954296#define XXH3_accumulate_512 XXH3_accumulate_512_scalar4297#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar4298#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar42994300#endif4301430243034304#ifndef XXH_PREFETCH_DIST4305# ifdef __clang__4306# define XXH_PREFETCH_DIST 3204307# else4308# if (XXH_VECTOR == XXH_AVX512)4309# define XXH_PREFETCH_DIST 5124310# else4311# define XXH_PREFETCH_DIST 3844312# endif4313# endif /* __clang__ */4314#endif /* XXH_PREFETCH_DIST */43154316/*4317* XXH3_accumulate()4318* Loops over XXH3_accumulate_512().4319* Assumption: nbStripes will not overflow the secret size4320*/4321XXH_FORCE_INLINE void4322XXH3_accumulate( xxh_u64* XXH_RESTRICT acc,4323const xxh_u8* XXH_RESTRICT input,4324const xxh_u8* XXH_RESTRICT secret,4325size_t nbStripes,4326XXH3_f_accumulate_512 f_acc512)4327{4328size_t n;4329for (n = 0; n < nbStripes; n++ ) {4330const xxh_u8* const in = input + n*XXH_STRIPE_LEN;4331XXH_PREFETCH(in + XXH_PREFETCH_DIST);4332f_acc512(acc,4333in,4334secret + n*XXH_SECRET_CONSUME_RATE);4335}4336}43374338XXH_FORCE_INLINE void4339XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,4340const xxh_u8* XXH_RESTRICT input, size_t len,4341const xxh_u8* XXH_RESTRICT secret, size_t secretSize,4342XXH3_f_accumulate_512 f_acc512,4343XXH3_f_scrambleAcc f_scramble)4344{4345size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;4346size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;4347size_t const nb_blocks = (len - 1) / block_len;43484349size_t n;43504351XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);43524353for (n = 0; n < nb_blocks; n++) {4354XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);4355f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);4356}43574358/* last partial block */4359XXH_ASSERT(len > XXH_STRIPE_LEN);4360{ size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;4361XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));4362XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);43634364/* last stripe */4365{ const xxh_u8* const p = input + len - XXH_STRIPE_LEN;4366#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */4367f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);4368} }4369}43704371XXH_FORCE_INLINE xxh_u644372XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)4373{4374return XXH3_mul128_fold64(4375acc[0] ^ XXH_readLE64(secret),4376acc[1] ^ XXH_readLE64(secret+8) );4377}43784379static XXH64_hash_t4380XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)4381{4382xxh_u64 result64 = start;4383size_t i = 0;43844385for (i = 0; i < 4; i++) {4386result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);4387#if defined(__clang__) /* Clang */ \4388&& (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \4389&& (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \4390&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */4391/*4392* UGLY HACK:4393* Prevent autovectorization on Clang ARMv7-a. Exact same problem as4394* the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.4395* XXH3_64bits, len == 256, Snapdragon 835:4396* without hack: 2063.7 MB/s4397* with hack: 2560.7 MB/s4398*/4399__asm__("" : "+r" (result64));4400#endif4401}44024403return XXH3_avalanche(result64);4404}44054406#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \4407XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }44084409XXH_FORCE_INLINE XXH64_hash_t4410XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,4411const void* XXH_RESTRICT secret, size_t secretSize,4412XXH3_f_accumulate_512 f_acc512,4413XXH3_f_scrambleAcc f_scramble)4414{4415XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;44164417XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble);44184419/* converge into final hash */4420XXH_STATIC_ASSERT(sizeof(acc) == 64);4421/* do not align on 8, so that the secret is different from the accumulator */4422#define XXH_SECRET_MERGEACCS_START 114423XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);4424return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);4425}44264427/*4428* It's important for performance that XXH3_hashLong is not inlined.4429*/4430XXH_NO_INLINE XXH64_hash_t4431XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,4432XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)4433{4434(void)seed64;4435return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);4436}44374438/*4439* It's important for performance that XXH3_hashLong is not inlined.4440* Since the function is not inlined, the compiler may not be able to understand that,4441* in some scenarios, its `secret` argument is actually a compile time constant.4442* This variant enforces that the compiler can detect that,4443* and uses this opportunity to streamline the generated code for better performance.4444*/4445XXH_NO_INLINE XXH64_hash_t4446XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,4447XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)4448{4449(void)seed64; (void)secret; (void)secretLen;4450return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);4451}44524453/*4454* XXH3_hashLong_64b_withSeed():4455* Generate a custom key based on alteration of default XXH3_kSecret with the seed,4456* and then use this key for long mode hashing.4457*4458* This operation is decently fast but nonetheless costs a little bit of time.4459* Try to avoid it whenever possible (typically when seed==0).4460*4461* It's important for performance that XXH3_hashLong is not inlined. Not sure4462* why (uop cache maybe?), but the difference is large and easily measurable.4463*/4464XXH_FORCE_INLINE XXH64_hash_t4465XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,4466XXH64_hash_t seed,4467XXH3_f_accumulate_512 f_acc512,4468XXH3_f_scrambleAcc f_scramble,4469XXH3_f_initCustomSecret f_initSec)4470{4471if (seed == 0)4472return XXH3_hashLong_64b_internal(input, len,4473XXH3_kSecret, sizeof(XXH3_kSecret),4474f_acc512, f_scramble);4475{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];4476f_initSec(secret, seed);4477return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),4478f_acc512, f_scramble);4479}4480}44814482/*4483* It's important for performance that XXH3_hashLong is not inlined.4484*/4485XXH_NO_INLINE XXH64_hash_t4486XXH3_hashLong_64b_withSeed(const void* input, size_t len,4487XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)4488{4489(void)secret; (void)secretLen;4490return XXH3_hashLong_64b_withSeed_internal(input, len, seed,4491XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);4492}449344944495typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,4496XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);44974498XXH_FORCE_INLINE XXH64_hash_t4499XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,4500XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,4501XXH3_hashLong64_f f_hashLong)4502{4503XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);4504/*4505* If an action is to be taken if `secretLen` condition is not respected,4506* it should be done here.4507* For now, it's a contract pre-condition.4508* Adding a check and a branch here would cost performance at every hash.4509* Also, note that function signature doesn't offer room to return an error.4510*/4511if (len <= 16)4512return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);4513if (len <= 128)4514return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);4515if (len <= XXH3_MIDSIZE_MAX)4516return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);4517return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);4518}451945204521/* === Public entry point === */45224523/*! @ingroup xxh3_family */4524XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)4525{4526return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);4527}45284529/*! @ingroup xxh3_family */4530XXH_PUBLIC_API XXH64_hash_t4531XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)4532{4533return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);4534}45354536/*! @ingroup xxh3_family */4537XXH_PUBLIC_API XXH64_hash_t4538XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)4539{4540return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);4541}454245434544/* === XXH3 streaming === */45454546/*4547* Malloc's a pointer that is always aligned to align.4548*4549* This must be freed with `XXH_alignedFree()`.4550*4551* malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte4552* alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX24553* or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.4554*4555* This underalignment previously caused a rather obvious crash which went4556* completely unnoticed due to XXH3_createState() not actually being tested.4557* Credit to RedSpah for noticing this bug.4558*4559* The alignment is done manually: Functions like posix_memalign or _mm_malloc4560* are avoided: To maintain portability, we would have to write a fallback4561* like this anyways, and besides, testing for the existence of library4562* functions without relying on external build tools is impossible.4563*4564* The method is simple: Overallocate, manually align, and store the offset4565* to the original behind the returned pointer.4566*4567* Align must be a power of 2 and 8 <= align <= 128.4568*/4569static void* XXH_alignedMalloc(size_t s, size_t align)4570{4571XXH_ASSERT(align <= 128 && align >= 8); /* range check */4572XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */4573XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */4574{ /* Overallocate to make room for manual realignment and an offset byte */4575xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);4576if (base != NULL) {4577/*4578* Get the offset needed to align this pointer.4579*4580* Even if the returned pointer is aligned, there will always be4581* at least one byte to store the offset to the original pointer.4582*/4583size_t offset = align - ((size_t)base & (align - 1)); /* base % align */4584/* Add the offset for the now-aligned pointer */4585xxh_u8* ptr = base + offset;45864587XXH_ASSERT((size_t)ptr % align == 0);45884589/* Store the offset immediately before the returned pointer. */4590ptr[-1] = (xxh_u8)offset;4591return ptr;4592}4593return NULL;4594}4595}4596/*4597* Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass4598* normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.4599*/4600static void XXH_alignedFree(void* p)4601{4602if (p != NULL) {4603xxh_u8* ptr = (xxh_u8*)p;4604/* Get the offset byte we added in XXH_malloc. */4605xxh_u8 offset = ptr[-1];4606/* Free the original malloc'd pointer */4607xxh_u8* base = ptr - offset;4608XXH_free(base);4609}4610}4611/*! @ingroup xxh3_family */4612XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)4613{4614XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);4615if (state==NULL) return NULL;4616XXH3_INITSTATE(state);4617return state;4618}46194620/*! @ingroup xxh3_family */4621XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)4622{4623XXH_alignedFree(statePtr);4624return XXH_OK;4625}46264627/*! @ingroup xxh3_family */4628XXH_PUBLIC_API void4629XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)4630{4631memcpy(dst_state, src_state, sizeof(*dst_state));4632}46334634static void4635XXH3_reset_internal(XXH3_state_t* statePtr,4636XXH64_hash_t seed,4637const void* secret, size_t secretSize)4638{4639size_t const initStart = offsetof(XXH3_state_t, bufferedSize);4640size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;4641XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);4642XXH_ASSERT(statePtr != NULL);4643/* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */4644memset((char*)statePtr + initStart, 0, initLength);4645statePtr->acc[0] = XXH_PRIME32_3;4646statePtr->acc[1] = XXH_PRIME64_1;4647statePtr->acc[2] = XXH_PRIME64_2;4648statePtr->acc[3] = XXH_PRIME64_3;4649statePtr->acc[4] = XXH_PRIME64_4;4650statePtr->acc[5] = XXH_PRIME32_2;4651statePtr->acc[6] = XXH_PRIME64_5;4652statePtr->acc[7] = XXH_PRIME32_1;4653statePtr->seed = seed;4654statePtr->extSecret = (const unsigned char*)secret;4655XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);4656statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;4657statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;4658}46594660/*! @ingroup xxh3_family */4661XXH_PUBLIC_API XXH_errorcode4662XXH3_64bits_reset(XXH3_state_t* statePtr)4663{4664if (statePtr == NULL) return XXH_ERROR;4665XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);4666return XXH_OK;4667}46684669/*! @ingroup xxh3_family */4670XXH_PUBLIC_API XXH_errorcode4671XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)4672{4673if (statePtr == NULL) return XXH_ERROR;4674XXH3_reset_internal(statePtr, 0, secret, secretSize);4675if (secret == NULL) return XXH_ERROR;4676if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;4677return XXH_OK;4678}46794680/*! @ingroup xxh3_family */4681XXH_PUBLIC_API XXH_errorcode4682XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)4683{4684if (statePtr == NULL) return XXH_ERROR;4685if (seed==0) return XXH3_64bits_reset(statePtr);4686if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);4687XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);4688return XXH_OK;4689}46904691/* Note : when XXH3_consumeStripes() is invoked,4692* there must be a guarantee that at least one more byte must be consumed from input4693* so that the function can blindly consume all stripes using the "normal" secret segment */4694XXH_FORCE_INLINE void4695XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,4696size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,4697const xxh_u8* XXH_RESTRICT input, size_t nbStripes,4698const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,4699XXH3_f_accumulate_512 f_acc512,4700XXH3_f_scrambleAcc f_scramble)4701{4702XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */4703XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);4704if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {4705/* need a scrambling operation */4706size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;4707size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;4708XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);4709f_scramble(acc, secret + secretLimit);4710XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);4711*nbStripesSoFarPtr = nbStripesAfterBlock;4712} else {4713XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);4714*nbStripesSoFarPtr += nbStripes;4715}4716}47174718/*4719* Both XXH3_64bits_update and XXH3_128bits_update use this routine.4720*/4721XXH_FORCE_INLINE XXH_errorcode4722XXH3_update(XXH3_state_t* state,4723const xxh_u8* input, size_t len,4724XXH3_f_accumulate_512 f_acc512,4725XXH3_f_scrambleAcc f_scramble)4726{4727if (input==NULL)4728#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)4729return XXH_OK;4730#else4731return XXH_ERROR;4732#endif47334734{ const xxh_u8* const bEnd = input + len;4735const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;47364737state->totalLen += len;4738XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);47394740if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */4741XXH_memcpy(state->buffer + state->bufferedSize, input, len);4742state->bufferedSize += (XXH32_hash_t)len;4743return XXH_OK;4744}4745/* total input is now > XXH3_INTERNALBUFFER_SIZE */47464747#define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)4748XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */47494750/*4751* Internal buffer is partially filled (always, except at beginning)4752* Complete it, then consume it.4753*/4754if (state->bufferedSize) {4755size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;4756XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);4757input += loadSize;4758XXH3_consumeStripes(state->acc,4759&state->nbStripesSoFar, state->nbStripesPerBlock,4760state->buffer, XXH3_INTERNALBUFFER_STRIPES,4761secret, state->secretLimit,4762f_acc512, f_scramble);4763state->bufferedSize = 0;4764}4765XXH_ASSERT(input < bEnd);47664767/* Consume input by a multiple of internal buffer size */4768if (input+XXH3_INTERNALBUFFER_SIZE < bEnd) {4769const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;4770do {4771XXH3_consumeStripes(state->acc,4772&state->nbStripesSoFar, state->nbStripesPerBlock,4773input, XXH3_INTERNALBUFFER_STRIPES,4774secret, state->secretLimit,4775f_acc512, f_scramble);4776input += XXH3_INTERNALBUFFER_SIZE;4777} while (input<limit);4778/* for last partial stripe */4779memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);4780}4781XXH_ASSERT(input < bEnd);47824783/* Some remaining input (always) : buffer it */4784XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));4785state->bufferedSize = (XXH32_hash_t)(bEnd-input);4786}47874788return XXH_OK;4789}47904791/*! @ingroup xxh3_family */4792XXH_PUBLIC_API XXH_errorcode4793XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)4794{4795return XXH3_update(state, (const xxh_u8*)input, len,4796XXH3_accumulate_512, XXH3_scrambleAcc);4797}479847994800XXH_FORCE_INLINE void4801XXH3_digest_long (XXH64_hash_t* acc,4802const XXH3_state_t* state,4803const unsigned char* secret)4804{4805/*4806* Digest on a local copy. This way, the state remains unaltered, and it can4807* continue ingesting more input afterwards.4808*/4809memcpy(acc, state->acc, sizeof(state->acc));4810if (state->bufferedSize >= XXH_STRIPE_LEN) {4811size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;4812size_t nbStripesSoFar = state->nbStripesSoFar;4813XXH3_consumeStripes(acc,4814&nbStripesSoFar, state->nbStripesPerBlock,4815state->buffer, nbStripes,4816secret, state->secretLimit,4817XXH3_accumulate_512, XXH3_scrambleAcc);4818/* last stripe */4819XXH3_accumulate_512(acc,4820state->buffer + state->bufferedSize - XXH_STRIPE_LEN,4821secret + state->secretLimit - XXH_SECRET_LASTACC_START);4822} else { /* bufferedSize < XXH_STRIPE_LEN */4823xxh_u8 lastStripe[XXH_STRIPE_LEN];4824size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;4825XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */4826memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);4827memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);4828XXH3_accumulate_512(acc,4829lastStripe,4830secret + state->secretLimit - XXH_SECRET_LASTACC_START);4831}4832}48334834/*! @ingroup xxh3_family */4835XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)4836{4837const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;4838if (state->totalLen > XXH3_MIDSIZE_MAX) {4839XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];4840XXH3_digest_long(acc, state, secret);4841return XXH3_mergeAccs(acc,4842secret + XXH_SECRET_MERGEACCS_START,4843(xxh_u64)state->totalLen * XXH_PRIME64_1);4844}4845/* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */4846if (state->seed)4847return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);4848return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),4849secret, state->secretLimit + XXH_STRIPE_LEN);4850}485148524853#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))48544855/*! @ingroup xxh3_family */4856XXH_PUBLIC_API void4857XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize)4858{4859XXH_ASSERT(secretBuffer != NULL);4860if (customSeedSize == 0) {4861memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);4862return;4863}4864XXH_ASSERT(customSeed != NULL);48654866{ size_t const segmentSize = sizeof(XXH128_hash_t);4867size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;4868XXH128_canonical_t scrambler;4869XXH64_hash_t seeds[12];4870size_t segnb;4871XXH_ASSERT(nbSegments == 12);4872XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */4873XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));48744875/*4876* Copy customSeed to seeds[], truncating or repeating as necessary.4877*/4878{ size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));4879size_t filled = toFill;4880memcpy(seeds, customSeed, toFill);4881while (filled < sizeof(seeds)) {4882toFill = XXH_MIN(filled, sizeof(seeds) - filled);4883memcpy((char*)seeds + filled, seeds, toFill);4884filled += toFill;4885} }48864887/* generate secret */4888memcpy(secretBuffer, &scrambler, sizeof(scrambler));4889for (segnb=1; segnb < nbSegments; segnb++) {4890size_t const segmentStart = segnb * segmentSize;4891XXH128_canonical_t segment;4892XXH128_canonicalFromHash(&segment,4893XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) );4894memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment));4895} }4896}489748984899/* ==========================================4900* XXH3 128 bits (a.k.a XXH128)4901* ==========================================4902* XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,4903* even without counting the significantly larger output size.4904*4905* For example, extra steps are taken to avoid the seed-dependent collisions4906* in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).4907*4908* This strength naturally comes at the cost of some speed, especially on short4909* lengths. Note that longer hashes are about as fast as the 64-bit version4910* due to it using only a slight modification of the 64-bit loop.4911*4912* XXH128 is also more oriented towards 64-bit machines. It is still extremely4913* fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).4914*/49154916XXH_FORCE_INLINE XXH128_hash_t4917XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)4918{4919/* A doubled version of 1to3_64b with different constants. */4920XXH_ASSERT(input != NULL);4921XXH_ASSERT(1 <= len && len <= 3);4922XXH_ASSERT(secret != NULL);4923/*4924* len = 1: combinedl = { input[0], 0x01, input[0], input[0] }4925* len = 2: combinedl = { input[1], 0x02, input[0], input[1] }4926* len = 3: combinedl = { input[2], 0x03, input[0], input[1] }4927*/4928{ xxh_u8 const c1 = input[0];4929xxh_u8 const c2 = input[len >> 1];4930xxh_u8 const c3 = input[len - 1];4931xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)4932| ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);4933xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);4934xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;4935xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;4936xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;4937xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;4938XXH128_hash_t h128;4939h128.low64 = XXH64_avalanche(keyed_lo);4940h128.high64 = XXH64_avalanche(keyed_hi);4941return h128;4942}4943}49444945XXH_FORCE_INLINE XXH128_hash_t4946XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)4947{4948XXH_ASSERT(input != NULL);4949XXH_ASSERT(secret != NULL);4950XXH_ASSERT(4 <= len && len <= 8);4951seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;4952{ xxh_u32 const input_lo = XXH_readLE32(input);4953xxh_u32 const input_hi = XXH_readLE32(input + len - 4);4954xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);4955xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;4956xxh_u64 const keyed = input_64 ^ bitflip;49574958/* Shift len to the left to ensure it is even, this avoids even multiplies. */4959XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));49604961m128.high64 += (m128.low64 << 1);4962m128.low64 ^= (m128.high64 >> 3);49634964m128.low64 = XXH_xorshift64(m128.low64, 35);4965m128.low64 *= 0x9FB21C651E98DF25ULL;4966m128.low64 = XXH_xorshift64(m128.low64, 28);4967m128.high64 = XXH3_avalanche(m128.high64);4968return m128;4969}4970}49714972XXH_FORCE_INLINE XXH128_hash_t4973XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)4974{4975XXH_ASSERT(input != NULL);4976XXH_ASSERT(secret != NULL);4977XXH_ASSERT(9 <= len && len <= 16);4978{ xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;4979xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;4980xxh_u64 const input_lo = XXH_readLE64(input);4981xxh_u64 input_hi = XXH_readLE64(input + len - 8);4982XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);4983/*4984* Put len in the middle of m128 to ensure that the length gets mixed to4985* both the low and high bits in the 128x64 multiply below.4986*/4987m128.low64 += (xxh_u64)(len - 1) << 54;4988input_hi ^= bitfliph;4989/*4990* Add the high 32 bits of input_hi to the high 32 bits of m128, then4991* add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to4992* the high 64 bits of m128.4993*4994* The best approach to this operation is different on 32-bit and 64-bit.4995*/4996if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */4997/*4998* 32-bit optimized version, which is more readable.4999*5000* On 32-bit, it removes an ADC and delays a dependency between the two5001* halves of m128.high64, but it generates an extra mask on 64-bit.5002*/5003m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);5004} else {5005/*5006* 64-bit optimized (albeit more confusing) version.5007*5008* Uses some properties of addition and multiplication to remove the mask:5009*5010* Let:5011* a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)5012* b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)5013* c = XXH_PRIME32_25014*5015* a + (b * c)5016* Inverse Property: x + y - x == y5017* a + (b * (1 + c - 1))5018* Distributive Property: x * (y + z) == (x * y) + (x * z)5019* a + (b * 1) + (b * (c - 1))5020* Identity Property: x * 1 == x5021* a + b + (b * (c - 1))5022*5023* Substitute a, b, and c:5024* input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))5025*5026* Since input_hi.hi + input_hi.lo == input_hi, we get this:5027* input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))5028*/5029m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);5030}5031/* m128 ^= XXH_swap64(m128 >> 64); */5032m128.low64 ^= XXH_swap64(m128.high64);50335034{ /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */5035XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);5036h128.high64 += m128.high64 * XXH_PRIME64_2;50375038h128.low64 = XXH3_avalanche(h128.low64);5039h128.high64 = XXH3_avalanche(h128.high64);5040return h128;5041} }5042}50435044/*5045* Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN5046*/5047XXH_FORCE_INLINE XXH128_hash_t5048XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)5049{5050XXH_ASSERT(len <= 16);5051{ if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);5052if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);5053if (len) return XXH3_len_1to3_128b(input, len, secret, seed);5054{ XXH128_hash_t h128;5055xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);5056xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);5057h128.low64 = XXH64_avalanche(seed ^ bitflipl);5058h128.high64 = XXH64_avalanche( seed ^ bitfliph);5059return h128;5060} }5061}50625063/*5064* A bit slower than XXH3_mix16B, but handles multiply by zero better.5065*/5066XXH_FORCE_INLINE XXH128_hash_t5067XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,5068const xxh_u8* secret, XXH64_hash_t seed)5069{5070acc.low64 += XXH3_mix16B (input_1, secret+0, seed);5071acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);5072acc.high64 += XXH3_mix16B (input_2, secret+16, seed);5073acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);5074return acc;5075}507650775078XXH_FORCE_INLINE XXH128_hash_t5079XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,5080const xxh_u8* XXH_RESTRICT secret, size_t secretSize,5081XXH64_hash_t seed)5082{5083XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;5084XXH_ASSERT(16 < len && len <= 128);50855086{ XXH128_hash_t acc;5087acc.low64 = len * XXH_PRIME64_1;5088acc.high64 = 0;5089if (len > 32) {5090if (len > 64) {5091if (len > 96) {5092acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);5093}5094acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);5095}5096acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);5097}5098acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);5099{ XXH128_hash_t h128;5100h128.low64 = acc.low64 + acc.high64;5101h128.high64 = (acc.low64 * XXH_PRIME64_1)5102+ (acc.high64 * XXH_PRIME64_4)5103+ ((len - seed) * XXH_PRIME64_2);5104h128.low64 = XXH3_avalanche(h128.low64);5105h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);5106return h128;5107}5108}5109}51105111XXH_NO_INLINE XXH128_hash_t5112XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,5113const xxh_u8* XXH_RESTRICT secret, size_t secretSize,5114XXH64_hash_t seed)5115{5116XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;5117XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);51185119{ XXH128_hash_t acc;5120int const nbRounds = (int)len / 32;5121int i;5122acc.low64 = len * XXH_PRIME64_1;5123acc.high64 = 0;5124for (i=0; i<4; i++) {5125acc = XXH128_mix32B(acc,5126input + (32 * i),5127input + (32 * i) + 16,5128secret + (32 * i),5129seed);5130}5131acc.low64 = XXH3_avalanche(acc.low64);5132acc.high64 = XXH3_avalanche(acc.high64);5133XXH_ASSERT(nbRounds >= 4);5134for (i=4 ; i < nbRounds; i++) {5135acc = XXH128_mix32B(acc,5136input + (32 * i),5137input + (32 * i) + 16,5138secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),5139seed);5140}5141/* last bytes */5142acc = XXH128_mix32B(acc,5143input + len - 16,5144input + len - 32,5145secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,51460ULL - seed);51475148{ XXH128_hash_t h128;5149h128.low64 = acc.low64 + acc.high64;5150h128.high64 = (acc.low64 * XXH_PRIME64_1)5151+ (acc.high64 * XXH_PRIME64_4)5152+ ((len - seed) * XXH_PRIME64_2);5153h128.low64 = XXH3_avalanche(h128.low64);5154h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);5155return h128;5156}5157}5158}51595160XXH_FORCE_INLINE XXH128_hash_t5161XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,5162const xxh_u8* XXH_RESTRICT secret, size_t secretSize,5163XXH3_f_accumulate_512 f_acc512,5164XXH3_f_scrambleAcc f_scramble)5165{5166XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;51675168XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);51695170/* converge into final hash */5171XXH_STATIC_ASSERT(sizeof(acc) == 64);5172XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);5173{ XXH128_hash_t h128;5174h128.low64 = XXH3_mergeAccs(acc,5175secret + XXH_SECRET_MERGEACCS_START,5176(xxh_u64)len * XXH_PRIME64_1);5177h128.high64 = XXH3_mergeAccs(acc,5178secret + secretSize5179- sizeof(acc) - XXH_SECRET_MERGEACCS_START,5180~((xxh_u64)len * XXH_PRIME64_2));5181return h128;5182}5183}51845185/*5186* It's important for performance that XXH3_hashLong is not inlined.5187*/5188XXH_NO_INLINE XXH128_hash_t5189XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,5190XXH64_hash_t seed64,5191const void* XXH_RESTRICT secret, size_t secretLen)5192{5193(void)seed64; (void)secret; (void)secretLen;5194return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),5195XXH3_accumulate_512, XXH3_scrambleAcc);5196}51975198/*5199* It's important for performance that XXH3_hashLong is not inlined.5200*/5201XXH_NO_INLINE XXH128_hash_t5202XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,5203XXH64_hash_t seed64,5204const void* XXH_RESTRICT secret, size_t secretLen)5205{5206(void)seed64;5207return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,5208XXH3_accumulate_512, XXH3_scrambleAcc);5209}52105211XXH_FORCE_INLINE XXH128_hash_t5212XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,5213XXH64_hash_t seed64,5214XXH3_f_accumulate_512 f_acc512,5215XXH3_f_scrambleAcc f_scramble,5216XXH3_f_initCustomSecret f_initSec)5217{5218if (seed64 == 0)5219return XXH3_hashLong_128b_internal(input, len,5220XXH3_kSecret, sizeof(XXH3_kSecret),5221f_acc512, f_scramble);5222{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];5223f_initSec(secret, seed64);5224return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),5225f_acc512, f_scramble);5226}5227}52285229/*5230* It's important for performance that XXH3_hashLong is not inlined.5231*/5232XXH_NO_INLINE XXH128_hash_t5233XXH3_hashLong_128b_withSeed(const void* input, size_t len,5234XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)5235{5236(void)secret; (void)secretLen;5237return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,5238XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);5239}52405241typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,5242XXH64_hash_t, const void* XXH_RESTRICT, size_t);52435244XXH_FORCE_INLINE XXH128_hash_t5245XXH3_128bits_internal(const void* input, size_t len,5246XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,5247XXH3_hashLong128_f f_hl128)5248{5249XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);5250/*5251* If an action is to be taken if `secret` conditions are not respected,5252* it should be done here.5253* For now, it's a contract pre-condition.5254* Adding a check and a branch here would cost performance at every hash.5255*/5256if (len <= 16)5257return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);5258if (len <= 128)5259return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);5260if (len <= XXH3_MIDSIZE_MAX)5261return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);5262return f_hl128(input, len, seed64, secret, secretLen);5263}526452655266/* === Public XXH128 API === */52675268/*! @ingroup xxh3_family */5269XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)5270{5271return XXH3_128bits_internal(input, len, 0,5272XXH3_kSecret, sizeof(XXH3_kSecret),5273XXH3_hashLong_128b_default);5274}52755276/*! @ingroup xxh3_family */5277XXH_PUBLIC_API XXH128_hash_t5278XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)5279{5280return XXH3_128bits_internal(input, len, 0,5281(const xxh_u8*)secret, secretSize,5282XXH3_hashLong_128b_withSecret);5283}52845285/*! @ingroup xxh3_family */5286XXH_PUBLIC_API XXH128_hash_t5287XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)5288{5289return XXH3_128bits_internal(input, len, seed,5290XXH3_kSecret, sizeof(XXH3_kSecret),5291XXH3_hashLong_128b_withSeed);5292}52935294/*! @ingroup xxh3_family */5295XXH_PUBLIC_API XXH128_hash_t5296XXH128(const void* input, size_t len, XXH64_hash_t seed)5297{5298return XXH3_128bits_withSeed(input, len, seed);5299}530053015302/* === XXH3 128-bit streaming === */53035304/*5305* All the functions are actually the same as for 64-bit streaming variant.5306* The only difference is the finalizatiom routine.5307*/53085309/*! @ingroup xxh3_family */5310XXH_PUBLIC_API XXH_errorcode5311XXH3_128bits_reset(XXH3_state_t* statePtr)5312{5313if (statePtr == NULL) return XXH_ERROR;5314XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);5315return XXH_OK;5316}53175318/*! @ingroup xxh3_family */5319XXH_PUBLIC_API XXH_errorcode5320XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)5321{5322if (statePtr == NULL) return XXH_ERROR;5323XXH3_reset_internal(statePtr, 0, secret, secretSize);5324if (secret == NULL) return XXH_ERROR;5325if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;5326return XXH_OK;5327}53285329/*! @ingroup xxh3_family */5330XXH_PUBLIC_API XXH_errorcode5331XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)5332{5333if (statePtr == NULL) return XXH_ERROR;5334if (seed==0) return XXH3_128bits_reset(statePtr);5335if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);5336XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);5337return XXH_OK;5338}53395340/*! @ingroup xxh3_family */5341XXH_PUBLIC_API XXH_errorcode5342XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)5343{5344return XXH3_update(state, (const xxh_u8*)input, len,5345XXH3_accumulate_512, XXH3_scrambleAcc);5346}53475348/*! @ingroup xxh3_family */5349XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)5350{5351const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;5352if (state->totalLen > XXH3_MIDSIZE_MAX) {5353XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];5354XXH3_digest_long(acc, state, secret);5355XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);5356{ XXH128_hash_t h128;5357h128.low64 = XXH3_mergeAccs(acc,5358secret + XXH_SECRET_MERGEACCS_START,5359(xxh_u64)state->totalLen * XXH_PRIME64_1);5360h128.high64 = XXH3_mergeAccs(acc,5361secret + state->secretLimit + XXH_STRIPE_LEN5362- sizeof(acc) - XXH_SECRET_MERGEACCS_START,5363~((xxh_u64)state->totalLen * XXH_PRIME64_2));5364return h128;5365}5366}5367/* len <= XXH3_MIDSIZE_MAX : short code */5368if (state->seed)5369return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);5370return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),5371secret, state->secretLimit + XXH_STRIPE_LEN);5372}53735374/* 128-bit utility functions */53755376#include <string.h> /* memcmp, memcpy */53775378/* return : 1 is equal, 0 if different */5379/*! @ingroup xxh3_family */5380XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)5381{5382/* note : XXH128_hash_t is compact, it has no padding byte */5383return !(memcmp(&h1, &h2, sizeof(h1)));5384}53855386/* This prototype is compatible with stdlib's qsort().5387* return : >0 if *h128_1 > *h128_25388* <0 if *h128_1 < *h128_25389* =0 if *h128_1 == *h128_2 */5390/*! @ingroup xxh3_family */5391XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)5392{5393XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;5394XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;5395int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);5396/* note : bets that, in most cases, hash values are different */5397if (hcmp) return hcmp;5398return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);5399}540054015402/*====== Canonical representation ======*/5403/*! @ingroup xxh3_family */5404XXH_PUBLIC_API void5405XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)5406{5407XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));5408if (XXH_CPU_LITTLE_ENDIAN) {5409hash.high64 = XXH_swap64(hash.high64);5410hash.low64 = XXH_swap64(hash.low64);5411}5412memcpy(dst, &hash.high64, sizeof(hash.high64));5413memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));5414}54155416/*! @ingroup xxh3_family */5417XXH_PUBLIC_API XXH128_hash_t5418XXH128_hashFromCanonical(const XXH128_canonical_t* src)5419{5420XXH128_hash_t h;5421h.high64 = XXH_readBE64(src);5422h.low64 = XXH_readBE64(src->digest + 8);5423return h;5424}54255426/* Pop our optimization override from above */5427#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \5428&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \5429&& defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */5430# pragma GCC pop_options5431#endif54325433#endif /* XXH_NO_LONG_LONG */54345435/*!5436* @}5437*/5438#endif /* XXH_IMPLEMENTATION */543954405441#if defined (__cplusplus)5442}5443#endif544454455446