/*1* xxHash - Fast Hash algorithm2* Copyright (c) Meta Platforms, Inc. and affiliates.3*4* You can contact the author at :5* - xxHash homepage: https://cyan4973.github.io/xxHash/6* - xxHash source repository : https://github.com/Cyan4973/xxHash7*8* This source code is licensed under both the BSD-style license (found in the9* LICENSE file in the root directory of this source tree) and the GPLv2 (found10* in the COPYING file in the root directory of this source tree).11* You may select, at your option, one of the above-listed licenses.12*/131415#ifndef XXH_NO_XXH316# define XXH_NO_XXH317#endif1819#ifndef XXH_NAMESPACE20# define XXH_NAMESPACE ZSTD_21#endif2223/*!24* @mainpage xxHash25*26* @file xxhash.h27* xxHash prototypes and implementation28*/29/* TODO: update */30/* Notice extracted from xxHash homepage:3132xxHash is an extremely fast hash algorithm, running at RAM speed limits.33It also successfully passes all tests from the SMHasher suite.3435Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)3637Name Speed Q.Score Author38xxHash 5.4 GB/s 1039CrapWow 3.2 GB/s 2 Andrew40MurmurHash 3a 2.7 GB/s 10 Austin Appleby41SpookyHash 2.0 GB/s 10 Bob Jenkins42SBox 1.4 GB/s 9 Bret Mulvey43Lookup3 1.2 GB/s 9 Bob Jenkins44SuperFastHash 1.2 GB/s 1 Paul Hsieh45CityHash64 1.05 GB/s 10 Pike & Alakuijala46FNV 0.55 GB/s 5 Fowler, Noll, Vo47CRC32 0.43 GB/s 948MD5-32 0.33 GB/s 10 Ronald L. Rivest49SHA1-32 0.28 GB/s 105051Q.Score is a measure of quality of the hash function.52It depends on successfully passing SMHasher test set.5310 is a perfect score.5455Note: SMHasher's CRC32 implementation is not the fastest one.56Other speed-oriented implementations can be faster,57especially in combination with PCLMUL instruction:58https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c34900923404611707355960A 64-bit version, named XXH64, is available since r35.61It offers much better speed, but for 64-bit applications only.62Name Speed on 64 bits Speed on 32 bits63XXH64 13.8 GB/s 1.9 GB/s64XXH32 6.8 GB/s 6.0 GB/s65*/6667#if defined (__cplusplus)68extern "C" {69#endif7071/* ****************************72* INLINE mode73******************************/74/*!75* XXH_INLINE_ALL (and XXH_PRIVATE_API)76* Use these build macros to inline xxhash into the target unit.77* Inlining improves performance on small inputs, especially when the length is78* expressed as a compile-time constant:79*80* https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html81*82* It also keeps xxHash symbols private to the unit, so they are not exported.83*84* Usage:85* #define XXH_INLINE_ALL86* #include "xxhash.h"87*88* Do not compile and link xxhash.o as a separate object, as it is not useful.89*/90#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \91&& !defined(XXH_INLINE_ALL_31684351384)92/* this section should be traversed only once */93# define XXH_INLINE_ALL_3168435138494/* give access to the advanced API, required to compile implementations */95# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */96# define XXH_STATIC_LINKING_ONLY97/* make all functions private */98# undef XXH_PUBLIC_API99# if defined(__GNUC__)100# define XXH_PUBLIC_API static __inline __attribute__((unused))101# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)102# define XXH_PUBLIC_API static inline103# elif defined(_MSC_VER)104# define XXH_PUBLIC_API static __inline105# else106/* note: this version may generate warnings for unused static functions */107# define XXH_PUBLIC_API static108# endif109110/*111* This part deals with the special case where a unit wants to inline xxHash,112* but "xxhash.h" has previously been included without XXH_INLINE_ALL,113* such as part of some previously included *.h header file.114* Without further action, the new include would just be ignored,115* and functions would effectively _not_ be inlined (silent failure).116* The following macros solve this situation by prefixing all inlined names,117* avoiding naming collision with previous inclusions.118*/119/* Before that, we unconditionally #undef all symbols,120* in case they were already defined with XXH_NAMESPACE.121* They will then be redefined for XXH_INLINE_ALL122*/123# undef XXH_versionNumber124/* XXH32 */125# undef XXH32126# undef XXH32_createState127# undef XXH32_freeState128# undef XXH32_reset129# undef XXH32_update130# undef XXH32_digest131# undef XXH32_copyState132# undef XXH32_canonicalFromHash133# undef XXH32_hashFromCanonical134/* XXH64 */135# undef XXH64136# undef XXH64_createState137# undef XXH64_freeState138# undef XXH64_reset139# undef XXH64_update140# undef XXH64_digest141# undef XXH64_copyState142# undef XXH64_canonicalFromHash143# undef XXH64_hashFromCanonical144/* XXH3_64bits */145# undef XXH3_64bits146# undef XXH3_64bits_withSecret147# undef XXH3_64bits_withSeed148# undef XXH3_64bits_withSecretandSeed149# undef XXH3_createState150# undef XXH3_freeState151# undef XXH3_copyState152# undef XXH3_64bits_reset153# undef XXH3_64bits_reset_withSeed154# undef XXH3_64bits_reset_withSecret155# undef XXH3_64bits_update156# undef XXH3_64bits_digest157# undef XXH3_generateSecret158/* XXH3_128bits */159# undef XXH128160# undef XXH3_128bits161# undef XXH3_128bits_withSeed162# undef XXH3_128bits_withSecret163# undef XXH3_128bits_reset164# undef XXH3_128bits_reset_withSeed165# undef XXH3_128bits_reset_withSecret166# undef XXH3_128bits_reset_withSecretandSeed167# undef XXH3_128bits_update168# undef XXH3_128bits_digest169# undef XXH128_isEqual170# undef XXH128_cmp171# undef XXH128_canonicalFromHash172# undef XXH128_hashFromCanonical173/* Finally, free the namespace itself */174# undef XXH_NAMESPACE175176/* employ the namespace for XXH_INLINE_ALL */177# define XXH_NAMESPACE XXH_INLINE_178/*179* Some identifiers (enums, type names) are not symbols,180* but they must nonetheless be renamed to avoid redeclaration.181* Alternative solution: do not redeclare them.182* However, this requires some #ifdefs, and has a more dispersed impact.183* Meanwhile, renaming can be achieved in a single place.184*/185# define XXH_IPREF(Id) XXH_NAMESPACE ## Id186# define XXH_OK XXH_IPREF(XXH_OK)187# define XXH_ERROR XXH_IPREF(XXH_ERROR)188# define XXH_errorcode XXH_IPREF(XXH_errorcode)189# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t)190# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t)191# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)192# define XXH32_state_s XXH_IPREF(XXH32_state_s)193# define XXH32_state_t XXH_IPREF(XXH32_state_t)194# define XXH64_state_s XXH_IPREF(XXH64_state_s)195# define XXH64_state_t XXH_IPREF(XXH64_state_t)196# define XXH3_state_s XXH_IPREF(XXH3_state_s)197# define XXH3_state_t XXH_IPREF(XXH3_state_t)198# define XXH128_hash_t XXH_IPREF(XXH128_hash_t)199/* Ensure the header is parsed again, even if it was previously included */200# undef XXHASH_H_5627135585666179201# undef XXHASH_H_STATIC_13879238742202#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */203204205206/* ****************************************************************207* Stable API208*****************************************************************/209#ifndef XXHASH_H_5627135585666179210#define XXHASH_H_5627135585666179 1211212213/*!214* @defgroup public Public API215* Contains details on the public xxHash functions.216* @{217*/218/* specific declaration modes for Windows */219#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)220# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))221# ifdef XXH_EXPORT222# define XXH_PUBLIC_API __declspec(dllexport)223# elif XXH_IMPORT224# define XXH_PUBLIC_API __declspec(dllimport)225# endif226# else227# define XXH_PUBLIC_API /* do nothing */228# endif229#endif230231#ifdef XXH_DOXYGEN232/*!233* @brief Emulate a namespace by transparently prefixing all symbols.234*235* If you want to include _and expose_ xxHash functions from within your own236* library, but also want to avoid symbol collisions with other libraries which237* may also include xxHash, you can use XXH_NAMESPACE to automatically prefix238* any public symbol from xxhash library with the value of XXH_NAMESPACE239* (therefore, avoid empty or numeric values).240*241* Note that no change is required within the calling program as long as it242* includes `xxhash.h`: Regular symbol names will be automatically translated243* by this header.244*/245# define XXH_NAMESPACE /* YOUR NAME HERE */246# undef XXH_NAMESPACE247#endif248249#ifdef XXH_NAMESPACE250# define XXH_CAT(A,B) A##B251# define XXH_NAME2(A,B) XXH_CAT(A,B)252# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)253/* XXH32 */254# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)255# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)256# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)257# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)258# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)259# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)260# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)261# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)262# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)263/* XXH64 */264# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)265# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)266# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)267# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)268# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)269# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)270# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)271# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)272# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)273/* XXH3_64bits */274# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)275# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)276# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)277# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)278# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)279# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)280# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)281# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)282# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)283# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)284# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)285# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)286# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)287# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)288# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)289/* XXH3_128bits */290# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)291# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)292# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)293# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)294# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)295# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)296# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)297# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)298# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)299# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)300# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)301# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)302# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)303# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)304# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)305#endif306307308/* *************************************309* Version310***************************************/311#define XXH_VERSION_MAJOR 0312#define XXH_VERSION_MINOR 8313#define XXH_VERSION_RELEASE 1314#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)315316/*!317* @brief Obtains the xxHash version.318*319* This is mostly useful when xxHash is compiled as a shared library,320* since the returned value comes from the library, as opposed to header file.321*322* @return `XXH_VERSION_NUMBER` of the invoked library.323*/324XXH_PUBLIC_API unsigned XXH_versionNumber (void);325326327/* ****************************328* Common basic types329******************************/330#include <stddef.h> /* size_t */331typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;332333334/*-**********************************************************************335* 32-bit hash336************************************************************************/337#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */338/*!339* @brief An unsigned 32-bit integer.340*341* Not necessarily defined to `uint32_t` but functionally equivalent.342*/343typedef uint32_t XXH32_hash_t;344345#elif !defined (__VMS) \346&& (defined (__cplusplus) \347|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )348# include <stdint.h>349typedef uint32_t XXH32_hash_t;350351#else352# include <limits.h>353# if UINT_MAX == 0xFFFFFFFFUL354typedef unsigned int XXH32_hash_t;355# else356# if ULONG_MAX == 0xFFFFFFFFUL357typedef unsigned long XXH32_hash_t;358# else359# error "unsupported platform: need a 32-bit type"360# endif361# endif362#endif363364/*!365* @}366*367* @defgroup xxh32_family XXH32 family368* @ingroup public369* Contains functions used in the classic 32-bit xxHash algorithm.370*371* @note372* XXH32 is useful for older platforms, with no or poor 64-bit performance.373* Note that @ref xxh3_family provides competitive speed374* for both 32-bit and 64-bit systems, and offers true 64/128 bit hash results.375*376* @see @ref xxh64_family, @ref xxh3_family : Other xxHash families377* @see @ref xxh32_impl for implementation details378* @{379*/380381/*!382* @brief Calculates the 32-bit hash of @p input using xxHash32.383*384* Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s385*386* @param input The block of data to be hashed, at least @p length bytes in size.387* @param length The length of @p input, in bytes.388* @param seed The 32-bit seed to alter the hash's output predictably.389*390* @pre391* The memory between @p input and @p input + @p length must be valid,392* readable, contiguous memory. However, if @p length is `0`, @p input may be393* `NULL`. In C++, this also must be *TriviallyCopyable*.394*395* @return The calculated 32-bit hash value.396*397* @see398* XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():399* Direct equivalents for the other variants of xxHash.400* @see401* XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.402*/403XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);404405/*!406* Streaming functions generate the xxHash value from an incremental input.407* This method is slower than single-call functions, due to state management.408* For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.409*410* An XXH state must first be allocated using `XXH*_createState()`.411*412* Start a new hash by initializing the state with a seed using `XXH*_reset()`.413*414* Then, feed the hash state by calling `XXH*_update()` as many times as necessary.415*416* The function returns an error code, with 0 meaning OK, and any other value417* meaning there is an error.418*419* Finally, a hash value can be produced anytime, by using `XXH*_digest()`.420* This function returns the nn-bits hash as an int or long long.421*422* It's still possible to continue inserting input into the hash state after a423* digest, and generate new hash values later on by invoking `XXH*_digest()`.424*425* When done, release the state using `XXH*_freeState()`.426*427* Example code for incrementally hashing a file:428* @code{.c}429* #include <stdio.h>430* #include <xxhash.h>431* #define BUFFER_SIZE 256432*433* // Note: XXH64 and XXH3 use the same interface.434* XXH32_hash_t435* hashFile(FILE* stream)436* {437* XXH32_state_t* state;438* unsigned char buf[BUFFER_SIZE];439* size_t amt;440* XXH32_hash_t hash;441*442* state = XXH32_createState(); // Create a state443* assert(state != NULL); // Error check here444* XXH32_reset(state, 0xbaad5eed); // Reset state with our seed445* while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {446* XXH32_update(state, buf, amt); // Hash the file in chunks447* }448* hash = XXH32_digest(state); // Finalize the hash449* XXH32_freeState(state); // Clean up450* return hash;451* }452* @endcode453*/454455/*!456* @typedef struct XXH32_state_s XXH32_state_t457* @brief The opaque state struct for the XXH32 streaming API.458*459* @see XXH32_state_s for details.460*/461typedef struct XXH32_state_s XXH32_state_t;462463/*!464* @brief Allocates an @ref XXH32_state_t.465*466* Must be freed with XXH32_freeState().467* @return An allocated XXH32_state_t on success, `NULL` on failure.468*/469XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);470/*!471* @brief Frees an @ref XXH32_state_t.472*473* Must be allocated with XXH32_createState().474* @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().475* @return XXH_OK.476*/477XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);478/*!479* @brief Copies one @ref XXH32_state_t to another.480*481* @param dst_state The state to copy to.482* @param src_state The state to copy from.483* @pre484* @p dst_state and @p src_state must not be `NULL` and must not overlap.485*/486XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);487488/*!489* @brief Resets an @ref XXH32_state_t to begin a new hash.490*491* This function resets and seeds a state. Call it before @ref XXH32_update().492*493* @param statePtr The state struct to reset.494* @param seed The 32-bit seed to alter the hash result predictably.495*496* @pre497* @p statePtr must not be `NULL`.498*499* @return @ref XXH_OK on success, @ref XXH_ERROR on failure.500*/501XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed);502503/*!504* @brief Consumes a block of @p input to an @ref XXH32_state_t.505*506* Call this to incrementally consume blocks of data.507*508* @param statePtr The state struct to update.509* @param input The block of data to be hashed, at least @p length bytes in size.510* @param length The length of @p input, in bytes.511*512* @pre513* @p statePtr must not be `NULL`.514* @pre515* The memory between @p input and @p input + @p length must be valid,516* readable, contiguous memory. However, if @p length is `0`, @p input may be517* `NULL`. In C++, this also must be *TriviallyCopyable*.518*519* @return @ref XXH_OK on success, @ref XXH_ERROR on failure.520*/521XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);522523/*!524* @brief Returns the calculated hash value from an @ref XXH32_state_t.525*526* @note527* Calling XXH32_digest() will not affect @p statePtr, so you can update,528* digest, and update again.529*530* @param statePtr The state struct to calculate the hash from.531*532* @pre533* @p statePtr must not be `NULL`.534*535* @return The calculated xxHash32 value from that state.536*/537XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);538539/******* Canonical representation *******/540541/*542* The default return values from XXH functions are unsigned 32 and 64 bit543* integers.544* This the simplest and fastest format for further post-processing.545*546* However, this leaves open the question of what is the order on the byte level,547* since little and big endian conventions will store the same number differently.548*549* The canonical representation settles this issue by mandating big-endian550* convention, the same convention as human-readable numbers (large digits first).551*552* When writing hash values to storage, sending them over a network, or printing553* them, it's highly recommended to use the canonical representation to ensure554* portability across a wider range of systems, present and future.555*556* The following functions allow transformation of hash values to and from557* canonical format.558*/559560/*!561* @brief Canonical (big endian) representation of @ref XXH32_hash_t.562*/563typedef struct {564unsigned char digest[4]; /*!< Hash bytes, big endian */565} XXH32_canonical_t;566567/*!568* @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.569*570* @param dst The @ref XXH32_canonical_t pointer to be stored to.571* @param hash The @ref XXH32_hash_t to be converted.572*573* @pre574* @p dst must not be `NULL`.575*/576XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);577578/*!579* @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.580*581* @param src The @ref XXH32_canonical_t to convert.582*583* @pre584* @p src must not be `NULL`.585*586* @return The converted hash.587*/588XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);589590591#ifdef __has_attribute592# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)593#else594# define XXH_HAS_ATTRIBUTE(x) 0595#endif596597/* C-language Attributes are added in C23. */598#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)599# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)600#else601# define XXH_HAS_C_ATTRIBUTE(x) 0602#endif603604#if defined(__cplusplus) && defined(__has_cpp_attribute)605# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)606#else607# define XXH_HAS_CPP_ATTRIBUTE(x) 0608#endif609610/*611Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute612introduced in CPP17 and C23.613CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough614C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough615*/616#if XXH_HAS_C_ATTRIBUTE(x)617# define XXH_FALLTHROUGH [[fallthrough]]618#elif XXH_HAS_CPP_ATTRIBUTE(x)619# define XXH_FALLTHROUGH [[fallthrough]]620#elif XXH_HAS_ATTRIBUTE(__fallthrough__)621# define XXH_FALLTHROUGH __attribute__ ((fallthrough))622#else623# define XXH_FALLTHROUGH624#endif625626/*!627* @}628* @ingroup public629* @{630*/631632#ifndef XXH_NO_LONG_LONG633/*-**********************************************************************634* 64-bit hash635************************************************************************/636#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */637/*!638* @brief An unsigned 64-bit integer.639*640* Not necessarily defined to `uint64_t` but functionally equivalent.641*/642typedef uint64_t XXH64_hash_t;643#elif !defined (__VMS) \644&& (defined (__cplusplus) \645|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )646# include <stdint.h>647typedef uint64_t XXH64_hash_t;648#else649# include <limits.h>650# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL651/* LP64 ABI says uint64_t is unsigned long */652typedef unsigned long XXH64_hash_t;653# else654/* the following type must have a width of 64-bit */655typedef unsigned long long XXH64_hash_t;656# endif657#endif658659/*!660* @}661*662* @defgroup xxh64_family XXH64 family663* @ingroup public664* @{665* Contains functions used in the classic 64-bit xxHash algorithm.666*667* @note668* XXH3 provides competitive speed for both 32-bit and 64-bit systems,669* and offers true 64/128 bit hash results.670* It provides better speed for systems with vector processing capabilities.671*/672673674/*!675* @brief Calculates the 64-bit hash of @p input using xxHash64.676*677* This function usually runs faster on 64-bit systems, but slower on 32-bit678* systems (see benchmark).679*680* @param input The block of data to be hashed, at least @p length bytes in size.681* @param length The length of @p input, in bytes.682* @param seed The 64-bit seed to alter the hash's output predictably.683*684* @pre685* The memory between @p input and @p input + @p length must be valid,686* readable, contiguous memory. However, if @p length is `0`, @p input may be687* `NULL`. In C++, this also must be *TriviallyCopyable*.688*689* @return The calculated 64-bit hash.690*691* @see692* XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():693* Direct equivalents for the other variants of xxHash.694* @see695* XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.696*/697XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);698699/******* Streaming *******/700/*!701* @brief The opaque state struct for the XXH64 streaming API.702*703* @see XXH64_state_s for details.704*/705typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */706XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);707XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);708XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);709710XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed);711XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);712XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);713714/******* Canonical representation *******/715typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;716XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);717XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);718719#ifndef XXH_NO_XXH3720/*!721* @}722* ************************************************************************723* @defgroup xxh3_family XXH3 family724* @ingroup public725* @{726*727* XXH3 is a more recent hash algorithm featuring:728* - Improved speed for both small and large inputs729* - True 64-bit and 128-bit outputs730* - SIMD acceleration731* - Improved 32-bit viability732*733* Speed analysis methodology is explained here:734*735* https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html736*737* Compared to XXH64, expect XXH3 to run approximately738* ~2x faster on large inputs and >3x faster on small ones,739* exact differences vary depending on platform.740*741* XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,742* but does not require it.743* Any 32-bit and 64-bit targets that can run XXH32 smoothly744* can run XXH3 at competitive speeds, even without vector support.745* Further details are explained in the implementation.746*747* Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,748* ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.749*750* XXH3 implementation is portable:751* it has a generic C90 formulation that can be compiled on any platform,752* all implementations generage exactly the same hash value on all platforms.753* Starting from v0.8.0, it's also labelled "stable", meaning that754* any future version will also generate the same hash value.755*756* XXH3 offers 2 variants, _64bits and _128bits.757*758* When only 64 bits are needed, prefer invoking the _64bits variant, as it759* reduces the amount of mixing, resulting in faster speed on small inputs.760* It's also generally simpler to manipulate a scalar return type than a struct.761*762* The API supports one-shot hashing, streaming mode, and custom secrets.763*/764765/*-**********************************************************************766* XXH3 64-bit variant767************************************************************************/768769/* XXH3_64bits():770* default 64-bit variant, using default secret and default seed of 0.771* It's the fastest variant. */772XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);773774/*775* XXH3_64bits_withSeed():776* This variant generates a custom secret on the fly777* based on default secret altered using the `seed` value.778* While this operation is decently fast, note that it's not completely free.779* Note: seed==0 produces the same results as XXH3_64bits().780*/781XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);782783/*!784* The bare minimum size for a custom secret.785*786* @see787* XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),788* XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().789*/790#define XXH3_SECRET_SIZE_MIN 136791792/*793* XXH3_64bits_withSecret():794* It's possible to provide any blob of bytes as a "secret" to generate the hash.795* This makes it more difficult for an external actor to prepare an intentional collision.796* The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).797* However, the quality of the secret impacts the dispersion of the hash algorithm.798* Therefore, the secret _must_ look like a bunch of random bytes.799* Avoid "trivial" or structured data such as repeated sequences or a text document.800* Whenever in doubt about the "randomness" of the blob of bytes,801* consider employing "XXH3_generateSecret()" instead (see below).802* It will generate a proper high entropy secret derived from the blob of bytes.803* Another advantage of using XXH3_generateSecret() is that804* it guarantees that all bits within the initial blob of bytes805* will impact every bit of the output.806* This is not necessarily the case when using the blob of bytes directly807* because, when hashing _small_ inputs, only a portion of the secret is employed.808*/809XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);810811812/******* Streaming *******/813/*814* Streaming requires state maintenance.815* This operation costs memory and CPU.816* As a consequence, streaming is slower than one-shot hashing.817* For better performance, prefer one-shot functions whenever applicable.818*/819820/*!821* @brief The state struct for the XXH3 streaming API.822*823* @see XXH3_state_s for details.824*/825typedef struct XXH3_state_s XXH3_state_t;826XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);827XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);828XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);829830/*831* XXH3_64bits_reset():832* Initialize with default parameters.833* digest will be equivalent to `XXH3_64bits()`.834*/835XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);836/*837* XXH3_64bits_reset_withSeed():838* Generate a custom secret from `seed`, and store it into `statePtr`.839* digest will be equivalent to `XXH3_64bits_withSeed()`.840*/841XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);842/*843* XXH3_64bits_reset_withSecret():844* `secret` is referenced, it _must outlive_ the hash streaming session.845* Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,846* and the quality of produced hash values depends on secret's entropy847* (secret's content should look like a bunch of random bytes).848* When in doubt about the randomness of a candidate `secret`,849* consider employing `XXH3_generateSecret()` instead (see below).850*/851XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);852853XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);854XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr);855856/* note : canonical representation of XXH3 is the same as XXH64857* since they both produce XXH64_hash_t values */858859860/*-**********************************************************************861* XXH3 128-bit variant862************************************************************************/863864/*!865* @brief The return value from 128-bit hashes.866*867* Stored in little endian order, although the fields themselves are in native868* endianness.869*/870typedef struct {871XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */872XXH64_hash_t high64; /*!< `value >> 64` */873} XXH128_hash_t;874875XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);876XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);877XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);878879/******* Streaming *******/880/*881* Streaming requires state maintenance.882* This operation costs memory and CPU.883* As a consequence, streaming is slower than one-shot hashing.884* For better performance, prefer one-shot functions whenever applicable.885*886* XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().887* Use already declared XXH3_createState() and XXH3_freeState().888*889* All reset and streaming functions have same meaning as their 64-bit counterpart.890*/891892XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);893XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);894XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);895896XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);897XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);898899/* Following helper functions make it possible to compare XXH128_hast_t values.900* Since XXH128_hash_t is a structure, this capability is not offered by the language.901* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */902903/*!904* XXH128_isEqual():905* Return: 1 if `h1` and `h2` are equal, 0 if they are not.906*/907XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);908909/*!910* XXH128_cmp():911*912* This comparator is compatible with stdlib's `qsort()`/`bsearch()`.913*914* return: >0 if *h128_1 > *h128_2915* =0 if *h128_1 == *h128_2916* <0 if *h128_1 < *h128_2917*/918XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);919920921/******* Canonical representation *******/922typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;923XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);924XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);925926927#endif /* !XXH_NO_XXH3 */928#endif /* XXH_NO_LONG_LONG */929930/*!931* @}932*/933#endif /* XXHASH_H_5627135585666179 */934935936937#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)938#define XXHASH_H_STATIC_13879238742939/* ****************************************************************************940* This section contains declarations which are not guaranteed to remain stable.941* They may change in future versions, becoming incompatible with a different942* version of the library.943* These declarations should only be used with static linking.944* Never use them in association with dynamic linking!945***************************************************************************** */946947/*948* These definitions are only present to allow static allocation949* of XXH states, on stack or in a struct, for example.950* Never **ever** access their members directly.951*/952953/*!954* @internal955* @brief Structure for XXH32 streaming API.956*957* @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,958* @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is959* an opaque type. This allows fields to safely be changed.960*961* Typedef'd to @ref XXH32_state_t.962* Do not access the members of this struct directly.963* @see XXH64_state_s, XXH3_state_s964*/965struct XXH32_state_s {966XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */967XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */968XXH32_hash_t v[4]; /*!< Accumulator lanes */969XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */970XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */971XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */972}; /* typedef'd to XXH32_state_t */973974975#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */976977/*!978* @internal979* @brief Structure for XXH64 streaming API.980*981* @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,982* @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is983* an opaque type. This allows fields to safely be changed.984*985* Typedef'd to @ref XXH64_state_t.986* Do not access the members of this struct directly.987* @see XXH32_state_s, XXH3_state_s988*/989struct XXH64_state_s {990XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */991XXH64_hash_t v[4]; /*!< Accumulator lanes */992XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */993XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */994XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/995XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */996}; /* typedef'd to XXH64_state_t */997998999#ifndef XXH_NO_XXH310001001#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */1002# include <stdalign.h>1003# define XXH_ALIGN(n) alignas(n)1004#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */1005/* In C++ alignas() is a keyword */1006# define XXH_ALIGN(n) alignas(n)1007#elif defined(__GNUC__)1008# define XXH_ALIGN(n) __attribute__ ((aligned(n)))1009#elif defined(_MSC_VER)1010# define XXH_ALIGN(n) __declspec(align(n))1011#else1012# define XXH_ALIGN(n) /* disabled */1013#endif10141015/* Old GCC versions only accept the attribute after the type in structures. */1016#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \1017&& ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \1018&& defined(__GNUC__)1019# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)1020#else1021# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type1022#endif10231024/*!1025* @brief The size of the internal XXH3 buffer.1026*1027* This is the optimal update size for incremental hashing.1028*1029* @see XXH3_64b_update(), XXH3_128b_update().1030*/1031#define XXH3_INTERNALBUFFER_SIZE 25610321033/*!1034* @brief Default size of the secret buffer (and @ref XXH3_kSecret).1035*1036* This is the size used in @ref XXH3_kSecret and the seeded functions.1037*1038* Not to be confused with @ref XXH3_SECRET_SIZE_MIN.1039*/1040#define XXH3_SECRET_DEFAULT_SIZE 19210411042/*!1043* @internal1044* @brief Structure for XXH3 streaming API.1045*1046* @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,1047* @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.1048* Otherwise it is an opaque type.1049* Never use this definition in combination with dynamic library.1050* This allows fields to safely be changed in the future.1051*1052* @note ** This structure has a strict alignment requirement of 64 bytes!! **1053* Do not allocate this with `malloc()` or `new`,1054* it will not be sufficiently aligned.1055* Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.1056*1057* Typedef'd to @ref XXH3_state_t.1058* Do never access the members of this struct directly.1059*1060* @see XXH3_INITSTATE() for stack initialization.1061* @see XXH3_createState(), XXH3_freeState().1062* @see XXH32_state_s, XXH64_state_s1063*/1064struct XXH3_state_s {1065XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);1066/*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */1067XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);1068/*!< Used to store a custom secret generated from a seed. */1069XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);1070/*!< The internal buffer. @see XXH32_state_s::mem32 */1071XXH32_hash_t bufferedSize;1072/*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */1073XXH32_hash_t useSeed;1074/*!< Reserved field. Needed for padding on 64-bit. */1075size_t nbStripesSoFar;1076/*!< Number or stripes processed. */1077XXH64_hash_t totalLen;1078/*!< Total length hashed. 64-bit even on 32-bit targets. */1079size_t nbStripesPerBlock;1080/*!< Number of stripes per block. */1081size_t secretLimit;1082/*!< Size of @ref customSecret or @ref extSecret */1083XXH64_hash_t seed;1084/*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */1085XXH64_hash_t reserved64;1086/*!< Reserved field. */1087const unsigned char* extSecret;1088/*!< Reference to an external secret for the _withSecret variants, NULL1089* for other variants. */1090/* note: there may be some padding at the end due to alignment on 64 bytes */1091}; /* typedef'd to XXH3_state_t */10921093#undef XXH_ALIGN_MEMBER10941095/*!1096* @brief Initializes a stack-allocated `XXH3_state_s`.1097*1098* When the @ref XXH3_state_t structure is merely emplaced on stack,1099* it should be initialized with XXH3_INITSTATE() or a memset()1100* in case its first reset uses XXH3_NNbits_reset_withSeed().1101* This init can be omitted if the first reset uses default or _withSecret mode.1102* This operation isn't necessary when the state is created with XXH3_createState().1103* Note that this doesn't prepare the state for a streaming operation,1104* it's still necessary to use XXH3_NNbits_reset*() afterwards.1105*/1106#define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; }110711081109/* XXH128() :1110* simple alias to pre-selected XXH3_128bits variant1111*/1112XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);111311141115/* === Experimental API === */1116/* Symbols defined below must be considered tied to a specific library version. */11171118/*1119* XXH3_generateSecret():1120*1121* Derive a high-entropy secret from any user-defined content, named customSeed.1122* The generated secret can be used in combination with `*_withSecret()` functions.1123* The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,1124* as it becomes much more difficult for an external actor to guess how to impact the calculation logic.1125*1126* The function accepts as input a custom seed of any length and any content,1127* and derives from it a high-entropy secret of length @secretSize1128* into an already allocated buffer @secretBuffer.1129* @secretSize must be >= XXH3_SECRET_SIZE_MIN1130*1131* The generated secret can then be used with any `*_withSecret()` variant.1132* Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,1133* `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`1134* are part of this list. They all accept a `secret` parameter1135* which must be large enough for implementation reasons (>= XXH3_SECRET_SIZE_MIN)1136* _and_ feature very high entropy (consist of random-looking bytes).1137* These conditions can be a high bar to meet, so1138* XXH3_generateSecret() can be employed to ensure proper quality.1139*1140* customSeed can be anything. It can have any size, even small ones,1141* and its content can be anything, even "poor entropy" sources such as a bunch of zeroes.1142* The resulting `secret` will nonetheless provide all required qualities.1143*1144* When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.1145*/1146XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);114711481149/*1150* XXH3_generateSecret_fromSeed():1151*1152* Generate the same secret as the _withSeed() variants.1153*1154* The resulting secret has a length of XXH3_SECRET_DEFAULT_SIZE (necessarily).1155* @secretBuffer must be already allocated, of size at least XXH3_SECRET_DEFAULT_SIZE bytes.1156*1157* The generated secret can be used in combination with1158*`*_withSecret()` and `_withSecretandSeed()` variants.1159* This generator is notably useful in combination with `_withSecretandSeed()`,1160* as a way to emulate a faster `_withSeed()` variant.1161*/1162XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);11631164/*1165* *_withSecretandSeed() :1166* These variants generate hash values using either1167* @seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes)1168* or @secret for "large" keys (>= XXH3_MIDSIZE_MAX).1169*1170* This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.1171* `_withSeed()` has to generate the secret on the fly for "large" keys.1172* It's fast, but can be perceptible for "not so large" keys (< 1 KB).1173* `_withSecret()` has to generate the masks on the fly for "small" keys,1174* which requires more instructions than _withSeed() variants.1175* Therefore, _withSecretandSeed variant combines the best of both worlds.1176*1177* When @secret has been generated by XXH3_generateSecret_fromSeed(),1178* this variant produces *exactly* the same results as `_withSeed()` variant,1179* hence offering only a pure speed benefit on "large" input,1180* by skipping the need to regenerate the secret for every large input.1181*1182* Another usage scenario is to hash the secret to a 64-bit hash value,1183* for example with XXH3_64bits(), which then becomes the seed,1184* and then employ both the seed and the secret in _withSecretandSeed().1185* On top of speed, an added benefit is that each bit in the secret1186* has a 50% chance to swap each bit in the output,1187* via its impact to the seed.1188* This is not guaranteed when using the secret directly in "small data" scenarios,1189* because only portions of the secret are employed for small data.1190*/1191XXH_PUBLIC_API XXH64_hash_t1192XXH3_64bits_withSecretandSeed(const void* data, size_t len,1193const void* secret, size_t secretSize,1194XXH64_hash_t seed);11951196XXH_PUBLIC_API XXH128_hash_t1197XXH3_128bits_withSecretandSeed(const void* data, size_t len,1198const void* secret, size_t secretSize,1199XXH64_hash_t seed64);12001201XXH_PUBLIC_API XXH_errorcode1202XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,1203const void* secret, size_t secretSize,1204XXH64_hash_t seed64);12051206XXH_PUBLIC_API XXH_errorcode1207XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,1208const void* secret, size_t secretSize,1209XXH64_hash_t seed64);121012111212#endif /* XXH_NO_XXH3 */1213#endif /* XXH_NO_LONG_LONG */1214#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)1215# define XXH_IMPLEMENTATION1216#endif12171218#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */121912201221/* ======================================================================== */1222/* ======================================================================== */1223/* ======================================================================== */122412251226/*-**********************************************************************1227* xxHash implementation1228*-**********************************************************************1229* xxHash's implementation used to be hosted inside xxhash.c.1230*1231* However, inlining requires implementation to be visible to the compiler,1232* hence be included alongside the header.1233* Previously, implementation was hosted inside xxhash.c,1234* which was then #included when inlining was activated.1235* This construction created issues with a few build and install systems,1236* as it required xxhash.c to be stored in /include directory.1237*1238* xxHash implementation is now directly integrated within xxhash.h.1239* As a consequence, xxhash.c is no longer needed in /include.1240*1241* xxhash.c is still available and is still useful.1242* In a "normal" setup, when xxhash is not inlined,1243* xxhash.h only exposes the prototypes and public symbols,1244* while xxhash.c can be built into an object file xxhash.o1245* which can then be linked into the final binary.1246************************************************************************/12471248#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \1249|| defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)1250# define XXH_IMPLEM_13a873738712511252/* *************************************1253* Tuning parameters1254***************************************/12551256/*!1257* @defgroup tuning Tuning parameters1258* @{1259*1260* Various macros to control xxHash's behavior.1261*/1262#ifdef XXH_DOXYGEN1263/*!1264* @brief Define this to disable 64-bit code.1265*1266* Useful if only using the @ref xxh32_family and you have a strict C90 compiler.1267*/1268# define XXH_NO_LONG_LONG1269# undef XXH_NO_LONG_LONG /* don't actually */1270/*!1271* @brief Controls how unaligned memory is accessed.1272*1273* By default, access to unaligned memory is controlled by `memcpy()`, which is1274* safe and portable.1275*1276* Unfortunately, on some target/compiler combinations, the generated assembly1277* is sub-optimal.1278*1279* The below switch allow selection of a different access method1280* in the search for improved performance.1281*1282* @par Possible options:1283*1284* - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`1285* @par1286* Use `memcpy()`. Safe and portable. Note that most modern compilers will1287* eliminate the function call and treat it as an unaligned access.1288*1289* - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`1290* @par1291* Depends on compiler extensions and is therefore not portable.1292* This method is safe _if_ your compiler supports it,1293* and *generally* as fast or faster than `memcpy`.1294*1295* - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast1296* @par1297* Casts directly and dereferences. This method doesn't depend on the1298* compiler, but it violates the C standard as it directly dereferences an1299* unaligned pointer. It can generate buggy code on targets which do not1300* support unaligned memory accesses, but in some circumstances, it's the1301* only known way to get the most performance.1302*1303* - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift1304* @par1305* Also portable. This can generate the best code on old compilers which don't1306* inline small `memcpy()` calls, and it might also be faster on big-endian1307* systems which lack a native byteswap instruction. However, some compilers1308* will emit literal byteshifts even if the target supports unaligned access.1309* .1310*1311* @warning1312* Methods 1 and 2 rely on implementation-defined behavior. Use these with1313* care, as what works on one compiler/platform/optimization level may cause1314* another to read garbage data or even crash.1315*1316* See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.1317*1318* Prefer these methods in priority order (0 > 3 > 1 > 2)1319*/1320# define XXH_FORCE_MEMORY_ACCESS 013211322/*!1323* @def XXH_FORCE_ALIGN_CHECK1324* @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()1325* and XXH64() only).1326*1327* This is an important performance trick for architectures without decent1328* unaligned memory access performance.1329*1330* It checks for input alignment, and when conditions are met, uses a "fast1331* path" employing direct 32-bit/64-bit reads, resulting in _dramatically1332* faster_ read speed.1333*1334* The check costs one initial branch per hash, which is generally negligible,1335* but not zero.1336*1337* Moreover, it's not useful to generate an additional code path if memory1338* access uses the same instruction for both aligned and unaligned1339* addresses (e.g. x86 and aarch64).1340*1341* In these cases, the alignment check can be removed by setting this macro to 0.1342* Then the code will always use unaligned memory access.1343* Align check is automatically disabled on x86, x64 & arm64,1344* which are platforms known to offer good unaligned memory accesses performance.1345*1346* This option does not affect XXH3 (only XXH32 and XXH64).1347*/1348# define XXH_FORCE_ALIGN_CHECK 013491350/*!1351* @def XXH_NO_INLINE_HINTS1352* @brief When non-zero, sets all functions to `static`.1353*1354* By default, xxHash tries to force the compiler to inline almost all internal1355* functions.1356*1357* This can usually improve performance due to reduced jumping and improved1358* constant folding, but significantly increases the size of the binary which1359* might not be favorable.1360*1361* Additionally, sometimes the forced inlining can be detrimental to performance,1362* depending on the architecture.1363*1364* XXH_NO_INLINE_HINTS marks all internal functions as static, giving the1365* compiler full control on whether to inline or not.1366*1367* When not optimizing (-O0), optimizing for size (-Os, -Oz), or using1368* -fno-inline with GCC or Clang, this will automatically be defined.1369*/1370# define XXH_NO_INLINE_HINTS 013711372/*!1373* @def XXH32_ENDJMP1374* @brief Whether to use a jump for `XXH32_finalize`.1375*1376* For performance, `XXH32_finalize` uses multiple branches in the finalizer.1377* This is generally preferable for performance,1378* but depending on exact architecture, a jmp may be preferable.1379*1380* This setting is only possibly making a difference for very small inputs.1381*/1382# define XXH32_ENDJMP 013831384/*!1385* @internal1386* @brief Redefines old internal names.1387*1388* For compatibility with code that uses xxHash's internals before the names1389* were changed to improve namespacing. There is no other reason to use this.1390*/1391# define XXH_OLD_NAMES1392# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */1393#endif /* XXH_DOXYGEN */1394/*!1395* @}1396*/13971398#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */1399/* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */1400# if !defined(__clang__) && \1401( \1402(defined(__INTEL_COMPILER) && !defined(_WIN32)) || \1403( \1404defined(__GNUC__) && ( \1405(defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \1406( \1407defined(__mips__) && \1408(__mips <= 5 || __mips_isa_rev < 6) && \1409(!defined(__mips16) || defined(__mips_mips16e2)) \1410) \1411) \1412) \1413)1414# define XXH_FORCE_MEMORY_ACCESS 11415# endif1416#endif14171418#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */1419# if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) \1420|| defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */1421# define XXH_FORCE_ALIGN_CHECK 01422# else1423# define XXH_FORCE_ALIGN_CHECK 11424# endif1425#endif14261427#ifndef XXH_NO_INLINE_HINTS1428# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \1429|| defined(__NO_INLINE__) /* -O0, -fno-inline */1430# define XXH_NO_INLINE_HINTS 11431# else1432# define XXH_NO_INLINE_HINTS 01433# endif1434#endif14351436#ifndef XXH32_ENDJMP1437/* generally preferable for performance */1438# define XXH32_ENDJMP 01439#endif14401441/*!1442* @defgroup impl Implementation1443* @{1444*/144514461447/* *************************************1448* Includes & Memory related functions1449***************************************/1450/* Modify the local functions below should you wish to use some other memory routines */1451/* for ZSTD_malloc(), ZSTD_free() */1452#define ZSTD_DEPS_NEED_MALLOC1453#include "zstd_deps.h" /* size_t, ZSTD_malloc, ZSTD_free, ZSTD_memcpy */1454static void* XXH_malloc(size_t s) { return ZSTD_malloc(s); }1455static void XXH_free (void* p) { ZSTD_free(p); }1456static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_memcpy(dest,src,size); }145714581459/* *************************************1460* Compiler Specific Options1461***************************************/1462#ifdef _MSC_VER /* Visual Studio warning fix */1463# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */1464#endif14651466#if XXH_NO_INLINE_HINTS /* disable inlining hints */1467# if defined(__GNUC__) || defined(__clang__)1468# define XXH_FORCE_INLINE static __attribute__((unused))1469# else1470# define XXH_FORCE_INLINE static1471# endif1472# define XXH_NO_INLINE static1473/* enable inlining hints */1474#elif defined(__GNUC__) || defined(__clang__)1475# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))1476# define XXH_NO_INLINE static __attribute__((noinline))1477#elif defined(_MSC_VER) /* Visual Studio */1478# define XXH_FORCE_INLINE static __forceinline1479# define XXH_NO_INLINE static __declspec(noinline)1480#elif defined (__cplusplus) \1481|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */1482# define XXH_FORCE_INLINE static inline1483# define XXH_NO_INLINE static1484#else1485# define XXH_FORCE_INLINE static1486# define XXH_NO_INLINE static1487#endif1488148914901491/* *************************************1492* Debug1493***************************************/1494/*!1495* @ingroup tuning1496* @def XXH_DEBUGLEVEL1497* @brief Sets the debugging level.1498*1499* XXH_DEBUGLEVEL is expected to be defined externally, typically via the1500* compiler's command line options. The value must be a number.1501*/1502#ifndef XXH_DEBUGLEVEL1503# ifdef DEBUGLEVEL /* backwards compat */1504# define XXH_DEBUGLEVEL DEBUGLEVEL1505# else1506# define XXH_DEBUGLEVEL 01507# endif1508#endif15091510#if (XXH_DEBUGLEVEL>=1)1511# include <assert.h> /* note: can still be disabled with NDEBUG */1512# define XXH_ASSERT(c) assert(c)1513#else1514# define XXH_ASSERT(c) ((void)0)1515#endif15161517/* note: use after variable declarations */1518#ifndef XXH_STATIC_ASSERT1519# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */1520# include <assert.h>1521# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)1522# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */1523# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)1524# else1525# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)1526# endif1527# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)1528#endif15291530/*!1531* @internal1532* @def XXH_COMPILER_GUARD(var)1533* @brief Used to prevent unwanted optimizations for @p var.1534*1535* It uses an empty GCC inline assembly statement with a register constraint1536* which forces @p var into a general purpose register (e.g. eax, ebx, ecx1537* on x86) and marks it as modified.1538*1539* This is used in a few places to avoid unwanted autovectorization (e.g.1540* XXH32_round()). All vectorization we want is explicit via intrinsics,1541* and _usually_ isn't wanted elsewhere.1542*1543* We also use it to prevent unwanted constant folding for AArch64 in1544* XXH3_initCustomSecret_scalar().1545*/1546#if defined(__GNUC__) || defined(__clang__)1547# define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))1548#else1549# define XXH_COMPILER_GUARD(var) ((void)0)1550#endif15511552/* *************************************1553* Basic Types1554***************************************/1555#if !defined (__VMS) \1556&& (defined (__cplusplus) \1557|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )1558# include <stdint.h>1559typedef uint8_t xxh_u8;1560#else1561typedef unsigned char xxh_u8;1562#endif1563typedef XXH32_hash_t xxh_u32;15641565#ifdef XXH_OLD_NAMES1566# define BYTE xxh_u81567# define U8 xxh_u81568# define U32 xxh_u321569#endif15701571/* *** Memory access *** */15721573/*!1574* @internal1575* @fn xxh_u32 XXH_read32(const void* ptr)1576* @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.1577*1578* Affected by @ref XXH_FORCE_MEMORY_ACCESS.1579*1580* @param ptr The pointer to read from.1581* @return The 32-bit native endian integer from the bytes at @p ptr.1582*/15831584/*!1585* @internal1586* @fn xxh_u32 XXH_readLE32(const void* ptr)1587* @brief Reads an unaligned 32-bit little endian integer from @p ptr.1588*1589* Affected by @ref XXH_FORCE_MEMORY_ACCESS.1590*1591* @param ptr The pointer to read from.1592* @return The 32-bit little endian integer from the bytes at @p ptr.1593*/15941595/*!1596* @internal1597* @fn xxh_u32 XXH_readBE32(const void* ptr)1598* @brief Reads an unaligned 32-bit big endian integer from @p ptr.1599*1600* Affected by @ref XXH_FORCE_MEMORY_ACCESS.1601*1602* @param ptr The pointer to read from.1603* @return The 32-bit big endian integer from the bytes at @p ptr.1604*/16051606/*!1607* @internal1608* @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)1609* @brief Like @ref XXH_readLE32(), but has an option for aligned reads.1610*1611* Affected by @ref XXH_FORCE_MEMORY_ACCESS.1612* Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is1613* always @ref XXH_alignment::XXH_unaligned.1614*1615* @param ptr The pointer to read from.1616* @param align Whether @p ptr is aligned.1617* @pre1618* If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte1619* aligned.1620* @return The 32-bit little endian integer from the bytes at @p ptr.1621*/16221623#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))1624/*1625* Manual byteshift. Best for old compilers which don't inline memcpy.1626* We actually directly use XXH_readLE32 and XXH_readBE32.1627*/1628#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))16291630/*1631* Force direct memory access. Only works on CPU which support unaligned memory1632* access in hardware.1633*/1634static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }16351636#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))16371638/*1639* __pack instructions are safer but compiler specific, hence potentially1640* problematic for some compilers.1641*1642* Currently only defined for GCC and ICC.1643*/1644#ifdef XXH_OLD_NAMES1645typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;1646#endif1647static xxh_u32 XXH_read32(const void* ptr)1648{1649typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;1650return ((const xxh_unalign*)ptr)->u32;1651}16521653#else16541655/*1656* Portable and safe solution. Generally efficient.1657* see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html1658*/1659static xxh_u32 XXH_read32(const void* memPtr)1660{1661xxh_u32 val;1662XXH_memcpy(&val, memPtr, sizeof(val));1663return val;1664}16651666#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */166716681669/* *** Endianness *** */16701671/*!1672* @ingroup tuning1673* @def XXH_CPU_LITTLE_ENDIAN1674* @brief Whether the target is little endian.1675*1676* Defined to 1 if the target is little endian, or 0 if it is big endian.1677* It can be defined externally, for example on the compiler command line.1678*1679* If it is not defined,1680* a runtime check (which is usually constant folded) is used instead.1681*1682* @note1683* This is not necessarily defined to an integer constant.1684*1685* @see XXH_isLittleEndian() for the runtime check.1686*/1687#ifndef XXH_CPU_LITTLE_ENDIAN1688/*1689* Try to detect endianness automatically, to avoid the nonstandard behavior1690* in `XXH_isLittleEndian()`1691*/1692# if defined(_WIN32) /* Windows is always little endian */ \1693|| defined(__LITTLE_ENDIAN__) \1694|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)1695# define XXH_CPU_LITTLE_ENDIAN 11696# elif defined(__BIG_ENDIAN__) \1697|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)1698# define XXH_CPU_LITTLE_ENDIAN 01699# else1700/*!1701* @internal1702* @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.1703*1704* Most compilers will constant fold this.1705*/1706static int XXH_isLittleEndian(void)1707{1708/*1709* Portable and well-defined behavior.1710* Don't use static: it is detrimental to performance.1711*/1712const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };1713return one.c[0];1714}1715# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()1716# endif1717#endif17181719172017211722/* ****************************************1723* Compiler-specific Functions and Macros1724******************************************/1725#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)17261727#ifdef __has_builtin1728# define XXH_HAS_BUILTIN(x) __has_builtin(x)1729#else1730# define XXH_HAS_BUILTIN(x) 01731#endif17321733/*!1734* @internal1735* @def XXH_rotl32(x,r)1736* @brief 32-bit rotate left.1737*1738* @param x The 32-bit integer to be rotated.1739* @param r The number of bits to rotate.1740* @pre1741* @p r > 0 && @p r < 321742* @note1743* @p x and @p r may be evaluated multiple times.1744* @return The rotated result.1745*/1746#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \1747&& XXH_HAS_BUILTIN(__builtin_rotateleft64)1748# define XXH_rotl32 __builtin_rotateleft321749# define XXH_rotl64 __builtin_rotateleft641750/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */1751#elif defined(_MSC_VER)1752# define XXH_rotl32(x,r) _rotl(x,r)1753# define XXH_rotl64(x,r) _rotl64(x,r)1754#else1755# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))1756# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))1757#endif17581759/*!1760* @internal1761* @fn xxh_u32 XXH_swap32(xxh_u32 x)1762* @brief A 32-bit byteswap.1763*1764* @param x The 32-bit integer to byteswap.1765* @return @p x, byteswapped.1766*/1767#if defined(_MSC_VER) /* Visual Studio */1768# define XXH_swap32 _byteswap_ulong1769#elif XXH_GCC_VERSION >= 4031770# define XXH_swap32 __builtin_bswap321771#else1772static xxh_u32 XXH_swap32 (xxh_u32 x)1773{1774return ((x << 24) & 0xff000000 ) |1775((x << 8) & 0x00ff0000 ) |1776((x >> 8) & 0x0000ff00 ) |1777((x >> 24) & 0x000000ff );1778}1779#endif178017811782/* ***************************1783* Memory reads1784*****************************/17851786/*!1787* @internal1788* @brief Enum to indicate whether a pointer is aligned.1789*/1790typedef enum {1791XXH_aligned, /*!< Aligned */1792XXH_unaligned /*!< Possibly unaligned */1793} XXH_alignment;17941795/*1796* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.1797*1798* This is ideal for older compilers which don't inline memcpy.1799*/1800#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))18011802XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)1803{1804const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;1805return bytePtr[0]1806| ((xxh_u32)bytePtr[1] << 8)1807| ((xxh_u32)bytePtr[2] << 16)1808| ((xxh_u32)bytePtr[3] << 24);1809}18101811XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)1812{1813const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;1814return bytePtr[3]1815| ((xxh_u32)bytePtr[2] << 8)1816| ((xxh_u32)bytePtr[1] << 16)1817| ((xxh_u32)bytePtr[0] << 24);1818}18191820#else1821XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)1822{1823return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));1824}18251826static xxh_u32 XXH_readBE32(const void* ptr)1827{1828return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);1829}1830#endif18311832XXH_FORCE_INLINE xxh_u321833XXH_readLE32_align(const void* ptr, XXH_alignment align)1834{1835if (align==XXH_unaligned) {1836return XXH_readLE32(ptr);1837} else {1838return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);1839}1840}184118421843/* *************************************1844* Misc1845***************************************/1846/*! @ingroup public */1847XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }184818491850/* *******************************************************************1851* 32-bit hash functions1852*********************************************************************/1853/*!1854* @}1855* @defgroup xxh32_impl XXH32 implementation1856* @ingroup impl1857* @{1858*/1859/* #define instead of static const, to be used as initializers */1860#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */1861#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */1862#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */1863#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */1864#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */18651866#ifdef XXH_OLD_NAMES1867# define PRIME32_1 XXH_PRIME32_11868# define PRIME32_2 XXH_PRIME32_21869# define PRIME32_3 XXH_PRIME32_31870# define PRIME32_4 XXH_PRIME32_41871# define PRIME32_5 XXH_PRIME32_51872#endif18731874/*!1875* @internal1876* @brief Normal stripe processing routine.1877*1878* This shuffles the bits so that any bit from @p input impacts several bits in1879* @p acc.1880*1881* @param acc The accumulator lane.1882* @param input The stripe of input to mix.1883* @return The mixed accumulator lane.1884*/1885static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)1886{1887acc += input * XXH_PRIME32_2;1888acc = XXH_rotl32(acc, 13);1889acc *= XXH_PRIME32_1;1890#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)1891/*1892* UGLY HACK:1893* A compiler fence is the only thing that prevents GCC and Clang from1894* autovectorizing the XXH32 loop (pragmas and attributes don't work for some1895* reason) without globally disabling SSE4.1.1896*1897* The reason we want to avoid vectorization is because despite working on1898* 4 integers at a time, there are multiple factors slowing XXH32 down on1899* SSE4:1900* - There's a ridiculous amount of lag from pmulld (10 cycles of latency on1901* newer chips!) making it slightly slower to multiply four integers at1902* once compared to four integers independently. Even when pmulld was1903* fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE1904* just to multiply unless doing a long operation.1905*1906* - Four instructions are required to rotate,1907* movqda tmp, v // not required with VEX encoding1908* pslld tmp, 13 // tmp <<= 131909* psrld v, 19 // x >>= 191910* por v, tmp // x |= tmp1911* compared to one for scalar:1912* roll v, 13 // reliably fast across the board1913* shldl v, v, 13 // Sandy Bridge and later prefer this for some reason1914*1915* - Instruction level parallelism is actually more beneficial here because1916* the SIMD actually serializes this operation: While v1 is rotating, v21917* can load data, while v3 can multiply. SSE forces them to operate1918* together.1919*1920* This is also enabled on AArch64, as Clang autovectorizes it incorrectly1921* and it is pointless writing a NEON implementation that is basically the1922* same speed as scalar for XXH32.1923*/1924XXH_COMPILER_GUARD(acc);1925#endif1926return acc;1927}19281929/*!1930* @internal1931* @brief Mixes all bits to finalize the hash.1932*1933* The final mix ensures that all input bits have a chance to impact any bit in1934* the output digest, resulting in an unbiased distribution.1935*1936* @param h32 The hash to avalanche.1937* @return The avalanched hash.1938*/1939static xxh_u32 XXH32_avalanche(xxh_u32 h32)1940{1941h32 ^= h32 >> 15;1942h32 *= XXH_PRIME32_2;1943h32 ^= h32 >> 13;1944h32 *= XXH_PRIME32_3;1945h32 ^= h32 >> 16;1946return(h32);1947}19481949#define XXH_get32bits(p) XXH_readLE32_align(p, align)19501951/*!1952* @internal1953* @brief Processes the last 0-15 bytes of @p ptr.1954*1955* There may be up to 15 bytes remaining to consume from the input.1956* This final stage will digest them to ensure that all input bytes are present1957* in the final mix.1958*1959* @param h32 The hash to finalize.1960* @param ptr The pointer to the remaining input.1961* @param len The remaining length, modulo 16.1962* @param align Whether @p ptr is aligned.1963* @return The finalized hash.1964*/1965static xxh_u321966XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)1967{1968#define XXH_PROCESS1 do { \1969h32 += (*ptr++) * XXH_PRIME32_5; \1970h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \1971} while (0)19721973#define XXH_PROCESS4 do { \1974h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \1975ptr += 4; \1976h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \1977} while (0)19781979if (ptr==NULL) XXH_ASSERT(len == 0);19801981/* Compact rerolled version; generally faster */1982if (!XXH32_ENDJMP) {1983len &= 15;1984while (len >= 4) {1985XXH_PROCESS4;1986len -= 4;1987}1988while (len > 0) {1989XXH_PROCESS1;1990--len;1991}1992return XXH32_avalanche(h32);1993} else {1994switch(len&15) /* or switch(bEnd - p) */ {1995case 12: XXH_PROCESS4;1996XXH_FALLTHROUGH;1997case 8: XXH_PROCESS4;1998XXH_FALLTHROUGH;1999case 4: XXH_PROCESS4;2000return XXH32_avalanche(h32);20012002case 13: XXH_PROCESS4;2003XXH_FALLTHROUGH;2004case 9: XXH_PROCESS4;2005XXH_FALLTHROUGH;2006case 5: XXH_PROCESS4;2007XXH_PROCESS1;2008return XXH32_avalanche(h32);20092010case 14: XXH_PROCESS4;2011XXH_FALLTHROUGH;2012case 10: XXH_PROCESS4;2013XXH_FALLTHROUGH;2014case 6: XXH_PROCESS4;2015XXH_PROCESS1;2016XXH_PROCESS1;2017return XXH32_avalanche(h32);20182019case 15: XXH_PROCESS4;2020XXH_FALLTHROUGH;2021case 11: XXH_PROCESS4;2022XXH_FALLTHROUGH;2023case 7: XXH_PROCESS4;2024XXH_FALLTHROUGH;2025case 3: XXH_PROCESS1;2026XXH_FALLTHROUGH;2027case 2: XXH_PROCESS1;2028XXH_FALLTHROUGH;2029case 1: XXH_PROCESS1;2030XXH_FALLTHROUGH;2031case 0: return XXH32_avalanche(h32);2032}2033XXH_ASSERT(0);2034return h32; /* reaching this point is deemed impossible */2035}2036}20372038#ifdef XXH_OLD_NAMES2039# define PROCESS1 XXH_PROCESS12040# define PROCESS4 XXH_PROCESS42041#else2042# undef XXH_PROCESS12043# undef XXH_PROCESS42044#endif20452046/*!2047* @internal2048* @brief The implementation for @ref XXH32().2049*2050* @param input , len , seed Directly passed from @ref XXH32().2051* @param align Whether @p input is aligned.2052* @return The calculated hash.2053*/2054XXH_FORCE_INLINE xxh_u322055XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)2056{2057xxh_u32 h32;20582059if (input==NULL) XXH_ASSERT(len == 0);20602061if (len>=16) {2062const xxh_u8* const bEnd = input + len;2063const xxh_u8* const limit = bEnd - 15;2064xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;2065xxh_u32 v2 = seed + XXH_PRIME32_2;2066xxh_u32 v3 = seed + 0;2067xxh_u32 v4 = seed - XXH_PRIME32_1;20682069do {2070v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;2071v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;2072v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;2073v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;2074} while (input < limit);20752076h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7)2077+ XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);2078} else {2079h32 = seed + XXH_PRIME32_5;2080}20812082h32 += (xxh_u32)len;20832084return XXH32_finalize(h32, input, len&15, align);2085}20862087/*! @ingroup xxh32_family */2088XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)2089{2090#if 02091/* Simple version, good for code maintenance, but unfortunately slow for small inputs */2092XXH32_state_t state;2093XXH32_reset(&state, seed);2094XXH32_update(&state, (const xxh_u8*)input, len);2095return XXH32_digest(&state);2096#else2097if (XXH_FORCE_ALIGN_CHECK) {2098if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */2099return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);2100} }21012102return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);2103#endif2104}2105210621072108/******* Hash streaming *******/2109/*!2110* @ingroup xxh32_family2111*/2112XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)2113{2114return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));2115}2116/*! @ingroup xxh32_family */2117XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)2118{2119XXH_free(statePtr);2120return XXH_OK;2121}21222123/*! @ingroup xxh32_family */2124XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)2125{2126XXH_memcpy(dstState, srcState, sizeof(*dstState));2127}21282129/*! @ingroup xxh32_family */2130XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)2131{2132XXH_ASSERT(statePtr != NULL);2133memset(statePtr, 0, sizeof(*statePtr));2134statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;2135statePtr->v[1] = seed + XXH_PRIME32_2;2136statePtr->v[2] = seed + 0;2137statePtr->v[3] = seed - XXH_PRIME32_1;2138return XXH_OK;2139}214021412142/*! @ingroup xxh32_family */2143XXH_PUBLIC_API XXH_errorcode2144XXH32_update(XXH32_state_t* state, const void* input, size_t len)2145{2146if (input==NULL) {2147XXH_ASSERT(len == 0);2148return XXH_OK;2149}21502151{ const xxh_u8* p = (const xxh_u8*)input;2152const xxh_u8* const bEnd = p + len;21532154state->total_len_32 += (XXH32_hash_t)len;2155state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));21562157if (state->memsize + len < 16) { /* fill in tmp buffer */2158XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);2159state->memsize += (XXH32_hash_t)len;2160return XXH_OK;2161}21622163if (state->memsize) { /* some data left from previous update */2164XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);2165{ const xxh_u32* p32 = state->mem32;2166state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++;2167state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++;2168state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++;2169state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));2170}2171p += 16-state->memsize;2172state->memsize = 0;2173}21742175if (p <= bEnd-16) {2176const xxh_u8* const limit = bEnd - 16;21772178do {2179state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4;2180state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4;2181state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4;2182state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4;2183} while (p<=limit);21842185}21862187if (p < bEnd) {2188XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));2189state->memsize = (unsigned)(bEnd-p);2190}2191}21922193return XXH_OK;2194}219521962197/*! @ingroup xxh32_family */2198XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)2199{2200xxh_u32 h32;22012202if (state->large_len) {2203h32 = XXH_rotl32(state->v[0], 1)2204+ XXH_rotl32(state->v[1], 7)2205+ XXH_rotl32(state->v[2], 12)2206+ XXH_rotl32(state->v[3], 18);2207} else {2208h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;2209}22102211h32 += state->total_len_32;22122213return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);2214}221522162217/******* Canonical representation *******/22182219/*!2220* @ingroup xxh32_family2221* The default return values from XXH functions are unsigned 32 and 64 bit2222* integers.2223*2224* The canonical representation uses big endian convention, the same convention2225* as human-readable numbers (large digits first).2226*2227* This way, hash values can be written into a file or buffer, remaining2228* comparable across different systems.2229*2230* The following functions allow transformation of hash values to and from their2231* canonical format.2232*/2233XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)2234{2235/* XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); */2236if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);2237XXH_memcpy(dst, &hash, sizeof(*dst));2238}2239/*! @ingroup xxh32_family */2240XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)2241{2242return XXH_readBE32(src);2243}224422452246#ifndef XXH_NO_LONG_LONG22472248/* *******************************************************************2249* 64-bit hash functions2250*********************************************************************/2251/*!2252* @}2253* @ingroup impl2254* @{2255*/2256/******* Memory access *******/22572258typedef XXH64_hash_t xxh_u64;22592260#ifdef XXH_OLD_NAMES2261# define U64 xxh_u642262#endif22632264#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))2265/*2266* Manual byteshift. Best for old compilers which don't inline memcpy.2267* We actually directly use XXH_readLE64 and XXH_readBE64.2268*/2269#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))22702271/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */2272static xxh_u64 XXH_read64(const void* memPtr)2273{2274return *(const xxh_u64*) memPtr;2275}22762277#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))22782279/*2280* __pack instructions are safer, but compiler specific, hence potentially2281* problematic for some compilers.2282*2283* Currently only defined for GCC and ICC.2284*/2285#ifdef XXH_OLD_NAMES2286typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;2287#endif2288static xxh_u64 XXH_read64(const void* ptr)2289{2290typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;2291return ((const xxh_unalign64*)ptr)->u64;2292}22932294#else22952296/*2297* Portable and safe solution. Generally efficient.2298* see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html2299*/2300static xxh_u64 XXH_read64(const void* memPtr)2301{2302xxh_u64 val;2303XXH_memcpy(&val, memPtr, sizeof(val));2304return val;2305}23062307#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */23082309#if defined(_MSC_VER) /* Visual Studio */2310# define XXH_swap64 _byteswap_uint642311#elif XXH_GCC_VERSION >= 4032312# define XXH_swap64 __builtin_bswap642313#else2314static xxh_u64 XXH_swap64(xxh_u64 x)2315{2316return ((x << 56) & 0xff00000000000000ULL) |2317((x << 40) & 0x00ff000000000000ULL) |2318((x << 24) & 0x0000ff0000000000ULL) |2319((x << 8) & 0x000000ff00000000ULL) |2320((x >> 8) & 0x00000000ff000000ULL) |2321((x >> 24) & 0x0000000000ff0000ULL) |2322((x >> 40) & 0x000000000000ff00ULL) |2323((x >> 56) & 0x00000000000000ffULL);2324}2325#endif232623272328/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */2329#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))23302331XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)2332{2333const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;2334return bytePtr[0]2335| ((xxh_u64)bytePtr[1] << 8)2336| ((xxh_u64)bytePtr[2] << 16)2337| ((xxh_u64)bytePtr[3] << 24)2338| ((xxh_u64)bytePtr[4] << 32)2339| ((xxh_u64)bytePtr[5] << 40)2340| ((xxh_u64)bytePtr[6] << 48)2341| ((xxh_u64)bytePtr[7] << 56);2342}23432344XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)2345{2346const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;2347return bytePtr[7]2348| ((xxh_u64)bytePtr[6] << 8)2349| ((xxh_u64)bytePtr[5] << 16)2350| ((xxh_u64)bytePtr[4] << 24)2351| ((xxh_u64)bytePtr[3] << 32)2352| ((xxh_u64)bytePtr[2] << 40)2353| ((xxh_u64)bytePtr[1] << 48)2354| ((xxh_u64)bytePtr[0] << 56);2355}23562357#else2358XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)2359{2360return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));2361}23622363static xxh_u64 XXH_readBE64(const void* ptr)2364{2365return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);2366}2367#endif23682369XXH_FORCE_INLINE xxh_u642370XXH_readLE64_align(const void* ptr, XXH_alignment align)2371{2372if (align==XXH_unaligned)2373return XXH_readLE64(ptr);2374else2375return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);2376}237723782379/******* xxh64 *******/2380/*!2381* @}2382* @defgroup xxh64_impl XXH64 implementation2383* @ingroup impl2384* @{2385*/2386/* #define rather that static const, to be used as initializers */2387#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */2388#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */2389#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */2390#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */2391#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */23922393#ifdef XXH_OLD_NAMES2394# define PRIME64_1 XXH_PRIME64_12395# define PRIME64_2 XXH_PRIME64_22396# define PRIME64_3 XXH_PRIME64_32397# define PRIME64_4 XXH_PRIME64_42398# define PRIME64_5 XXH_PRIME64_52399#endif24002401static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)2402{2403acc += input * XXH_PRIME64_2;2404acc = XXH_rotl64(acc, 31);2405acc *= XXH_PRIME64_1;2406return acc;2407}24082409static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)2410{2411val = XXH64_round(0, val);2412acc ^= val;2413acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4;2414return acc;2415}24162417static xxh_u64 XXH64_avalanche(xxh_u64 h64)2418{2419h64 ^= h64 >> 33;2420h64 *= XXH_PRIME64_2;2421h64 ^= h64 >> 29;2422h64 *= XXH_PRIME64_3;2423h64 ^= h64 >> 32;2424return h64;2425}242624272428#define XXH_get64bits(p) XXH_readLE64_align(p, align)24292430static xxh_u642431XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)2432{2433if (ptr==NULL) XXH_ASSERT(len == 0);2434len &= 31;2435while (len >= 8) {2436xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));2437ptr += 8;2438h64 ^= k1;2439h64 = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4;2440len -= 8;2441}2442if (len >= 4) {2443h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;2444ptr += 4;2445h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;2446len -= 4;2447}2448while (len > 0) {2449h64 ^= (*ptr++) * XXH_PRIME64_5;2450h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;2451--len;2452}2453return XXH64_avalanche(h64);2454}24552456#ifdef XXH_OLD_NAMES2457# define PROCESS1_64 XXH_PROCESS1_642458# define PROCESS4_64 XXH_PROCESS4_642459# define PROCESS8_64 XXH_PROCESS8_642460#else2461# undef XXH_PROCESS1_642462# undef XXH_PROCESS4_642463# undef XXH_PROCESS8_642464#endif24652466XXH_FORCE_INLINE xxh_u642467XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)2468{2469xxh_u64 h64;2470if (input==NULL) XXH_ASSERT(len == 0);24712472if (len>=32) {2473const xxh_u8* const bEnd = input + len;2474const xxh_u8* const limit = bEnd - 31;2475xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;2476xxh_u64 v2 = seed + XXH_PRIME64_2;2477xxh_u64 v3 = seed + 0;2478xxh_u64 v4 = seed - XXH_PRIME64_1;24792480do {2481v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;2482v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;2483v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;2484v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;2485} while (input<limit);24862487h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);2488h64 = XXH64_mergeRound(h64, v1);2489h64 = XXH64_mergeRound(h64, v2);2490h64 = XXH64_mergeRound(h64, v3);2491h64 = XXH64_mergeRound(h64, v4);24922493} else {2494h64 = seed + XXH_PRIME64_5;2495}24962497h64 += (xxh_u64) len;24982499return XXH64_finalize(h64, input, len, align);2500}250125022503/*! @ingroup xxh64_family */2504XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)2505{2506#if 02507/* Simple version, good for code maintenance, but unfortunately slow for small inputs */2508XXH64_state_t state;2509XXH64_reset(&state, seed);2510XXH64_update(&state, (const xxh_u8*)input, len);2511return XXH64_digest(&state);2512#else2513if (XXH_FORCE_ALIGN_CHECK) {2514if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */2515return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);2516} }25172518return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);25192520#endif2521}25222523/******* Hash Streaming *******/25242525/*! @ingroup xxh64_family*/2526XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)2527{2528return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));2529}2530/*! @ingroup xxh64_family */2531XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)2532{2533XXH_free(statePtr);2534return XXH_OK;2535}25362537/*! @ingroup xxh64_family */2538XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)2539{2540XXH_memcpy(dstState, srcState, sizeof(*dstState));2541}25422543/*! @ingroup xxh64_family */2544XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)2545{2546XXH_ASSERT(statePtr != NULL);2547memset(statePtr, 0, sizeof(*statePtr));2548statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;2549statePtr->v[1] = seed + XXH_PRIME64_2;2550statePtr->v[2] = seed + 0;2551statePtr->v[3] = seed - XXH_PRIME64_1;2552return XXH_OK;2553}25542555/*! @ingroup xxh64_family */2556XXH_PUBLIC_API XXH_errorcode2557XXH64_update (XXH64_state_t* state, const void* input, size_t len)2558{2559if (input==NULL) {2560XXH_ASSERT(len == 0);2561return XXH_OK;2562}25632564{ const xxh_u8* p = (const xxh_u8*)input;2565const xxh_u8* const bEnd = p + len;25662567state->total_len += len;25682569if (state->memsize + len < 32) { /* fill in tmp buffer */2570XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);2571state->memsize += (xxh_u32)len;2572return XXH_OK;2573}25742575if (state->memsize) { /* tmp buffer is full */2576XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);2577state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0));2578state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1));2579state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2));2580state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3));2581p += 32 - state->memsize;2582state->memsize = 0;2583}25842585if (p+32 <= bEnd) {2586const xxh_u8* const limit = bEnd - 32;25872588do {2589state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8;2590state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8;2591state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8;2592state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8;2593} while (p<=limit);25942595}25962597if (p < bEnd) {2598XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));2599state->memsize = (unsigned)(bEnd-p);2600}2601}26022603return XXH_OK;2604}260526062607/*! @ingroup xxh64_family */2608XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)2609{2610xxh_u64 h64;26112612if (state->total_len >= 32) {2613h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);2614h64 = XXH64_mergeRound(h64, state->v[0]);2615h64 = XXH64_mergeRound(h64, state->v[1]);2616h64 = XXH64_mergeRound(h64, state->v[2]);2617h64 = XXH64_mergeRound(h64, state->v[3]);2618} else {2619h64 = state->v[2] /*seed*/ + XXH_PRIME64_5;2620}26212622h64 += (xxh_u64) state->total_len;26232624return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);2625}262626272628/******* Canonical representation *******/26292630/*! @ingroup xxh64_family */2631XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)2632{2633/* XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); */2634if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);2635XXH_memcpy(dst, &hash, sizeof(*dst));2636}26372638/*! @ingroup xxh64_family */2639XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)2640{2641return XXH_readBE64(src);2642}26432644#ifndef XXH_NO_XXH326452646/* *********************************************************************2647* XXH32648* New generation hash designed for speed on small keys and vectorization2649************************************************************************ */2650/*!2651* @}2652* @defgroup xxh3_impl XXH3 implementation2653* @ingroup impl2654* @{2655*/26562657/* === Compiler specifics === */26582659#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */2660# define XXH_RESTRICT /* disable */2661#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */2662# define XXH_RESTRICT restrict2663#else2664/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */2665# define XXH_RESTRICT /* disable */2666#endif26672668#if (defined(__GNUC__) && (__GNUC__ >= 3)) \2669|| (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \2670|| defined(__clang__)2671# define XXH_likely(x) __builtin_expect(x, 1)2672# define XXH_unlikely(x) __builtin_expect(x, 0)2673#else2674# define XXH_likely(x) (x)2675# define XXH_unlikely(x) (x)2676#endif26772678#if defined(__GNUC__) || defined(__clang__)2679# if defined(__ARM_NEON__) || defined(__ARM_NEON) \2680|| defined(__aarch64__) || defined(_M_ARM) \2681|| defined(_M_ARM64) || defined(_M_ARM64EC)2682# define inline __inline__ /* circumvent a clang bug */2683# include <arm_neon.h>2684# undef inline2685# elif defined(__AVX2__)2686# include <immintrin.h>2687# elif defined(__SSE2__)2688# include <emmintrin.h>2689# endif2690#endif26912692#if defined(_MSC_VER)2693# include <intrin.h>2694#endif26952696/*2697* One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while2698* remaining a true 64-bit/128-bit hash function.2699*2700* This is done by prioritizing a subset of 64-bit operations that can be2701* emulated without too many steps on the average 32-bit machine.2702*2703* For example, these two lines seem similar, and run equally fast on 64-bit:2704*2705* xxh_u64 x;2706* x ^= (x >> 47); // good2707* x ^= (x >> 13); // bad2708*2709* However, to a 32-bit machine, there is a major difference.2710*2711* x ^= (x >> 47) looks like this:2712*2713* x.lo ^= (x.hi >> (47 - 32));2714*2715* while x ^= (x >> 13) looks like this:2716*2717* // note: funnel shifts are not usually cheap.2718* x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));2719* x.hi ^= (x.hi >> 13);2720*2721* The first one is significantly faster than the second, simply because the2722* shift is larger than 32. This means:2723* - All the bits we need are in the upper 32 bits, so we can ignore the lower2724* 32 bits in the shift.2725* - The shift result will always fit in the lower 32 bits, and therefore,2726* we can ignore the upper 32 bits in the xor.2727*2728* Thanks to this optimization, XXH3 only requires these features to be efficient:2729*2730* - Usable unaligned access2731* - A 32-bit or 64-bit ALU2732* - If 32-bit, a decent ADC instruction2733* - A 32 or 64-bit multiply with a 64-bit result2734* - For the 128-bit variant, a decent byteswap helps short inputs.2735*2736* The first two are already required by XXH32, and almost all 32-bit and 64-bit2737* platforms which can run XXH32 can run XXH3 efficiently.2738*2739* Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one2740* notable exception.2741*2742* First of all, Thumb-1 lacks support for the UMULL instruction which2743* performs the important long multiply. This means numerous __aeabi_lmul2744* calls.2745*2746* Second of all, the 8 functional registers are just not enough.2747* Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need2748* Lo registers, and this shuffling results in thousands more MOVs than A32.2749*2750* A32 and T32 don't have this limitation. They can access all 14 registers,2751* do a 32->64 multiply with UMULL, and the flexible operand allowing free2752* shifts is helpful, too.2753*2754* Therefore, we do a quick sanity check.2755*2756* If compiling Thumb-1 for a target which supports ARM instructions, we will2757* emit a warning, as it is not a "sane" platform to compile for.2758*2759* Usually, if this happens, it is because of an accident and you probably need2760* to specify -march, as you likely meant to compile for a newer architecture.2761*2762* Credit: large sections of the vectorial and asm source code paths2763* have been contributed by @easyaspi3142764*/2765#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)2766# warning "XXH3 is highly inefficient without ARM or Thumb-2."2767#endif27682769/* ==========================================2770* Vectorization detection2771* ========================================== */27722773#ifdef XXH_DOXYGEN2774/*!2775* @ingroup tuning2776* @brief Overrides the vectorization implementation chosen for XXH3.2777*2778* Can be defined to 0 to disable SIMD or any of the values mentioned in2779* @ref XXH_VECTOR_TYPE.2780*2781* If this is not defined, it uses predefined macros to determine the best2782* implementation.2783*/2784# define XXH_VECTOR XXH_SCALAR2785/*!2786* @ingroup tuning2787* @brief Possible values for @ref XXH_VECTOR.2788*2789* Note that these are actually implemented as macros.2790*2791* If this is not defined, it is detected automatically.2792* @ref XXH_X86DISPATCH overrides this.2793*/2794enum XXH_VECTOR_TYPE /* fake enum */ {2795XXH_SCALAR = 0, /*!< Portable scalar version */2796XXH_SSE2 = 1, /*!<2797* SSE2 for Pentium 4, Opteron, all x86_64.2798*2799* @note SSE2 is also guaranteed on Windows 10, macOS, and2800* Android x86.2801*/2802XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */2803XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */2804XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */2805XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */2806};2807/*!2808* @ingroup tuning2809* @brief Selects the minimum alignment for XXH3's accumulators.2810*2811* When using SIMD, this should match the alignment required for said vector2812* type, so, for example, 32 for AVX2.2813*2814* Default: Auto detected.2815*/2816# define XXH_ACC_ALIGN 82817#endif28182819/* Actual definition */2820#ifndef XXH_DOXYGEN2821# define XXH_SCALAR 02822# define XXH_SSE2 12823# define XXH_AVX2 22824# define XXH_AVX512 32825# define XXH_NEON 42826# define XXH_VSX 52827#endif28282829#ifndef XXH_VECTOR /* can be defined on command line */2830# if ( \2831defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \2832|| defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \2833) && ( \2834defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \2835|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \2836)2837# define XXH_VECTOR XXH_NEON2838# elif defined(__AVX512F__)2839# define XXH_VECTOR XXH_AVX5122840# elif defined(__AVX2__)2841# define XXH_VECTOR XXH_AVX22842# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))2843# define XXH_VECTOR XXH_SSE22844# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \2845|| (defined(__s390x__) && defined(__VEC__)) \2846&& defined(__GNUC__) /* TODO: IBM XL */2847# define XXH_VECTOR XXH_VSX2848# else2849# define XXH_VECTOR XXH_SCALAR2850# endif2851#endif28522853/*2854* Controls the alignment of the accumulator,2855* for compatibility with aligned vector loads, which are usually faster.2856*/2857#ifndef XXH_ACC_ALIGN2858# if defined(XXH_X86DISPATCH)2859# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */2860# elif XXH_VECTOR == XXH_SCALAR /* scalar */2861# define XXH_ACC_ALIGN 82862# elif XXH_VECTOR == XXH_SSE2 /* sse2 */2863# define XXH_ACC_ALIGN 162864# elif XXH_VECTOR == XXH_AVX2 /* avx2 */2865# define XXH_ACC_ALIGN 322866# elif XXH_VECTOR == XXH_NEON /* neon */2867# define XXH_ACC_ALIGN 162868# elif XXH_VECTOR == XXH_VSX /* vsx */2869# define XXH_ACC_ALIGN 162870# elif XXH_VECTOR == XXH_AVX512 /* avx512 */2871# define XXH_ACC_ALIGN 642872# endif2873#endif28742875#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \2876|| XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX5122877# define XXH_SEC_ALIGN XXH_ACC_ALIGN2878#else2879# define XXH_SEC_ALIGN 82880#endif28812882/*2883* UGLY HACK:2884* GCC usually generates the best code with -O3 for xxHash.2885*2886* However, when targeting AVX2, it is overzealous in its unrolling resulting2887* in code roughly 3/4 the speed of Clang.2888*2889* There are other issues, such as GCC splitting _mm256_loadu_si256 into2890* _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which2891* only applies to Sandy and Ivy Bridge... which don't even support AVX2.2892*2893* That is why when compiling the AVX2 version, it is recommended to use either2894* -O2 -mavx2 -march=haswell2895* or2896* -O2 -mavx2 -mno-avx256-split-unaligned-load2897* for decent performance, or to use Clang instead.2898*2899* Fortunately, we can control the first one with a pragma that forces GCC into2900* -O2, but the other one we can't control without "failed to inline always2901* inline function due to target mismatch" warnings.2902*/2903#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \2904&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \2905&& defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */2906# pragma GCC push_options2907# pragma GCC optimize("-O2")2908#endif290929102911#if XXH_VECTOR == XXH_NEON2912/*2913* NEON's setup for vmlal_u32 is a little more complicated than it is on2914* SSE2, AVX2, and VSX.2915*2916* While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.2917*2918* To do the same operation, the 128-bit 'Q' register needs to be split into2919* two 64-bit 'D' registers, performing this operation::2920*2921* [ a | b ]2922* | '---------. .--------' |2923* | x |2924* | .---------' '--------. |2925* [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ]2926*2927* Due to significant changes in aarch64, the fastest method for aarch64 is2928* completely different than the fastest method for ARMv7-A.2929*2930* ARMv7-A treats D registers as unions overlaying Q registers, so modifying2931* D11 will modify the high half of Q5. This is similar to how modifying AH2932* will only affect bits 8-15 of AX on x86.2933*2934* VZIP takes two registers, and puts even lanes in one register and odd lanes2935* in the other.2936*2937* On ARMv7-A, this strangely modifies both parameters in place instead of2938* taking the usual 3-operand form.2939*2940* Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the2941* lower and upper halves of the Q register to end up with the high and low2942* halves where we want - all in one instruction.2943*2944* vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }2945*2946* Unfortunately we need inline assembly for this: Instructions modifying two2947* registers at once is not possible in GCC or Clang's IR, and they have to2948* create a copy.2949*2950* aarch64 requires a different approach.2951*2952* In order to make it easier to write a decent compiler for aarch64, many2953* quirks were removed, such as conditional execution.2954*2955* NEON was also affected by this.2956*2957* aarch64 cannot access the high bits of a Q-form register, and writes to a2958* D-form register zero the high bits, similar to how writes to W-form scalar2959* registers (or DWORD registers on x86_64) work.2960*2961* The formerly free vget_high intrinsics now require a vext (with a few2962* exceptions)2963*2964* Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent2965* of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one2966* operand.2967*2968* The equivalent of the VZIP.32 on the lower and upper halves would be this2969* mess:2970*2971* ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }2972* zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }2973* zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }2974*2975* Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):2976*2977* shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);2978* xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);2979*2980* This is available on ARMv7-A, but is less efficient than a single VZIP.32.2981*/29822983/*!2984* Function-like macro:2985* void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)2986* {2987* outLo = (uint32x2_t)(in & 0xFFFFFFFF);2988* outHi = (uint32x2_t)(in >> 32);2989* in = UNDEFINED;2990* }2991*/2992# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \2993&& (defined(__GNUC__) || defined(__clang__)) \2994&& (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))2995# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \2996do { \2997/* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \2998/* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \2999/* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \3000__asm__("vzip.32 %e0, %f0" : "+w" (in)); \3001(outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \3002(outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \3003} while (0)3004# else3005# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \3006do { \3007(outLo) = vmovn_u64 (in); \3008(outHi) = vshrn_n_u64 ((in), 32); \3009} while (0)3010# endif30113012/*!3013* @ingroup tuning3014* @brief Controls the NEON to scalar ratio for XXH33015*3016* On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and3017* 2 lanes on scalar by default.3018*3019* This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the3020* emulated 64-bit arithmetic is too slow.3021*3022* Modern ARM CPUs are _very_ sensitive to how their pipelines are used.3023*3024* For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't3025* have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,3026* you are only using 2/3 of the CPU bandwidth.3027*3028* This is even more noticeable on the more advanced cores like the A76 which3029* can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.3030*3031* Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the3032* remaining lanes will use scalar instructions. This improves the bandwidth3033* and also gives the integer pipelines something to do besides twiddling loop3034* counters and pointers.3035*3036* This change benefits CPUs with large micro-op buffers without negatively affecting3037* other CPUs:3038*3039* | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. |3040* |:----------------------|:--------------------|----------:|-----------:|------:|3041* | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% |3042* | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% |3043* | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% |3044*3045* It also seems to fix some bad codegen on GCC, making it almost as fast as clang.3046*3047* @see XXH3_accumulate_512_neon()3048*/3049# ifndef XXH3_NEON_LANES3050# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \3051&& !defined(__OPTIMIZE_SIZE__)3052# define XXH3_NEON_LANES 63053# else3054# define XXH3_NEON_LANES XXH_ACC_NB3055# endif3056# endif3057#endif /* XXH_VECTOR == XXH_NEON */30583059/*3060* VSX and Z Vector helpers.3061*3062* This is very messy, and any pull requests to clean this up are welcome.3063*3064* There are a lot of problems with supporting VSX and s390x, due to3065* inconsistent intrinsics, spotty coverage, and multiple endiannesses.3066*/3067#if XXH_VECTOR == XXH_VSX3068# if defined(__s390x__)3069# include <s390intrin.h>3070# else3071/* gcc's altivec.h can have the unwanted consequence to unconditionally3072* #define bool, vector, and pixel keywords,3073* with bad consequences for programs already using these keywords for other purposes.3074* The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined.3075* __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler,3076* but it seems that, in some cases, it isn't.3077* Force the build macro to be defined, so that keywords are not altered.3078*/3079# if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)3080# define __APPLE_ALTIVEC__3081# endif3082# include <altivec.h>3083# endif30843085typedef __vector unsigned long long xxh_u64x2;3086typedef __vector unsigned char xxh_u8x16;3087typedef __vector unsigned xxh_u32x4;30883089# ifndef XXH_VSX_BE3090# if defined(__BIG_ENDIAN__) \3091|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)3092# define XXH_VSX_BE 13093# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__3094# warning "-maltivec=be is not recommended. Please use native endianness."3095# define XXH_VSX_BE 13096# else3097# define XXH_VSX_BE 03098# endif3099# endif /* !defined(XXH_VSX_BE) */31003101# if XXH_VSX_BE3102# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))3103# define XXH_vec_revb vec_revb3104# else3105/*!3106* A polyfill for POWER9's vec_revb().3107*/3108XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)3109{3110xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,31110x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };3112return vec_perm(val, val, vByteSwap);3113}3114# endif3115# endif /* XXH_VSX_BE */31163117/*!3118* Performs an unaligned vector load and byte swaps it on big endian.3119*/3120XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)3121{3122xxh_u64x2 ret;3123XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));3124# if XXH_VSX_BE3125ret = XXH_vec_revb(ret);3126# endif3127return ret;3128}31293130/*3131* vec_mulo and vec_mule are very problematic intrinsics on PowerPC3132*3133* These intrinsics weren't added until GCC 8, despite existing for a while,3134* and they are endian dependent. Also, their meaning swap depending on version.3135* */3136# if defined(__s390x__)3137/* s390x is always big endian, no issue on this platform */3138# define XXH_vec_mulo vec_mulo3139# define XXH_vec_mule vec_mule3140# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)3141/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */3142# define XXH_vec_mulo __builtin_altivec_vmulouw3143# define XXH_vec_mule __builtin_altivec_vmuleuw3144# else3145/* gcc needs inline assembly */3146/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */3147XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)3148{3149xxh_u64x2 result;3150__asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));3151return result;3152}3153XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)3154{3155xxh_u64x2 result;3156__asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));3157return result;3158}3159# endif /* XXH_vec_mulo, XXH_vec_mule */3160#endif /* XXH_VECTOR == XXH_VSX */316131623163/* prefetch3164* can be disabled, by declaring XXH_NO_PREFETCH build macro */3165#if defined(XXH_NO_PREFETCH)3166# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */3167#else3168# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */3169# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */3170# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)3171# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )3172# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)3173# else3174# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */3175# endif3176#endif /* XXH_NO_PREFETCH */317731783179/* ==========================================3180* XXH3 default settings3181* ========================================== */31823183#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */31843185#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)3186# error "default keyset is not large enough"3187#endif31883189/*! Pseudorandom secret taken directly from FARSH. */3190XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {31910xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,31920xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,31930xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,31940xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,31950x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,31960x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,31970xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,31980x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,31990xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,32000x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,32010x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,32020x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,3203};320432053206#ifdef XXH_OLD_NAMES3207# define kSecret XXH3_kSecret3208#endif32093210#ifdef XXH_DOXYGEN3211/*!3212* @brief Calculates a 32-bit to 64-bit long multiply.3213*3214* Implemented as a macro.3215*3216* Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't3217* need to (but it shouldn't need to anyways, it is about 7 instructions to do3218* a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we3219* use that instead of the normal method.3220*3221* If you are compiling for platforms like Thumb-1 and don't have a better option,3222* you may also want to write your own long multiply routine here.3223*3224* @param x, y Numbers to be multiplied3225* @return 64-bit product of the low 32 bits of @p x and @p y.3226*/3227XXH_FORCE_INLINE xxh_u643228XXH_mult32to64(xxh_u64 x, xxh_u64 y)3229{3230return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);3231}3232#elif defined(_MSC_VER) && defined(_M_IX86)3233# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))3234#else3235/*3236* Downcast + upcast is usually better than masking on older compilers like3237* GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.3238*3239* The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands3240* and perform a full 64x64 multiply -- entirely redundant on 32-bit.3241*/3242# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))3243#endif32443245/*!3246* @brief Calculates a 64->128-bit long multiply.3247*3248* Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar3249* version.3250*3251* @param lhs , rhs The 64-bit integers to be multiplied3252* @return The 128-bit result represented in an @ref XXH128_hash_t.3253*/3254static XXH128_hash_t3255XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)3256{3257/*3258* GCC/Clang __uint128_t method.3259*3260* On most 64-bit targets, GCC and Clang define a __uint128_t type.3261* This is usually the best way as it usually uses a native long 64-bit3262* multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.3263*3264* Usually.3265*3266* Despite being a 32-bit platform, Clang (and emscripten) define this type3267* despite not having the arithmetic for it. This results in a laggy3268* compiler builtin call which calculates a full 128-bit multiply.3269* In that case it is best to use the portable one.3270* https://github.com/Cyan4973/xxHash/issues/211#issuecomment-5155756773271*/3272#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \3273&& defined(__SIZEOF_INT128__) \3274|| (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)32753276__uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;3277XXH128_hash_t r128;3278r128.low64 = (xxh_u64)(product);3279r128.high64 = (xxh_u64)(product >> 64);3280return r128;32813282/*3283* MSVC for x64's _umul128 method.3284*3285* xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);3286*3287* This compiles to single operand MUL on x64.3288*/3289#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)32903291#ifndef _MSC_VER3292# pragma intrinsic(_umul128)3293#endif3294xxh_u64 product_high;3295xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);3296XXH128_hash_t r128;3297r128.low64 = product_low;3298r128.high64 = product_high;3299return r128;33003301/*3302* MSVC for ARM64's __umulh method.3303*3304* This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.3305*/3306#elif defined(_M_ARM64) || defined(_M_ARM64EC)33073308#ifndef _MSC_VER3309# pragma intrinsic(__umulh)3310#endif3311XXH128_hash_t r128;3312r128.low64 = lhs * rhs;3313r128.high64 = __umulh(lhs, rhs);3314return r128;33153316#else3317/*3318* Portable scalar method. Optimized for 32-bit and 64-bit ALUs.3319*3320* This is a fast and simple grade school multiply, which is shown below3321* with base 10 arithmetic instead of base 0x100000000.3322*3323* 9 3 // D2 lhs = 933324* x 7 5 // D2 rhs = 753325* ----------3326* 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 153327* 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 453328* 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 213329* + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 633330* ---------3331* 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 273332* + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 673333* ---------3334* 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 69753335*3336* The reasons for adding the products like this are:3337* 1. It avoids manual carry tracking. Just like how3338* (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.3339* This avoids a lot of complexity.3340*3341* 2. It hints for, and on Clang, compiles to, the powerful UMAAL3342* instruction available in ARM's Digital Signal Processing extension3343* in 32-bit ARMv6 and later, which is shown below:3344*3345* void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)3346* {3347* xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;3348* *RdLo = (xxh_u32)(product & 0xFFFFFFFF);3349* *RdHi = (xxh_u32)(product >> 32);3350* }3351*3352* This instruction was designed for efficient long multiplication, and3353* allows this to be calculated in only 4 instructions at speeds3354* comparable to some 64-bit ALUs.3355*3356* 3. It isn't terrible on other platforms. Usually this will be a couple3357* of 32-bit ADD/ADCs.3358*/33593360/* First calculate all of the cross products. */3361xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);3362xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);3363xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);3364xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);33653366/* Now add the products together. These will never overflow. */3367xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;3368xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;3369xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);33703371XXH128_hash_t r128;3372r128.low64 = lower;3373r128.high64 = upper;3374return r128;3375#endif3376}33773378/*!3379* @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.3380*3381* The reason for the separate function is to prevent passing too many structs3382* around by value. This will hopefully inline the multiply, but we don't force it.3383*3384* @param lhs , rhs The 64-bit integers to multiply3385* @return The low 64 bits of the product XOR'd by the high 64 bits.3386* @see XXH_mult64to128()3387*/3388static xxh_u643389XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)3390{3391XXH128_hash_t product = XXH_mult64to128(lhs, rhs);3392return product.low64 ^ product.high64;3393}33943395/*! Seems to produce slightly better code on GCC for some reason. */3396XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)3397{3398XXH_ASSERT(0 <= shift && shift < 64);3399return v64 ^ (v64 >> shift);3400}34013402/*3403* This is a fast avalanche stage,3404* suitable when input bits are already partially mixed3405*/3406static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)3407{3408h64 = XXH_xorshift64(h64, 37);3409h64 *= 0x165667919E3779F9ULL;3410h64 = XXH_xorshift64(h64, 32);3411return h64;3412}34133414/*3415* This is a stronger avalanche,3416* inspired by Pelle Evensen's rrmxmx3417* preferable when input has not been previously mixed3418*/3419static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)3420{3421/* this mix is inspired by Pelle Evensen's rrmxmx */3422h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);3423h64 *= 0x9FB21C651E98DF25ULL;3424h64 ^= (h64 >> 35) + len ;3425h64 *= 0x9FB21C651E98DF25ULL;3426return XXH_xorshift64(h64, 28);3427}342834293430/* ==========================================3431* Short keys3432* ==========================================3433* One of the shortcomings of XXH32 and XXH64 was that their performance was3434* sub-optimal on short lengths. It used an iterative algorithm which strongly3435* favored lengths that were a multiple of 4 or 8.3436*3437* Instead of iterating over individual inputs, we use a set of single shot3438* functions which piece together a range of lengths and operate in constant time.3439*3440* Additionally, the number of multiplies has been significantly reduced. This3441* reduces latency, especially when emulating 64-bit multiplies on 32-bit.3442*3443* Depending on the platform, this may or may not be faster than XXH32, but it3444* is almost guaranteed to be faster than XXH64.3445*/34463447/*3448* At very short lengths, there isn't enough input to fully hide secrets, or use3449* the entire secret.3450*3451* There is also only a limited amount of mixing we can do before significantly3452* impacting performance.3453*3454* Therefore, we use different sections of the secret and always mix two secret3455* samples with an XOR. This should have no effect on performance on the3456* seedless or withSeed variants because everything _should_ be constant folded3457* by modern compilers.3458*3459* The XOR mixing hides individual parts of the secret and increases entropy.3460*3461* This adds an extra layer of strength for custom secrets.3462*/3463XXH_FORCE_INLINE XXH64_hash_t3464XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)3465{3466XXH_ASSERT(input != NULL);3467XXH_ASSERT(1 <= len && len <= 3);3468XXH_ASSERT(secret != NULL);3469/*3470* len = 1: combined = { input[0], 0x01, input[0], input[0] }3471* len = 2: combined = { input[1], 0x02, input[0], input[1] }3472* len = 3: combined = { input[2], 0x03, input[0], input[1] }3473*/3474{ xxh_u8 const c1 = input[0];3475xxh_u8 const c2 = input[len >> 1];3476xxh_u8 const c3 = input[len - 1];3477xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24)3478| ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);3479xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;3480xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;3481return XXH64_avalanche(keyed);3482}3483}34843485XXH_FORCE_INLINE XXH64_hash_t3486XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)3487{3488XXH_ASSERT(input != NULL);3489XXH_ASSERT(secret != NULL);3490XXH_ASSERT(4 <= len && len <= 8);3491seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;3492{ xxh_u32 const input1 = XXH_readLE32(input);3493xxh_u32 const input2 = XXH_readLE32(input + len - 4);3494xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;3495xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);3496xxh_u64 const keyed = input64 ^ bitflip;3497return XXH3_rrmxmx(keyed, len);3498}3499}35003501XXH_FORCE_INLINE XXH64_hash_t3502XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)3503{3504XXH_ASSERT(input != NULL);3505XXH_ASSERT(secret != NULL);3506XXH_ASSERT(9 <= len && len <= 16);3507{ xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;3508xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;3509xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;3510xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;3511xxh_u64 const acc = len3512+ XXH_swap64(input_lo) + input_hi3513+ XXH3_mul128_fold64(input_lo, input_hi);3514return XXH3_avalanche(acc);3515}3516}35173518XXH_FORCE_INLINE XXH64_hash_t3519XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)3520{3521XXH_ASSERT(len <= 16);3522{ if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed);3523if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);3524if (len) return XXH3_len_1to3_64b(input, len, secret, seed);3525return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));3526}3527}35283529/*3530* DISCLAIMER: There are known *seed-dependent* multicollisions here due to3531* multiplication by zero, affecting hashes of lengths 17 to 240.3532*3533* However, they are very unlikely.3534*3535* Keep this in mind when using the unseeded XXH3_64bits() variant: As with all3536* unseeded non-cryptographic hashes, it does not attempt to defend itself3537* against specially crafted inputs, only random inputs.3538*3539* Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes3540* cancelling out the secret is taken an arbitrary number of times (addressed3541* in XXH3_accumulate_512), this collision is very unlikely with random inputs3542* and/or proper seeding:3543*3544* This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a3545* function that is only called up to 16 times per hash with up to 240 bytes of3546* input.3547*3548* This is not too bad for a non-cryptographic hash function, especially with3549* only 64 bit outputs.3550*3551* The 128-bit variant (which trades some speed for strength) is NOT affected3552* by this, although it is always a good idea to use a proper seed if you care3553* about strength.3554*/3555XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,3556const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)3557{3558#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \3559&& defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \3560&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */3561/*3562* UGLY HACK:3563* GCC for x86 tends to autovectorize the 128-bit multiply, resulting in3564* slower code.3565*3566* By forcing seed64 into a register, we disrupt the cost model and3567* cause it to scalarize. See `XXH32_round()`3568*3569* FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,3570* XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on3571* GCC 9.2, despite both emitting scalar code.3572*3573* GCC generates much better scalar code than Clang for the rest of XXH3,3574* which is why finding a more optimal codepath is an interest.3575*/3576XXH_COMPILER_GUARD(seed64);3577#endif3578{ xxh_u64 const input_lo = XXH_readLE64(input);3579xxh_u64 const input_hi = XXH_readLE64(input+8);3580return XXH3_mul128_fold64(3581input_lo ^ (XXH_readLE64(secret) + seed64),3582input_hi ^ (XXH_readLE64(secret+8) - seed64)3583);3584}3585}35863587/* For mid range keys, XXH3 uses a Mum-hash variant. */3588XXH_FORCE_INLINE XXH64_hash_t3589XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,3590const xxh_u8* XXH_RESTRICT secret, size_t secretSize,3591XXH64_hash_t seed)3592{3593XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;3594XXH_ASSERT(16 < len && len <= 128);35953596{ xxh_u64 acc = len * XXH_PRIME64_1;3597if (len > 32) {3598if (len > 64) {3599if (len > 96) {3600acc += XXH3_mix16B(input+48, secret+96, seed);3601acc += XXH3_mix16B(input+len-64, secret+112, seed);3602}3603acc += XXH3_mix16B(input+32, secret+64, seed);3604acc += XXH3_mix16B(input+len-48, secret+80, seed);3605}3606acc += XXH3_mix16B(input+16, secret+32, seed);3607acc += XXH3_mix16B(input+len-32, secret+48, seed);3608}3609acc += XXH3_mix16B(input+0, secret+0, seed);3610acc += XXH3_mix16B(input+len-16, secret+16, seed);36113612return XXH3_avalanche(acc);3613}3614}36153616#define XXH3_MIDSIZE_MAX 24036173618XXH_NO_INLINE XXH64_hash_t3619XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,3620const xxh_u8* XXH_RESTRICT secret, size_t secretSize,3621XXH64_hash_t seed)3622{3623XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;3624XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);36253626#define XXH3_MIDSIZE_STARTOFFSET 33627#define XXH3_MIDSIZE_LASTOFFSET 1736283629{ xxh_u64 acc = len * XXH_PRIME64_1;3630int const nbRounds = (int)len / 16;3631int i;3632for (i=0; i<8; i++) {3633acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);3634}3635acc = XXH3_avalanche(acc);3636XXH_ASSERT(nbRounds >= 8);3637#if defined(__clang__) /* Clang */ \3638&& (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \3639&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */3640/*3641* UGLY HACK:3642* Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.3643* In everywhere else, it uses scalar code.3644*3645* For 64->128-bit multiplies, even if the NEON was 100% optimal, it3646* would still be slower than UMAAL (see XXH_mult64to128).3647*3648* Unfortunately, Clang doesn't handle the long multiplies properly and3649* converts them to the nonexistent "vmulq_u64" intrinsic, which is then3650* scalarized into an ugly mess of VMOV.32 instructions.3651*3652* This mess is difficult to avoid without turning autovectorization3653* off completely, but they are usually relatively minor and/or not3654* worth it to fix.3655*3656* This loop is the easiest to fix, as unlike XXH32, this pragma3657* _actually works_ because it is a loop vectorization instead of an3658* SLP vectorization.3659*/3660#pragma clang loop vectorize(disable)3661#endif3662for (i=8 ; i < nbRounds; i++) {3663acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);3664}3665/* last bytes */3666acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);3667return XXH3_avalanche(acc);3668}3669}367036713672/* ======= Long Keys ======= */36733674#define XXH_STRIPE_LEN 643675#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */3676#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))36773678#ifdef XXH_OLD_NAMES3679# define STRIPE_LEN XXH_STRIPE_LEN3680# define ACC_NB XXH_ACC_NB3681#endif36823683XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)3684{3685if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);3686XXH_memcpy(dst, &v64, sizeof(v64));3687}36883689/* Several intrinsic functions below are supposed to accept __int64 as argument,3690* as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .3691* However, several environments do not define __int64 type,3692* requiring a workaround.3693*/3694#if !defined (__VMS) \3695&& (defined (__cplusplus) \3696|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )3697typedef int64_t xxh_i64;3698#else3699/* the following type must have a width of 64-bit */3700typedef long long xxh_i64;3701#endif370237033704/*3705* XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.3706*3707* It is a hardened version of UMAC, based off of FARSH's implementation.3708*3709* This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD3710* implementations, and it is ridiculously fast.3711*3712* We harden it by mixing the original input to the accumulators as well as the product.3713*3714* This means that in the (relatively likely) case of a multiply by zero, the3715* original input is preserved.3716*3717* On 128-bit inputs, we swap 64-bit pairs when we add the input to improve3718* cross-pollination, as otherwise the upper and lower halves would be3719* essentially independent.3720*3721* This doesn't matter on 64-bit hashes since they all get merged together in3722* the end, so we skip the extra step.3723*3724* Both XXH3_64bits and XXH3_128bits use this subroutine.3725*/37263727#if (XXH_VECTOR == XXH_AVX512) \3728|| (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)37293730#ifndef XXH_TARGET_AVX5123731# define XXH_TARGET_AVX512 /* disable attribute target */3732#endif37333734XXH_FORCE_INLINE XXH_TARGET_AVX512 void3735XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,3736const void* XXH_RESTRICT input,3737const void* XXH_RESTRICT secret)3738{3739__m512i* const xacc = (__m512i *) acc;3740XXH_ASSERT((((size_t)acc) & 63) == 0);3741XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));37423743{3744/* data_vec = input[0]; */3745__m512i const data_vec = _mm512_loadu_si512 (input);3746/* key_vec = secret[0]; */3747__m512i const key_vec = _mm512_loadu_si512 (secret);3748/* data_key = data_vec ^ key_vec; */3749__m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);3750/* data_key_lo = data_key >> 32; */3751__m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));3752/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */3753__m512i const product = _mm512_mul_epu32 (data_key, data_key_lo);3754/* xacc[0] += swap(data_vec); */3755__m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));3756__m512i const sum = _mm512_add_epi64(*xacc, data_swap);3757/* xacc[0] += product; */3758*xacc = _mm512_add_epi64(product, sum);3759}3760}37613762/*3763* XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.3764*3765* Multiplication isn't perfect, as explained by Google in HighwayHash:3766*3767* // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to3768* // varying degrees. In descending order of goodness, bytes3769* // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.3770* // As expected, the upper and lower bytes are much worse.3771*3772* Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L2913773*3774* Since our algorithm uses a pseudorandom secret to add some variance into the3775* mix, we don't need to (or want to) mix as often or as much as HighwayHash does.3776*3777* This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid3778* extraction.3779*3780* Both XXH3_64bits and XXH3_128bits use this subroutine.3781*/37823783XXH_FORCE_INLINE XXH_TARGET_AVX512 void3784XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)3785{3786XXH_ASSERT((((size_t)acc) & 63) == 0);3787XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));3788{ __m512i* const xacc = (__m512i*) acc;3789const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);37903791/* xacc[0] ^= (xacc[0] >> 47) */3792__m512i const acc_vec = *xacc;3793__m512i const shifted = _mm512_srli_epi64 (acc_vec, 47);3794__m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted);3795/* xacc[0] ^= secret; */3796__m512i const key_vec = _mm512_loadu_si512 (secret);3797__m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);37983799/* xacc[0] *= XXH_PRIME32_1; */3800__m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));3801__m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32);3802__m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32);3803*xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));3804}3805}38063807XXH_FORCE_INLINE XXH_TARGET_AVX512 void3808XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)3809{3810XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);3811XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);3812XXH_ASSERT(((size_t)customSecret & 63) == 0);3813(void)(&XXH_writeLE64);3814{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);3815__m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));38163817const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret);3818__m512i* const dest = ( __m512i*) customSecret;3819int i;3820XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */3821XXH_ASSERT(((size_t)dest & 63) == 0);3822for (i=0; i < nbRounds; ++i) {3823/* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',3824* this will warn "discards 'const' qualifier". */3825union {3826const __m512i* cp;3827void* p;3828} remote_const_void;3829remote_const_void.cp = src + i;3830dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);3831} }3832}38333834#endif38353836#if (XXH_VECTOR == XXH_AVX2) \3837|| (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)38383839#ifndef XXH_TARGET_AVX23840# define XXH_TARGET_AVX2 /* disable attribute target */3841#endif38423843XXH_FORCE_INLINE XXH_TARGET_AVX2 void3844XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,3845const void* XXH_RESTRICT input,3846const void* XXH_RESTRICT secret)3847{3848XXH_ASSERT((((size_t)acc) & 31) == 0);3849{ __m256i* const xacc = (__m256i *) acc;3850/* Unaligned. This is mainly for pointer arithmetic, and because3851* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */3852const __m256i* const xinput = (const __m256i *) input;3853/* Unaligned. This is mainly for pointer arithmetic, and because3854* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */3855const __m256i* const xsecret = (const __m256i *) secret;38563857size_t i;3858for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {3859/* data_vec = xinput[i]; */3860__m256i const data_vec = _mm256_loadu_si256 (xinput+i);3861/* key_vec = xsecret[i]; */3862__m256i const key_vec = _mm256_loadu_si256 (xsecret+i);3863/* data_key = data_vec ^ key_vec; */3864__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);3865/* data_key_lo = data_key >> 32; */3866__m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));3867/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */3868__m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);3869/* xacc[i] += swap(data_vec); */3870__m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));3871__m256i const sum = _mm256_add_epi64(xacc[i], data_swap);3872/* xacc[i] += product; */3873xacc[i] = _mm256_add_epi64(product, sum);3874} }3875}38763877XXH_FORCE_INLINE XXH_TARGET_AVX2 void3878XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)3879{3880XXH_ASSERT((((size_t)acc) & 31) == 0);3881{ __m256i* const xacc = (__m256i*) acc;3882/* Unaligned. This is mainly for pointer arithmetic, and because3883* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */3884const __m256i* const xsecret = (const __m256i *) secret;3885const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);38863887size_t i;3888for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {3889/* xacc[i] ^= (xacc[i] >> 47) */3890__m256i const acc_vec = xacc[i];3891__m256i const shifted = _mm256_srli_epi64 (acc_vec, 47);3892__m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted);3893/* xacc[i] ^= xsecret; */3894__m256i const key_vec = _mm256_loadu_si256 (xsecret+i);3895__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);38963897/* xacc[i] *= XXH_PRIME32_1; */3898__m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));3899__m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);3900__m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);3901xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));3902}3903}3904}39053906XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)3907{3908XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);3909XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);3910XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);3911(void)(&XXH_writeLE64);3912XXH_PREFETCH(customSecret);3913{ __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);39143915const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret);3916__m256i* dest = ( __m256i*) customSecret;39173918# if defined(__GNUC__) || defined(__clang__)3919/*3920* On GCC & Clang, marking 'dest' as modified will cause the compiler:3921* - do not extract the secret from sse registers in the internal loop3922* - use less common registers, and avoid pushing these reg into stack3923*/3924XXH_COMPILER_GUARD(dest);3925# endif3926XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */3927XXH_ASSERT(((size_t)dest & 31) == 0);39283929/* GCC -O2 need unroll loop manually */3930dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);3931dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);3932dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);3933dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);3934dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);3935dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);3936}3937}39383939#endif39403941/* x86dispatch always generates SSE2 */3942#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)39433944#ifndef XXH_TARGET_SSE23945# define XXH_TARGET_SSE2 /* disable attribute target */3946#endif39473948XXH_FORCE_INLINE XXH_TARGET_SSE2 void3949XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,3950const void* XXH_RESTRICT input,3951const void* XXH_RESTRICT secret)3952{3953/* SSE2 is just a half-scale version of the AVX2 version. */3954XXH_ASSERT((((size_t)acc) & 15) == 0);3955{ __m128i* const xacc = (__m128i *) acc;3956/* Unaligned. This is mainly for pointer arithmetic, and because3957* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */3958const __m128i* const xinput = (const __m128i *) input;3959/* Unaligned. This is mainly for pointer arithmetic, and because3960* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */3961const __m128i* const xsecret = (const __m128i *) secret;39623963size_t i;3964for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {3965/* data_vec = xinput[i]; */3966__m128i const data_vec = _mm_loadu_si128 (xinput+i);3967/* key_vec = xsecret[i]; */3968__m128i const key_vec = _mm_loadu_si128 (xsecret+i);3969/* data_key = data_vec ^ key_vec; */3970__m128i const data_key = _mm_xor_si128 (data_vec, key_vec);3971/* data_key_lo = data_key >> 32; */3972__m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));3973/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */3974__m128i const product = _mm_mul_epu32 (data_key, data_key_lo);3975/* xacc[i] += swap(data_vec); */3976__m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));3977__m128i const sum = _mm_add_epi64(xacc[i], data_swap);3978/* xacc[i] += product; */3979xacc[i] = _mm_add_epi64(product, sum);3980} }3981}39823983XXH_FORCE_INLINE XXH_TARGET_SSE2 void3984XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)3985{3986XXH_ASSERT((((size_t)acc) & 15) == 0);3987{ __m128i* const xacc = (__m128i*) acc;3988/* Unaligned. This is mainly for pointer arithmetic, and because3989* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */3990const __m128i* const xsecret = (const __m128i *) secret;3991const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);39923993size_t i;3994for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {3995/* xacc[i] ^= (xacc[i] >> 47) */3996__m128i const acc_vec = xacc[i];3997__m128i const shifted = _mm_srli_epi64 (acc_vec, 47);3998__m128i const data_vec = _mm_xor_si128 (acc_vec, shifted);3999/* xacc[i] ^= xsecret[i]; */4000__m128i const key_vec = _mm_loadu_si128 (xsecret+i);4001__m128i const data_key = _mm_xor_si128 (data_vec, key_vec);40024003/* xacc[i] *= XXH_PRIME32_1; */4004__m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));4005__m128i const prod_lo = _mm_mul_epu32 (data_key, prime32);4006__m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32);4007xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));4008}4009}4010}40114012XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)4013{4014XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);4015(void)(&XXH_writeLE64);4016{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);40174018# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 19004019/* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */4020XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };4021__m128i const seed = _mm_load_si128((__m128i const*)seed64x2);4022# else4023__m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);4024# endif4025int i;40264027const void* const src16 = XXH3_kSecret;4028__m128i* dst16 = (__m128i*) customSecret;4029# if defined(__GNUC__) || defined(__clang__)4030/*4031* On GCC & Clang, marking 'dest' as modified will cause the compiler:4032* - do not extract the secret from sse registers in the internal loop4033* - use less common registers, and avoid pushing these reg into stack4034*/4035XXH_COMPILER_GUARD(dst16);4036# endif4037XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */4038XXH_ASSERT(((size_t)dst16 & 15) == 0);40394040for (i=0; i < nbRounds; ++i) {4041dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);4042} }4043}40444045#endif40464047#if (XXH_VECTOR == XXH_NEON)40484049/* forward declarations for the scalar routines */4050XXH_FORCE_INLINE void4051XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,4052void const* XXH_RESTRICT secret, size_t lane);40534054XXH_FORCE_INLINE void4055XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,4056void const* XXH_RESTRICT secret, size_t lane);40574058/*!4059* @internal4060* @brief The bulk processing loop for NEON.4061*4062* The NEON code path is actually partially scalar when running on AArch64. This4063* is to optimize the pipelining and can have up to 15% speedup depending on the4064* CPU, and it also mitigates some GCC codegen issues.4065*4066* @see XXH3_NEON_LANES for configuring this and details about this optimization.4067*/4068XXH_FORCE_INLINE void4069XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,4070const void* XXH_RESTRICT input,4071const void* XXH_RESTRICT secret)4072{4073XXH_ASSERT((((size_t)acc) & 15) == 0);4074XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);4075{4076uint64x2_t* const xacc = (uint64x2_t *) acc;4077/* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */4078uint8_t const* const xinput = (const uint8_t *) input;4079uint8_t const* const xsecret = (const uint8_t *) secret;40804081size_t i;4082/* NEON for the first few lanes (these loops are normally interleaved) */4083for (i=0; i < XXH3_NEON_LANES / 2; i++) {4084/* data_vec = xinput[i]; */4085uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));4086/* key_vec = xsecret[i]; */4087uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));4088uint64x2_t data_key;4089uint32x2_t data_key_lo, data_key_hi;4090/* xacc[i] += swap(data_vec); */4091uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);4092uint64x2_t const swapped = vextq_u64(data64, data64, 1);4093xacc[i] = vaddq_u64 (xacc[i], swapped);4094/* data_key = data_vec ^ key_vec; */4095data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));4096/* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);4097* data_key_hi = (uint32x2_t) (data_key >> 32);4098* data_key = UNDEFINED; */4099XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);4100/* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */4101xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);41024103}4104/* Scalar for the remainder. This may be a zero iteration loop. */4105for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {4106XXH3_scalarRound(acc, input, secret, i);4107}4108}4109}41104111XXH_FORCE_INLINE void4112XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)4113{4114XXH_ASSERT((((size_t)acc) & 15) == 0);41154116{ uint64x2_t* xacc = (uint64x2_t*) acc;4117uint8_t const* xsecret = (uint8_t const*) secret;4118uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1);41194120size_t i;4121/* NEON for the first few lanes (these loops are normally interleaved) */4122for (i=0; i < XXH3_NEON_LANES / 2; i++) {4123/* xacc[i] ^= (xacc[i] >> 47); */4124uint64x2_t acc_vec = xacc[i];4125uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);4126uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);41274128/* xacc[i] ^= xsecret[i]; */4129uint8x16_t key_vec = vld1q_u8 (xsecret + (i * 16));4130uint64x2_t data_key = veorq_u64 (data_vec, vreinterpretq_u64_u8(key_vec));41314132/* xacc[i] *= XXH_PRIME32_1 */4133uint32x2_t data_key_lo, data_key_hi;4134/* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);4135* data_key_hi = (uint32x2_t) (xacc[i] >> 32);4136* xacc[i] = UNDEFINED; */4137XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);4138{ /*4139* prod_hi = (data_key >> 32) * XXH_PRIME32_1;4140*4141* Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will4142* incorrectly "optimize" this:4143* tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));4144* shifted = vshll_n_u32(tmp, 32);4145* to this:4146* tmp = "vmulq_u64"(a, b); // no such thing!4147* shifted = vshlq_n_u64(tmp, 32);4148*4149* However, unlike SSE, Clang lacks a 64-bit multiply routine4150* for NEON, and it scalarizes two 64-bit multiplies instead.4151*4152* vmull_u32 has the same timing as vmul_u32, and it avoids4153* this bug completely.4154* See https://bugs.llvm.org/show_bug.cgi?id=399674155*/4156uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);4157/* xacc[i] = prod_hi << 32; */4158xacc[i] = vshlq_n_u64(prod_hi, 32);4159/* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */4160xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);4161}4162}4163/* Scalar for the remainder. This may be a zero iteration loop. */4164for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {4165XXH3_scalarScrambleRound(acc, secret, i);4166}4167}4168}41694170#endif41714172#if (XXH_VECTOR == XXH_VSX)41734174XXH_FORCE_INLINE void4175XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,4176const void* XXH_RESTRICT input,4177const void* XXH_RESTRICT secret)4178{4179/* presumed aligned */4180unsigned int* const xacc = (unsigned int*) acc;4181xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */4182xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */4183xxh_u64x2 const v32 = { 32, 32 };4184size_t i;4185for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {4186/* data_vec = xinput[i]; */4187xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);4188/* key_vec = xsecret[i]; */4189xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);4190xxh_u64x2 const data_key = data_vec ^ key_vec;4191/* shuffled = (data_key << 32) | (data_key >> 32); */4192xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);4193/* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */4194xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);4195/* acc_vec = xacc[i]; */4196xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i);4197acc_vec += product;41984199/* swap high and low halves */4200#ifdef __s390x__4201acc_vec += vec_permi(data_vec, data_vec, 2);4202#else4203acc_vec += vec_xxpermdi(data_vec, data_vec, 2);4204#endif4205/* xacc[i] = acc_vec; */4206vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);4207}4208}42094210XXH_FORCE_INLINE void4211XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)4212{4213XXH_ASSERT((((size_t)acc) & 15) == 0);42144215{ xxh_u64x2* const xacc = (xxh_u64x2*) acc;4216const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;4217/* constants */4218xxh_u64x2 const v32 = { 32, 32 };4219xxh_u64x2 const v47 = { 47, 47 };4220xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };4221size_t i;4222for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {4223/* xacc[i] ^= (xacc[i] >> 47); */4224xxh_u64x2 const acc_vec = xacc[i];4225xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);42264227/* xacc[i] ^= xsecret[i]; */4228xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);4229xxh_u64x2 const data_key = data_vec ^ key_vec;42304231/* xacc[i] *= XXH_PRIME32_1 */4232/* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */4233xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);4234/* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */4235xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);4236xacc[i] = prod_odd + (prod_even << v32);4237} }4238}42394240#endif42414242/* scalar variants - universal */42434244/*!4245* @internal4246* @brief Scalar round for @ref XXH3_accumulate_512_scalar().4247*4248* This is extracted to its own function because the NEON path uses a combination4249* of NEON and scalar.4250*/4251XXH_FORCE_INLINE void4252XXH3_scalarRound(void* XXH_RESTRICT acc,4253void const* XXH_RESTRICT input,4254void const* XXH_RESTRICT secret,4255size_t lane)4256{4257xxh_u64* xacc = (xxh_u64*) acc;4258xxh_u8 const* xinput = (xxh_u8 const*) input;4259xxh_u8 const* xsecret = (xxh_u8 const*) secret;4260XXH_ASSERT(lane < XXH_ACC_NB);4261XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);4262{4263xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);4264xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);4265xacc[lane ^ 1] += data_val; /* swap adjacent lanes */4266xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);4267}4268}42694270/*!4271* @internal4272* @brief Processes a 64 byte block of data using the scalar path.4273*/4274XXH_FORCE_INLINE void4275XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,4276const void* XXH_RESTRICT input,4277const void* XXH_RESTRICT secret)4278{4279size_t i;4280for (i=0; i < XXH_ACC_NB; i++) {4281XXH3_scalarRound(acc, input, secret, i);4282}4283}42844285/*!4286* @internal4287* @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().4288*4289* This is extracted to its own function because the NEON path uses a combination4290* of NEON and scalar.4291*/4292XXH_FORCE_INLINE void4293XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,4294void const* XXH_RESTRICT secret,4295size_t lane)4296{4297xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */4298const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */4299XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);4300XXH_ASSERT(lane < XXH_ACC_NB);4301{4302xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);4303xxh_u64 acc64 = xacc[lane];4304acc64 = XXH_xorshift64(acc64, 47);4305acc64 ^= key64;4306acc64 *= XXH_PRIME32_1;4307xacc[lane] = acc64;4308}4309}43104311/*!4312* @internal4313* @brief Scrambles the accumulators after a large chunk has been read4314*/4315XXH_FORCE_INLINE void4316XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)4317{4318size_t i;4319for (i=0; i < XXH_ACC_NB; i++) {4320XXH3_scalarScrambleRound(acc, secret, i);4321}4322}43234324XXH_FORCE_INLINE void4325XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)4326{4327/*4328* We need a separate pointer for the hack below,4329* which requires a non-const pointer.4330* Any decent compiler will optimize this out otherwise.4331*/4332const xxh_u8* kSecretPtr = XXH3_kSecret;4333XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);43344335#if defined(__clang__) && defined(__aarch64__)4336/*4337* UGLY HACK:4338* Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are4339* placed sequentially, in order, at the top of the unrolled loop.4340*4341* While MOVK is great for generating constants (2 cycles for a 64-bit4342* constant compared to 4 cycles for LDR), it fights for bandwidth with4343* the arithmetic instructions.4344*4345* I L S4346* MOVK4347* MOVK4348* MOVK4349* MOVK4350* ADD4351* SUB STR4352* STR4353* By forcing loads from memory (as the asm line causes Clang to assume4354* that XXH3_kSecretPtr has been changed), the pipelines are used more4355* efficiently:4356* I L S4357* LDR4358* ADD LDR4359* SUB STR4360* STR4361*4362* See XXH3_NEON_LANES for details on the pipsline.4363*4364* XXH3_64bits_withSeed, len == 256, Snapdragon 8354365* without hack: 2654.4 MB/s4366* with hack: 3202.9 MB/s4367*/4368XXH_COMPILER_GUARD(kSecretPtr);4369#endif4370/*4371* Note: in debug mode, this overrides the asm optimization4372* and Clang will emit MOVK chains again.4373*/4374XXH_ASSERT(kSecretPtr == XXH3_kSecret);43754376{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;4377int i;4378for (i=0; i < nbRounds; i++) {4379/*4380* The asm hack causes Clang to assume that kSecretPtr aliases with4381* customSecret, and on aarch64, this prevented LDP from merging two4382* loads together for free. Putting the loads together before the stores4383* properly generates LDP.4384*/4385xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64;4386xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;4387XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo);4388XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);4389} }4390}439143924393typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*);4394typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);4395typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);439643974398#if (XXH_VECTOR == XXH_AVX512)43994400#define XXH3_accumulate_512 XXH3_accumulate_512_avx5124401#define XXH3_scrambleAcc XXH3_scrambleAcc_avx5124402#define XXH3_initCustomSecret XXH3_initCustomSecret_avx51244034404#elif (XXH_VECTOR == XXH_AVX2)44054406#define XXH3_accumulate_512 XXH3_accumulate_512_avx24407#define XXH3_scrambleAcc XXH3_scrambleAcc_avx24408#define XXH3_initCustomSecret XXH3_initCustomSecret_avx244094410#elif (XXH_VECTOR == XXH_SSE2)44114412#define XXH3_accumulate_512 XXH3_accumulate_512_sse24413#define XXH3_scrambleAcc XXH3_scrambleAcc_sse24414#define XXH3_initCustomSecret XXH3_initCustomSecret_sse244154416#elif (XXH_VECTOR == XXH_NEON)44174418#define XXH3_accumulate_512 XXH3_accumulate_512_neon4419#define XXH3_scrambleAcc XXH3_scrambleAcc_neon4420#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar44214422#elif (XXH_VECTOR == XXH_VSX)44234424#define XXH3_accumulate_512 XXH3_accumulate_512_vsx4425#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx4426#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar44274428#else /* scalar */44294430#define XXH3_accumulate_512 XXH3_accumulate_512_scalar4431#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar4432#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar44334434#endif4435443644374438#ifndef XXH_PREFETCH_DIST4439# ifdef __clang__4440# define XXH_PREFETCH_DIST 3204441# else4442# if (XXH_VECTOR == XXH_AVX512)4443# define XXH_PREFETCH_DIST 5124444# else4445# define XXH_PREFETCH_DIST 3844446# endif4447# endif /* __clang__ */4448#endif /* XXH_PREFETCH_DIST */44494450/*4451* XXH3_accumulate()4452* Loops over XXH3_accumulate_512().4453* Assumption: nbStripes will not overflow the secret size4454*/4455XXH_FORCE_INLINE void4456XXH3_accumulate( xxh_u64* XXH_RESTRICT acc,4457const xxh_u8* XXH_RESTRICT input,4458const xxh_u8* XXH_RESTRICT secret,4459size_t nbStripes,4460XXH3_f_accumulate_512 f_acc512)4461{4462size_t n;4463for (n = 0; n < nbStripes; n++ ) {4464const xxh_u8* const in = input + n*XXH_STRIPE_LEN;4465XXH_PREFETCH(in + XXH_PREFETCH_DIST);4466f_acc512(acc,4467in,4468secret + n*XXH_SECRET_CONSUME_RATE);4469}4470}44714472XXH_FORCE_INLINE void4473XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,4474const xxh_u8* XXH_RESTRICT input, size_t len,4475const xxh_u8* XXH_RESTRICT secret, size_t secretSize,4476XXH3_f_accumulate_512 f_acc512,4477XXH3_f_scrambleAcc f_scramble)4478{4479size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;4480size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;4481size_t const nb_blocks = (len - 1) / block_len;44824483size_t n;44844485XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);44864487for (n = 0; n < nb_blocks; n++) {4488XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);4489f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);4490}44914492/* last partial block */4493XXH_ASSERT(len > XXH_STRIPE_LEN);4494{ size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;4495XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));4496XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);44974498/* last stripe */4499{ const xxh_u8* const p = input + len - XXH_STRIPE_LEN;4500#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */4501f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);4502} }4503}45044505XXH_FORCE_INLINE xxh_u644506XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)4507{4508return XXH3_mul128_fold64(4509acc[0] ^ XXH_readLE64(secret),4510acc[1] ^ XXH_readLE64(secret+8) );4511}45124513static XXH64_hash_t4514XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)4515{4516xxh_u64 result64 = start;4517size_t i = 0;45184519for (i = 0; i < 4; i++) {4520result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);4521#if defined(__clang__) /* Clang */ \4522&& (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \4523&& (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \4524&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */4525/*4526* UGLY HACK:4527* Prevent autovectorization on Clang ARMv7-a. Exact same problem as4528* the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.4529* XXH3_64bits, len == 256, Snapdragon 835:4530* without hack: 2063.7 MB/s4531* with hack: 2560.7 MB/s4532*/4533XXH_COMPILER_GUARD(result64);4534#endif4535}45364537return XXH3_avalanche(result64);4538}45394540#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \4541XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }45424543XXH_FORCE_INLINE XXH64_hash_t4544XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,4545const void* XXH_RESTRICT secret, size_t secretSize,4546XXH3_f_accumulate_512 f_acc512,4547XXH3_f_scrambleAcc f_scramble)4548{4549XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;45504551XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble);45524553/* converge into final hash */4554XXH_STATIC_ASSERT(sizeof(acc) == 64);4555/* do not align on 8, so that the secret is different from the accumulator */4556#define XXH_SECRET_MERGEACCS_START 114557XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);4558return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);4559}45604561/*4562* It's important for performance to transmit secret's size (when it's static)4563* so that the compiler can properly optimize the vectorized loop.4564* This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.4565*/4566XXH_FORCE_INLINE XXH64_hash_t4567XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,4568XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)4569{4570(void)seed64;4571return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);4572}45734574/*4575* It's preferable for performance that XXH3_hashLong is not inlined,4576* as it results in a smaller function for small data, easier to the instruction cache.4577* Note that inside this no_inline function, we do inline the internal loop,4578* and provide a statically defined secret size to allow optimization of vector loop.4579*/4580XXH_NO_INLINE XXH64_hash_t4581XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,4582XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)4583{4584(void)seed64; (void)secret; (void)secretLen;4585return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);4586}45874588/*4589* XXH3_hashLong_64b_withSeed():4590* Generate a custom key based on alteration of default XXH3_kSecret with the seed,4591* and then use this key for long mode hashing.4592*4593* This operation is decently fast but nonetheless costs a little bit of time.4594* Try to avoid it whenever possible (typically when seed==0).4595*4596* It's important for performance that XXH3_hashLong is not inlined. Not sure4597* why (uop cache maybe?), but the difference is large and easily measurable.4598*/4599XXH_FORCE_INLINE XXH64_hash_t4600XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,4601XXH64_hash_t seed,4602XXH3_f_accumulate_512 f_acc512,4603XXH3_f_scrambleAcc f_scramble,4604XXH3_f_initCustomSecret f_initSec)4605{4606if (seed == 0)4607return XXH3_hashLong_64b_internal(input, len,4608XXH3_kSecret, sizeof(XXH3_kSecret),4609f_acc512, f_scramble);4610{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];4611f_initSec(secret, seed);4612return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),4613f_acc512, f_scramble);4614}4615}46164617/*4618* It's important for performance that XXH3_hashLong is not inlined.4619*/4620XXH_NO_INLINE XXH64_hash_t4621XXH3_hashLong_64b_withSeed(const void* input, size_t len,4622XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)4623{4624(void)secret; (void)secretLen;4625return XXH3_hashLong_64b_withSeed_internal(input, len, seed,4626XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);4627}462846294630typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,4631XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);46324633XXH_FORCE_INLINE XXH64_hash_t4634XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,4635XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,4636XXH3_hashLong64_f f_hashLong)4637{4638XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);4639/*4640* If an action is to be taken if `secretLen` condition is not respected,4641* it should be done here.4642* For now, it's a contract pre-condition.4643* Adding a check and a branch here would cost performance at every hash.4644* Also, note that function signature doesn't offer room to return an error.4645*/4646if (len <= 16)4647return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);4648if (len <= 128)4649return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);4650if (len <= XXH3_MIDSIZE_MAX)4651return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);4652return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);4653}465446554656/* === Public entry point === */46574658/*! @ingroup xxh3_family */4659XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)4660{4661return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);4662}46634664/*! @ingroup xxh3_family */4665XXH_PUBLIC_API XXH64_hash_t4666XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)4667{4668return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);4669}46704671/*! @ingroup xxh3_family */4672XXH_PUBLIC_API XXH64_hash_t4673XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)4674{4675return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);4676}46774678XXH_PUBLIC_API XXH64_hash_t4679XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)4680{4681if (len <= XXH3_MIDSIZE_MAX)4682return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);4683return XXH3_hashLong_64b_withSecret(input, len, seed, (const xxh_u8*)secret, secretSize);4684}468546864687/* === XXH3 streaming === */46884689/*4690* Malloc's a pointer that is always aligned to align.4691*4692* This must be freed with `XXH_alignedFree()`.4693*4694* malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte4695* alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX24696* or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.4697*4698* This underalignment previously caused a rather obvious crash which went4699* completely unnoticed due to XXH3_createState() not actually being tested.4700* Credit to RedSpah for noticing this bug.4701*4702* The alignment is done manually: Functions like posix_memalign or _mm_malloc4703* are avoided: To maintain portability, we would have to write a fallback4704* like this anyways, and besides, testing for the existence of library4705* functions without relying on external build tools is impossible.4706*4707* The method is simple: Overallocate, manually align, and store the offset4708* to the original behind the returned pointer.4709*4710* Align must be a power of 2 and 8 <= align <= 128.4711*/4712static void* XXH_alignedMalloc(size_t s, size_t align)4713{4714XXH_ASSERT(align <= 128 && align >= 8); /* range check */4715XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */4716XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */4717{ /* Overallocate to make room for manual realignment and an offset byte */4718xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);4719if (base != NULL) {4720/*4721* Get the offset needed to align this pointer.4722*4723* Even if the returned pointer is aligned, there will always be4724* at least one byte to store the offset to the original pointer.4725*/4726size_t offset = align - ((size_t)base & (align - 1)); /* base % align */4727/* Add the offset for the now-aligned pointer */4728xxh_u8* ptr = base + offset;47294730XXH_ASSERT((size_t)ptr % align == 0);47314732/* Store the offset immediately before the returned pointer. */4733ptr[-1] = (xxh_u8)offset;4734return ptr;4735}4736return NULL;4737}4738}4739/*4740* Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass4741* normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.4742*/4743static void XXH_alignedFree(void* p)4744{4745if (p != NULL) {4746xxh_u8* ptr = (xxh_u8*)p;4747/* Get the offset byte we added in XXH_malloc. */4748xxh_u8 offset = ptr[-1];4749/* Free the original malloc'd pointer */4750xxh_u8* base = ptr - offset;4751XXH_free(base);4752}4753}4754/*! @ingroup xxh3_family */4755XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)4756{4757XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);4758if (state==NULL) return NULL;4759XXH3_INITSTATE(state);4760return state;4761}47624763/*! @ingroup xxh3_family */4764XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)4765{4766XXH_alignedFree(statePtr);4767return XXH_OK;4768}47694770/*! @ingroup xxh3_family */4771XXH_PUBLIC_API void4772XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)4773{4774XXH_memcpy(dst_state, src_state, sizeof(*dst_state));4775}47764777static void4778XXH3_reset_internal(XXH3_state_t* statePtr,4779XXH64_hash_t seed,4780const void* secret, size_t secretSize)4781{4782size_t const initStart = offsetof(XXH3_state_t, bufferedSize);4783size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;4784XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);4785XXH_ASSERT(statePtr != NULL);4786/* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */4787memset((char*)statePtr + initStart, 0, initLength);4788statePtr->acc[0] = XXH_PRIME32_3;4789statePtr->acc[1] = XXH_PRIME64_1;4790statePtr->acc[2] = XXH_PRIME64_2;4791statePtr->acc[3] = XXH_PRIME64_3;4792statePtr->acc[4] = XXH_PRIME64_4;4793statePtr->acc[5] = XXH_PRIME32_2;4794statePtr->acc[6] = XXH_PRIME64_5;4795statePtr->acc[7] = XXH_PRIME32_1;4796statePtr->seed = seed;4797statePtr->useSeed = (seed != 0);4798statePtr->extSecret = (const unsigned char*)secret;4799XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);4800statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;4801statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;4802}48034804/*! @ingroup xxh3_family */4805XXH_PUBLIC_API XXH_errorcode4806XXH3_64bits_reset(XXH3_state_t* statePtr)4807{4808if (statePtr == NULL) return XXH_ERROR;4809XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);4810return XXH_OK;4811}48124813/*! @ingroup xxh3_family */4814XXH_PUBLIC_API XXH_errorcode4815XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)4816{4817if (statePtr == NULL) return XXH_ERROR;4818XXH3_reset_internal(statePtr, 0, secret, secretSize);4819if (secret == NULL) return XXH_ERROR;4820if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;4821return XXH_OK;4822}48234824/*! @ingroup xxh3_family */4825XXH_PUBLIC_API XXH_errorcode4826XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)4827{4828if (statePtr == NULL) return XXH_ERROR;4829if (seed==0) return XXH3_64bits_reset(statePtr);4830if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))4831XXH3_initCustomSecret(statePtr->customSecret, seed);4832XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);4833return XXH_OK;4834}48354836/*! @ingroup xxh3_family */4837XXH_PUBLIC_API XXH_errorcode4838XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)4839{4840if (statePtr == NULL) return XXH_ERROR;4841if (secret == NULL) return XXH_ERROR;4842if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;4843XXH3_reset_internal(statePtr, seed64, secret, secretSize);4844statePtr->useSeed = 1; /* always, even if seed64==0 */4845return XXH_OK;4846}48474848/* Note : when XXH3_consumeStripes() is invoked,4849* there must be a guarantee that at least one more byte must be consumed from input4850* so that the function can blindly consume all stripes using the "normal" secret segment */4851XXH_FORCE_INLINE void4852XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,4853size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,4854const xxh_u8* XXH_RESTRICT input, size_t nbStripes,4855const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,4856XXH3_f_accumulate_512 f_acc512,4857XXH3_f_scrambleAcc f_scramble)4858{4859XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */4860XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);4861if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {4862/* need a scrambling operation */4863size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;4864size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;4865XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);4866f_scramble(acc, secret + secretLimit);4867XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);4868*nbStripesSoFarPtr = nbStripesAfterBlock;4869} else {4870XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);4871*nbStripesSoFarPtr += nbStripes;4872}4873}48744875#ifndef XXH3_STREAM_USE_STACK4876# ifndef __clang__ /* clang doesn't need additional stack space */4877# define XXH3_STREAM_USE_STACK 14878# endif4879#endif4880/*4881* Both XXH3_64bits_update and XXH3_128bits_update use this routine.4882*/4883XXH_FORCE_INLINE XXH_errorcode4884XXH3_update(XXH3_state_t* XXH_RESTRICT const state,4885const xxh_u8* XXH_RESTRICT input, size_t len,4886XXH3_f_accumulate_512 f_acc512,4887XXH3_f_scrambleAcc f_scramble)4888{4889if (input==NULL) {4890XXH_ASSERT(len == 0);4891return XXH_OK;4892}48934894XXH_ASSERT(state != NULL);4895{ const xxh_u8* const bEnd = input + len;4896const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;4897#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 14898/* For some reason, gcc and MSVC seem to suffer greatly4899* when operating accumulators directly into state.4900* Operating into stack space seems to enable proper optimization.4901* clang, on the other hand, doesn't seem to need this trick */4902XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc));4903#else4904xxh_u64* XXH_RESTRICT const acc = state->acc;4905#endif4906state->totalLen += len;4907XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);49084909/* small input : just fill in tmp buffer */4910if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {4911XXH_memcpy(state->buffer + state->bufferedSize, input, len);4912state->bufferedSize += (XXH32_hash_t)len;4913return XXH_OK;4914}49154916/* total input is now > XXH3_INTERNALBUFFER_SIZE */4917#define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)4918XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */49194920/*4921* Internal buffer is partially filled (always, except at beginning)4922* Complete it, then consume it.4923*/4924if (state->bufferedSize) {4925size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;4926XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);4927input += loadSize;4928XXH3_consumeStripes(acc,4929&state->nbStripesSoFar, state->nbStripesPerBlock,4930state->buffer, XXH3_INTERNALBUFFER_STRIPES,4931secret, state->secretLimit,4932f_acc512, f_scramble);4933state->bufferedSize = 0;4934}4935XXH_ASSERT(input < bEnd);49364937/* large input to consume : ingest per full block */4938if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {4939size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;4940XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);4941/* join to current block's end */4942{ size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;4943XXH_ASSERT(nbStripesToEnd <= nbStripes);4944XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);4945f_scramble(acc, secret + state->secretLimit);4946state->nbStripesSoFar = 0;4947input += nbStripesToEnd * XXH_STRIPE_LEN;4948nbStripes -= nbStripesToEnd;4949}4950/* consume per entire blocks */4951while(nbStripes >= state->nbStripesPerBlock) {4952XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);4953f_scramble(acc, secret + state->secretLimit);4954input += state->nbStripesPerBlock * XXH_STRIPE_LEN;4955nbStripes -= state->nbStripesPerBlock;4956}4957/* consume last partial block */4958XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);4959input += nbStripes * XXH_STRIPE_LEN;4960XXH_ASSERT(input < bEnd); /* at least some bytes left */4961state->nbStripesSoFar = nbStripes;4962/* buffer predecessor of last partial stripe */4963XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);4964XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);4965} else {4966/* content to consume <= block size */4967/* Consume input by a multiple of internal buffer size */4968if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {4969const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;4970do {4971XXH3_consumeStripes(acc,4972&state->nbStripesSoFar, state->nbStripesPerBlock,4973input, XXH3_INTERNALBUFFER_STRIPES,4974secret, state->secretLimit,4975f_acc512, f_scramble);4976input += XXH3_INTERNALBUFFER_SIZE;4977} while (input<limit);4978/* buffer predecessor of last partial stripe */4979XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);4980}4981}49824983/* Some remaining input (always) : buffer it */4984XXH_ASSERT(input < bEnd);4985XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);4986XXH_ASSERT(state->bufferedSize == 0);4987XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));4988state->bufferedSize = (XXH32_hash_t)(bEnd-input);4989#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 14990/* save stack accumulators into state */4991memcpy(state->acc, acc, sizeof(acc));4992#endif4993}49944995return XXH_OK;4996}49974998/*! @ingroup xxh3_family */4999XXH_PUBLIC_API XXH_errorcode5000XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)5001{5002return XXH3_update(state, (const xxh_u8*)input, len,5003XXH3_accumulate_512, XXH3_scrambleAcc);5004}500550065007XXH_FORCE_INLINE void5008XXH3_digest_long (XXH64_hash_t* acc,5009const XXH3_state_t* state,5010const unsigned char* secret)5011{5012/*5013* Digest on a local copy. This way, the state remains unaltered, and it can5014* continue ingesting more input afterwards.5015*/5016XXH_memcpy(acc, state->acc, sizeof(state->acc));5017if (state->bufferedSize >= XXH_STRIPE_LEN) {5018size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;5019size_t nbStripesSoFar = state->nbStripesSoFar;5020XXH3_consumeStripes(acc,5021&nbStripesSoFar, state->nbStripesPerBlock,5022state->buffer, nbStripes,5023secret, state->secretLimit,5024XXH3_accumulate_512, XXH3_scrambleAcc);5025/* last stripe */5026XXH3_accumulate_512(acc,5027state->buffer + state->bufferedSize - XXH_STRIPE_LEN,5028secret + state->secretLimit - XXH_SECRET_LASTACC_START);5029} else { /* bufferedSize < XXH_STRIPE_LEN */5030xxh_u8 lastStripe[XXH_STRIPE_LEN];5031size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;5032XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */5033XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);5034XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);5035XXH3_accumulate_512(acc,5036lastStripe,5037secret + state->secretLimit - XXH_SECRET_LASTACC_START);5038}5039}50405041/*! @ingroup xxh3_family */5042XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)5043{5044const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;5045if (state->totalLen > XXH3_MIDSIZE_MAX) {5046XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];5047XXH3_digest_long(acc, state, secret);5048return XXH3_mergeAccs(acc,5049secret + XXH_SECRET_MERGEACCS_START,5050(xxh_u64)state->totalLen * XXH_PRIME64_1);5051}5052/* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */5053if (state->useSeed)5054return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);5055return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),5056secret, state->secretLimit + XXH_STRIPE_LEN);5057}5058505950605061/* ==========================================5062* XXH3 128 bits (a.k.a XXH128)5063* ==========================================5064* XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,5065* even without counting the significantly larger output size.5066*5067* For example, extra steps are taken to avoid the seed-dependent collisions5068* in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).5069*5070* This strength naturally comes at the cost of some speed, especially on short5071* lengths. Note that longer hashes are about as fast as the 64-bit version5072* due to it using only a slight modification of the 64-bit loop.5073*5074* XXH128 is also more oriented towards 64-bit machines. It is still extremely5075* fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).5076*/50775078XXH_FORCE_INLINE XXH128_hash_t5079XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)5080{5081/* A doubled version of 1to3_64b with different constants. */5082XXH_ASSERT(input != NULL);5083XXH_ASSERT(1 <= len && len <= 3);5084XXH_ASSERT(secret != NULL);5085/*5086* len = 1: combinedl = { input[0], 0x01, input[0], input[0] }5087* len = 2: combinedl = { input[1], 0x02, input[0], input[1] }5088* len = 3: combinedl = { input[2], 0x03, input[0], input[1] }5089*/5090{ xxh_u8 const c1 = input[0];5091xxh_u8 const c2 = input[len >> 1];5092xxh_u8 const c3 = input[len - 1];5093xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)5094| ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);5095xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);5096xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;5097xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;5098xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;5099xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;5100XXH128_hash_t h128;5101h128.low64 = XXH64_avalanche(keyed_lo);5102h128.high64 = XXH64_avalanche(keyed_hi);5103return h128;5104}5105}51065107XXH_FORCE_INLINE XXH128_hash_t5108XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)5109{5110XXH_ASSERT(input != NULL);5111XXH_ASSERT(secret != NULL);5112XXH_ASSERT(4 <= len && len <= 8);5113seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;5114{ xxh_u32 const input_lo = XXH_readLE32(input);5115xxh_u32 const input_hi = XXH_readLE32(input + len - 4);5116xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);5117xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;5118xxh_u64 const keyed = input_64 ^ bitflip;51195120/* Shift len to the left to ensure it is even, this avoids even multiplies. */5121XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));51225123m128.high64 += (m128.low64 << 1);5124m128.low64 ^= (m128.high64 >> 3);51255126m128.low64 = XXH_xorshift64(m128.low64, 35);5127m128.low64 *= 0x9FB21C651E98DF25ULL;5128m128.low64 = XXH_xorshift64(m128.low64, 28);5129m128.high64 = XXH3_avalanche(m128.high64);5130return m128;5131}5132}51335134XXH_FORCE_INLINE XXH128_hash_t5135XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)5136{5137XXH_ASSERT(input != NULL);5138XXH_ASSERT(secret != NULL);5139XXH_ASSERT(9 <= len && len <= 16);5140{ xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;5141xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;5142xxh_u64 const input_lo = XXH_readLE64(input);5143xxh_u64 input_hi = XXH_readLE64(input + len - 8);5144XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);5145/*5146* Put len in the middle of m128 to ensure that the length gets mixed to5147* both the low and high bits in the 128x64 multiply below.5148*/5149m128.low64 += (xxh_u64)(len - 1) << 54;5150input_hi ^= bitfliph;5151/*5152* Add the high 32 bits of input_hi to the high 32 bits of m128, then5153* add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to5154* the high 64 bits of m128.5155*5156* The best approach to this operation is different on 32-bit and 64-bit.5157*/5158if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */5159/*5160* 32-bit optimized version, which is more readable.5161*5162* On 32-bit, it removes an ADC and delays a dependency between the two5163* halves of m128.high64, but it generates an extra mask on 64-bit.5164*/5165m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);5166} else {5167/*5168* 64-bit optimized (albeit more confusing) version.5169*5170* Uses some properties of addition and multiplication to remove the mask:5171*5172* Let:5173* a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)5174* b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)5175* c = XXH_PRIME32_25176*5177* a + (b * c)5178* Inverse Property: x + y - x == y5179* a + (b * (1 + c - 1))5180* Distributive Property: x * (y + z) == (x * y) + (x * z)5181* a + (b * 1) + (b * (c - 1))5182* Identity Property: x * 1 == x5183* a + b + (b * (c - 1))5184*5185* Substitute a, b, and c:5186* input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))5187*5188* Since input_hi.hi + input_hi.lo == input_hi, we get this:5189* input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))5190*/5191m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);5192}5193/* m128 ^= XXH_swap64(m128 >> 64); */5194m128.low64 ^= XXH_swap64(m128.high64);51955196{ /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */5197XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);5198h128.high64 += m128.high64 * XXH_PRIME64_2;51995200h128.low64 = XXH3_avalanche(h128.low64);5201h128.high64 = XXH3_avalanche(h128.high64);5202return h128;5203} }5204}52055206/*5207* Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN5208*/5209XXH_FORCE_INLINE XXH128_hash_t5210XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)5211{5212XXH_ASSERT(len <= 16);5213{ if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);5214if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);5215if (len) return XXH3_len_1to3_128b(input, len, secret, seed);5216{ XXH128_hash_t h128;5217xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);5218xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);5219h128.low64 = XXH64_avalanche(seed ^ bitflipl);5220h128.high64 = XXH64_avalanche( seed ^ bitfliph);5221return h128;5222} }5223}52245225/*5226* A bit slower than XXH3_mix16B, but handles multiply by zero better.5227*/5228XXH_FORCE_INLINE XXH128_hash_t5229XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,5230const xxh_u8* secret, XXH64_hash_t seed)5231{5232acc.low64 += XXH3_mix16B (input_1, secret+0, seed);5233acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);5234acc.high64 += XXH3_mix16B (input_2, secret+16, seed);5235acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);5236return acc;5237}523852395240XXH_FORCE_INLINE XXH128_hash_t5241XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,5242const xxh_u8* XXH_RESTRICT secret, size_t secretSize,5243XXH64_hash_t seed)5244{5245XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;5246XXH_ASSERT(16 < len && len <= 128);52475248{ XXH128_hash_t acc;5249acc.low64 = len * XXH_PRIME64_1;5250acc.high64 = 0;5251if (len > 32) {5252if (len > 64) {5253if (len > 96) {5254acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);5255}5256acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);5257}5258acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);5259}5260acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);5261{ XXH128_hash_t h128;5262h128.low64 = acc.low64 + acc.high64;5263h128.high64 = (acc.low64 * XXH_PRIME64_1)5264+ (acc.high64 * XXH_PRIME64_4)5265+ ((len - seed) * XXH_PRIME64_2);5266h128.low64 = XXH3_avalanche(h128.low64);5267h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);5268return h128;5269}5270}5271}52725273XXH_NO_INLINE XXH128_hash_t5274XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,5275const xxh_u8* XXH_RESTRICT secret, size_t secretSize,5276XXH64_hash_t seed)5277{5278XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;5279XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);52805281{ XXH128_hash_t acc;5282int const nbRounds = (int)len / 32;5283int i;5284acc.low64 = len * XXH_PRIME64_1;5285acc.high64 = 0;5286for (i=0; i<4; i++) {5287acc = XXH128_mix32B(acc,5288input + (32 * i),5289input + (32 * i) + 16,5290secret + (32 * i),5291seed);5292}5293acc.low64 = XXH3_avalanche(acc.low64);5294acc.high64 = XXH3_avalanche(acc.high64);5295XXH_ASSERT(nbRounds >= 4);5296for (i=4 ; i < nbRounds; i++) {5297acc = XXH128_mix32B(acc,5298input + (32 * i),5299input + (32 * i) + 16,5300secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),5301seed);5302}5303/* last bytes */5304acc = XXH128_mix32B(acc,5305input + len - 16,5306input + len - 32,5307secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,53080ULL - seed);53095310{ XXH128_hash_t h128;5311h128.low64 = acc.low64 + acc.high64;5312h128.high64 = (acc.low64 * XXH_PRIME64_1)5313+ (acc.high64 * XXH_PRIME64_4)5314+ ((len - seed) * XXH_PRIME64_2);5315h128.low64 = XXH3_avalanche(h128.low64);5316h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);5317return h128;5318}5319}5320}53215322XXH_FORCE_INLINE XXH128_hash_t5323XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,5324const xxh_u8* XXH_RESTRICT secret, size_t secretSize,5325XXH3_f_accumulate_512 f_acc512,5326XXH3_f_scrambleAcc f_scramble)5327{5328XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;53295330XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);53315332/* converge into final hash */5333XXH_STATIC_ASSERT(sizeof(acc) == 64);5334XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);5335{ XXH128_hash_t h128;5336h128.low64 = XXH3_mergeAccs(acc,5337secret + XXH_SECRET_MERGEACCS_START,5338(xxh_u64)len * XXH_PRIME64_1);5339h128.high64 = XXH3_mergeAccs(acc,5340secret + secretSize5341- sizeof(acc) - XXH_SECRET_MERGEACCS_START,5342~((xxh_u64)len * XXH_PRIME64_2));5343return h128;5344}5345}53465347/*5348* It's important for performance that XXH3_hashLong is not inlined.5349*/5350XXH_NO_INLINE XXH128_hash_t5351XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,5352XXH64_hash_t seed64,5353const void* XXH_RESTRICT secret, size_t secretLen)5354{5355(void)seed64; (void)secret; (void)secretLen;5356return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),5357XXH3_accumulate_512, XXH3_scrambleAcc);5358}53595360/*5361* It's important for performance to pass @secretLen (when it's static)5362* to the compiler, so that it can properly optimize the vectorized loop.5363*/5364XXH_FORCE_INLINE XXH128_hash_t5365XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,5366XXH64_hash_t seed64,5367const void* XXH_RESTRICT secret, size_t secretLen)5368{5369(void)seed64;5370return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,5371XXH3_accumulate_512, XXH3_scrambleAcc);5372}53735374XXH_FORCE_INLINE XXH128_hash_t5375XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,5376XXH64_hash_t seed64,5377XXH3_f_accumulate_512 f_acc512,5378XXH3_f_scrambleAcc f_scramble,5379XXH3_f_initCustomSecret f_initSec)5380{5381if (seed64 == 0)5382return XXH3_hashLong_128b_internal(input, len,5383XXH3_kSecret, sizeof(XXH3_kSecret),5384f_acc512, f_scramble);5385{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];5386f_initSec(secret, seed64);5387return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),5388f_acc512, f_scramble);5389}5390}53915392/*5393* It's important for performance that XXH3_hashLong is not inlined.5394*/5395XXH_NO_INLINE XXH128_hash_t5396XXH3_hashLong_128b_withSeed(const void* input, size_t len,5397XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)5398{5399(void)secret; (void)secretLen;5400return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,5401XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);5402}54035404typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,5405XXH64_hash_t, const void* XXH_RESTRICT, size_t);54065407XXH_FORCE_INLINE XXH128_hash_t5408XXH3_128bits_internal(const void* input, size_t len,5409XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,5410XXH3_hashLong128_f f_hl128)5411{5412XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);5413/*5414* If an action is to be taken if `secret` conditions are not respected,5415* it should be done here.5416* For now, it's a contract pre-condition.5417* Adding a check and a branch here would cost performance at every hash.5418*/5419if (len <= 16)5420return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);5421if (len <= 128)5422return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);5423if (len <= XXH3_MIDSIZE_MAX)5424return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);5425return f_hl128(input, len, seed64, secret, secretLen);5426}542754285429/* === Public XXH128 API === */54305431/*! @ingroup xxh3_family */5432XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)5433{5434return XXH3_128bits_internal(input, len, 0,5435XXH3_kSecret, sizeof(XXH3_kSecret),5436XXH3_hashLong_128b_default);5437}54385439/*! @ingroup xxh3_family */5440XXH_PUBLIC_API XXH128_hash_t5441XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)5442{5443return XXH3_128bits_internal(input, len, 0,5444(const xxh_u8*)secret, secretSize,5445XXH3_hashLong_128b_withSecret);5446}54475448/*! @ingroup xxh3_family */5449XXH_PUBLIC_API XXH128_hash_t5450XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)5451{5452return XXH3_128bits_internal(input, len, seed,5453XXH3_kSecret, sizeof(XXH3_kSecret),5454XXH3_hashLong_128b_withSeed);5455}54565457/*! @ingroup xxh3_family */5458XXH_PUBLIC_API XXH128_hash_t5459XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)5460{5461if (len <= XXH3_MIDSIZE_MAX)5462return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);5463return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);5464}54655466/*! @ingroup xxh3_family */5467XXH_PUBLIC_API XXH128_hash_t5468XXH128(const void* input, size_t len, XXH64_hash_t seed)5469{5470return XXH3_128bits_withSeed(input, len, seed);5471}547254735474/* === XXH3 128-bit streaming === */54755476/*5477* All initialization and update functions are identical to 64-bit streaming variant.5478* The only difference is the finalization routine.5479*/54805481/*! @ingroup xxh3_family */5482XXH_PUBLIC_API XXH_errorcode5483XXH3_128bits_reset(XXH3_state_t* statePtr)5484{5485return XXH3_64bits_reset(statePtr);5486}54875488/*! @ingroup xxh3_family */5489XXH_PUBLIC_API XXH_errorcode5490XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)5491{5492return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);5493}54945495/*! @ingroup xxh3_family */5496XXH_PUBLIC_API XXH_errorcode5497XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)5498{5499return XXH3_64bits_reset_withSeed(statePtr, seed);5500}55015502/*! @ingroup xxh3_family */5503XXH_PUBLIC_API XXH_errorcode5504XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)5505{5506return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);5507}55085509/*! @ingroup xxh3_family */5510XXH_PUBLIC_API XXH_errorcode5511XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)5512{5513return XXH3_update(state, (const xxh_u8*)input, len,5514XXH3_accumulate_512, XXH3_scrambleAcc);5515}55165517/*! @ingroup xxh3_family */5518XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)5519{5520const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;5521if (state->totalLen > XXH3_MIDSIZE_MAX) {5522XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];5523XXH3_digest_long(acc, state, secret);5524XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);5525{ XXH128_hash_t h128;5526h128.low64 = XXH3_mergeAccs(acc,5527secret + XXH_SECRET_MERGEACCS_START,5528(xxh_u64)state->totalLen * XXH_PRIME64_1);5529h128.high64 = XXH3_mergeAccs(acc,5530secret + state->secretLimit + XXH_STRIPE_LEN5531- sizeof(acc) - XXH_SECRET_MERGEACCS_START,5532~((xxh_u64)state->totalLen * XXH_PRIME64_2));5533return h128;5534}5535}5536/* len <= XXH3_MIDSIZE_MAX : short code */5537if (state->seed)5538return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);5539return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),5540secret, state->secretLimit + XXH_STRIPE_LEN);5541}55425543/* 128-bit utility functions */55445545#include <string.h> /* memcmp, memcpy */55465547/* return : 1 is equal, 0 if different */5548/*! @ingroup xxh3_family */5549XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)5550{5551/* note : XXH128_hash_t is compact, it has no padding byte */5552return !(memcmp(&h1, &h2, sizeof(h1)));5553}55545555/* This prototype is compatible with stdlib's qsort().5556* return : >0 if *h128_1 > *h128_25557* <0 if *h128_1 < *h128_25558* =0 if *h128_1 == *h128_2 */5559/*! @ingroup xxh3_family */5560XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)5561{5562XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;5563XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;5564int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);5565/* note : bets that, in most cases, hash values are different */5566if (hcmp) return hcmp;5567return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);5568}556955705571/*====== Canonical representation ======*/5572/*! @ingroup xxh3_family */5573XXH_PUBLIC_API void5574XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)5575{5576XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));5577if (XXH_CPU_LITTLE_ENDIAN) {5578hash.high64 = XXH_swap64(hash.high64);5579hash.low64 = XXH_swap64(hash.low64);5580}5581XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));5582XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));5583}55845585/*! @ingroup xxh3_family */5586XXH_PUBLIC_API XXH128_hash_t5587XXH128_hashFromCanonical(const XXH128_canonical_t* src)5588{5589XXH128_hash_t h;5590h.high64 = XXH_readBE64(src);5591h.low64 = XXH_readBE64(src->digest + 8);5592return h;5593}5594559555965597/* ==========================================5598* Secret generators5599* ==========================================5600*/5601#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))56025603XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)5604{5605XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );5606XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );5607}56085609/*! @ingroup xxh3_family */5610XXH_PUBLIC_API XXH_errorcode5611XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)5612{5613#if (XXH_DEBUGLEVEL >= 1)5614XXH_ASSERT(secretBuffer != NULL);5615XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);5616#else5617/* production mode, assert() are disabled */5618if (secretBuffer == NULL) return XXH_ERROR;5619if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;5620#endif56215622if (customSeedSize == 0) {5623customSeed = XXH3_kSecret;5624customSeedSize = XXH_SECRET_DEFAULT_SIZE;5625}5626#if (XXH_DEBUGLEVEL >= 1)5627XXH_ASSERT(customSeed != NULL);5628#else5629if (customSeed == NULL) return XXH_ERROR;5630#endif56315632/* Fill secretBuffer with a copy of customSeed - repeat as needed */5633{ size_t pos = 0;5634while (pos < secretSize) {5635size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);5636memcpy((char*)secretBuffer + pos, customSeed, toCopy);5637pos += toCopy;5638} }56395640{ size_t const nbSeg16 = secretSize / 16;5641size_t n;5642XXH128_canonical_t scrambler;5643XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));5644for (n=0; n<nbSeg16; n++) {5645XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);5646XXH3_combine16((char*)secretBuffer + n*16, h128);5647}5648/* last segment */5649XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));5650}5651return XXH_OK;5652}56535654/*! @ingroup xxh3_family */5655XXH_PUBLIC_API void5656XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)5657{5658XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];5659XXH3_initCustomSecret(secret, seed);5660XXH_ASSERT(secretBuffer != NULL);5661memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);5662}5663566456655666/* Pop our optimization override from above */5667#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \5668&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \5669&& defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */5670# pragma GCC pop_options5671#endif56725673#endif /* XXH_NO_LONG_LONG */56745675#endif /* XXH_NO_XXH3 */56765677/*!5678* @}5679*/5680#endif /* XXH_IMPLEMENTATION */568156825683#if defined (__cplusplus)5684}5685#endif568656875688