Path: blob/main/sys/contrib/zstd/lib/common/xxhash.h
48375 views
/*1* xxHash - Fast Hash algorithm2* Copyright (c) Yann Collet, Facebook, Inc.3*4* You can contact the author at :5* - xxHash homepage: http://www.xxhash.com6* - xxHash source repository : https://github.com/Cyan4973/xxHash7*8* This source code is licensed under both the BSD-style license (found in the9* LICENSE file in the root directory of this source tree) and the GPLv2 (found10* in the COPYING file in the root directory of this source tree).11* You may select, at your option, one of the above-listed licenses.12*/131415#ifndef XXH_NO_XXH316# define XXH_NO_XXH317#endif1819#ifndef XXH_NAMESPACE20# define XXH_NAMESPACE ZSTD_21#endif2223/*!24* @mainpage xxHash25*26* @file xxhash.h27* xxHash prototypes and implementation28*/29/* TODO: update */30/* Notice extracted from xxHash homepage:3132xxHash is an extremely fast hash algorithm, running at RAM speed limits.33It also successfully passes all tests from the SMHasher suite.3435Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)3637Name Speed Q.Score Author38xxHash 5.4 GB/s 1039CrapWow 3.2 GB/s 2 Andrew40MurmurHash 3a 2.7 GB/s 10 Austin Appleby41SpookyHash 2.0 GB/s 10 Bob Jenkins42SBox 1.4 GB/s 9 Bret Mulvey43Lookup3 1.2 GB/s 9 Bob Jenkins44SuperFastHash 1.2 GB/s 1 Paul Hsieh45CityHash64 1.05 GB/s 10 Pike & Alakuijala46FNV 0.55 GB/s 5 Fowler, Noll, Vo47CRC32 0.43 GB/s 948MD5-32 0.33 GB/s 10 Ronald L. Rivest49SHA1-32 0.28 GB/s 105051Q.Score is a measure of quality of the hash function.52It depends on successfully passing SMHasher test set.5310 is a perfect score.5455Note: SMHasher's CRC32 implementation is not the fastest one.56Other speed-oriented implementations can be faster,57especially in combination with PCLMUL instruction:58https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c34900923404611707355960A 64-bit version, named XXH64, is available since r35.61It offers much better speed, but for 64-bit applications only.62Name Speed on 64 bits Speed on 32 bits63XXH64 13.8 GB/s 1.9 GB/s64XXH32 6.8 GB/s 6.0 GB/s65*/6667#if defined (__cplusplus)68extern "C" {69#endif7071/* ****************************72* INLINE mode73******************************/74/*!75* XXH_INLINE_ALL (and XXH_PRIVATE_API)76* Use these build macros to inline xxhash into the target unit.77* Inlining improves performance on small inputs, especially when the length is78* expressed as a compile-time constant:79*80* https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html81*82* It also keeps xxHash symbols private to the unit, so they are not exported.83*84* Usage:85* #define XXH_INLINE_ALL86* #include "xxhash.h"87*88* Do not compile and link xxhash.o as a separate object, as it is not useful.89*/90#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \91&& !defined(XXH_INLINE_ALL_31684351384)92/* this section should be traversed only once */93# define XXH_INLINE_ALL_3168435138494/* give access to the advanced API, required to compile implementations */95# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */96# define XXH_STATIC_LINKING_ONLY97/* make all functions private */98# undef XXH_PUBLIC_API99# if defined(__GNUC__)100# define XXH_PUBLIC_API static __inline __attribute__((unused))101# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)102# define XXH_PUBLIC_API static inline103# elif defined(_MSC_VER)104# define XXH_PUBLIC_API static __inline105# else106/* note: this version may generate warnings for unused static functions */107# define XXH_PUBLIC_API static108# endif109110/*111* This part deals with the special case where a unit wants to inline xxHash,112* but "xxhash.h" has previously been included without XXH_INLINE_ALL,113* such as part of some previously included *.h header file.114* Without further action, the new include would just be ignored,115* and functions would effectively _not_ be inlined (silent failure).116* The following macros solve this situation by prefixing all inlined names,117* avoiding naming collision with previous inclusions.118*/119/* Before that, we unconditionally #undef all symbols,120* in case they were already defined with XXH_NAMESPACE.121* They will then be redefined for XXH_INLINE_ALL122*/123# undef XXH_versionNumber124/* XXH32 */125# undef XXH32126# undef XXH32_createState127# undef XXH32_freeState128# undef XXH32_reset129# undef XXH32_update130# undef XXH32_digest131# undef XXH32_copyState132# undef XXH32_canonicalFromHash133# undef XXH32_hashFromCanonical134/* XXH64 */135# undef XXH64136# undef XXH64_createState137# undef XXH64_freeState138# undef XXH64_reset139# undef XXH64_update140# undef XXH64_digest141# undef XXH64_copyState142# undef XXH64_canonicalFromHash143# undef XXH64_hashFromCanonical144/* XXH3_64bits */145# undef XXH3_64bits146# undef XXH3_64bits_withSecret147# undef XXH3_64bits_withSeed148# undef XXH3_64bits_withSecretandSeed149# undef XXH3_createState150# undef XXH3_freeState151# undef XXH3_copyState152# undef XXH3_64bits_reset153# undef XXH3_64bits_reset_withSeed154# undef XXH3_64bits_reset_withSecret155# undef XXH3_64bits_update156# undef XXH3_64bits_digest157# undef XXH3_generateSecret158/* XXH3_128bits */159# undef XXH128160# undef XXH3_128bits161# undef XXH3_128bits_withSeed162# undef XXH3_128bits_withSecret163# undef XXH3_128bits_reset164# undef XXH3_128bits_reset_withSeed165# undef XXH3_128bits_reset_withSecret166# undef XXH3_128bits_reset_withSecretandSeed167# undef XXH3_128bits_update168# undef XXH3_128bits_digest169# undef XXH128_isEqual170# undef XXH128_cmp171# undef XXH128_canonicalFromHash172# undef XXH128_hashFromCanonical173/* Finally, free the namespace itself */174# undef XXH_NAMESPACE175176/* employ the namespace for XXH_INLINE_ALL */177# define XXH_NAMESPACE XXH_INLINE_178/*179* Some identifiers (enums, type names) are not symbols,180* but they must nonetheless be renamed to avoid redeclaration.181* Alternative solution: do not redeclare them.182* However, this requires some #ifdefs, and has a more dispersed impact.183* Meanwhile, renaming can be achieved in a single place.184*/185# define XXH_IPREF(Id) XXH_NAMESPACE ## Id186# define XXH_OK XXH_IPREF(XXH_OK)187# define XXH_ERROR XXH_IPREF(XXH_ERROR)188# define XXH_errorcode XXH_IPREF(XXH_errorcode)189# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t)190# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t)191# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)192# define XXH32_state_s XXH_IPREF(XXH32_state_s)193# define XXH32_state_t XXH_IPREF(XXH32_state_t)194# define XXH64_state_s XXH_IPREF(XXH64_state_s)195# define XXH64_state_t XXH_IPREF(XXH64_state_t)196# define XXH3_state_s XXH_IPREF(XXH3_state_s)197# define XXH3_state_t XXH_IPREF(XXH3_state_t)198# define XXH128_hash_t XXH_IPREF(XXH128_hash_t)199/* Ensure the header is parsed again, even if it was previously included */200# undef XXHASH_H_5627135585666179201# undef XXHASH_H_STATIC_13879238742202#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */203204205206/* ****************************************************************207* Stable API208*****************************************************************/209#ifndef XXHASH_H_5627135585666179210#define XXHASH_H_5627135585666179 1211212213/*!214* @defgroup public Public API215* Contains details on the public xxHash functions.216* @{217*/218/* specific declaration modes for Windows */219#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)220# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))221# ifdef XXH_EXPORT222# define XXH_PUBLIC_API __declspec(dllexport)223# elif XXH_IMPORT224# define XXH_PUBLIC_API __declspec(dllimport)225# endif226# else227# define XXH_PUBLIC_API /* do nothing */228# endif229#endif230231#ifdef XXH_DOXYGEN232/*!233* @brief Emulate a namespace by transparently prefixing all symbols.234*235* If you want to include _and expose_ xxHash functions from within your own236* library, but also want to avoid symbol collisions with other libraries which237* may also include xxHash, you can use XXH_NAMESPACE to automatically prefix238* any public symbol from xxhash library with the value of XXH_NAMESPACE239* (therefore, avoid empty or numeric values).240*241* Note that no change is required within the calling program as long as it242* includes `xxhash.h`: Regular symbol names will be automatically translated243* by this header.244*/245# define XXH_NAMESPACE /* YOUR NAME HERE */246# undef XXH_NAMESPACE247#endif248249#ifdef XXH_NAMESPACE250# define XXH_CAT(A,B) A##B251# define XXH_NAME2(A,B) XXH_CAT(A,B)252# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)253/* XXH32 */254# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)255# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)256# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)257# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)258# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)259# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)260# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)261# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)262# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)263/* XXH64 */264# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)265# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)266# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)267# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)268# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)269# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)270# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)271# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)272# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)273/* XXH3_64bits */274# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)275# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)276# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)277# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)278# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)279# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)280# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)281# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)282# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)283# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)284# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)285# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)286# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)287# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)288# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)289/* XXH3_128bits */290# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)291# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)292# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)293# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)294# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)295# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)296# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)297# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)298# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)299# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)300# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)301# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)302# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)303# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)304# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)305#endif306307308/* *************************************309* Version310***************************************/311#define XXH_VERSION_MAJOR 0312#define XXH_VERSION_MINOR 8313#define XXH_VERSION_RELEASE 1314#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)315316/*!317* @brief Obtains the xxHash version.318*319* This is mostly useful when xxHash is compiled as a shared library,320* since the returned value comes from the library, as opposed to header file.321*322* @return `XXH_VERSION_NUMBER` of the invoked library.323*/324XXH_PUBLIC_API unsigned XXH_versionNumber (void);325326327/* ****************************328* Common basic types329******************************/330#include <stddef.h> /* size_t */331typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;332333334/*-**********************************************************************335* 32-bit hash336************************************************************************/337#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */338/*!339* @brief An unsigned 32-bit integer.340*341* Not necessarily defined to `uint32_t` but functionally equivalent.342*/343typedef uint32_t XXH32_hash_t;344345#elif !defined (__VMS) \346&& (defined (__cplusplus) \347|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )348# include <stdint.h>349typedef uint32_t XXH32_hash_t;350351#else352# include <limits.h>353# if UINT_MAX == 0xFFFFFFFFUL354typedef unsigned int XXH32_hash_t;355# else356# if ULONG_MAX == 0xFFFFFFFFUL357typedef unsigned long XXH32_hash_t;358# else359# error "unsupported platform: need a 32-bit type"360# endif361# endif362#endif363364/*!365* @}366*367* @defgroup xxh32_family XXH32 family368* @ingroup public369* Contains functions used in the classic 32-bit xxHash algorithm.370*371* @note372* XXH32 is useful for older platforms, with no or poor 64-bit performance.373* Note that @ref xxh3_family provides competitive speed374* for both 32-bit and 64-bit systems, and offers true 64/128 bit hash results.375*376* @see @ref xxh64_family, @ref xxh3_family : Other xxHash families377* @see @ref xxh32_impl for implementation details378* @{379*/380381/*!382* @brief Calculates the 32-bit hash of @p input using xxHash32.383*384* Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s385*386* @param input The block of data to be hashed, at least @p length bytes in size.387* @param length The length of @p input, in bytes.388* @param seed The 32-bit seed to alter the hash's output predictably.389*390* @pre391* The memory between @p input and @p input + @p length must be valid,392* readable, contiguous memory. However, if @p length is `0`, @p input may be393* `NULL`. In C++, this also must be *TriviallyCopyable*.394*395* @return The calculated 32-bit hash value.396*397* @see398* XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():399* Direct equivalents for the other variants of xxHash.400* @see401* XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.402*/403XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);404405/*!406* Streaming functions generate the xxHash value from an incremental input.407* This method is slower than single-call functions, due to state management.408* For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.409*410* An XXH state must first be allocated using `XXH*_createState()`.411*412* Start a new hash by initializing the state with a seed using `XXH*_reset()`.413*414* Then, feed the hash state by calling `XXH*_update()` as many times as necessary.415*416* The function returns an error code, with 0 meaning OK, and any other value417* meaning there is an error.418*419* Finally, a hash value can be produced anytime, by using `XXH*_digest()`.420* This function returns the nn-bits hash as an int or long long.421*422* It's still possible to continue inserting input into the hash state after a423* digest, and generate new hash values later on by invoking `XXH*_digest()`.424*425* When done, release the state using `XXH*_freeState()`.426*427* Example code for incrementally hashing a file:428* @code{.c}429* #include <stdio.h>430* #include <xxhash.h>431* #define BUFFER_SIZE 256432*433* // Note: XXH64 and XXH3 use the same interface.434* XXH32_hash_t435* hashFile(FILE* stream)436* {437* XXH32_state_t* state;438* unsigned char buf[BUFFER_SIZE];439* size_t amt;440* XXH32_hash_t hash;441*442* state = XXH32_createState(); // Create a state443* assert(state != NULL); // Error check here444* XXH32_reset(state, 0xbaad5eed); // Reset state with our seed445* while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {446* XXH32_update(state, buf, amt); // Hash the file in chunks447* }448* hash = XXH32_digest(state); // Finalize the hash449* XXH32_freeState(state); // Clean up450* return hash;451* }452* @endcode453*/454455/*!456* @typedef struct XXH32_state_s XXH32_state_t457* @brief The opaque state struct for the XXH32 streaming API.458*459* @see XXH32_state_s for details.460*/461typedef struct XXH32_state_s XXH32_state_t;462463/*!464* @brief Allocates an @ref XXH32_state_t.465*466* Must be freed with XXH32_freeState().467* @return An allocated XXH32_state_t on success, `NULL` on failure.468*/469XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);470/*!471* @brief Frees an @ref XXH32_state_t.472*473* Must be allocated with XXH32_createState().474* @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().475* @return XXH_OK.476*/477XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);478/*!479* @brief Copies one @ref XXH32_state_t to another.480*481* @param dst_state The state to copy to.482* @param src_state The state to copy from.483* @pre484* @p dst_state and @p src_state must not be `NULL` and must not overlap.485*/486XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);487488/*!489* @brief Resets an @ref XXH32_state_t to begin a new hash.490*491* This function resets and seeds a state. Call it before @ref XXH32_update().492*493* @param statePtr The state struct to reset.494* @param seed The 32-bit seed to alter the hash result predictably.495*496* @pre497* @p statePtr must not be `NULL`.498*499* @return @ref XXH_OK on success, @ref XXH_ERROR on failure.500*/501XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed);502503/*!504* @brief Consumes a block of @p input to an @ref XXH32_state_t.505*506* Call this to incrementally consume blocks of data.507*508* @param statePtr The state struct to update.509* @param input The block of data to be hashed, at least @p length bytes in size.510* @param length The length of @p input, in bytes.511*512* @pre513* @p statePtr must not be `NULL`.514* @pre515* The memory between @p input and @p input + @p length must be valid,516* readable, contiguous memory. However, if @p length is `0`, @p input may be517* `NULL`. In C++, this also must be *TriviallyCopyable*.518*519* @return @ref XXH_OK on success, @ref XXH_ERROR on failure.520*/521XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);522523/*!524* @brief Returns the calculated hash value from an @ref XXH32_state_t.525*526* @note527* Calling XXH32_digest() will not affect @p statePtr, so you can update,528* digest, and update again.529*530* @param statePtr The state struct to calculate the hash from.531*532* @pre533* @p statePtr must not be `NULL`.534*535* @return The calculated xxHash32 value from that state.536*/537XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);538539/******* Canonical representation *******/540541/*542* The default return values from XXH functions are unsigned 32 and 64 bit543* integers.544* This the simplest and fastest format for further post-processing.545*546* However, this leaves open the question of what is the order on the byte level,547* since little and big endian conventions will store the same number differently.548*549* The canonical representation settles this issue by mandating big-endian550* convention, the same convention as human-readable numbers (large digits first).551*552* When writing hash values to storage, sending them over a network, or printing553* them, it's highly recommended to use the canonical representation to ensure554* portability across a wider range of systems, present and future.555*556* The following functions allow transformation of hash values to and from557* canonical format.558*/559560/*!561* @brief Canonical (big endian) representation of @ref XXH32_hash_t.562*/563typedef struct {564unsigned char digest[4]; /*!< Hash bytes, big endian */565} XXH32_canonical_t;566567/*!568* @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.569*570* @param dst The @ref XXH32_canonical_t pointer to be stored to.571* @param hash The @ref XXH32_hash_t to be converted.572*573* @pre574* @p dst must not be `NULL`.575*/576XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);577578/*!579* @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.580*581* @param src The @ref XXH32_canonical_t to convert.582*583* @pre584* @p src must not be `NULL`.585*586* @return The converted hash.587*/588XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);589590591#ifdef __has_attribute592# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)593#else594# define XXH_HAS_ATTRIBUTE(x) 0595#endif596597/* C-language Attributes are added in C23. */598#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)599# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)600#else601# define XXH_HAS_C_ATTRIBUTE(x) 0602#endif603604#if defined(__cplusplus) && defined(__has_cpp_attribute)605# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)606#else607# define XXH_HAS_CPP_ATTRIBUTE(x) 0608#endif609610/*611Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute612introduced in CPP17 and C23.613CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough614C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough615*/616#if XXH_HAS_C_ATTRIBUTE(x)617# define XXH_FALLTHROUGH [[fallthrough]]618#elif XXH_HAS_CPP_ATTRIBUTE(x)619# define XXH_FALLTHROUGH [[fallthrough]]620#elif XXH_HAS_ATTRIBUTE(__fallthrough__)621# define XXH_FALLTHROUGH __attribute__ ((fallthrough))622#else623# define XXH_FALLTHROUGH624#endif625626/*!627* @}628* @ingroup public629* @{630*/631632#ifndef XXH_NO_LONG_LONG633/*-**********************************************************************634* 64-bit hash635************************************************************************/636#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */637/*!638* @brief An unsigned 64-bit integer.639*640* Not necessarily defined to `uint64_t` but functionally equivalent.641*/642typedef uint64_t XXH64_hash_t;643#elif !defined (__VMS) \644&& (defined (__cplusplus) \645|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )646# include <stdint.h>647typedef uint64_t XXH64_hash_t;648#else649# include <limits.h>650# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL651/* LP64 ABI says uint64_t is unsigned long */652typedef unsigned long XXH64_hash_t;653# else654/* the following type must have a width of 64-bit */655typedef unsigned long long XXH64_hash_t;656# endif657#endif658659/*!660* @}661*662* @defgroup xxh64_family XXH64 family663* @ingroup public664* @{665* Contains functions used in the classic 64-bit xxHash algorithm.666*667* @note668* XXH3 provides competitive speed for both 32-bit and 64-bit systems,669* and offers true 64/128 bit hash results.670* It provides better speed for systems with vector processing capabilities.671*/672673674/*!675* @brief Calculates the 64-bit hash of @p input using xxHash64.676*677* This function usually runs faster on 64-bit systems, but slower on 32-bit678* systems (see benchmark).679*680* @param input The block of data to be hashed, at least @p length bytes in size.681* @param length The length of @p input, in bytes.682* @param seed The 64-bit seed to alter the hash's output predictably.683*684* @pre685* The memory between @p input and @p input + @p length must be valid,686* readable, contiguous memory. However, if @p length is `0`, @p input may be687* `NULL`. In C++, this also must be *TriviallyCopyable*.688*689* @return The calculated 64-bit hash.690*691* @see692* XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():693* Direct equivalents for the other variants of xxHash.694* @see695* XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.696*/697/* Begin FreeBSD - This symbol is needed by dll-linked CLI zstd(1). */698__attribute__((visibility ("default")))699/* End FreeBSD */700XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);701702/******* Streaming *******/703/*!704* @brief The opaque state struct for the XXH64 streaming API.705*706* @see XXH64_state_s for details.707*/708typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */709XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);710XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);711XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);712713XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed);714XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);715XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);716717/******* Canonical representation *******/718typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;719XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);720XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);721722#ifndef XXH_NO_XXH3723/*!724* @}725* ************************************************************************726* @defgroup xxh3_family XXH3 family727* @ingroup public728* @{729*730* XXH3 is a more recent hash algorithm featuring:731* - Improved speed for both small and large inputs732* - True 64-bit and 128-bit outputs733* - SIMD acceleration734* - Improved 32-bit viability735*736* Speed analysis methodology is explained here:737*738* https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html739*740* Compared to XXH64, expect XXH3 to run approximately741* ~2x faster on large inputs and >3x faster on small ones,742* exact differences vary depending on platform.743*744* XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,745* but does not require it.746* Any 32-bit and 64-bit targets that can run XXH32 smoothly747* can run XXH3 at competitive speeds, even without vector support.748* Further details are explained in the implementation.749*750* Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,751* ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.752*753* XXH3 implementation is portable:754* it has a generic C90 formulation that can be compiled on any platform,755* all implementations generage exactly the same hash value on all platforms.756* Starting from v0.8.0, it's also labelled "stable", meaning that757* any future version will also generate the same hash value.758*759* XXH3 offers 2 variants, _64bits and _128bits.760*761* When only 64 bits are needed, prefer invoking the _64bits variant, as it762* reduces the amount of mixing, resulting in faster speed on small inputs.763* It's also generally simpler to manipulate a scalar return type than a struct.764*765* The API supports one-shot hashing, streaming mode, and custom secrets.766*/767768/*-**********************************************************************769* XXH3 64-bit variant770************************************************************************/771772/* XXH3_64bits():773* default 64-bit variant, using default secret and default seed of 0.774* It's the fastest variant. */775XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);776777/*778* XXH3_64bits_withSeed():779* This variant generates a custom secret on the fly780* based on default secret altered using the `seed` value.781* While this operation is decently fast, note that it's not completely free.782* Note: seed==0 produces the same results as XXH3_64bits().783*/784XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);785786/*!787* The bare minimum size for a custom secret.788*789* @see790* XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),791* XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().792*/793#define XXH3_SECRET_SIZE_MIN 136794795/*796* XXH3_64bits_withSecret():797* It's possible to provide any blob of bytes as a "secret" to generate the hash.798* This makes it more difficult for an external actor to prepare an intentional collision.799* The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).800* However, the quality of the secret impacts the dispersion of the hash algorithm.801* Therefore, the secret _must_ look like a bunch of random bytes.802* Avoid "trivial" or structured data such as repeated sequences or a text document.803* Whenever in doubt about the "randomness" of the blob of bytes,804* consider employing "XXH3_generateSecret()" instead (see below).805* It will generate a proper high entropy secret derived from the blob of bytes.806* Another advantage of using XXH3_generateSecret() is that807* it guarantees that all bits within the initial blob of bytes808* will impact every bit of the output.809* This is not necessarily the case when using the blob of bytes directly810* because, when hashing _small_ inputs, only a portion of the secret is employed.811*/812XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);813814815/******* Streaming *******/816/*817* Streaming requires state maintenance.818* This operation costs memory and CPU.819* As a consequence, streaming is slower than one-shot hashing.820* For better performance, prefer one-shot functions whenever applicable.821*/822823/*!824* @brief The state struct for the XXH3 streaming API.825*826* @see XXH3_state_s for details.827*/828typedef struct XXH3_state_s XXH3_state_t;829XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);830XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);831XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);832833/*834* XXH3_64bits_reset():835* Initialize with default parameters.836* digest will be equivalent to `XXH3_64bits()`.837*/838XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);839/*840* XXH3_64bits_reset_withSeed():841* Generate a custom secret from `seed`, and store it into `statePtr`.842* digest will be equivalent to `XXH3_64bits_withSeed()`.843*/844XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);845/*846* XXH3_64bits_reset_withSecret():847* `secret` is referenced, it _must outlive_ the hash streaming session.848* Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,849* and the quality of produced hash values depends on secret's entropy850* (secret's content should look like a bunch of random bytes).851* When in doubt about the randomness of a candidate `secret`,852* consider employing `XXH3_generateSecret()` instead (see below).853*/854XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);855856XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);857XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr);858859/* note : canonical representation of XXH3 is the same as XXH64860* since they both produce XXH64_hash_t values */861862863/*-**********************************************************************864* XXH3 128-bit variant865************************************************************************/866867/*!868* @brief The return value from 128-bit hashes.869*870* Stored in little endian order, although the fields themselves are in native871* endianness.872*/873typedef struct {874XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */875XXH64_hash_t high64; /*!< `value >> 64` */876} XXH128_hash_t;877878XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);879XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);880XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);881882/******* Streaming *******/883/*884* Streaming requires state maintenance.885* This operation costs memory and CPU.886* As a consequence, streaming is slower than one-shot hashing.887* For better performance, prefer one-shot functions whenever applicable.888*889* XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().890* Use already declared XXH3_createState() and XXH3_freeState().891*892* All reset and streaming functions have same meaning as their 64-bit counterpart.893*/894895XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);896XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);897XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);898899XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);900XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);901902/* Following helper functions make it possible to compare XXH128_hast_t values.903* Since XXH128_hash_t is a structure, this capability is not offered by the language.904* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */905906/*!907* XXH128_isEqual():908* Return: 1 if `h1` and `h2` are equal, 0 if they are not.909*/910XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);911912/*!913* XXH128_cmp():914*915* This comparator is compatible with stdlib's `qsort()`/`bsearch()`.916*917* return: >0 if *h128_1 > *h128_2918* =0 if *h128_1 == *h128_2919* <0 if *h128_1 < *h128_2920*/921XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);922923924/******* Canonical representation *******/925typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;926XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);927XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);928929930#endif /* !XXH_NO_XXH3 */931#endif /* XXH_NO_LONG_LONG */932933/*!934* @}935*/936#endif /* XXHASH_H_5627135585666179 */937938939940#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)941#define XXHASH_H_STATIC_13879238742942/* ****************************************************************************943* This section contains declarations which are not guaranteed to remain stable.944* They may change in future versions, becoming incompatible with a different945* version of the library.946* These declarations should only be used with static linking.947* Never use them in association with dynamic linking!948***************************************************************************** */949950/*951* These definitions are only present to allow static allocation952* of XXH states, on stack or in a struct, for example.953* Never **ever** access their members directly.954*/955956/*!957* @internal958* @brief Structure for XXH32 streaming API.959*960* @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,961* @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is962* an opaque type. This allows fields to safely be changed.963*964* Typedef'd to @ref XXH32_state_t.965* Do not access the members of this struct directly.966* @see XXH64_state_s, XXH3_state_s967*/968struct XXH32_state_s {969XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */970XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */971XXH32_hash_t v[4]; /*!< Accumulator lanes */972XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */973XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */974XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */975}; /* typedef'd to XXH32_state_t */976977978#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */979980/*!981* @internal982* @brief Structure for XXH64 streaming API.983*984* @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,985* @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is986* an opaque type. This allows fields to safely be changed.987*988* Typedef'd to @ref XXH64_state_t.989* Do not access the members of this struct directly.990* @see XXH32_state_s, XXH3_state_s991*/992struct XXH64_state_s {993XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */994XXH64_hash_t v[4]; /*!< Accumulator lanes */995XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */996XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */997XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/998XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */999}; /* typedef'd to XXH64_state_t */100010011002#ifndef XXH_NO_XXH310031004#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */1005# include <stdalign.h>1006# define XXH_ALIGN(n) alignas(n)1007#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */1008/* In C++ alignas() is a keyword */1009# define XXH_ALIGN(n) alignas(n)1010#elif defined(__GNUC__)1011# define XXH_ALIGN(n) __attribute__ ((aligned(n)))1012#elif defined(_MSC_VER)1013# define XXH_ALIGN(n) __declspec(align(n))1014#else1015# define XXH_ALIGN(n) /* disabled */1016#endif10171018/* Old GCC versions only accept the attribute after the type in structures. */1019#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \1020&& ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \1021&& defined(__GNUC__)1022# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)1023#else1024# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type1025#endif10261027/*!1028* @brief The size of the internal XXH3 buffer.1029*1030* This is the optimal update size for incremental hashing.1031*1032* @see XXH3_64b_update(), XXH3_128b_update().1033*/1034#define XXH3_INTERNALBUFFER_SIZE 25610351036/*!1037* @brief Default size of the secret buffer (and @ref XXH3_kSecret).1038*1039* This is the size used in @ref XXH3_kSecret and the seeded functions.1040*1041* Not to be confused with @ref XXH3_SECRET_SIZE_MIN.1042*/1043#define XXH3_SECRET_DEFAULT_SIZE 19210441045/*!1046* @internal1047* @brief Structure for XXH3 streaming API.1048*1049* @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,1050* @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.1051* Otherwise it is an opaque type.1052* Never use this definition in combination with dynamic library.1053* This allows fields to safely be changed in the future.1054*1055* @note ** This structure has a strict alignment requirement of 64 bytes!! **1056* Do not allocate this with `malloc()` or `new`,1057* it will not be sufficiently aligned.1058* Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.1059*1060* Typedef'd to @ref XXH3_state_t.1061* Do never access the members of this struct directly.1062*1063* @see XXH3_INITSTATE() for stack initialization.1064* @see XXH3_createState(), XXH3_freeState().1065* @see XXH32_state_s, XXH64_state_s1066*/1067struct XXH3_state_s {1068XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);1069/*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */1070XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);1071/*!< Used to store a custom secret generated from a seed. */1072XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);1073/*!< The internal buffer. @see XXH32_state_s::mem32 */1074XXH32_hash_t bufferedSize;1075/*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */1076XXH32_hash_t useSeed;1077/*!< Reserved field. Needed for padding on 64-bit. */1078size_t nbStripesSoFar;1079/*!< Number or stripes processed. */1080XXH64_hash_t totalLen;1081/*!< Total length hashed. 64-bit even on 32-bit targets. */1082size_t nbStripesPerBlock;1083/*!< Number of stripes per block. */1084size_t secretLimit;1085/*!< Size of @ref customSecret or @ref extSecret */1086XXH64_hash_t seed;1087/*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */1088XXH64_hash_t reserved64;1089/*!< Reserved field. */1090const unsigned char* extSecret;1091/*!< Reference to an external secret for the _withSecret variants, NULL1092* for other variants. */1093/* note: there may be some padding at the end due to alignment on 64 bytes */1094}; /* typedef'd to XXH3_state_t */10951096#undef XXH_ALIGN_MEMBER10971098/*!1099* @brief Initializes a stack-allocated `XXH3_state_s`.1100*1101* When the @ref XXH3_state_t structure is merely emplaced on stack,1102* it should be initialized with XXH3_INITSTATE() or a memset()1103* in case its first reset uses XXH3_NNbits_reset_withSeed().1104* This init can be omitted if the first reset uses default or _withSecret mode.1105* This operation isn't necessary when the state is created with XXH3_createState().1106* Note that this doesn't prepare the state for a streaming operation,1107* it's still necessary to use XXH3_NNbits_reset*() afterwards.1108*/1109#define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; }111011111112/* XXH128() :1113* simple alias to pre-selected XXH3_128bits variant1114*/1115XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);111611171118/* === Experimental API === */1119/* Symbols defined below must be considered tied to a specific library version. */11201121/*1122* XXH3_generateSecret():1123*1124* Derive a high-entropy secret from any user-defined content, named customSeed.1125* The generated secret can be used in combination with `*_withSecret()` functions.1126* The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,1127* as it becomes much more difficult for an external actor to guess how to impact the calculation logic.1128*1129* The function accepts as input a custom seed of any length and any content,1130* and derives from it a high-entropy secret of length @secretSize1131* into an already allocated buffer @secretBuffer.1132* @secretSize must be >= XXH3_SECRET_SIZE_MIN1133*1134* The generated secret can then be used with any `*_withSecret()` variant.1135* Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,1136* `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`1137* are part of this list. They all accept a `secret` parameter1138* which must be large enough for implementation reasons (>= XXH3_SECRET_SIZE_MIN)1139* _and_ feature very high entropy (consist of random-looking bytes).1140* These conditions can be a high bar to meet, so1141* XXH3_generateSecret() can be employed to ensure proper quality.1142*1143* customSeed can be anything. It can have any size, even small ones,1144* and its content can be anything, even "poor entropy" sources such as a bunch of zeroes.1145* The resulting `secret` will nonetheless provide all required qualities.1146*1147* When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.1148*/1149XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);115011511152/*1153* XXH3_generateSecret_fromSeed():1154*1155* Generate the same secret as the _withSeed() variants.1156*1157* The resulting secret has a length of XXH3_SECRET_DEFAULT_SIZE (necessarily).1158* @secretBuffer must be already allocated, of size at least XXH3_SECRET_DEFAULT_SIZE bytes.1159*1160* The generated secret can be used in combination with1161*`*_withSecret()` and `_withSecretandSeed()` variants.1162* This generator is notably useful in combination with `_withSecretandSeed()`,1163* as a way to emulate a faster `_withSeed()` variant.1164*/1165XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);11661167/*1168* *_withSecretandSeed() :1169* These variants generate hash values using either1170* @seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes)1171* or @secret for "large" keys (>= XXH3_MIDSIZE_MAX).1172*1173* This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.1174* `_withSeed()` has to generate the secret on the fly for "large" keys.1175* It's fast, but can be perceptible for "not so large" keys (< 1 KB).1176* `_withSecret()` has to generate the masks on the fly for "small" keys,1177* which requires more instructions than _withSeed() variants.1178* Therefore, _withSecretandSeed variant combines the best of both worlds.1179*1180* When @secret has been generated by XXH3_generateSecret_fromSeed(),1181* this variant produces *exactly* the same results as `_withSeed()` variant,1182* hence offering only a pure speed benefit on "large" input,1183* by skipping the need to regenerate the secret for every large input.1184*1185* Another usage scenario is to hash the secret to a 64-bit hash value,1186* for example with XXH3_64bits(), which then becomes the seed,1187* and then employ both the seed and the secret in _withSecretandSeed().1188* On top of speed, an added benefit is that each bit in the secret1189* has a 50% chance to swap each bit in the output,1190* via its impact to the seed.1191* This is not guaranteed when using the secret directly in "small data" scenarios,1192* because only portions of the secret are employed for small data.1193*/1194XXH_PUBLIC_API XXH64_hash_t1195XXH3_64bits_withSecretandSeed(const void* data, size_t len,1196const void* secret, size_t secretSize,1197XXH64_hash_t seed);11981199XXH_PUBLIC_API XXH128_hash_t1200XXH3_128bits_withSecretandSeed(const void* data, size_t len,1201const void* secret, size_t secretSize,1202XXH64_hash_t seed64);12031204XXH_PUBLIC_API XXH_errorcode1205XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,1206const void* secret, size_t secretSize,1207XXH64_hash_t seed64);12081209XXH_PUBLIC_API XXH_errorcode1210XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,1211const void* secret, size_t secretSize,1212XXH64_hash_t seed64);121312141215#endif /* XXH_NO_XXH3 */1216#endif /* XXH_NO_LONG_LONG */1217#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)1218# define XXH_IMPLEMENTATION1219#endif12201221#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */122212231224/* ======================================================================== */1225/* ======================================================================== */1226/* ======================================================================== */122712281229/*-**********************************************************************1230* xxHash implementation1231*-**********************************************************************1232* xxHash's implementation used to be hosted inside xxhash.c.1233*1234* However, inlining requires implementation to be visible to the compiler,1235* hence be included alongside the header.1236* Previously, implementation was hosted inside xxhash.c,1237* which was then #included when inlining was activated.1238* This construction created issues with a few build and install systems,1239* as it required xxhash.c to be stored in /include directory.1240*1241* xxHash implementation is now directly integrated within xxhash.h.1242* As a consequence, xxhash.c is no longer needed in /include.1243*1244* xxhash.c is still available and is still useful.1245* In a "normal" setup, when xxhash is not inlined,1246* xxhash.h only exposes the prototypes and public symbols,1247* while xxhash.c can be built into an object file xxhash.o1248* which can then be linked into the final binary.1249************************************************************************/12501251#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \1252|| defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)1253# define XXH_IMPLEM_13a873738712541255/* *************************************1256* Tuning parameters1257***************************************/12581259/*!1260* @defgroup tuning Tuning parameters1261* @{1262*1263* Various macros to control xxHash's behavior.1264*/1265#ifdef XXH_DOXYGEN1266/*!1267* @brief Define this to disable 64-bit code.1268*1269* Useful if only using the @ref xxh32_family and you have a strict C90 compiler.1270*/1271# define XXH_NO_LONG_LONG1272# undef XXH_NO_LONG_LONG /* don't actually */1273/*!1274* @brief Controls how unaligned memory is accessed.1275*1276* By default, access to unaligned memory is controlled by `memcpy()`, which is1277* safe and portable.1278*1279* Unfortunately, on some target/compiler combinations, the generated assembly1280* is sub-optimal.1281*1282* The below switch allow selection of a different access method1283* in the search for improved performance.1284*1285* @par Possible options:1286*1287* - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`1288* @par1289* Use `memcpy()`. Safe and portable. Note that most modern compilers will1290* eliminate the function call and treat it as an unaligned access.1291*1292* - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`1293* @par1294* Depends on compiler extensions and is therefore not portable.1295* This method is safe _if_ your compiler supports it,1296* and *generally* as fast or faster than `memcpy`.1297*1298* - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast1299* @par1300* Casts directly and dereferences. This method doesn't depend on the1301* compiler, but it violates the C standard as it directly dereferences an1302* unaligned pointer. It can generate buggy code on targets which do not1303* support unaligned memory accesses, but in some circumstances, it's the1304* only known way to get the most performance.1305*1306* - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift1307* @par1308* Also portable. This can generate the best code on old compilers which don't1309* inline small `memcpy()` calls, and it might also be faster on big-endian1310* systems which lack a native byteswap instruction. However, some compilers1311* will emit literal byteshifts even if the target supports unaligned access.1312* .1313*1314* @warning1315* Methods 1 and 2 rely on implementation-defined behavior. Use these with1316* care, as what works on one compiler/platform/optimization level may cause1317* another to read garbage data or even crash.1318*1319* See http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.1320*1321* Prefer these methods in priority order (0 > 3 > 1 > 2)1322*/1323# define XXH_FORCE_MEMORY_ACCESS 013241325/*!1326* @def XXH_FORCE_ALIGN_CHECK1327* @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()1328* and XXH64() only).1329*1330* This is an important performance trick for architectures without decent1331* unaligned memory access performance.1332*1333* It checks for input alignment, and when conditions are met, uses a "fast1334* path" employing direct 32-bit/64-bit reads, resulting in _dramatically1335* faster_ read speed.1336*1337* The check costs one initial branch per hash, which is generally negligible,1338* but not zero.1339*1340* Moreover, it's not useful to generate an additional code path if memory1341* access uses the same instruction for both aligned and unaligned1342* addresses (e.g. x86 and aarch64).1343*1344* In these cases, the alignment check can be removed by setting this macro to 0.1345* Then the code will always use unaligned memory access.1346* Align check is automatically disabled on x86, x64 & arm64,1347* which are platforms known to offer good unaligned memory accesses performance.1348*1349* This option does not affect XXH3 (only XXH32 and XXH64).1350*/1351# define XXH_FORCE_ALIGN_CHECK 013521353/*!1354* @def XXH_NO_INLINE_HINTS1355* @brief When non-zero, sets all functions to `static`.1356*1357* By default, xxHash tries to force the compiler to inline almost all internal1358* functions.1359*1360* This can usually improve performance due to reduced jumping and improved1361* constant folding, but significantly increases the size of the binary which1362* might not be favorable.1363*1364* Additionally, sometimes the forced inlining can be detrimental to performance,1365* depending on the architecture.1366*1367* XXH_NO_INLINE_HINTS marks all internal functions as static, giving the1368* compiler full control on whether to inline or not.1369*1370* When not optimizing (-O0), optimizing for size (-Os, -Oz), or using1371* -fno-inline with GCC or Clang, this will automatically be defined.1372*/1373# define XXH_NO_INLINE_HINTS 013741375/*!1376* @def XXH32_ENDJMP1377* @brief Whether to use a jump for `XXH32_finalize`.1378*1379* For performance, `XXH32_finalize` uses multiple branches in the finalizer.1380* This is generally preferable for performance,1381* but depending on exact architecture, a jmp may be preferable.1382*1383* This setting is only possibly making a difference for very small inputs.1384*/1385# define XXH32_ENDJMP 013861387/*!1388* @internal1389* @brief Redefines old internal names.1390*1391* For compatibility with code that uses xxHash's internals before the names1392* were changed to improve namespacing. There is no other reason to use this.1393*/1394# define XXH_OLD_NAMES1395# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */1396#endif /* XXH_DOXYGEN */1397/*!1398* @}1399*/14001401#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */1402/* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */1403# if !defined(__clang__) && \1404( \1405(defined(__INTEL_COMPILER) && !defined(_WIN32)) || \1406( \1407defined(__GNUC__) && ( \1408(defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \1409( \1410defined(__mips__) && \1411(__mips <= 5 || __mips_isa_rev < 6) && \1412(!defined(__mips16) || defined(__mips_mips16e2)) \1413) \1414) \1415) \1416)1417# define XXH_FORCE_MEMORY_ACCESS 11418# endif1419#endif14201421#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */1422# if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) \1423|| defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */1424# define XXH_FORCE_ALIGN_CHECK 01425# else1426# define XXH_FORCE_ALIGN_CHECK 11427# endif1428#endif14291430#ifndef XXH_NO_INLINE_HINTS1431# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \1432|| defined(__NO_INLINE__) /* -O0, -fno-inline */1433# define XXH_NO_INLINE_HINTS 11434# else1435# define XXH_NO_INLINE_HINTS 01436# endif1437#endif14381439#ifndef XXH32_ENDJMP1440/* generally preferable for performance */1441# define XXH32_ENDJMP 01442#endif14431444/*!1445* @defgroup impl Implementation1446* @{1447*/144814491450/* *************************************1451* Includes & Memory related functions1452***************************************/1453/* Modify the local functions below should you wish to use some other memory routines */1454/* for ZSTD_malloc(), ZSTD_free() */1455#define ZSTD_DEPS_NEED_MALLOC1456#include "zstd_deps.h" /* size_t, ZSTD_malloc, ZSTD_free, ZSTD_memcpy */1457static void* XXH_malloc(size_t s) { return ZSTD_malloc(s); }1458static void XXH_free (void* p) { ZSTD_free(p); }1459static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_memcpy(dest,src,size); }146014611462/* *************************************1463* Compiler Specific Options1464***************************************/1465#ifdef _MSC_VER /* Visual Studio warning fix */1466# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */1467#endif14681469#if XXH_NO_INLINE_HINTS /* disable inlining hints */1470# if defined(__GNUC__) || defined(__clang__)1471# define XXH_FORCE_INLINE static __attribute__((unused))1472# else1473# define XXH_FORCE_INLINE static1474# endif1475# define XXH_NO_INLINE static1476/* enable inlining hints */1477#elif defined(__GNUC__) || defined(__clang__)1478# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))1479# define XXH_NO_INLINE static __attribute__((noinline))1480#elif defined(_MSC_VER) /* Visual Studio */1481# define XXH_FORCE_INLINE static __forceinline1482# define XXH_NO_INLINE static __declspec(noinline)1483#elif defined (__cplusplus) \1484|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */1485# define XXH_FORCE_INLINE static inline1486# define XXH_NO_INLINE static1487#else1488# define XXH_FORCE_INLINE static1489# define XXH_NO_INLINE static1490#endif1491149214931494/* *************************************1495* Debug1496***************************************/1497/*!1498* @ingroup tuning1499* @def XXH_DEBUGLEVEL1500* @brief Sets the debugging level.1501*1502* XXH_DEBUGLEVEL is expected to be defined externally, typically via the1503* compiler's command line options. The value must be a number.1504*/1505#ifndef XXH_DEBUGLEVEL1506# ifdef DEBUGLEVEL /* backwards compat */1507# define XXH_DEBUGLEVEL DEBUGLEVEL1508# else1509# define XXH_DEBUGLEVEL 01510# endif1511#endif15121513#if (XXH_DEBUGLEVEL>=1)1514# include <assert.h> /* note: can still be disabled with NDEBUG */1515# define XXH_ASSERT(c) assert(c)1516#else1517# define XXH_ASSERT(c) ((void)0)1518#endif15191520/* note: use after variable declarations */1521#ifndef XXH_STATIC_ASSERT1522# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */1523# include <assert.h>1524# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)1525# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */1526# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)1527# else1528# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)1529# endif1530# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)1531#endif15321533/*!1534* @internal1535* @def XXH_COMPILER_GUARD(var)1536* @brief Used to prevent unwanted optimizations for @p var.1537*1538* It uses an empty GCC inline assembly statement with a register constraint1539* which forces @p var into a general purpose register (eg eax, ebx, ecx1540* on x86) and marks it as modified.1541*1542* This is used in a few places to avoid unwanted autovectorization (e.g.1543* XXH32_round()). All vectorization we want is explicit via intrinsics,1544* and _usually_ isn't wanted elsewhere.1545*1546* We also use it to prevent unwanted constant folding for AArch64 in1547* XXH3_initCustomSecret_scalar().1548*/1549#if defined(__GNUC__) || defined(__clang__)1550# define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))1551#else1552# define XXH_COMPILER_GUARD(var) ((void)0)1553#endif15541555/* *************************************1556* Basic Types1557***************************************/1558#if !defined (__VMS) \1559&& (defined (__cplusplus) \1560|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )1561# include <stdint.h>1562typedef uint8_t xxh_u8;1563#else1564typedef unsigned char xxh_u8;1565#endif1566typedef XXH32_hash_t xxh_u32;15671568#ifdef XXH_OLD_NAMES1569# define BYTE xxh_u81570# define U8 xxh_u81571# define U32 xxh_u321572#endif15731574/* *** Memory access *** */15751576/*!1577* @internal1578* @fn xxh_u32 XXH_read32(const void* ptr)1579* @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.1580*1581* Affected by @ref XXH_FORCE_MEMORY_ACCESS.1582*1583* @param ptr The pointer to read from.1584* @return The 32-bit native endian integer from the bytes at @p ptr.1585*/15861587/*!1588* @internal1589* @fn xxh_u32 XXH_readLE32(const void* ptr)1590* @brief Reads an unaligned 32-bit little endian integer from @p ptr.1591*1592* Affected by @ref XXH_FORCE_MEMORY_ACCESS.1593*1594* @param ptr The pointer to read from.1595* @return The 32-bit little endian integer from the bytes at @p ptr.1596*/15971598/*!1599* @internal1600* @fn xxh_u32 XXH_readBE32(const void* ptr)1601* @brief Reads an unaligned 32-bit big endian integer from @p ptr.1602*1603* Affected by @ref XXH_FORCE_MEMORY_ACCESS.1604*1605* @param ptr The pointer to read from.1606* @return The 32-bit big endian integer from the bytes at @p ptr.1607*/16081609/*!1610* @internal1611* @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)1612* @brief Like @ref XXH_readLE32(), but has an option for aligned reads.1613*1614* Affected by @ref XXH_FORCE_MEMORY_ACCESS.1615* Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is1616* always @ref XXH_alignment::XXH_unaligned.1617*1618* @param ptr The pointer to read from.1619* @param align Whether @p ptr is aligned.1620* @pre1621* If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte1622* aligned.1623* @return The 32-bit little endian integer from the bytes at @p ptr.1624*/16251626#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))1627/*1628* Manual byteshift. Best for old compilers which don't inline memcpy.1629* We actually directly use XXH_readLE32 and XXH_readBE32.1630*/1631#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))16321633/*1634* Force direct memory access. Only works on CPU which support unaligned memory1635* access in hardware.1636*/1637static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }16381639#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))16401641/*1642* __pack instructions are safer but compiler specific, hence potentially1643* problematic for some compilers.1644*1645* Currently only defined for GCC and ICC.1646*/1647#ifdef XXH_OLD_NAMES1648typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;1649#endif1650static xxh_u32 XXH_read32(const void* ptr)1651{1652typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;1653return ((const xxh_unalign*)ptr)->u32;1654}16551656#else16571658/*1659* Portable and safe solution. Generally efficient.1660* see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html1661*/1662static xxh_u32 XXH_read32(const void* memPtr)1663{1664xxh_u32 val;1665XXH_memcpy(&val, memPtr, sizeof(val));1666return val;1667}16681669#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */167016711672/* *** Endianness *** */16731674/*!1675* @ingroup tuning1676* @def XXH_CPU_LITTLE_ENDIAN1677* @brief Whether the target is little endian.1678*1679* Defined to 1 if the target is little endian, or 0 if it is big endian.1680* It can be defined externally, for example on the compiler command line.1681*1682* If it is not defined,1683* a runtime check (which is usually constant folded) is used instead.1684*1685* @note1686* This is not necessarily defined to an integer constant.1687*1688* @see XXH_isLittleEndian() for the runtime check.1689*/1690#ifndef XXH_CPU_LITTLE_ENDIAN1691/*1692* Try to detect endianness automatically, to avoid the nonstandard behavior1693* in `XXH_isLittleEndian()`1694*/1695# if defined(_WIN32) /* Windows is always little endian */ \1696|| defined(__LITTLE_ENDIAN__) \1697|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)1698# define XXH_CPU_LITTLE_ENDIAN 11699# elif defined(__BIG_ENDIAN__) \1700|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)1701# define XXH_CPU_LITTLE_ENDIAN 01702# else1703/*!1704* @internal1705* @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.1706*1707* Most compilers will constant fold this.1708*/1709static int XXH_isLittleEndian(void)1710{1711/*1712* Portable and well-defined behavior.1713* Don't use static: it is detrimental to performance.1714*/1715const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };1716return one.c[0];1717}1718# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()1719# endif1720#endif17211722172317241725/* ****************************************1726* Compiler-specific Functions and Macros1727******************************************/1728#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)17291730#ifdef __has_builtin1731# define XXH_HAS_BUILTIN(x) __has_builtin(x)1732#else1733# define XXH_HAS_BUILTIN(x) 01734#endif17351736/*!1737* @internal1738* @def XXH_rotl32(x,r)1739* @brief 32-bit rotate left.1740*1741* @param x The 32-bit integer to be rotated.1742* @param r The number of bits to rotate.1743* @pre1744* @p r > 0 && @p r < 321745* @note1746* @p x and @p r may be evaluated multiple times.1747* @return The rotated result.1748*/1749#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \1750&& XXH_HAS_BUILTIN(__builtin_rotateleft64)1751# define XXH_rotl32 __builtin_rotateleft321752# define XXH_rotl64 __builtin_rotateleft641753/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */1754#elif defined(_MSC_VER)1755# define XXH_rotl32(x,r) _rotl(x,r)1756# define XXH_rotl64(x,r) _rotl64(x,r)1757#else1758# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))1759# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))1760#endif17611762/*!1763* @internal1764* @fn xxh_u32 XXH_swap32(xxh_u32 x)1765* @brief A 32-bit byteswap.1766*1767* @param x The 32-bit integer to byteswap.1768* @return @p x, byteswapped.1769*/1770#if defined(_MSC_VER) /* Visual Studio */1771# define XXH_swap32 _byteswap_ulong1772#elif XXH_GCC_VERSION >= 4031773# define XXH_swap32 __builtin_bswap321774#else1775static xxh_u32 XXH_swap32 (xxh_u32 x)1776{1777return ((x << 24) & 0xff000000 ) |1778((x << 8) & 0x00ff0000 ) |1779((x >> 8) & 0x0000ff00 ) |1780((x >> 24) & 0x000000ff );1781}1782#endif178317841785/* ***************************1786* Memory reads1787*****************************/17881789/*!1790* @internal1791* @brief Enum to indicate whether a pointer is aligned.1792*/1793typedef enum {1794XXH_aligned, /*!< Aligned */1795XXH_unaligned /*!< Possibly unaligned */1796} XXH_alignment;17971798/*1799* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.1800*1801* This is ideal for older compilers which don't inline memcpy.1802*/1803#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))18041805XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)1806{1807const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;1808return bytePtr[0]1809| ((xxh_u32)bytePtr[1] << 8)1810| ((xxh_u32)bytePtr[2] << 16)1811| ((xxh_u32)bytePtr[3] << 24);1812}18131814XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)1815{1816const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;1817return bytePtr[3]1818| ((xxh_u32)bytePtr[2] << 8)1819| ((xxh_u32)bytePtr[1] << 16)1820| ((xxh_u32)bytePtr[0] << 24);1821}18221823#else1824XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)1825{1826return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));1827}18281829static xxh_u32 XXH_readBE32(const void* ptr)1830{1831return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);1832}1833#endif18341835XXH_FORCE_INLINE xxh_u321836XXH_readLE32_align(const void* ptr, XXH_alignment align)1837{1838if (align==XXH_unaligned) {1839return XXH_readLE32(ptr);1840} else {1841return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);1842}1843}184418451846/* *************************************1847* Misc1848***************************************/1849/*! @ingroup public */1850XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }185118521853/* *******************************************************************1854* 32-bit hash functions1855*********************************************************************/1856/*!1857* @}1858* @defgroup xxh32_impl XXH32 implementation1859* @ingroup impl1860* @{1861*/1862/* #define instead of static const, to be used as initializers */1863#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */1864#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */1865#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */1866#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */1867#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */18681869#ifdef XXH_OLD_NAMES1870# define PRIME32_1 XXH_PRIME32_11871# define PRIME32_2 XXH_PRIME32_21872# define PRIME32_3 XXH_PRIME32_31873# define PRIME32_4 XXH_PRIME32_41874# define PRIME32_5 XXH_PRIME32_51875#endif18761877/*!1878* @internal1879* @brief Normal stripe processing routine.1880*1881* This shuffles the bits so that any bit from @p input impacts several bits in1882* @p acc.1883*1884* @param acc The accumulator lane.1885* @param input The stripe of input to mix.1886* @return The mixed accumulator lane.1887*/1888static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)1889{1890acc += input * XXH_PRIME32_2;1891acc = XXH_rotl32(acc, 13);1892acc *= XXH_PRIME32_1;1893#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)1894/*1895* UGLY HACK:1896* A compiler fence is the only thing that prevents GCC and Clang from1897* autovectorizing the XXH32 loop (pragmas and attributes don't work for some1898* reason) without globally disabling SSE4.1.1899*1900* The reason we want to avoid vectorization is because despite working on1901* 4 integers at a time, there are multiple factors slowing XXH32 down on1902* SSE4:1903* - There's a ridiculous amount of lag from pmulld (10 cycles of latency on1904* newer chips!) making it slightly slower to multiply four integers at1905* once compared to four integers independently. Even when pmulld was1906* fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE1907* just to multiply unless doing a long operation.1908*1909* - Four instructions are required to rotate,1910* movqda tmp, v // not required with VEX encoding1911* pslld tmp, 13 // tmp <<= 131912* psrld v, 19 // x >>= 191913* por v, tmp // x |= tmp1914* compared to one for scalar:1915* roll v, 13 // reliably fast across the board1916* shldl v, v, 13 // Sandy Bridge and later prefer this for some reason1917*1918* - Instruction level parallelism is actually more beneficial here because1919* the SIMD actually serializes this operation: While v1 is rotating, v21920* can load data, while v3 can multiply. SSE forces them to operate1921* together.1922*1923* This is also enabled on AArch64, as Clang autovectorizes it incorrectly1924* and it is pointless writing a NEON implementation that is basically the1925* same speed as scalar for XXH32.1926*/1927XXH_COMPILER_GUARD(acc);1928#endif1929return acc;1930}19311932/*!1933* @internal1934* @brief Mixes all bits to finalize the hash.1935*1936* The final mix ensures that all input bits have a chance to impact any bit in1937* the output digest, resulting in an unbiased distribution.1938*1939* @param h32 The hash to avalanche.1940* @return The avalanched hash.1941*/1942static xxh_u32 XXH32_avalanche(xxh_u32 h32)1943{1944h32 ^= h32 >> 15;1945h32 *= XXH_PRIME32_2;1946h32 ^= h32 >> 13;1947h32 *= XXH_PRIME32_3;1948h32 ^= h32 >> 16;1949return(h32);1950}19511952#define XXH_get32bits(p) XXH_readLE32_align(p, align)19531954/*!1955* @internal1956* @brief Processes the last 0-15 bytes of @p ptr.1957*1958* There may be up to 15 bytes remaining to consume from the input.1959* This final stage will digest them to ensure that all input bytes are present1960* in the final mix.1961*1962* @param h32 The hash to finalize.1963* @param ptr The pointer to the remaining input.1964* @param len The remaining length, modulo 16.1965* @param align Whether @p ptr is aligned.1966* @return The finalized hash.1967*/1968static xxh_u321969XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)1970{1971#define XXH_PROCESS1 do { \1972h32 += (*ptr++) * XXH_PRIME32_5; \1973h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \1974} while (0)19751976#define XXH_PROCESS4 do { \1977h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \1978ptr += 4; \1979h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \1980} while (0)19811982if (ptr==NULL) XXH_ASSERT(len == 0);19831984/* Compact rerolled version; generally faster */1985if (!XXH32_ENDJMP) {1986len &= 15;1987while (len >= 4) {1988XXH_PROCESS4;1989len -= 4;1990}1991while (len > 0) {1992XXH_PROCESS1;1993--len;1994}1995return XXH32_avalanche(h32);1996} else {1997switch(len&15) /* or switch(bEnd - p) */ {1998case 12: XXH_PROCESS4;1999XXH_FALLTHROUGH;2000case 8: XXH_PROCESS4;2001XXH_FALLTHROUGH;2002case 4: XXH_PROCESS4;2003return XXH32_avalanche(h32);20042005case 13: XXH_PROCESS4;2006XXH_FALLTHROUGH;2007case 9: XXH_PROCESS4;2008XXH_FALLTHROUGH;2009case 5: XXH_PROCESS4;2010XXH_PROCESS1;2011return XXH32_avalanche(h32);20122013case 14: XXH_PROCESS4;2014XXH_FALLTHROUGH;2015case 10: XXH_PROCESS4;2016XXH_FALLTHROUGH;2017case 6: XXH_PROCESS4;2018XXH_PROCESS1;2019XXH_PROCESS1;2020return XXH32_avalanche(h32);20212022case 15: XXH_PROCESS4;2023XXH_FALLTHROUGH;2024case 11: XXH_PROCESS4;2025XXH_FALLTHROUGH;2026case 7: XXH_PROCESS4;2027XXH_FALLTHROUGH;2028case 3: XXH_PROCESS1;2029XXH_FALLTHROUGH;2030case 2: XXH_PROCESS1;2031XXH_FALLTHROUGH;2032case 1: XXH_PROCESS1;2033XXH_FALLTHROUGH;2034case 0: return XXH32_avalanche(h32);2035}2036XXH_ASSERT(0);2037return h32; /* reaching this point is deemed impossible */2038}2039}20402041#ifdef XXH_OLD_NAMES2042# define PROCESS1 XXH_PROCESS12043# define PROCESS4 XXH_PROCESS42044#else2045# undef XXH_PROCESS12046# undef XXH_PROCESS42047#endif20482049/*!2050* @internal2051* @brief The implementation for @ref XXH32().2052*2053* @param input , len , seed Directly passed from @ref XXH32().2054* @param align Whether @p input is aligned.2055* @return The calculated hash.2056*/2057XXH_FORCE_INLINE xxh_u322058XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)2059{2060xxh_u32 h32;20612062if (input==NULL) XXH_ASSERT(len == 0);20632064if (len>=16) {2065const xxh_u8* const bEnd = input + len;2066const xxh_u8* const limit = bEnd - 15;2067xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;2068xxh_u32 v2 = seed + XXH_PRIME32_2;2069xxh_u32 v3 = seed + 0;2070xxh_u32 v4 = seed - XXH_PRIME32_1;20712072do {2073v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;2074v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;2075v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;2076v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;2077} while (input < limit);20782079h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7)2080+ XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);2081} else {2082h32 = seed + XXH_PRIME32_5;2083}20842085h32 += (xxh_u32)len;20862087return XXH32_finalize(h32, input, len&15, align);2088}20892090/*! @ingroup xxh32_family */2091XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)2092{2093#if 02094/* Simple version, good for code maintenance, but unfortunately slow for small inputs */2095XXH32_state_t state;2096XXH32_reset(&state, seed);2097XXH32_update(&state, (const xxh_u8*)input, len);2098return XXH32_digest(&state);2099#else2100if (XXH_FORCE_ALIGN_CHECK) {2101if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */2102return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);2103} }21042105return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);2106#endif2107}2108210921102111/******* Hash streaming *******/2112/*!2113* @ingroup xxh32_family2114*/2115XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)2116{2117return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));2118}2119/*! @ingroup xxh32_family */2120XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)2121{2122XXH_free(statePtr);2123return XXH_OK;2124}21252126/*! @ingroup xxh32_family */2127XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)2128{2129XXH_memcpy(dstState, srcState, sizeof(*dstState));2130}21312132/*! @ingroup xxh32_family */2133XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)2134{2135XXH_ASSERT(statePtr != NULL);2136memset(statePtr, 0, sizeof(*statePtr));2137statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;2138statePtr->v[1] = seed + XXH_PRIME32_2;2139statePtr->v[2] = seed + 0;2140statePtr->v[3] = seed - XXH_PRIME32_1;2141return XXH_OK;2142}214321442145/*! @ingroup xxh32_family */2146XXH_PUBLIC_API XXH_errorcode2147XXH32_update(XXH32_state_t* state, const void* input, size_t len)2148{2149if (input==NULL) {2150XXH_ASSERT(len == 0);2151return XXH_OK;2152}21532154{ const xxh_u8* p = (const xxh_u8*)input;2155const xxh_u8* const bEnd = p + len;21562157state->total_len_32 += (XXH32_hash_t)len;2158state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));21592160if (state->memsize + len < 16) { /* fill in tmp buffer */2161XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);2162state->memsize += (XXH32_hash_t)len;2163return XXH_OK;2164}21652166if (state->memsize) { /* some data left from previous update */2167XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);2168{ const xxh_u32* p32 = state->mem32;2169state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++;2170state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++;2171state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++;2172state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));2173}2174p += 16-state->memsize;2175state->memsize = 0;2176}21772178if (p <= bEnd-16) {2179const xxh_u8* const limit = bEnd - 16;21802181do {2182state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4;2183state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4;2184state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4;2185state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4;2186} while (p<=limit);21872188}21892190if (p < bEnd) {2191XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));2192state->memsize = (unsigned)(bEnd-p);2193}2194}21952196return XXH_OK;2197}219821992200/*! @ingroup xxh32_family */2201XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)2202{2203xxh_u32 h32;22042205if (state->large_len) {2206h32 = XXH_rotl32(state->v[0], 1)2207+ XXH_rotl32(state->v[1], 7)2208+ XXH_rotl32(state->v[2], 12)2209+ XXH_rotl32(state->v[3], 18);2210} else {2211h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;2212}22132214h32 += state->total_len_32;22152216return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);2217}221822192220/******* Canonical representation *******/22212222/*!2223* @ingroup xxh32_family2224* The default return values from XXH functions are unsigned 32 and 64 bit2225* integers.2226*2227* The canonical representation uses big endian convention, the same convention2228* as human-readable numbers (large digits first).2229*2230* This way, hash values can be written into a file or buffer, remaining2231* comparable across different systems.2232*2233* The following functions allow transformation of hash values to and from their2234* canonical format.2235*/2236XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)2237{2238/* XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); */2239if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);2240XXH_memcpy(dst, &hash, sizeof(*dst));2241}2242/*! @ingroup xxh32_family */2243XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)2244{2245return XXH_readBE32(src);2246}224722482249#ifndef XXH_NO_LONG_LONG22502251/* *******************************************************************2252* 64-bit hash functions2253*********************************************************************/2254/*!2255* @}2256* @ingroup impl2257* @{2258*/2259/******* Memory access *******/22602261typedef XXH64_hash_t xxh_u64;22622263#ifdef XXH_OLD_NAMES2264# define U64 xxh_u642265#endif22662267#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))2268/*2269* Manual byteshift. Best for old compilers which don't inline memcpy.2270* We actually directly use XXH_readLE64 and XXH_readBE64.2271*/2272#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))22732274/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */2275static xxh_u64 XXH_read64(const void* memPtr)2276{2277return *(const xxh_u64*) memPtr;2278}22792280#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))22812282/*2283* __pack instructions are safer, but compiler specific, hence potentially2284* problematic for some compilers.2285*2286* Currently only defined for GCC and ICC.2287*/2288#ifdef XXH_OLD_NAMES2289typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;2290#endif2291static xxh_u64 XXH_read64(const void* ptr)2292{2293typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;2294return ((const xxh_unalign64*)ptr)->u64;2295}22962297#else22982299/*2300* Portable and safe solution. Generally efficient.2301* see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html2302*/2303static xxh_u64 XXH_read64(const void* memPtr)2304{2305xxh_u64 val;2306XXH_memcpy(&val, memPtr, sizeof(val));2307return val;2308}23092310#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */23112312#if defined(_MSC_VER) /* Visual Studio */2313# define XXH_swap64 _byteswap_uint642314#elif XXH_GCC_VERSION >= 4032315# define XXH_swap64 __builtin_bswap642316#else2317static xxh_u64 XXH_swap64(xxh_u64 x)2318{2319return ((x << 56) & 0xff00000000000000ULL) |2320((x << 40) & 0x00ff000000000000ULL) |2321((x << 24) & 0x0000ff0000000000ULL) |2322((x << 8) & 0x000000ff00000000ULL) |2323((x >> 8) & 0x00000000ff000000ULL) |2324((x >> 24) & 0x0000000000ff0000ULL) |2325((x >> 40) & 0x000000000000ff00ULL) |2326((x >> 56) & 0x00000000000000ffULL);2327}2328#endif232923302331/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */2332#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))23332334XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)2335{2336const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;2337return bytePtr[0]2338| ((xxh_u64)bytePtr[1] << 8)2339| ((xxh_u64)bytePtr[2] << 16)2340| ((xxh_u64)bytePtr[3] << 24)2341| ((xxh_u64)bytePtr[4] << 32)2342| ((xxh_u64)bytePtr[5] << 40)2343| ((xxh_u64)bytePtr[6] << 48)2344| ((xxh_u64)bytePtr[7] << 56);2345}23462347XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)2348{2349const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;2350return bytePtr[7]2351| ((xxh_u64)bytePtr[6] << 8)2352| ((xxh_u64)bytePtr[5] << 16)2353| ((xxh_u64)bytePtr[4] << 24)2354| ((xxh_u64)bytePtr[3] << 32)2355| ((xxh_u64)bytePtr[2] << 40)2356| ((xxh_u64)bytePtr[1] << 48)2357| ((xxh_u64)bytePtr[0] << 56);2358}23592360#else2361XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)2362{2363return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));2364}23652366static xxh_u64 XXH_readBE64(const void* ptr)2367{2368return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);2369}2370#endif23712372XXH_FORCE_INLINE xxh_u642373XXH_readLE64_align(const void* ptr, XXH_alignment align)2374{2375if (align==XXH_unaligned)2376return XXH_readLE64(ptr);2377else2378return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);2379}238023812382/******* xxh64 *******/2383/*!2384* @}2385* @defgroup xxh64_impl XXH64 implementation2386* @ingroup impl2387* @{2388*/2389/* #define rather that static const, to be used as initializers */2390#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */2391#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */2392#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */2393#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */2394#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */23952396#ifdef XXH_OLD_NAMES2397# define PRIME64_1 XXH_PRIME64_12398# define PRIME64_2 XXH_PRIME64_22399# define PRIME64_3 XXH_PRIME64_32400# define PRIME64_4 XXH_PRIME64_42401# define PRIME64_5 XXH_PRIME64_52402#endif24032404static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)2405{2406acc += input * XXH_PRIME64_2;2407acc = XXH_rotl64(acc, 31);2408acc *= XXH_PRIME64_1;2409return acc;2410}24112412static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)2413{2414val = XXH64_round(0, val);2415acc ^= val;2416acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4;2417return acc;2418}24192420static xxh_u64 XXH64_avalanche(xxh_u64 h64)2421{2422h64 ^= h64 >> 33;2423h64 *= XXH_PRIME64_2;2424h64 ^= h64 >> 29;2425h64 *= XXH_PRIME64_3;2426h64 ^= h64 >> 32;2427return h64;2428}242924302431#define XXH_get64bits(p) XXH_readLE64_align(p, align)24322433static xxh_u642434XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)2435{2436if (ptr==NULL) XXH_ASSERT(len == 0);2437len &= 31;2438while (len >= 8) {2439xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));2440ptr += 8;2441h64 ^= k1;2442h64 = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4;2443len -= 8;2444}2445if (len >= 4) {2446h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;2447ptr += 4;2448h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;2449len -= 4;2450}2451while (len > 0) {2452h64 ^= (*ptr++) * XXH_PRIME64_5;2453h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;2454--len;2455}2456return XXH64_avalanche(h64);2457}24582459#ifdef XXH_OLD_NAMES2460# define PROCESS1_64 XXH_PROCESS1_642461# define PROCESS4_64 XXH_PROCESS4_642462# define PROCESS8_64 XXH_PROCESS8_642463#else2464# undef XXH_PROCESS1_642465# undef XXH_PROCESS4_642466# undef XXH_PROCESS8_642467#endif24682469XXH_FORCE_INLINE xxh_u642470XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)2471{2472xxh_u64 h64;2473if (input==NULL) XXH_ASSERT(len == 0);24742475if (len>=32) {2476const xxh_u8* const bEnd = input + len;2477const xxh_u8* const limit = bEnd - 31;2478xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;2479xxh_u64 v2 = seed + XXH_PRIME64_2;2480xxh_u64 v3 = seed + 0;2481xxh_u64 v4 = seed - XXH_PRIME64_1;24822483do {2484v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;2485v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;2486v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;2487v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;2488} while (input<limit);24892490h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);2491h64 = XXH64_mergeRound(h64, v1);2492h64 = XXH64_mergeRound(h64, v2);2493h64 = XXH64_mergeRound(h64, v3);2494h64 = XXH64_mergeRound(h64, v4);24952496} else {2497h64 = seed + XXH_PRIME64_5;2498}24992500h64 += (xxh_u64) len;25012502return XXH64_finalize(h64, input, len, align);2503}250425052506/*! @ingroup xxh64_family */2507XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)2508{2509#if 02510/* Simple version, good for code maintenance, but unfortunately slow for small inputs */2511XXH64_state_t state;2512XXH64_reset(&state, seed);2513XXH64_update(&state, (const xxh_u8*)input, len);2514return XXH64_digest(&state);2515#else2516if (XXH_FORCE_ALIGN_CHECK) {2517if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */2518return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);2519} }25202521return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);25222523#endif2524}25252526/******* Hash Streaming *******/25272528/*! @ingroup xxh64_family*/2529XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)2530{2531return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));2532}2533/*! @ingroup xxh64_family */2534XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)2535{2536XXH_free(statePtr);2537return XXH_OK;2538}25392540/*! @ingroup xxh64_family */2541XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)2542{2543XXH_memcpy(dstState, srcState, sizeof(*dstState));2544}25452546/*! @ingroup xxh64_family */2547XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)2548{2549XXH_ASSERT(statePtr != NULL);2550memset(statePtr, 0, sizeof(*statePtr));2551statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;2552statePtr->v[1] = seed + XXH_PRIME64_2;2553statePtr->v[2] = seed + 0;2554statePtr->v[3] = seed - XXH_PRIME64_1;2555return XXH_OK;2556}25572558/*! @ingroup xxh64_family */2559XXH_PUBLIC_API XXH_errorcode2560XXH64_update (XXH64_state_t* state, const void* input, size_t len)2561{2562if (input==NULL) {2563XXH_ASSERT(len == 0);2564return XXH_OK;2565}25662567{ const xxh_u8* p = (const xxh_u8*)input;2568const xxh_u8* const bEnd = p + len;25692570state->total_len += len;25712572if (state->memsize + len < 32) { /* fill in tmp buffer */2573XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);2574state->memsize += (xxh_u32)len;2575return XXH_OK;2576}25772578if (state->memsize) { /* tmp buffer is full */2579XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);2580state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0));2581state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1));2582state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2));2583state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3));2584p += 32 - state->memsize;2585state->memsize = 0;2586}25872588if (p+32 <= bEnd) {2589const xxh_u8* const limit = bEnd - 32;25902591do {2592state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8;2593state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8;2594state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8;2595state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8;2596} while (p<=limit);25972598}25992600if (p < bEnd) {2601XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));2602state->memsize = (unsigned)(bEnd-p);2603}2604}26052606return XXH_OK;2607}260826092610/*! @ingroup xxh64_family */2611XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)2612{2613xxh_u64 h64;26142615if (state->total_len >= 32) {2616h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);2617h64 = XXH64_mergeRound(h64, state->v[0]);2618h64 = XXH64_mergeRound(h64, state->v[1]);2619h64 = XXH64_mergeRound(h64, state->v[2]);2620h64 = XXH64_mergeRound(h64, state->v[3]);2621} else {2622h64 = state->v[2] /*seed*/ + XXH_PRIME64_5;2623}26242625h64 += (xxh_u64) state->total_len;26262627return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);2628}262926302631/******* Canonical representation *******/26322633/*! @ingroup xxh64_family */2634XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)2635{2636/* XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); */2637if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);2638XXH_memcpy(dst, &hash, sizeof(*dst));2639}26402641/*! @ingroup xxh64_family */2642XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)2643{2644return XXH_readBE64(src);2645}26462647#ifndef XXH_NO_XXH326482649/* *********************************************************************2650* XXH32651* New generation hash designed for speed on small keys and vectorization2652************************************************************************ */2653/*!2654* @}2655* @defgroup xxh3_impl XXH3 implementation2656* @ingroup impl2657* @{2658*/26592660/* === Compiler specifics === */26612662#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */2663# define XXH_RESTRICT /* disable */2664#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */2665# define XXH_RESTRICT restrict2666#else2667/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */2668# define XXH_RESTRICT /* disable */2669#endif26702671#if (defined(__GNUC__) && (__GNUC__ >= 3)) \2672|| (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \2673|| defined(__clang__)2674# define XXH_likely(x) __builtin_expect(x, 1)2675# define XXH_unlikely(x) __builtin_expect(x, 0)2676#else2677# define XXH_likely(x) (x)2678# define XXH_unlikely(x) (x)2679#endif26802681#if defined(__GNUC__) || defined(__clang__)2682# if defined(__ARM_NEON__) || defined(__ARM_NEON) \2683|| defined(__aarch64__) || defined(_M_ARM) \2684|| defined(_M_ARM64) || defined(_M_ARM64EC)2685# define inline __inline__ /* circumvent a clang bug */2686# include <arm_neon.h>2687# undef inline2688# elif defined(__AVX2__)2689# include <immintrin.h>2690# elif defined(__SSE2__)2691# include <emmintrin.h>2692# endif2693#endif26942695#if defined(_MSC_VER)2696# include <intrin.h>2697#endif26982699/*2700* One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while2701* remaining a true 64-bit/128-bit hash function.2702*2703* This is done by prioritizing a subset of 64-bit operations that can be2704* emulated without too many steps on the average 32-bit machine.2705*2706* For example, these two lines seem similar, and run equally fast on 64-bit:2707*2708* xxh_u64 x;2709* x ^= (x >> 47); // good2710* x ^= (x >> 13); // bad2711*2712* However, to a 32-bit machine, there is a major difference.2713*2714* x ^= (x >> 47) looks like this:2715*2716* x.lo ^= (x.hi >> (47 - 32));2717*2718* while x ^= (x >> 13) looks like this:2719*2720* // note: funnel shifts are not usually cheap.2721* x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));2722* x.hi ^= (x.hi >> 13);2723*2724* The first one is significantly faster than the second, simply because the2725* shift is larger than 32. This means:2726* - All the bits we need are in the upper 32 bits, so we can ignore the lower2727* 32 bits in the shift.2728* - The shift result will always fit in the lower 32 bits, and therefore,2729* we can ignore the upper 32 bits in the xor.2730*2731* Thanks to this optimization, XXH3 only requires these features to be efficient:2732*2733* - Usable unaligned access2734* - A 32-bit or 64-bit ALU2735* - If 32-bit, a decent ADC instruction2736* - A 32 or 64-bit multiply with a 64-bit result2737* - For the 128-bit variant, a decent byteswap helps short inputs.2738*2739* The first two are already required by XXH32, and almost all 32-bit and 64-bit2740* platforms which can run XXH32 can run XXH3 efficiently.2741*2742* Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one2743* notable exception.2744*2745* First of all, Thumb-1 lacks support for the UMULL instruction which2746* performs the important long multiply. This means numerous __aeabi_lmul2747* calls.2748*2749* Second of all, the 8 functional registers are just not enough.2750* Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need2751* Lo registers, and this shuffling results in thousands more MOVs than A32.2752*2753* A32 and T32 don't have this limitation. They can access all 14 registers,2754* do a 32->64 multiply with UMULL, and the flexible operand allowing free2755* shifts is helpful, too.2756*2757* Therefore, we do a quick sanity check.2758*2759* If compiling Thumb-1 for a target which supports ARM instructions, we will2760* emit a warning, as it is not a "sane" platform to compile for.2761*2762* Usually, if this happens, it is because of an accident and you probably need2763* to specify -march, as you likely meant to compile for a newer architecture.2764*2765* Credit: large sections of the vectorial and asm source code paths2766* have been contributed by @easyaspi3142767*/2768#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)2769# warning "XXH3 is highly inefficient without ARM or Thumb-2."2770#endif27712772/* ==========================================2773* Vectorization detection2774* ========================================== */27752776#ifdef XXH_DOXYGEN2777/*!2778* @ingroup tuning2779* @brief Overrides the vectorization implementation chosen for XXH3.2780*2781* Can be defined to 0 to disable SIMD or any of the values mentioned in2782* @ref XXH_VECTOR_TYPE.2783*2784* If this is not defined, it uses predefined macros to determine the best2785* implementation.2786*/2787# define XXH_VECTOR XXH_SCALAR2788/*!2789* @ingroup tuning2790* @brief Possible values for @ref XXH_VECTOR.2791*2792* Note that these are actually implemented as macros.2793*2794* If this is not defined, it is detected automatically.2795* @ref XXH_X86DISPATCH overrides this.2796*/2797enum XXH_VECTOR_TYPE /* fake enum */ {2798XXH_SCALAR = 0, /*!< Portable scalar version */2799XXH_SSE2 = 1, /*!<2800* SSE2 for Pentium 4, Opteron, all x86_64.2801*2802* @note SSE2 is also guaranteed on Windows 10, macOS, and2803* Android x86.2804*/2805XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */2806XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */2807XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */2808XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */2809};2810/*!2811* @ingroup tuning2812* @brief Selects the minimum alignment for XXH3's accumulators.2813*2814* When using SIMD, this should match the alignment reqired for said vector2815* type, so, for example, 32 for AVX2.2816*2817* Default: Auto detected.2818*/2819# define XXH_ACC_ALIGN 82820#endif28212822/* Actual definition */2823#ifndef XXH_DOXYGEN2824# define XXH_SCALAR 02825# define XXH_SSE2 12826# define XXH_AVX2 22827# define XXH_AVX512 32828# define XXH_NEON 42829# define XXH_VSX 52830#endif28312832#ifndef XXH_VECTOR /* can be defined on command line */2833# if ( \2834defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \2835|| defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \2836) && ( \2837defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \2838|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \2839)2840# define XXH_VECTOR XXH_NEON2841# elif defined(__AVX512F__)2842# define XXH_VECTOR XXH_AVX5122843# elif defined(__AVX2__)2844# define XXH_VECTOR XXH_AVX22845# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))2846# define XXH_VECTOR XXH_SSE22847# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \2848|| (defined(__s390x__) && defined(__VEC__)) \2849&& defined(__GNUC__) /* TODO: IBM XL */2850# define XXH_VECTOR XXH_VSX2851# else2852# define XXH_VECTOR XXH_SCALAR2853# endif2854#endif28552856/*2857* Controls the alignment of the accumulator,2858* for compatibility with aligned vector loads, which are usually faster.2859*/2860#ifndef XXH_ACC_ALIGN2861# if defined(XXH_X86DISPATCH)2862# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */2863# elif XXH_VECTOR == XXH_SCALAR /* scalar */2864# define XXH_ACC_ALIGN 82865# elif XXH_VECTOR == XXH_SSE2 /* sse2 */2866# define XXH_ACC_ALIGN 162867# elif XXH_VECTOR == XXH_AVX2 /* avx2 */2868# define XXH_ACC_ALIGN 322869# elif XXH_VECTOR == XXH_NEON /* neon */2870# define XXH_ACC_ALIGN 162871# elif XXH_VECTOR == XXH_VSX /* vsx */2872# define XXH_ACC_ALIGN 162873# elif XXH_VECTOR == XXH_AVX512 /* avx512 */2874# define XXH_ACC_ALIGN 642875# endif2876#endif28772878#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \2879|| XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX5122880# define XXH_SEC_ALIGN XXH_ACC_ALIGN2881#else2882# define XXH_SEC_ALIGN 82883#endif28842885/*2886* UGLY HACK:2887* GCC usually generates the best code with -O3 for xxHash.2888*2889* However, when targeting AVX2, it is overzealous in its unrolling resulting2890* in code roughly 3/4 the speed of Clang.2891*2892* There are other issues, such as GCC splitting _mm256_loadu_si256 into2893* _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which2894* only applies to Sandy and Ivy Bridge... which don't even support AVX2.2895*2896* That is why when compiling the AVX2 version, it is recommended to use either2897* -O2 -mavx2 -march=haswell2898* or2899* -O2 -mavx2 -mno-avx256-split-unaligned-load2900* for decent performance, or to use Clang instead.2901*2902* Fortunately, we can control the first one with a pragma that forces GCC into2903* -O2, but the other one we can't control without "failed to inline always2904* inline function due to target mismatch" warnings.2905*/2906#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \2907&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \2908&& defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */2909# pragma GCC push_options2910# pragma GCC optimize("-O2")2911#endif291229132914#if XXH_VECTOR == XXH_NEON2915/*2916* NEON's setup for vmlal_u32 is a little more complicated than it is on2917* SSE2, AVX2, and VSX.2918*2919* While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.2920*2921* To do the same operation, the 128-bit 'Q' register needs to be split into2922* two 64-bit 'D' registers, performing this operation::2923*2924* [ a | b ]2925* | '---------. .--------' |2926* | x |2927* | .---------' '--------. |2928* [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ]2929*2930* Due to significant changes in aarch64, the fastest method for aarch64 is2931* completely different than the fastest method for ARMv7-A.2932*2933* ARMv7-A treats D registers as unions overlaying Q registers, so modifying2934* D11 will modify the high half of Q5. This is similar to how modifying AH2935* will only affect bits 8-15 of AX on x86.2936*2937* VZIP takes two registers, and puts even lanes in one register and odd lanes2938* in the other.2939*2940* On ARMv7-A, this strangely modifies both parameters in place instead of2941* taking the usual 3-operand form.2942*2943* Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the2944* lower and upper halves of the Q register to end up with the high and low2945* halves where we want - all in one instruction.2946*2947* vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }2948*2949* Unfortunately we need inline assembly for this: Instructions modifying two2950* registers at once is not possible in GCC or Clang's IR, and they have to2951* create a copy.2952*2953* aarch64 requires a different approach.2954*2955* In order to make it easier to write a decent compiler for aarch64, many2956* quirks were removed, such as conditional execution.2957*2958* NEON was also affected by this.2959*2960* aarch64 cannot access the high bits of a Q-form register, and writes to a2961* D-form register zero the high bits, similar to how writes to W-form scalar2962* registers (or DWORD registers on x86_64) work.2963*2964* The formerly free vget_high intrinsics now require a vext (with a few2965* exceptions)2966*2967* Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent2968* of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one2969* operand.2970*2971* The equivalent of the VZIP.32 on the lower and upper halves would be this2972* mess:2973*2974* ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }2975* zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }2976* zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }2977*2978* Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):2979*2980* shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);2981* xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);2982*2983* This is available on ARMv7-A, but is less efficient than a single VZIP.32.2984*/29852986/*!2987* Function-like macro:2988* void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)2989* {2990* outLo = (uint32x2_t)(in & 0xFFFFFFFF);2991* outHi = (uint32x2_t)(in >> 32);2992* in = UNDEFINED;2993* }2994*/2995# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \2996&& (defined(__GNUC__) || defined(__clang__)) \2997&& (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))2998# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \2999do { \3000/* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \3001/* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \3002/* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \3003__asm__("vzip.32 %e0, %f0" : "+w" (in)); \3004(outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \3005(outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \3006} while (0)3007# else3008# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \3009do { \3010(outLo) = vmovn_u64 (in); \3011(outHi) = vshrn_n_u64 ((in), 32); \3012} while (0)3013# endif30143015/*!3016* @ingroup tuning3017* @brief Controls the NEON to scalar ratio for XXH33018*3019* On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and3020* 2 lanes on scalar by default.3021*3022* This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the3023* emulated 64-bit arithmetic is too slow.3024*3025* Modern ARM CPUs are _very_ sensitive to how their pipelines are used.3026*3027* For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't3028* have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,3029* you are only using 2/3 of the CPU bandwidth.3030*3031* This is even more noticable on the more advanced cores like the A76 which3032* can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.3033*3034* Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the3035* remaining lanes will use scalar instructions. This improves the bandwidth3036* and also gives the integer pipelines something to do besides twiddling loop3037* counters and pointers.3038*3039* This change benefits CPUs with large micro-op buffers without negatively affecting3040* other CPUs:3041*3042* | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. |3043* |:----------------------|:--------------------|----------:|-----------:|------:|3044* | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% |3045* | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% |3046* | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% |3047*3048* It also seems to fix some bad codegen on GCC, making it almost as fast as clang.3049*3050* @see XXH3_accumulate_512_neon()3051*/3052# ifndef XXH3_NEON_LANES3053# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \3054&& !defined(__OPTIMIZE_SIZE__)3055# define XXH3_NEON_LANES 63056# else3057# define XXH3_NEON_LANES XXH_ACC_NB3058# endif3059# endif3060#endif /* XXH_VECTOR == XXH_NEON */30613062/*3063* VSX and Z Vector helpers.3064*3065* This is very messy, and any pull requests to clean this up are welcome.3066*3067* There are a lot of problems with supporting VSX and s390x, due to3068* inconsistent intrinsics, spotty coverage, and multiple endiannesses.3069*/3070#if XXH_VECTOR == XXH_VSX3071# if defined(__s390x__)3072# include <s390intrin.h>3073# else3074/* gcc's altivec.h can have the unwanted consequence to unconditionally3075* #define bool, vector, and pixel keywords,3076* with bad consequences for programs already using these keywords for other purposes.3077* The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined.3078* __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler,3079* but it seems that, in some cases, it isn't.3080* Force the build macro to be defined, so that keywords are not altered.3081*/3082# if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)3083# define __APPLE_ALTIVEC__3084# endif3085# include <altivec.h>3086# endif30873088typedef __vector unsigned long long xxh_u64x2;3089typedef __vector unsigned char xxh_u8x16;3090typedef __vector unsigned xxh_u32x4;30913092# ifndef XXH_VSX_BE3093# if defined(__BIG_ENDIAN__) \3094|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)3095# define XXH_VSX_BE 13096# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__3097# warning "-maltivec=be is not recommended. Please use native endianness."3098# define XXH_VSX_BE 13099# else3100# define XXH_VSX_BE 03101# endif3102# endif /* !defined(XXH_VSX_BE) */31033104# if XXH_VSX_BE3105# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))3106# define XXH_vec_revb vec_revb3107# else3108/*!3109* A polyfill for POWER9's vec_revb().3110*/3111XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)3112{3113xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,31140x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };3115return vec_perm(val, val, vByteSwap);3116}3117# endif3118# endif /* XXH_VSX_BE */31193120/*!3121* Performs an unaligned vector load and byte swaps it on big endian.3122*/3123XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)3124{3125xxh_u64x2 ret;3126XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));3127# if XXH_VSX_BE3128ret = XXH_vec_revb(ret);3129# endif3130return ret;3131}31323133/*3134* vec_mulo and vec_mule are very problematic intrinsics on PowerPC3135*3136* These intrinsics weren't added until GCC 8, despite existing for a while,3137* and they are endian dependent. Also, their meaning swap depending on version.3138* */3139# if defined(__s390x__)3140/* s390x is always big endian, no issue on this platform */3141# define XXH_vec_mulo vec_mulo3142# define XXH_vec_mule vec_mule3143# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)3144/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */3145# define XXH_vec_mulo __builtin_altivec_vmulouw3146# define XXH_vec_mule __builtin_altivec_vmuleuw3147# else3148/* gcc needs inline assembly */3149/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */3150XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)3151{3152xxh_u64x2 result;3153__asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));3154return result;3155}3156XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)3157{3158xxh_u64x2 result;3159__asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));3160return result;3161}3162# endif /* XXH_vec_mulo, XXH_vec_mule */3163#endif /* XXH_VECTOR == XXH_VSX */316431653166/* prefetch3167* can be disabled, by declaring XXH_NO_PREFETCH build macro */3168#if defined(XXH_NO_PREFETCH)3169# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */3170#else3171# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */3172# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */3173# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)3174# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )3175# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)3176# else3177# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */3178# endif3179#endif /* XXH_NO_PREFETCH */318031813182/* ==========================================3183* XXH3 default settings3184* ========================================== */31853186#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */31873188#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)3189# error "default keyset is not large enough"3190#endif31913192/*! Pseudorandom secret taken directly from FARSH. */3193XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {31940xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,31950xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,31960xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,31970xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,31980x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,31990x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,32000xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,32010x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,32020xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,32030x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,32040x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,32050x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,3206};320732083209#ifdef XXH_OLD_NAMES3210# define kSecret XXH3_kSecret3211#endif32123213#ifdef XXH_DOXYGEN3214/*!3215* @brief Calculates a 32-bit to 64-bit long multiply.3216*3217* Implemented as a macro.3218*3219* Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't3220* need to (but it shouldn't need to anyways, it is about 7 instructions to do3221* a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we3222* use that instead of the normal method.3223*3224* If you are compiling for platforms like Thumb-1 and don't have a better option,3225* you may also want to write your own long multiply routine here.3226*3227* @param x, y Numbers to be multiplied3228* @return 64-bit product of the low 32 bits of @p x and @p y.3229*/3230XXH_FORCE_INLINE xxh_u643231XXH_mult32to64(xxh_u64 x, xxh_u64 y)3232{3233return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);3234}3235#elif defined(_MSC_VER) && defined(_M_IX86)3236# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))3237#else3238/*3239* Downcast + upcast is usually better than masking on older compilers like3240* GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.3241*3242* The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands3243* and perform a full 64x64 multiply -- entirely redundant on 32-bit.3244*/3245# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))3246#endif32473248/*!3249* @brief Calculates a 64->128-bit long multiply.3250*3251* Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar3252* version.3253*3254* @param lhs , rhs The 64-bit integers to be multiplied3255* @return The 128-bit result represented in an @ref XXH128_hash_t.3256*/3257static XXH128_hash_t3258XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)3259{3260/*3261* GCC/Clang __uint128_t method.3262*3263* On most 64-bit targets, GCC and Clang define a __uint128_t type.3264* This is usually the best way as it usually uses a native long 64-bit3265* multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.3266*3267* Usually.3268*3269* Despite being a 32-bit platform, Clang (and emscripten) define this type3270* despite not having the arithmetic for it. This results in a laggy3271* compiler builtin call which calculates a full 128-bit multiply.3272* In that case it is best to use the portable one.3273* https://github.com/Cyan4973/xxHash/issues/211#issuecomment-5155756773274*/3275#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \3276&& defined(__SIZEOF_INT128__) \3277|| (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)32783279__uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;3280XXH128_hash_t r128;3281r128.low64 = (xxh_u64)(product);3282r128.high64 = (xxh_u64)(product >> 64);3283return r128;32843285/*3286* MSVC for x64's _umul128 method.3287*3288* xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);3289*3290* This compiles to single operand MUL on x64.3291*/3292#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)32933294#ifndef _MSC_VER3295# pragma intrinsic(_umul128)3296#endif3297xxh_u64 product_high;3298xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);3299XXH128_hash_t r128;3300r128.low64 = product_low;3301r128.high64 = product_high;3302return r128;33033304/*3305* MSVC for ARM64's __umulh method.3306*3307* This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.3308*/3309#elif defined(_M_ARM64) || defined(_M_ARM64EC)33103311#ifndef _MSC_VER3312# pragma intrinsic(__umulh)3313#endif3314XXH128_hash_t r128;3315r128.low64 = lhs * rhs;3316r128.high64 = __umulh(lhs, rhs);3317return r128;33183319#else3320/*3321* Portable scalar method. Optimized for 32-bit and 64-bit ALUs.3322*3323* This is a fast and simple grade school multiply, which is shown below3324* with base 10 arithmetic instead of base 0x100000000.3325*3326* 9 3 // D2 lhs = 933327* x 7 5 // D2 rhs = 753328* ----------3329* 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 153330* 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 453331* 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 213332* + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 633333* ---------3334* 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 273335* + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 673336* ---------3337* 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 69753338*3339* The reasons for adding the products like this are:3340* 1. It avoids manual carry tracking. Just like how3341* (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.3342* This avoids a lot of complexity.3343*3344* 2. It hints for, and on Clang, compiles to, the powerful UMAAL3345* instruction available in ARM's Digital Signal Processing extension3346* in 32-bit ARMv6 and later, which is shown below:3347*3348* void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)3349* {3350* xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;3351* *RdLo = (xxh_u32)(product & 0xFFFFFFFF);3352* *RdHi = (xxh_u32)(product >> 32);3353* }3354*3355* This instruction was designed for efficient long multiplication, and3356* allows this to be calculated in only 4 instructions at speeds3357* comparable to some 64-bit ALUs.3358*3359* 3. It isn't terrible on other platforms. Usually this will be a couple3360* of 32-bit ADD/ADCs.3361*/33623363/* First calculate all of the cross products. */3364xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);3365xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);3366xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);3367xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);33683369/* Now add the products together. These will never overflow. */3370xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;3371xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;3372xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);33733374XXH128_hash_t r128;3375r128.low64 = lower;3376r128.high64 = upper;3377return r128;3378#endif3379}33803381/*!3382* @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.3383*3384* The reason for the separate function is to prevent passing too many structs3385* around by value. This will hopefully inline the multiply, but we don't force it.3386*3387* @param lhs , rhs The 64-bit integers to multiply3388* @return The low 64 bits of the product XOR'd by the high 64 bits.3389* @see XXH_mult64to128()3390*/3391static xxh_u643392XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)3393{3394XXH128_hash_t product = XXH_mult64to128(lhs, rhs);3395return product.low64 ^ product.high64;3396}33973398/*! Seems to produce slightly better code on GCC for some reason. */3399XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)3400{3401XXH_ASSERT(0 <= shift && shift < 64);3402return v64 ^ (v64 >> shift);3403}34043405/*3406* This is a fast avalanche stage,3407* suitable when input bits are already partially mixed3408*/3409static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)3410{3411h64 = XXH_xorshift64(h64, 37);3412h64 *= 0x165667919E3779F9ULL;3413h64 = XXH_xorshift64(h64, 32);3414return h64;3415}34163417/*3418* This is a stronger avalanche,3419* inspired by Pelle Evensen's rrmxmx3420* preferable when input has not been previously mixed3421*/3422static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)3423{3424/* this mix is inspired by Pelle Evensen's rrmxmx */3425h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);3426h64 *= 0x9FB21C651E98DF25ULL;3427h64 ^= (h64 >> 35) + len ;3428h64 *= 0x9FB21C651E98DF25ULL;3429return XXH_xorshift64(h64, 28);3430}343134323433/* ==========================================3434* Short keys3435* ==========================================3436* One of the shortcomings of XXH32 and XXH64 was that their performance was3437* sub-optimal on short lengths. It used an iterative algorithm which strongly3438* favored lengths that were a multiple of 4 or 8.3439*3440* Instead of iterating over individual inputs, we use a set of single shot3441* functions which piece together a range of lengths and operate in constant time.3442*3443* Additionally, the number of multiplies has been significantly reduced. This3444* reduces latency, especially when emulating 64-bit multiplies on 32-bit.3445*3446* Depending on the platform, this may or may not be faster than XXH32, but it3447* is almost guaranteed to be faster than XXH64.3448*/34493450/*3451* At very short lengths, there isn't enough input to fully hide secrets, or use3452* the entire secret.3453*3454* There is also only a limited amount of mixing we can do before significantly3455* impacting performance.3456*3457* Therefore, we use different sections of the secret and always mix two secret3458* samples with an XOR. This should have no effect on performance on the3459* seedless or withSeed variants because everything _should_ be constant folded3460* by modern compilers.3461*3462* The XOR mixing hides individual parts of the secret and increases entropy.3463*3464* This adds an extra layer of strength for custom secrets.3465*/3466XXH_FORCE_INLINE XXH64_hash_t3467XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)3468{3469XXH_ASSERT(input != NULL);3470XXH_ASSERT(1 <= len && len <= 3);3471XXH_ASSERT(secret != NULL);3472/*3473* len = 1: combined = { input[0], 0x01, input[0], input[0] }3474* len = 2: combined = { input[1], 0x02, input[0], input[1] }3475* len = 3: combined = { input[2], 0x03, input[0], input[1] }3476*/3477{ xxh_u8 const c1 = input[0];3478xxh_u8 const c2 = input[len >> 1];3479xxh_u8 const c3 = input[len - 1];3480xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24)3481| ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);3482xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;3483xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;3484return XXH64_avalanche(keyed);3485}3486}34873488XXH_FORCE_INLINE XXH64_hash_t3489XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)3490{3491XXH_ASSERT(input != NULL);3492XXH_ASSERT(secret != NULL);3493XXH_ASSERT(4 <= len && len <= 8);3494seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;3495{ xxh_u32 const input1 = XXH_readLE32(input);3496xxh_u32 const input2 = XXH_readLE32(input + len - 4);3497xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;3498xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);3499xxh_u64 const keyed = input64 ^ bitflip;3500return XXH3_rrmxmx(keyed, len);3501}3502}35033504XXH_FORCE_INLINE XXH64_hash_t3505XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)3506{3507XXH_ASSERT(input != NULL);3508XXH_ASSERT(secret != NULL);3509XXH_ASSERT(9 <= len && len <= 16);3510{ xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;3511xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;3512xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;3513xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;3514xxh_u64 const acc = len3515+ XXH_swap64(input_lo) + input_hi3516+ XXH3_mul128_fold64(input_lo, input_hi);3517return XXH3_avalanche(acc);3518}3519}35203521XXH_FORCE_INLINE XXH64_hash_t3522XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)3523{3524XXH_ASSERT(len <= 16);3525{ if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed);3526if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);3527if (len) return XXH3_len_1to3_64b(input, len, secret, seed);3528return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));3529}3530}35313532/*3533* DISCLAIMER: There are known *seed-dependent* multicollisions here due to3534* multiplication by zero, affecting hashes of lengths 17 to 240.3535*3536* However, they are very unlikely.3537*3538* Keep this in mind when using the unseeded XXH3_64bits() variant: As with all3539* unseeded non-cryptographic hashes, it does not attempt to defend itself3540* against specially crafted inputs, only random inputs.3541*3542* Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes3543* cancelling out the secret is taken an arbitrary number of times (addressed3544* in XXH3_accumulate_512), this collision is very unlikely with random inputs3545* and/or proper seeding:3546*3547* This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a3548* function that is only called up to 16 times per hash with up to 240 bytes of3549* input.3550*3551* This is not too bad for a non-cryptographic hash function, especially with3552* only 64 bit outputs.3553*3554* The 128-bit variant (which trades some speed for strength) is NOT affected3555* by this, although it is always a good idea to use a proper seed if you care3556* about strength.3557*/3558XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,3559const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)3560{3561#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \3562&& defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \3563&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */3564/*3565* UGLY HACK:3566* GCC for x86 tends to autovectorize the 128-bit multiply, resulting in3567* slower code.3568*3569* By forcing seed64 into a register, we disrupt the cost model and3570* cause it to scalarize. See `XXH32_round()`3571*3572* FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,3573* XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on3574* GCC 9.2, despite both emitting scalar code.3575*3576* GCC generates much better scalar code than Clang for the rest of XXH3,3577* which is why finding a more optimal codepath is an interest.3578*/3579XXH_COMPILER_GUARD(seed64);3580#endif3581{ xxh_u64 const input_lo = XXH_readLE64(input);3582xxh_u64 const input_hi = XXH_readLE64(input+8);3583return XXH3_mul128_fold64(3584input_lo ^ (XXH_readLE64(secret) + seed64),3585input_hi ^ (XXH_readLE64(secret+8) - seed64)3586);3587}3588}35893590/* For mid range keys, XXH3 uses a Mum-hash variant. */3591XXH_FORCE_INLINE XXH64_hash_t3592XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,3593const xxh_u8* XXH_RESTRICT secret, size_t secretSize,3594XXH64_hash_t seed)3595{3596XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;3597XXH_ASSERT(16 < len && len <= 128);35983599{ xxh_u64 acc = len * XXH_PRIME64_1;3600if (len > 32) {3601if (len > 64) {3602if (len > 96) {3603acc += XXH3_mix16B(input+48, secret+96, seed);3604acc += XXH3_mix16B(input+len-64, secret+112, seed);3605}3606acc += XXH3_mix16B(input+32, secret+64, seed);3607acc += XXH3_mix16B(input+len-48, secret+80, seed);3608}3609acc += XXH3_mix16B(input+16, secret+32, seed);3610acc += XXH3_mix16B(input+len-32, secret+48, seed);3611}3612acc += XXH3_mix16B(input+0, secret+0, seed);3613acc += XXH3_mix16B(input+len-16, secret+16, seed);36143615return XXH3_avalanche(acc);3616}3617}36183619#define XXH3_MIDSIZE_MAX 24036203621XXH_NO_INLINE XXH64_hash_t3622XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,3623const xxh_u8* XXH_RESTRICT secret, size_t secretSize,3624XXH64_hash_t seed)3625{3626XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;3627XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);36283629#define XXH3_MIDSIZE_STARTOFFSET 33630#define XXH3_MIDSIZE_LASTOFFSET 1736313632{ xxh_u64 acc = len * XXH_PRIME64_1;3633int const nbRounds = (int)len / 16;3634int i;3635for (i=0; i<8; i++) {3636acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);3637}3638acc = XXH3_avalanche(acc);3639XXH_ASSERT(nbRounds >= 8);3640#if defined(__clang__) /* Clang */ \3641&& (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \3642&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */3643/*3644* UGLY HACK:3645* Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.3646* In everywhere else, it uses scalar code.3647*3648* For 64->128-bit multiplies, even if the NEON was 100% optimal, it3649* would still be slower than UMAAL (see XXH_mult64to128).3650*3651* Unfortunately, Clang doesn't handle the long multiplies properly and3652* converts them to the nonexistent "vmulq_u64" intrinsic, which is then3653* scalarized into an ugly mess of VMOV.32 instructions.3654*3655* This mess is difficult to avoid without turning autovectorization3656* off completely, but they are usually relatively minor and/or not3657* worth it to fix.3658*3659* This loop is the easiest to fix, as unlike XXH32, this pragma3660* _actually works_ because it is a loop vectorization instead of an3661* SLP vectorization.3662*/3663#pragma clang loop vectorize(disable)3664#endif3665for (i=8 ; i < nbRounds; i++) {3666acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);3667}3668/* last bytes */3669acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);3670return XXH3_avalanche(acc);3671}3672}367336743675/* ======= Long Keys ======= */36763677#define XXH_STRIPE_LEN 643678#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */3679#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))36803681#ifdef XXH_OLD_NAMES3682# define STRIPE_LEN XXH_STRIPE_LEN3683# define ACC_NB XXH_ACC_NB3684#endif36853686XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)3687{3688if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);3689XXH_memcpy(dst, &v64, sizeof(v64));3690}36913692/* Several intrinsic functions below are supposed to accept __int64 as argument,3693* as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .3694* However, several environments do not define __int64 type,3695* requiring a workaround.3696*/3697#if !defined (__VMS) \3698&& (defined (__cplusplus) \3699|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )3700typedef int64_t xxh_i64;3701#else3702/* the following type must have a width of 64-bit */3703typedef long long xxh_i64;3704#endif370537063707/*3708* XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.3709*3710* It is a hardened version of UMAC, based off of FARSH's implementation.3711*3712* This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD3713* implementations, and it is ridiculously fast.3714*3715* We harden it by mixing the original input to the accumulators as well as the product.3716*3717* This means that in the (relatively likely) case of a multiply by zero, the3718* original input is preserved.3719*3720* On 128-bit inputs, we swap 64-bit pairs when we add the input to improve3721* cross-pollination, as otherwise the upper and lower halves would be3722* essentially independent.3723*3724* This doesn't matter on 64-bit hashes since they all get merged together in3725* the end, so we skip the extra step.3726*3727* Both XXH3_64bits and XXH3_128bits use this subroutine.3728*/37293730#if (XXH_VECTOR == XXH_AVX512) \3731|| (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)37323733#ifndef XXH_TARGET_AVX5123734# define XXH_TARGET_AVX512 /* disable attribute target */3735#endif37363737XXH_FORCE_INLINE XXH_TARGET_AVX512 void3738XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,3739const void* XXH_RESTRICT input,3740const void* XXH_RESTRICT secret)3741{3742__m512i* const xacc = (__m512i *) acc;3743XXH_ASSERT((((size_t)acc) & 63) == 0);3744XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));37453746{3747/* data_vec = input[0]; */3748__m512i const data_vec = _mm512_loadu_si512 (input);3749/* key_vec = secret[0]; */3750__m512i const key_vec = _mm512_loadu_si512 (secret);3751/* data_key = data_vec ^ key_vec; */3752__m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);3753/* data_key_lo = data_key >> 32; */3754__m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));3755/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */3756__m512i const product = _mm512_mul_epu32 (data_key, data_key_lo);3757/* xacc[0] += swap(data_vec); */3758__m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));3759__m512i const sum = _mm512_add_epi64(*xacc, data_swap);3760/* xacc[0] += product; */3761*xacc = _mm512_add_epi64(product, sum);3762}3763}37643765/*3766* XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.3767*3768* Multiplication isn't perfect, as explained by Google in HighwayHash:3769*3770* // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to3771* // varying degrees. In descending order of goodness, bytes3772* // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.3773* // As expected, the upper and lower bytes are much worse.3774*3775* Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L2913776*3777* Since our algorithm uses a pseudorandom secret to add some variance into the3778* mix, we don't need to (or want to) mix as often or as much as HighwayHash does.3779*3780* This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid3781* extraction.3782*3783* Both XXH3_64bits and XXH3_128bits use this subroutine.3784*/37853786XXH_FORCE_INLINE XXH_TARGET_AVX512 void3787XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)3788{3789XXH_ASSERT((((size_t)acc) & 63) == 0);3790XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));3791{ __m512i* const xacc = (__m512i*) acc;3792const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);37933794/* xacc[0] ^= (xacc[0] >> 47) */3795__m512i const acc_vec = *xacc;3796__m512i const shifted = _mm512_srli_epi64 (acc_vec, 47);3797__m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted);3798/* xacc[0] ^= secret; */3799__m512i const key_vec = _mm512_loadu_si512 (secret);3800__m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);38013802/* xacc[0] *= XXH_PRIME32_1; */3803__m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));3804__m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32);3805__m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32);3806*xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));3807}3808}38093810XXH_FORCE_INLINE XXH_TARGET_AVX512 void3811XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)3812{3813XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);3814XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);3815XXH_ASSERT(((size_t)customSecret & 63) == 0);3816(void)(&XXH_writeLE64);3817{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);3818__m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));38193820const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret);3821__m512i* const dest = ( __m512i*) customSecret;3822int i;3823XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */3824XXH_ASSERT(((size_t)dest & 63) == 0);3825for (i=0; i < nbRounds; ++i) {3826/* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',3827* this will warn "discards 'const' qualifier". */3828union {3829const __m512i* cp;3830void* p;3831} remote_const_void;3832remote_const_void.cp = src + i;3833dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);3834} }3835}38363837#endif38383839#if (XXH_VECTOR == XXH_AVX2) \3840|| (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)38413842#ifndef XXH_TARGET_AVX23843# define XXH_TARGET_AVX2 /* disable attribute target */3844#endif38453846XXH_FORCE_INLINE XXH_TARGET_AVX2 void3847XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,3848const void* XXH_RESTRICT input,3849const void* XXH_RESTRICT secret)3850{3851XXH_ASSERT((((size_t)acc) & 31) == 0);3852{ __m256i* const xacc = (__m256i *) acc;3853/* Unaligned. This is mainly for pointer arithmetic, and because3854* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */3855const __m256i* const xinput = (const __m256i *) input;3856/* Unaligned. This is mainly for pointer arithmetic, and because3857* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */3858const __m256i* const xsecret = (const __m256i *) secret;38593860size_t i;3861for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {3862/* data_vec = xinput[i]; */3863__m256i const data_vec = _mm256_loadu_si256 (xinput+i);3864/* key_vec = xsecret[i]; */3865__m256i const key_vec = _mm256_loadu_si256 (xsecret+i);3866/* data_key = data_vec ^ key_vec; */3867__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);3868/* data_key_lo = data_key >> 32; */3869__m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));3870/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */3871__m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);3872/* xacc[i] += swap(data_vec); */3873__m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));3874__m256i const sum = _mm256_add_epi64(xacc[i], data_swap);3875/* xacc[i] += product; */3876xacc[i] = _mm256_add_epi64(product, sum);3877} }3878}38793880XXH_FORCE_INLINE XXH_TARGET_AVX2 void3881XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)3882{3883XXH_ASSERT((((size_t)acc) & 31) == 0);3884{ __m256i* const xacc = (__m256i*) acc;3885/* Unaligned. This is mainly for pointer arithmetic, and because3886* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */3887const __m256i* const xsecret = (const __m256i *) secret;3888const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);38893890size_t i;3891for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {3892/* xacc[i] ^= (xacc[i] >> 47) */3893__m256i const acc_vec = xacc[i];3894__m256i const shifted = _mm256_srli_epi64 (acc_vec, 47);3895__m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted);3896/* xacc[i] ^= xsecret; */3897__m256i const key_vec = _mm256_loadu_si256 (xsecret+i);3898__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);38993900/* xacc[i] *= XXH_PRIME32_1; */3901__m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));3902__m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);3903__m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);3904xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));3905}3906}3907}39083909XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)3910{3911XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);3912XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);3913XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);3914(void)(&XXH_writeLE64);3915XXH_PREFETCH(customSecret);3916{ __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);39173918const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret);3919__m256i* dest = ( __m256i*) customSecret;39203921# if defined(__GNUC__) || defined(__clang__)3922/*3923* On GCC & Clang, marking 'dest' as modified will cause the compiler:3924* - do not extract the secret from sse registers in the internal loop3925* - use less common registers, and avoid pushing these reg into stack3926*/3927XXH_COMPILER_GUARD(dest);3928# endif3929XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */3930XXH_ASSERT(((size_t)dest & 31) == 0);39313932/* GCC -O2 need unroll loop manually */3933dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);3934dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);3935dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);3936dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);3937dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);3938dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);3939}3940}39413942#endif39433944/* x86dispatch always generates SSE2 */3945#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)39463947#ifndef XXH_TARGET_SSE23948# define XXH_TARGET_SSE2 /* disable attribute target */3949#endif39503951XXH_FORCE_INLINE XXH_TARGET_SSE2 void3952XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,3953const void* XXH_RESTRICT input,3954const void* XXH_RESTRICT secret)3955{3956/* SSE2 is just a half-scale version of the AVX2 version. */3957XXH_ASSERT((((size_t)acc) & 15) == 0);3958{ __m128i* const xacc = (__m128i *) acc;3959/* Unaligned. This is mainly for pointer arithmetic, and because3960* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */3961const __m128i* const xinput = (const __m128i *) input;3962/* Unaligned. This is mainly for pointer arithmetic, and because3963* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */3964const __m128i* const xsecret = (const __m128i *) secret;39653966size_t i;3967for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {3968/* data_vec = xinput[i]; */3969__m128i const data_vec = _mm_loadu_si128 (xinput+i);3970/* key_vec = xsecret[i]; */3971__m128i const key_vec = _mm_loadu_si128 (xsecret+i);3972/* data_key = data_vec ^ key_vec; */3973__m128i const data_key = _mm_xor_si128 (data_vec, key_vec);3974/* data_key_lo = data_key >> 32; */3975__m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));3976/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */3977__m128i const product = _mm_mul_epu32 (data_key, data_key_lo);3978/* xacc[i] += swap(data_vec); */3979__m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));3980__m128i const sum = _mm_add_epi64(xacc[i], data_swap);3981/* xacc[i] += product; */3982xacc[i] = _mm_add_epi64(product, sum);3983} }3984}39853986XXH_FORCE_INLINE XXH_TARGET_SSE2 void3987XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)3988{3989XXH_ASSERT((((size_t)acc) & 15) == 0);3990{ __m128i* const xacc = (__m128i*) acc;3991/* Unaligned. This is mainly for pointer arithmetic, and because3992* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */3993const __m128i* const xsecret = (const __m128i *) secret;3994const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);39953996size_t i;3997for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {3998/* xacc[i] ^= (xacc[i] >> 47) */3999__m128i const acc_vec = xacc[i];4000__m128i const shifted = _mm_srli_epi64 (acc_vec, 47);4001__m128i const data_vec = _mm_xor_si128 (acc_vec, shifted);4002/* xacc[i] ^= xsecret[i]; */4003__m128i const key_vec = _mm_loadu_si128 (xsecret+i);4004__m128i const data_key = _mm_xor_si128 (data_vec, key_vec);40054006/* xacc[i] *= XXH_PRIME32_1; */4007__m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));4008__m128i const prod_lo = _mm_mul_epu32 (data_key, prime32);4009__m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32);4010xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));4011}4012}4013}40144015XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)4016{4017XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);4018(void)(&XXH_writeLE64);4019{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);40204021# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 19004022/* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */4023XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };4024__m128i const seed = _mm_load_si128((__m128i const*)seed64x2);4025# else4026__m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);4027# endif4028int i;40294030const void* const src16 = XXH3_kSecret;4031__m128i* dst16 = (__m128i*) customSecret;4032# if defined(__GNUC__) || defined(__clang__)4033/*4034* On GCC & Clang, marking 'dest' as modified will cause the compiler:4035* - do not extract the secret from sse registers in the internal loop4036* - use less common registers, and avoid pushing these reg into stack4037*/4038XXH_COMPILER_GUARD(dst16);4039# endif4040XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */4041XXH_ASSERT(((size_t)dst16 & 15) == 0);40424043for (i=0; i < nbRounds; ++i) {4044dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);4045} }4046}40474048#endif40494050#if (XXH_VECTOR == XXH_NEON)40514052/* forward declarations for the scalar routines */4053XXH_FORCE_INLINE void4054XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,4055void const* XXH_RESTRICT secret, size_t lane);40564057XXH_FORCE_INLINE void4058XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,4059void const* XXH_RESTRICT secret, size_t lane);40604061/*!4062* @internal4063* @brief The bulk processing loop for NEON.4064*4065* The NEON code path is actually partially scalar when running on AArch64. This4066* is to optimize the pipelining and can have up to 15% speedup depending on the4067* CPU, and it also mitigates some GCC codegen issues.4068*4069* @see XXH3_NEON_LANES for configuring this and details about this optimization.4070*/4071XXH_FORCE_INLINE void4072XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,4073const void* XXH_RESTRICT input,4074const void* XXH_RESTRICT secret)4075{4076XXH_ASSERT((((size_t)acc) & 15) == 0);4077XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);4078{4079uint64x2_t* const xacc = (uint64x2_t *) acc;4080/* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */4081uint8_t const* const xinput = (const uint8_t *) input;4082uint8_t const* const xsecret = (const uint8_t *) secret;40834084size_t i;4085/* NEON for the first few lanes (these loops are normally interleaved) */4086for (i=0; i < XXH3_NEON_LANES / 2; i++) {4087/* data_vec = xinput[i]; */4088uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));4089/* key_vec = xsecret[i]; */4090uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));4091uint64x2_t data_key;4092uint32x2_t data_key_lo, data_key_hi;4093/* xacc[i] += swap(data_vec); */4094uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);4095uint64x2_t const swapped = vextq_u64(data64, data64, 1);4096xacc[i] = vaddq_u64 (xacc[i], swapped);4097/* data_key = data_vec ^ key_vec; */4098data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));4099/* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);4100* data_key_hi = (uint32x2_t) (data_key >> 32);4101* data_key = UNDEFINED; */4102XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);4103/* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */4104xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);41054106}4107/* Scalar for the remainder. This may be a zero iteration loop. */4108for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {4109XXH3_scalarRound(acc, input, secret, i);4110}4111}4112}41134114XXH_FORCE_INLINE void4115XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)4116{4117XXH_ASSERT((((size_t)acc) & 15) == 0);41184119{ uint64x2_t* xacc = (uint64x2_t*) acc;4120uint8_t const* xsecret = (uint8_t const*) secret;4121uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1);41224123size_t i;4124/* NEON for the first few lanes (these loops are normally interleaved) */4125for (i=0; i < XXH3_NEON_LANES / 2; i++) {4126/* xacc[i] ^= (xacc[i] >> 47); */4127uint64x2_t acc_vec = xacc[i];4128uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);4129uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);41304131/* xacc[i] ^= xsecret[i]; */4132uint8x16_t key_vec = vld1q_u8 (xsecret + (i * 16));4133uint64x2_t data_key = veorq_u64 (data_vec, vreinterpretq_u64_u8(key_vec));41344135/* xacc[i] *= XXH_PRIME32_1 */4136uint32x2_t data_key_lo, data_key_hi;4137/* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);4138* data_key_hi = (uint32x2_t) (xacc[i] >> 32);4139* xacc[i] = UNDEFINED; */4140XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);4141{ /*4142* prod_hi = (data_key >> 32) * XXH_PRIME32_1;4143*4144* Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will4145* incorrectly "optimize" this:4146* tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));4147* shifted = vshll_n_u32(tmp, 32);4148* to this:4149* tmp = "vmulq_u64"(a, b); // no such thing!4150* shifted = vshlq_n_u64(tmp, 32);4151*4152* However, unlike SSE, Clang lacks a 64-bit multiply routine4153* for NEON, and it scalarizes two 64-bit multiplies instead.4154*4155* vmull_u32 has the same timing as vmul_u32, and it avoids4156* this bug completely.4157* See https://bugs.llvm.org/show_bug.cgi?id=399674158*/4159uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);4160/* xacc[i] = prod_hi << 32; */4161xacc[i] = vshlq_n_u64(prod_hi, 32);4162/* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */4163xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);4164}4165}4166/* Scalar for the remainder. This may be a zero iteration loop. */4167for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {4168XXH3_scalarScrambleRound(acc, secret, i);4169}4170}4171}41724173#endif41744175#if (XXH_VECTOR == XXH_VSX)41764177XXH_FORCE_INLINE void4178XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,4179const void* XXH_RESTRICT input,4180const void* XXH_RESTRICT secret)4181{4182/* presumed aligned */4183unsigned int* const xacc = (unsigned int*) acc;4184xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */4185xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */4186xxh_u64x2 const v32 = { 32, 32 };4187size_t i;4188for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {4189/* data_vec = xinput[i]; */4190xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);4191/* key_vec = xsecret[i]; */4192xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);4193xxh_u64x2 const data_key = data_vec ^ key_vec;4194/* shuffled = (data_key << 32) | (data_key >> 32); */4195xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);4196/* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */4197xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);4198/* acc_vec = xacc[i]; */4199xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i);4200acc_vec += product;42014202/* swap high and low halves */4203#ifdef __s390x__4204acc_vec += vec_permi(data_vec, data_vec, 2);4205#else4206acc_vec += vec_xxpermdi(data_vec, data_vec, 2);4207#endif4208/* xacc[i] = acc_vec; */4209vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);4210}4211}42124213XXH_FORCE_INLINE void4214XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)4215{4216XXH_ASSERT((((size_t)acc) & 15) == 0);42174218{ xxh_u64x2* const xacc = (xxh_u64x2*) acc;4219const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;4220/* constants */4221xxh_u64x2 const v32 = { 32, 32 };4222xxh_u64x2 const v47 = { 47, 47 };4223xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };4224size_t i;4225for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {4226/* xacc[i] ^= (xacc[i] >> 47); */4227xxh_u64x2 const acc_vec = xacc[i];4228xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);42294230/* xacc[i] ^= xsecret[i]; */4231xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);4232xxh_u64x2 const data_key = data_vec ^ key_vec;42334234/* xacc[i] *= XXH_PRIME32_1 */4235/* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */4236xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);4237/* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */4238xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);4239xacc[i] = prod_odd + (prod_even << v32);4240} }4241}42424243#endif42444245/* scalar variants - universal */42464247/*!4248* @internal4249* @brief Scalar round for @ref XXH3_accumulate_512_scalar().4250*4251* This is extracted to its own function because the NEON path uses a combination4252* of NEON and scalar.4253*/4254XXH_FORCE_INLINE void4255XXH3_scalarRound(void* XXH_RESTRICT acc,4256void const* XXH_RESTRICT input,4257void const* XXH_RESTRICT secret,4258size_t lane)4259{4260xxh_u64* xacc = (xxh_u64*) acc;4261xxh_u8 const* xinput = (xxh_u8 const*) input;4262xxh_u8 const* xsecret = (xxh_u8 const*) secret;4263XXH_ASSERT(lane < XXH_ACC_NB);4264XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);4265{4266xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);4267xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);4268xacc[lane ^ 1] += data_val; /* swap adjacent lanes */4269xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);4270}4271}42724273/*!4274* @internal4275* @brief Processes a 64 byte block of data using the scalar path.4276*/4277XXH_FORCE_INLINE void4278XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,4279const void* XXH_RESTRICT input,4280const void* XXH_RESTRICT secret)4281{4282size_t i;4283for (i=0; i < XXH_ACC_NB; i++) {4284XXH3_scalarRound(acc, input, secret, i);4285}4286}42874288/*!4289* @internal4290* @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().4291*4292* This is extracted to its own function because the NEON path uses a combination4293* of NEON and scalar.4294*/4295XXH_FORCE_INLINE void4296XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,4297void const* XXH_RESTRICT secret,4298size_t lane)4299{4300xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */4301const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */4302XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);4303XXH_ASSERT(lane < XXH_ACC_NB);4304{4305xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);4306xxh_u64 acc64 = xacc[lane];4307acc64 = XXH_xorshift64(acc64, 47);4308acc64 ^= key64;4309acc64 *= XXH_PRIME32_1;4310xacc[lane] = acc64;4311}4312}43134314/*!4315* @internal4316* @brief Scrambles the accumulators after a large chunk has been read4317*/4318XXH_FORCE_INLINE void4319XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)4320{4321size_t i;4322for (i=0; i < XXH_ACC_NB; i++) {4323XXH3_scalarScrambleRound(acc, secret, i);4324}4325}43264327XXH_FORCE_INLINE void4328XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)4329{4330/*4331* We need a separate pointer for the hack below,4332* which requires a non-const pointer.4333* Any decent compiler will optimize this out otherwise.4334*/4335const xxh_u8* kSecretPtr = XXH3_kSecret;4336XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);43374338#if defined(__clang__) && defined(__aarch64__)4339/*4340* UGLY HACK:4341* Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are4342* placed sequentially, in order, at the top of the unrolled loop.4343*4344* While MOVK is great for generating constants (2 cycles for a 64-bit4345* constant compared to 4 cycles for LDR), it fights for bandwidth with4346* the arithmetic instructions.4347*4348* I L S4349* MOVK4350* MOVK4351* MOVK4352* MOVK4353* ADD4354* SUB STR4355* STR4356* By forcing loads from memory (as the asm line causes Clang to assume4357* that XXH3_kSecretPtr has been changed), the pipelines are used more4358* efficiently:4359* I L S4360* LDR4361* ADD LDR4362* SUB STR4363* STR4364*4365* See XXH3_NEON_LANES for details on the pipsline.4366*4367* XXH3_64bits_withSeed, len == 256, Snapdragon 8354368* without hack: 2654.4 MB/s4369* with hack: 3202.9 MB/s4370*/4371XXH_COMPILER_GUARD(kSecretPtr);4372#endif4373/*4374* Note: in debug mode, this overrides the asm optimization4375* and Clang will emit MOVK chains again.4376*/4377XXH_ASSERT(kSecretPtr == XXH3_kSecret);43784379{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;4380int i;4381for (i=0; i < nbRounds; i++) {4382/*4383* The asm hack causes Clang to assume that kSecretPtr aliases with4384* customSecret, and on aarch64, this prevented LDP from merging two4385* loads together for free. Putting the loads together before the stores4386* properly generates LDP.4387*/4388xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64;4389xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;4390XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo);4391XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);4392} }4393}439443954396typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*);4397typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);4398typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);439944004401#if (XXH_VECTOR == XXH_AVX512)44024403#define XXH3_accumulate_512 XXH3_accumulate_512_avx5124404#define XXH3_scrambleAcc XXH3_scrambleAcc_avx5124405#define XXH3_initCustomSecret XXH3_initCustomSecret_avx51244064407#elif (XXH_VECTOR == XXH_AVX2)44084409#define XXH3_accumulate_512 XXH3_accumulate_512_avx24410#define XXH3_scrambleAcc XXH3_scrambleAcc_avx24411#define XXH3_initCustomSecret XXH3_initCustomSecret_avx244124413#elif (XXH_VECTOR == XXH_SSE2)44144415#define XXH3_accumulate_512 XXH3_accumulate_512_sse24416#define XXH3_scrambleAcc XXH3_scrambleAcc_sse24417#define XXH3_initCustomSecret XXH3_initCustomSecret_sse244184419#elif (XXH_VECTOR == XXH_NEON)44204421#define XXH3_accumulate_512 XXH3_accumulate_512_neon4422#define XXH3_scrambleAcc XXH3_scrambleAcc_neon4423#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar44244425#elif (XXH_VECTOR == XXH_VSX)44264427#define XXH3_accumulate_512 XXH3_accumulate_512_vsx4428#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx4429#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar44304431#else /* scalar */44324433#define XXH3_accumulate_512 XXH3_accumulate_512_scalar4434#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar4435#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar44364437#endif4438443944404441#ifndef XXH_PREFETCH_DIST4442# ifdef __clang__4443# define XXH_PREFETCH_DIST 3204444# else4445# if (XXH_VECTOR == XXH_AVX512)4446# define XXH_PREFETCH_DIST 5124447# else4448# define XXH_PREFETCH_DIST 3844449# endif4450# endif /* __clang__ */4451#endif /* XXH_PREFETCH_DIST */44524453/*4454* XXH3_accumulate()4455* Loops over XXH3_accumulate_512().4456* Assumption: nbStripes will not overflow the secret size4457*/4458XXH_FORCE_INLINE void4459XXH3_accumulate( xxh_u64* XXH_RESTRICT acc,4460const xxh_u8* XXH_RESTRICT input,4461const xxh_u8* XXH_RESTRICT secret,4462size_t nbStripes,4463XXH3_f_accumulate_512 f_acc512)4464{4465size_t n;4466for (n = 0; n < nbStripes; n++ ) {4467const xxh_u8* const in = input + n*XXH_STRIPE_LEN;4468XXH_PREFETCH(in + XXH_PREFETCH_DIST);4469f_acc512(acc,4470in,4471secret + n*XXH_SECRET_CONSUME_RATE);4472}4473}44744475XXH_FORCE_INLINE void4476XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,4477const xxh_u8* XXH_RESTRICT input, size_t len,4478const xxh_u8* XXH_RESTRICT secret, size_t secretSize,4479XXH3_f_accumulate_512 f_acc512,4480XXH3_f_scrambleAcc f_scramble)4481{4482size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;4483size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;4484size_t const nb_blocks = (len - 1) / block_len;44854486size_t n;44874488XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);44894490for (n = 0; n < nb_blocks; n++) {4491XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);4492f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);4493}44944495/* last partial block */4496XXH_ASSERT(len > XXH_STRIPE_LEN);4497{ size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;4498XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));4499XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);45004501/* last stripe */4502{ const xxh_u8* const p = input + len - XXH_STRIPE_LEN;4503#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */4504f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);4505} }4506}45074508XXH_FORCE_INLINE xxh_u644509XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)4510{4511return XXH3_mul128_fold64(4512acc[0] ^ XXH_readLE64(secret),4513acc[1] ^ XXH_readLE64(secret+8) );4514}45154516static XXH64_hash_t4517XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)4518{4519xxh_u64 result64 = start;4520size_t i = 0;45214522for (i = 0; i < 4; i++) {4523result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);4524#if defined(__clang__) /* Clang */ \4525&& (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \4526&& (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \4527&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */4528/*4529* UGLY HACK:4530* Prevent autovectorization on Clang ARMv7-a. Exact same problem as4531* the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.4532* XXH3_64bits, len == 256, Snapdragon 835:4533* without hack: 2063.7 MB/s4534* with hack: 2560.7 MB/s4535*/4536XXH_COMPILER_GUARD(result64);4537#endif4538}45394540return XXH3_avalanche(result64);4541}45424543#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \4544XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }45454546XXH_FORCE_INLINE XXH64_hash_t4547XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,4548const void* XXH_RESTRICT secret, size_t secretSize,4549XXH3_f_accumulate_512 f_acc512,4550XXH3_f_scrambleAcc f_scramble)4551{4552XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;45534554XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble);45554556/* converge into final hash */4557XXH_STATIC_ASSERT(sizeof(acc) == 64);4558/* do not align on 8, so that the secret is different from the accumulator */4559#define XXH_SECRET_MERGEACCS_START 114560XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);4561return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);4562}45634564/*4565* It's important for performance to transmit secret's size (when it's static)4566* so that the compiler can properly optimize the vectorized loop.4567* This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.4568*/4569XXH_FORCE_INLINE XXH64_hash_t4570XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,4571XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)4572{4573(void)seed64;4574return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);4575}45764577/*4578* It's preferable for performance that XXH3_hashLong is not inlined,4579* as it results in a smaller function for small data, easier to the instruction cache.4580* Note that inside this no_inline function, we do inline the internal loop,4581* and provide a statically defined secret size to allow optimization of vector loop.4582*/4583XXH_NO_INLINE XXH64_hash_t4584XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,4585XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)4586{4587(void)seed64; (void)secret; (void)secretLen;4588return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);4589}45904591/*4592* XXH3_hashLong_64b_withSeed():4593* Generate a custom key based on alteration of default XXH3_kSecret with the seed,4594* and then use this key for long mode hashing.4595*4596* This operation is decently fast but nonetheless costs a little bit of time.4597* Try to avoid it whenever possible (typically when seed==0).4598*4599* It's important for performance that XXH3_hashLong is not inlined. Not sure4600* why (uop cache maybe?), but the difference is large and easily measurable.4601*/4602XXH_FORCE_INLINE XXH64_hash_t4603XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,4604XXH64_hash_t seed,4605XXH3_f_accumulate_512 f_acc512,4606XXH3_f_scrambleAcc f_scramble,4607XXH3_f_initCustomSecret f_initSec)4608{4609if (seed == 0)4610return XXH3_hashLong_64b_internal(input, len,4611XXH3_kSecret, sizeof(XXH3_kSecret),4612f_acc512, f_scramble);4613{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];4614f_initSec(secret, seed);4615return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),4616f_acc512, f_scramble);4617}4618}46194620/*4621* It's important for performance that XXH3_hashLong is not inlined.4622*/4623XXH_NO_INLINE XXH64_hash_t4624XXH3_hashLong_64b_withSeed(const void* input, size_t len,4625XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)4626{4627(void)secret; (void)secretLen;4628return XXH3_hashLong_64b_withSeed_internal(input, len, seed,4629XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);4630}463146324633typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,4634XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);46354636XXH_FORCE_INLINE XXH64_hash_t4637XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,4638XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,4639XXH3_hashLong64_f f_hashLong)4640{4641XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);4642/*4643* If an action is to be taken if `secretLen` condition is not respected,4644* it should be done here.4645* For now, it's a contract pre-condition.4646* Adding a check and a branch here would cost performance at every hash.4647* Also, note that function signature doesn't offer room to return an error.4648*/4649if (len <= 16)4650return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);4651if (len <= 128)4652return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);4653if (len <= XXH3_MIDSIZE_MAX)4654return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);4655return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);4656}465746584659/* === Public entry point === */46604661/*! @ingroup xxh3_family */4662XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)4663{4664return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);4665}46664667/*! @ingroup xxh3_family */4668XXH_PUBLIC_API XXH64_hash_t4669XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)4670{4671return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);4672}46734674/*! @ingroup xxh3_family */4675XXH_PUBLIC_API XXH64_hash_t4676XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)4677{4678return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);4679}46804681XXH_PUBLIC_API XXH64_hash_t4682XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)4683{4684if (len <= XXH3_MIDSIZE_MAX)4685return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);4686return XXH3_hashLong_64b_withSecret(input, len, seed, (const xxh_u8*)secret, secretSize);4687}468846894690/* === XXH3 streaming === */46914692/*4693* Malloc's a pointer that is always aligned to align.4694*4695* This must be freed with `XXH_alignedFree()`.4696*4697* malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte4698* alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX24699* or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.4700*4701* This underalignment previously caused a rather obvious crash which went4702* completely unnoticed due to XXH3_createState() not actually being tested.4703* Credit to RedSpah for noticing this bug.4704*4705* The alignment is done manually: Functions like posix_memalign or _mm_malloc4706* are avoided: To maintain portability, we would have to write a fallback4707* like this anyways, and besides, testing for the existence of library4708* functions without relying on external build tools is impossible.4709*4710* The method is simple: Overallocate, manually align, and store the offset4711* to the original behind the returned pointer.4712*4713* Align must be a power of 2 and 8 <= align <= 128.4714*/4715static void* XXH_alignedMalloc(size_t s, size_t align)4716{4717XXH_ASSERT(align <= 128 && align >= 8); /* range check */4718XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */4719XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */4720{ /* Overallocate to make room for manual realignment and an offset byte */4721xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);4722if (base != NULL) {4723/*4724* Get the offset needed to align this pointer.4725*4726* Even if the returned pointer is aligned, there will always be4727* at least one byte to store the offset to the original pointer.4728*/4729size_t offset = align - ((size_t)base & (align - 1)); /* base % align */4730/* Add the offset for the now-aligned pointer */4731xxh_u8* ptr = base + offset;47324733XXH_ASSERT((size_t)ptr % align == 0);47344735/* Store the offset immediately before the returned pointer. */4736ptr[-1] = (xxh_u8)offset;4737return ptr;4738}4739return NULL;4740}4741}4742/*4743* Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass4744* normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.4745*/4746static void XXH_alignedFree(void* p)4747{4748if (p != NULL) {4749xxh_u8* ptr = (xxh_u8*)p;4750/* Get the offset byte we added in XXH_malloc. */4751xxh_u8 offset = ptr[-1];4752/* Free the original malloc'd pointer */4753xxh_u8* base = ptr - offset;4754XXH_free(base);4755}4756}4757/*! @ingroup xxh3_family */4758XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)4759{4760XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);4761if (state==NULL) return NULL;4762XXH3_INITSTATE(state);4763return state;4764}47654766/*! @ingroup xxh3_family */4767XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)4768{4769XXH_alignedFree(statePtr);4770return XXH_OK;4771}47724773/*! @ingroup xxh3_family */4774XXH_PUBLIC_API void4775XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)4776{4777XXH_memcpy(dst_state, src_state, sizeof(*dst_state));4778}47794780static void4781XXH3_reset_internal(XXH3_state_t* statePtr,4782XXH64_hash_t seed,4783const void* secret, size_t secretSize)4784{4785size_t const initStart = offsetof(XXH3_state_t, bufferedSize);4786size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;4787XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);4788XXH_ASSERT(statePtr != NULL);4789/* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */4790memset((char*)statePtr + initStart, 0, initLength);4791statePtr->acc[0] = XXH_PRIME32_3;4792statePtr->acc[1] = XXH_PRIME64_1;4793statePtr->acc[2] = XXH_PRIME64_2;4794statePtr->acc[3] = XXH_PRIME64_3;4795statePtr->acc[4] = XXH_PRIME64_4;4796statePtr->acc[5] = XXH_PRIME32_2;4797statePtr->acc[6] = XXH_PRIME64_5;4798statePtr->acc[7] = XXH_PRIME32_1;4799statePtr->seed = seed;4800statePtr->useSeed = (seed != 0);4801statePtr->extSecret = (const unsigned char*)secret;4802XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);4803statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;4804statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;4805}48064807/*! @ingroup xxh3_family */4808XXH_PUBLIC_API XXH_errorcode4809XXH3_64bits_reset(XXH3_state_t* statePtr)4810{4811if (statePtr == NULL) return XXH_ERROR;4812XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);4813return XXH_OK;4814}48154816/*! @ingroup xxh3_family */4817XXH_PUBLIC_API XXH_errorcode4818XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)4819{4820if (statePtr == NULL) return XXH_ERROR;4821XXH3_reset_internal(statePtr, 0, secret, secretSize);4822if (secret == NULL) return XXH_ERROR;4823if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;4824return XXH_OK;4825}48264827/*! @ingroup xxh3_family */4828XXH_PUBLIC_API XXH_errorcode4829XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)4830{4831if (statePtr == NULL) return XXH_ERROR;4832if (seed==0) return XXH3_64bits_reset(statePtr);4833if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))4834XXH3_initCustomSecret(statePtr->customSecret, seed);4835XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);4836return XXH_OK;4837}48384839/*! @ingroup xxh3_family */4840XXH_PUBLIC_API XXH_errorcode4841XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)4842{4843if (statePtr == NULL) return XXH_ERROR;4844if (secret == NULL) return XXH_ERROR;4845if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;4846XXH3_reset_internal(statePtr, seed64, secret, secretSize);4847statePtr->useSeed = 1; /* always, even if seed64==0 */4848return XXH_OK;4849}48504851/* Note : when XXH3_consumeStripes() is invoked,4852* there must be a guarantee that at least one more byte must be consumed from input4853* so that the function can blindly consume all stripes using the "normal" secret segment */4854XXH_FORCE_INLINE void4855XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,4856size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,4857const xxh_u8* XXH_RESTRICT input, size_t nbStripes,4858const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,4859XXH3_f_accumulate_512 f_acc512,4860XXH3_f_scrambleAcc f_scramble)4861{4862XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */4863XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);4864if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {4865/* need a scrambling operation */4866size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;4867size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;4868XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);4869f_scramble(acc, secret + secretLimit);4870XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);4871*nbStripesSoFarPtr = nbStripesAfterBlock;4872} else {4873XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);4874*nbStripesSoFarPtr += nbStripes;4875}4876}48774878#ifndef XXH3_STREAM_USE_STACK4879# ifndef __clang__ /* clang doesn't need additional stack space */4880# define XXH3_STREAM_USE_STACK 14881# endif4882#endif4883/*4884* Both XXH3_64bits_update and XXH3_128bits_update use this routine.4885*/4886XXH_FORCE_INLINE XXH_errorcode4887XXH3_update(XXH3_state_t* XXH_RESTRICT const state,4888const xxh_u8* XXH_RESTRICT input, size_t len,4889XXH3_f_accumulate_512 f_acc512,4890XXH3_f_scrambleAcc f_scramble)4891{4892if (input==NULL) {4893XXH_ASSERT(len == 0);4894return XXH_OK;4895}48964897XXH_ASSERT(state != NULL);4898{ const xxh_u8* const bEnd = input + len;4899const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;4900#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 14901/* For some reason, gcc and MSVC seem to suffer greatly4902* when operating accumulators directly into state.4903* Operating into stack space seems to enable proper optimization.4904* clang, on the other hand, doesn't seem to need this trick */4905XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc));4906#else4907xxh_u64* XXH_RESTRICT const acc = state->acc;4908#endif4909state->totalLen += len;4910XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);49114912/* small input : just fill in tmp buffer */4913if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {4914XXH_memcpy(state->buffer + state->bufferedSize, input, len);4915state->bufferedSize += (XXH32_hash_t)len;4916return XXH_OK;4917}49184919/* total input is now > XXH3_INTERNALBUFFER_SIZE */4920#define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)4921XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */49224923/*4924* Internal buffer is partially filled (always, except at beginning)4925* Complete it, then consume it.4926*/4927if (state->bufferedSize) {4928size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;4929XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);4930input += loadSize;4931XXH3_consumeStripes(acc,4932&state->nbStripesSoFar, state->nbStripesPerBlock,4933state->buffer, XXH3_INTERNALBUFFER_STRIPES,4934secret, state->secretLimit,4935f_acc512, f_scramble);4936state->bufferedSize = 0;4937}4938XXH_ASSERT(input < bEnd);49394940/* large input to consume : ingest per full block */4941if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {4942size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;4943XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);4944/* join to current block's end */4945{ size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;4946XXH_ASSERT(nbStripesToEnd <= nbStripes);4947XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);4948f_scramble(acc, secret + state->secretLimit);4949state->nbStripesSoFar = 0;4950input += nbStripesToEnd * XXH_STRIPE_LEN;4951nbStripes -= nbStripesToEnd;4952}4953/* consume per entire blocks */4954while(nbStripes >= state->nbStripesPerBlock) {4955XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);4956f_scramble(acc, secret + state->secretLimit);4957input += state->nbStripesPerBlock * XXH_STRIPE_LEN;4958nbStripes -= state->nbStripesPerBlock;4959}4960/* consume last partial block */4961XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);4962input += nbStripes * XXH_STRIPE_LEN;4963XXH_ASSERT(input < bEnd); /* at least some bytes left */4964state->nbStripesSoFar = nbStripes;4965/* buffer predecessor of last partial stripe */4966XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);4967XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);4968} else {4969/* content to consume <= block size */4970/* Consume input by a multiple of internal buffer size */4971if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {4972const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;4973do {4974XXH3_consumeStripes(acc,4975&state->nbStripesSoFar, state->nbStripesPerBlock,4976input, XXH3_INTERNALBUFFER_STRIPES,4977secret, state->secretLimit,4978f_acc512, f_scramble);4979input += XXH3_INTERNALBUFFER_SIZE;4980} while (input<limit);4981/* buffer predecessor of last partial stripe */4982XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);4983}4984}49854986/* Some remaining input (always) : buffer it */4987XXH_ASSERT(input < bEnd);4988XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);4989XXH_ASSERT(state->bufferedSize == 0);4990XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));4991state->bufferedSize = (XXH32_hash_t)(bEnd-input);4992#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 14993/* save stack accumulators into state */4994memcpy(state->acc, acc, sizeof(acc));4995#endif4996}49974998return XXH_OK;4999}50005001/*! @ingroup xxh3_family */5002XXH_PUBLIC_API XXH_errorcode5003XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)5004{5005return XXH3_update(state, (const xxh_u8*)input, len,5006XXH3_accumulate_512, XXH3_scrambleAcc);5007}500850095010XXH_FORCE_INLINE void5011XXH3_digest_long (XXH64_hash_t* acc,5012const XXH3_state_t* state,5013const unsigned char* secret)5014{5015/*5016* Digest on a local copy. This way, the state remains unaltered, and it can5017* continue ingesting more input afterwards.5018*/5019XXH_memcpy(acc, state->acc, sizeof(state->acc));5020if (state->bufferedSize >= XXH_STRIPE_LEN) {5021size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;5022size_t nbStripesSoFar = state->nbStripesSoFar;5023XXH3_consumeStripes(acc,5024&nbStripesSoFar, state->nbStripesPerBlock,5025state->buffer, nbStripes,5026secret, state->secretLimit,5027XXH3_accumulate_512, XXH3_scrambleAcc);5028/* last stripe */5029XXH3_accumulate_512(acc,5030state->buffer + state->bufferedSize - XXH_STRIPE_LEN,5031secret + state->secretLimit - XXH_SECRET_LASTACC_START);5032} else { /* bufferedSize < XXH_STRIPE_LEN */5033xxh_u8 lastStripe[XXH_STRIPE_LEN];5034size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;5035XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */5036XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);5037XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);5038XXH3_accumulate_512(acc,5039lastStripe,5040secret + state->secretLimit - XXH_SECRET_LASTACC_START);5041}5042}50435044/*! @ingroup xxh3_family */5045XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)5046{5047const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;5048if (state->totalLen > XXH3_MIDSIZE_MAX) {5049XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];5050XXH3_digest_long(acc, state, secret);5051return XXH3_mergeAccs(acc,5052secret + XXH_SECRET_MERGEACCS_START,5053(xxh_u64)state->totalLen * XXH_PRIME64_1);5054}5055/* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */5056if (state->useSeed)5057return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);5058return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),5059secret, state->secretLimit + XXH_STRIPE_LEN);5060}5061506250635064/* ==========================================5065* XXH3 128 bits (a.k.a XXH128)5066* ==========================================5067* XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,5068* even without counting the significantly larger output size.5069*5070* For example, extra steps are taken to avoid the seed-dependent collisions5071* in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).5072*5073* This strength naturally comes at the cost of some speed, especially on short5074* lengths. Note that longer hashes are about as fast as the 64-bit version5075* due to it using only a slight modification of the 64-bit loop.5076*5077* XXH128 is also more oriented towards 64-bit machines. It is still extremely5078* fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).5079*/50805081XXH_FORCE_INLINE XXH128_hash_t5082XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)5083{5084/* A doubled version of 1to3_64b with different constants. */5085XXH_ASSERT(input != NULL);5086XXH_ASSERT(1 <= len && len <= 3);5087XXH_ASSERT(secret != NULL);5088/*5089* len = 1: combinedl = { input[0], 0x01, input[0], input[0] }5090* len = 2: combinedl = { input[1], 0x02, input[0], input[1] }5091* len = 3: combinedl = { input[2], 0x03, input[0], input[1] }5092*/5093{ xxh_u8 const c1 = input[0];5094xxh_u8 const c2 = input[len >> 1];5095xxh_u8 const c3 = input[len - 1];5096xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)5097| ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);5098xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);5099xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;5100xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;5101xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;5102xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;5103XXH128_hash_t h128;5104h128.low64 = XXH64_avalanche(keyed_lo);5105h128.high64 = XXH64_avalanche(keyed_hi);5106return h128;5107}5108}51095110XXH_FORCE_INLINE XXH128_hash_t5111XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)5112{5113XXH_ASSERT(input != NULL);5114XXH_ASSERT(secret != NULL);5115XXH_ASSERT(4 <= len && len <= 8);5116seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;5117{ xxh_u32 const input_lo = XXH_readLE32(input);5118xxh_u32 const input_hi = XXH_readLE32(input + len - 4);5119xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);5120xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;5121xxh_u64 const keyed = input_64 ^ bitflip;51225123/* Shift len to the left to ensure it is even, this avoids even multiplies. */5124XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));51255126m128.high64 += (m128.low64 << 1);5127m128.low64 ^= (m128.high64 >> 3);51285129m128.low64 = XXH_xorshift64(m128.low64, 35);5130m128.low64 *= 0x9FB21C651E98DF25ULL;5131m128.low64 = XXH_xorshift64(m128.low64, 28);5132m128.high64 = XXH3_avalanche(m128.high64);5133return m128;5134}5135}51365137XXH_FORCE_INLINE XXH128_hash_t5138XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)5139{5140XXH_ASSERT(input != NULL);5141XXH_ASSERT(secret != NULL);5142XXH_ASSERT(9 <= len && len <= 16);5143{ xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;5144xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;5145xxh_u64 const input_lo = XXH_readLE64(input);5146xxh_u64 input_hi = XXH_readLE64(input + len - 8);5147XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);5148/*5149* Put len in the middle of m128 to ensure that the length gets mixed to5150* both the low and high bits in the 128x64 multiply below.5151*/5152m128.low64 += (xxh_u64)(len - 1) << 54;5153input_hi ^= bitfliph;5154/*5155* Add the high 32 bits of input_hi to the high 32 bits of m128, then5156* add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to5157* the high 64 bits of m128.5158*5159* The best approach to this operation is different on 32-bit and 64-bit.5160*/5161if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */5162/*5163* 32-bit optimized version, which is more readable.5164*5165* On 32-bit, it removes an ADC and delays a dependency between the two5166* halves of m128.high64, but it generates an extra mask on 64-bit.5167*/5168m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);5169} else {5170/*5171* 64-bit optimized (albeit more confusing) version.5172*5173* Uses some properties of addition and multiplication to remove the mask:5174*5175* Let:5176* a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)5177* b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)5178* c = XXH_PRIME32_25179*5180* a + (b * c)5181* Inverse Property: x + y - x == y5182* a + (b * (1 + c - 1))5183* Distributive Property: x * (y + z) == (x * y) + (x * z)5184* a + (b * 1) + (b * (c - 1))5185* Identity Property: x * 1 == x5186* a + b + (b * (c - 1))5187*5188* Substitute a, b, and c:5189* input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))5190*5191* Since input_hi.hi + input_hi.lo == input_hi, we get this:5192* input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))5193*/5194m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);5195}5196/* m128 ^= XXH_swap64(m128 >> 64); */5197m128.low64 ^= XXH_swap64(m128.high64);51985199{ /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */5200XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);5201h128.high64 += m128.high64 * XXH_PRIME64_2;52025203h128.low64 = XXH3_avalanche(h128.low64);5204h128.high64 = XXH3_avalanche(h128.high64);5205return h128;5206} }5207}52085209/*5210* Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN5211*/5212XXH_FORCE_INLINE XXH128_hash_t5213XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)5214{5215XXH_ASSERT(len <= 16);5216{ if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);5217if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);5218if (len) return XXH3_len_1to3_128b(input, len, secret, seed);5219{ XXH128_hash_t h128;5220xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);5221xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);5222h128.low64 = XXH64_avalanche(seed ^ bitflipl);5223h128.high64 = XXH64_avalanche( seed ^ bitfliph);5224return h128;5225} }5226}52275228/*5229* A bit slower than XXH3_mix16B, but handles multiply by zero better.5230*/5231XXH_FORCE_INLINE XXH128_hash_t5232XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,5233const xxh_u8* secret, XXH64_hash_t seed)5234{5235acc.low64 += XXH3_mix16B (input_1, secret+0, seed);5236acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);5237acc.high64 += XXH3_mix16B (input_2, secret+16, seed);5238acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);5239return acc;5240}524152425243XXH_FORCE_INLINE XXH128_hash_t5244XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,5245const xxh_u8* XXH_RESTRICT secret, size_t secretSize,5246XXH64_hash_t seed)5247{5248XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;5249XXH_ASSERT(16 < len && len <= 128);52505251{ XXH128_hash_t acc;5252acc.low64 = len * XXH_PRIME64_1;5253acc.high64 = 0;5254if (len > 32) {5255if (len > 64) {5256if (len > 96) {5257acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);5258}5259acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);5260}5261acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);5262}5263acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);5264{ XXH128_hash_t h128;5265h128.low64 = acc.low64 + acc.high64;5266h128.high64 = (acc.low64 * XXH_PRIME64_1)5267+ (acc.high64 * XXH_PRIME64_4)5268+ ((len - seed) * XXH_PRIME64_2);5269h128.low64 = XXH3_avalanche(h128.low64);5270h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);5271return h128;5272}5273}5274}52755276XXH_NO_INLINE XXH128_hash_t5277XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,5278const xxh_u8* XXH_RESTRICT secret, size_t secretSize,5279XXH64_hash_t seed)5280{5281XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;5282XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);52835284{ XXH128_hash_t acc;5285int const nbRounds = (int)len / 32;5286int i;5287acc.low64 = len * XXH_PRIME64_1;5288acc.high64 = 0;5289for (i=0; i<4; i++) {5290acc = XXH128_mix32B(acc,5291input + (32 * i),5292input + (32 * i) + 16,5293secret + (32 * i),5294seed);5295}5296acc.low64 = XXH3_avalanche(acc.low64);5297acc.high64 = XXH3_avalanche(acc.high64);5298XXH_ASSERT(nbRounds >= 4);5299for (i=4 ; i < nbRounds; i++) {5300acc = XXH128_mix32B(acc,5301input + (32 * i),5302input + (32 * i) + 16,5303secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),5304seed);5305}5306/* last bytes */5307acc = XXH128_mix32B(acc,5308input + len - 16,5309input + len - 32,5310secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,53110ULL - seed);53125313{ XXH128_hash_t h128;5314h128.low64 = acc.low64 + acc.high64;5315h128.high64 = (acc.low64 * XXH_PRIME64_1)5316+ (acc.high64 * XXH_PRIME64_4)5317+ ((len - seed) * XXH_PRIME64_2);5318h128.low64 = XXH3_avalanche(h128.low64);5319h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);5320return h128;5321}5322}5323}53245325XXH_FORCE_INLINE XXH128_hash_t5326XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,5327const xxh_u8* XXH_RESTRICT secret, size_t secretSize,5328XXH3_f_accumulate_512 f_acc512,5329XXH3_f_scrambleAcc f_scramble)5330{5331XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;53325333XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);53345335/* converge into final hash */5336XXH_STATIC_ASSERT(sizeof(acc) == 64);5337XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);5338{ XXH128_hash_t h128;5339h128.low64 = XXH3_mergeAccs(acc,5340secret + XXH_SECRET_MERGEACCS_START,5341(xxh_u64)len * XXH_PRIME64_1);5342h128.high64 = XXH3_mergeAccs(acc,5343secret + secretSize5344- sizeof(acc) - XXH_SECRET_MERGEACCS_START,5345~((xxh_u64)len * XXH_PRIME64_2));5346return h128;5347}5348}53495350/*5351* It's important for performance that XXH3_hashLong is not inlined.5352*/5353XXH_NO_INLINE XXH128_hash_t5354XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,5355XXH64_hash_t seed64,5356const void* XXH_RESTRICT secret, size_t secretLen)5357{5358(void)seed64; (void)secret; (void)secretLen;5359return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),5360XXH3_accumulate_512, XXH3_scrambleAcc);5361}53625363/*5364* It's important for performance to pass @secretLen (when it's static)5365* to the compiler, so that it can properly optimize the vectorized loop.5366*/5367XXH_FORCE_INLINE XXH128_hash_t5368XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,5369XXH64_hash_t seed64,5370const void* XXH_RESTRICT secret, size_t secretLen)5371{5372(void)seed64;5373return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,5374XXH3_accumulate_512, XXH3_scrambleAcc);5375}53765377XXH_FORCE_INLINE XXH128_hash_t5378XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,5379XXH64_hash_t seed64,5380XXH3_f_accumulate_512 f_acc512,5381XXH3_f_scrambleAcc f_scramble,5382XXH3_f_initCustomSecret f_initSec)5383{5384if (seed64 == 0)5385return XXH3_hashLong_128b_internal(input, len,5386XXH3_kSecret, sizeof(XXH3_kSecret),5387f_acc512, f_scramble);5388{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];5389f_initSec(secret, seed64);5390return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),5391f_acc512, f_scramble);5392}5393}53945395/*5396* It's important for performance that XXH3_hashLong is not inlined.5397*/5398XXH_NO_INLINE XXH128_hash_t5399XXH3_hashLong_128b_withSeed(const void* input, size_t len,5400XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)5401{5402(void)secret; (void)secretLen;5403return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,5404XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);5405}54065407typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,5408XXH64_hash_t, const void* XXH_RESTRICT, size_t);54095410XXH_FORCE_INLINE XXH128_hash_t5411XXH3_128bits_internal(const void* input, size_t len,5412XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,5413XXH3_hashLong128_f f_hl128)5414{5415XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);5416/*5417* If an action is to be taken if `secret` conditions are not respected,5418* it should be done here.5419* For now, it's a contract pre-condition.5420* Adding a check and a branch here would cost performance at every hash.5421*/5422if (len <= 16)5423return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);5424if (len <= 128)5425return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);5426if (len <= XXH3_MIDSIZE_MAX)5427return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);5428return f_hl128(input, len, seed64, secret, secretLen);5429}543054315432/* === Public XXH128 API === */54335434/*! @ingroup xxh3_family */5435XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)5436{5437return XXH3_128bits_internal(input, len, 0,5438XXH3_kSecret, sizeof(XXH3_kSecret),5439XXH3_hashLong_128b_default);5440}54415442/*! @ingroup xxh3_family */5443XXH_PUBLIC_API XXH128_hash_t5444XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)5445{5446return XXH3_128bits_internal(input, len, 0,5447(const xxh_u8*)secret, secretSize,5448XXH3_hashLong_128b_withSecret);5449}54505451/*! @ingroup xxh3_family */5452XXH_PUBLIC_API XXH128_hash_t5453XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)5454{5455return XXH3_128bits_internal(input, len, seed,5456XXH3_kSecret, sizeof(XXH3_kSecret),5457XXH3_hashLong_128b_withSeed);5458}54595460/*! @ingroup xxh3_family */5461XXH_PUBLIC_API XXH128_hash_t5462XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)5463{5464if (len <= XXH3_MIDSIZE_MAX)5465return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);5466return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);5467}54685469/*! @ingroup xxh3_family */5470XXH_PUBLIC_API XXH128_hash_t5471XXH128(const void* input, size_t len, XXH64_hash_t seed)5472{5473return XXH3_128bits_withSeed(input, len, seed);5474}547554765477/* === XXH3 128-bit streaming === */54785479/*5480* All initialization and update functions are identical to 64-bit streaming variant.5481* The only difference is the finalization routine.5482*/54835484/*! @ingroup xxh3_family */5485XXH_PUBLIC_API XXH_errorcode5486XXH3_128bits_reset(XXH3_state_t* statePtr)5487{5488return XXH3_64bits_reset(statePtr);5489}54905491/*! @ingroup xxh3_family */5492XXH_PUBLIC_API XXH_errorcode5493XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)5494{5495return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);5496}54975498/*! @ingroup xxh3_family */5499XXH_PUBLIC_API XXH_errorcode5500XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)5501{5502return XXH3_64bits_reset_withSeed(statePtr, seed);5503}55045505/*! @ingroup xxh3_family */5506XXH_PUBLIC_API XXH_errorcode5507XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)5508{5509return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);5510}55115512/*! @ingroup xxh3_family */5513XXH_PUBLIC_API XXH_errorcode5514XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)5515{5516return XXH3_update(state, (const xxh_u8*)input, len,5517XXH3_accumulate_512, XXH3_scrambleAcc);5518}55195520/*! @ingroup xxh3_family */5521XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)5522{5523const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;5524if (state->totalLen > XXH3_MIDSIZE_MAX) {5525XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];5526XXH3_digest_long(acc, state, secret);5527XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);5528{ XXH128_hash_t h128;5529h128.low64 = XXH3_mergeAccs(acc,5530secret + XXH_SECRET_MERGEACCS_START,5531(xxh_u64)state->totalLen * XXH_PRIME64_1);5532h128.high64 = XXH3_mergeAccs(acc,5533secret + state->secretLimit + XXH_STRIPE_LEN5534- sizeof(acc) - XXH_SECRET_MERGEACCS_START,5535~((xxh_u64)state->totalLen * XXH_PRIME64_2));5536return h128;5537}5538}5539/* len <= XXH3_MIDSIZE_MAX : short code */5540if (state->seed)5541return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);5542return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),5543secret, state->secretLimit + XXH_STRIPE_LEN);5544}55455546/* 128-bit utility functions */55475548#include <string.h> /* memcmp, memcpy */55495550/* return : 1 is equal, 0 if different */5551/*! @ingroup xxh3_family */5552XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)5553{5554/* note : XXH128_hash_t is compact, it has no padding byte */5555return !(memcmp(&h1, &h2, sizeof(h1)));5556}55575558/* This prototype is compatible with stdlib's qsort().5559* return : >0 if *h128_1 > *h128_25560* <0 if *h128_1 < *h128_25561* =0 if *h128_1 == *h128_2 */5562/*! @ingroup xxh3_family */5563XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)5564{5565XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;5566XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;5567int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);5568/* note : bets that, in most cases, hash values are different */5569if (hcmp) return hcmp;5570return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);5571}557255735574/*====== Canonical representation ======*/5575/*! @ingroup xxh3_family */5576XXH_PUBLIC_API void5577XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)5578{5579XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));5580if (XXH_CPU_LITTLE_ENDIAN) {5581hash.high64 = XXH_swap64(hash.high64);5582hash.low64 = XXH_swap64(hash.low64);5583}5584XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));5585XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));5586}55875588/*! @ingroup xxh3_family */5589XXH_PUBLIC_API XXH128_hash_t5590XXH128_hashFromCanonical(const XXH128_canonical_t* src)5591{5592XXH128_hash_t h;5593h.high64 = XXH_readBE64(src);5594h.low64 = XXH_readBE64(src->digest + 8);5595return h;5596}5597559855995600/* ==========================================5601* Secret generators5602* ==========================================5603*/5604#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))56055606XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)5607{5608XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );5609XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );5610}56115612/*! @ingroup xxh3_family */5613XXH_PUBLIC_API XXH_errorcode5614XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)5615{5616#if (XXH_DEBUGLEVEL >= 1)5617XXH_ASSERT(secretBuffer != NULL);5618XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);5619#else5620/* production mode, assert() are disabled */5621if (secretBuffer == NULL) return XXH_ERROR;5622if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;5623#endif56245625if (customSeedSize == 0) {5626customSeed = XXH3_kSecret;5627customSeedSize = XXH_SECRET_DEFAULT_SIZE;5628}5629#if (XXH_DEBUGLEVEL >= 1)5630XXH_ASSERT(customSeed != NULL);5631#else5632if (customSeed == NULL) return XXH_ERROR;5633#endif56345635/* Fill secretBuffer with a copy of customSeed - repeat as needed */5636{ size_t pos = 0;5637while (pos < secretSize) {5638size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);5639memcpy((char*)secretBuffer + pos, customSeed, toCopy);5640pos += toCopy;5641} }56425643{ size_t const nbSeg16 = secretSize / 16;5644size_t n;5645XXH128_canonical_t scrambler;5646XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));5647for (n=0; n<nbSeg16; n++) {5648XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);5649XXH3_combine16((char*)secretBuffer + n*16, h128);5650}5651/* last segment */5652XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));5653}5654return XXH_OK;5655}56565657/*! @ingroup xxh3_family */5658XXH_PUBLIC_API void5659XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)5660{5661XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];5662XXH3_initCustomSecret(secret, seed);5663XXH_ASSERT(secretBuffer != NULL);5664memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);5665}5666566756685669/* Pop our optimization override from above */5670#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \5671&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \5672&& defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */5673# pragma GCC pop_options5674#endif56755676#endif /* XXH_NO_LONG_LONG */56775678#endif /* XXH_NO_XXH3 */56795680/*!5681* @}5682*/5683#endif /* XXH_IMPLEMENTATION */568456855686#if defined (__cplusplus)5687}5688#endif568956905691