/*1* xxHash - Extremely Fast Hash algorithm2* Header File3* Copyright (c) Yann Collet - Meta Platforms, Inc4*5* This source code is licensed under both the BSD-style license (found in the6* LICENSE file in the root directory of this source tree) and the GPLv2 (found7* in the COPYING file in the root directory of this source tree).8* You may select, at your option, one of the above-listed licenses.9*/1011/* Local adaptations for Zstandard */1213#ifndef XXH_NO_XXH314# define XXH_NO_XXH315#endif1617#ifndef XXH_NAMESPACE18# define XXH_NAMESPACE ZSTD_19#endif2021/*!22* @mainpage xxHash23*24* xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed25* limits.26*27* It is proposed in four flavors, in three families:28* 1. @ref XXH32_family29* - Classic 32-bit hash function. Simple, compact, and runs on almost all30* 32-bit and 64-bit systems.31* 2. @ref XXH64_family32* - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most33* 64-bit systems (but _not_ 32-bit systems).34* 3. @ref XXH3_family35* - Modern 64-bit and 128-bit hash function family which features improved36* strength and performance across the board, especially on smaller data.37* It benefits greatly from SIMD and 64-bit without requiring it.38*39* Benchmarks40* ---41* The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04.42* The open source benchmark program is compiled with clang v10.0 using -O3 flag.43*44* | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity |45* | -------------------- | ------- | ----: | ---------------: | ------------------: |46* | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 |47* | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 |48* | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 |49* | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 |50* | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 |51* | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 |52* | RAM sequential read | | N/A | 28.0 GB/s | N/A |53* | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 |54* | City64 | | 64 | 22.0 GB/s | 76.6 |55* | T1ha2 | | 64 | 22.0 GB/s | 99.0 |56* | City128 | | 128 | 21.7 GB/s | 57.7 |57* | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 |58* | XXH64() | | 64 | 19.4 GB/s | 71.0 |59* | SpookyHash | | 64 | 19.3 GB/s | 53.2 |60* | Mum | | 64 | 18.0 GB/s | 67.0 |61* | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 |62* | XXH32() | | 32 | 9.7 GB/s | 71.9 |63* | City32 | | 32 | 9.1 GB/s | 66.0 |64* | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 |65* | Murmur3 | | 32 | 3.9 GB/s | 56.1 |66* | SipHash* | | 64 | 3.0 GB/s | 43.2 |67* | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 |68* | HighwayHash | | 64 | 1.4 GB/s | 6.0 |69* | FNV64 | | 64 | 1.2 GB/s | 62.7 |70* | Blake2* | | 256 | 1.1 GB/s | 5.1 |71* | SHA1* | | 160 | 0.8 GB/s | 5.6 |72* | MD5* | | 128 | 0.6 GB/s | 7.8 |73* @note74* - Hashes which require a specific ISA extension are noted. SSE2 is also noted,75* even though it is mandatory on x64.76* - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic77* by modern standards.78* - Small data velocity is a rough average of algorithm's efficiency for small79* data. For more accurate information, see the wiki.80* - More benchmarks and strength tests are found on the wiki:81* https://github.com/Cyan4973/xxHash/wiki82*83* Usage84* ------85* All xxHash variants use a similar API. Changing the algorithm is a trivial86* substitution.87*88* @pre89* For functions which take an input and length parameter, the following90* requirements are assumed:91* - The range from [`input`, `input + length`) is valid, readable memory.92* - The only exception is if the `length` is `0`, `input` may be `NULL`.93* - For C++, the objects must have the *TriviallyCopyable* property, as the94* functions access bytes directly as if it was an array of `unsigned char`.95*96* @anchor single_shot_example97* **Single Shot**98*99* These functions are stateless functions which hash a contiguous block of memory,100* immediately returning the result. They are the easiest and usually the fastest101* option.102*103* XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits()104*105* @code{.c}106* #include <string.h>107* #include "xxhash.h"108*109* // Example for a function which hashes a null terminated string with XXH32().110* XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed)111* {112* // NULL pointers are only valid if the length is zero113* size_t length = (string == NULL) ? 0 : strlen(string);114* return XXH32(string, length, seed);115* }116* @endcode117*118*119* @anchor streaming_example120* **Streaming**121*122* These groups of functions allow incremental hashing of unknown size, even123* more than what would fit in a size_t.124*125* XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset()126*127* @code{.c}128* #include <stdio.h>129* #include <assert.h>130* #include "xxhash.h"131* // Example for a function which hashes a FILE incrementally with XXH3_64bits().132* XXH64_hash_t hashFile(FILE* f)133* {134* // Allocate a state struct. Do not just use malloc() or new.135* XXH3_state_t* state = XXH3_createState();136* assert(state != NULL && "Out of memory!");137* // Reset the state to start a new hashing session.138* XXH3_64bits_reset(state);139* char buffer[4096];140* size_t count;141* // Read the file in chunks142* while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) {143* // Run update() as many times as necessary to process the data144* XXH3_64bits_update(state, buffer, count);145* }146* // Retrieve the finalized hash. This will not change the state.147* XXH64_hash_t result = XXH3_64bits_digest(state);148* // Free the state. Do not use free().149* XXH3_freeState(state);150* return result;151* }152* @endcode153*154* Streaming functions generate the xxHash value from an incremental input.155* This method is slower than single-call functions, due to state management.156* For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.157*158* An XXH state must first be allocated using `XXH*_createState()`.159*160* Start a new hash by initializing the state with a seed using `XXH*_reset()`.161*162* Then, feed the hash state by calling `XXH*_update()` as many times as necessary.163*164* The function returns an error code, with 0 meaning OK, and any other value165* meaning there is an error.166*167* Finally, a hash value can be produced anytime, by using `XXH*_digest()`.168* This function returns the nn-bits hash as an int or long long.169*170* It's still possible to continue inserting input into the hash state after a171* digest, and generate new hash values later on by invoking `XXH*_digest()`.172*173* When done, release the state using `XXH*_freeState()`.174*175*176* @anchor canonical_representation_example177* **Canonical Representation**178*179* The default return values from XXH functions are unsigned 32, 64 and 128 bit180* integers.181* This the simplest and fastest format for further post-processing.182*183* However, this leaves open the question of what is the order on the byte level,184* since little and big endian conventions will store the same number differently.185*186* The canonical representation settles this issue by mandating big-endian187* convention, the same convention as human-readable numbers (large digits first).188*189* When writing hash values to storage, sending them over a network, or printing190* them, it's highly recommended to use the canonical representation to ensure191* portability across a wider range of systems, present and future.192*193* The following functions allow transformation of hash values to and from194* canonical format.195*196* XXH32_canonicalFromHash(), XXH32_hashFromCanonical(),197* XXH64_canonicalFromHash(), XXH64_hashFromCanonical(),198* XXH128_canonicalFromHash(), XXH128_hashFromCanonical(),199*200* @code{.c}201* #include <stdio.h>202* #include "xxhash.h"203*204* // Example for a function which prints XXH32_hash_t in human readable format205* void printXxh32(XXH32_hash_t hash)206* {207* XXH32_canonical_t cano;208* XXH32_canonicalFromHash(&cano, hash);209* size_t i;210* for(i = 0; i < sizeof(cano.digest); ++i) {211* printf("%02x", cano.digest[i]);212* }213* printf("\n");214* }215*216* // Example for a function which converts XXH32_canonical_t to XXH32_hash_t217* XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano)218* {219* XXH32_hash_t hash = XXH32_hashFromCanonical(&cano);220* return hash;221* }222* @endcode223*224*225* @file xxhash.h226* xxHash prototypes and implementation227*/228229/* ****************************230* INLINE mode231******************************/232/*!233* @defgroup public Public API234* Contains details on the public xxHash functions.235* @{236*/237#ifdef XXH_DOXYGEN238/*!239* @brief Gives access to internal state declaration, required for static allocation.240*241* Incompatible with dynamic linking, due to risks of ABI changes.242*243* Usage:244* @code{.c}245* #define XXH_STATIC_LINKING_ONLY246* #include "xxhash.h"247* @endcode248*/249# define XXH_STATIC_LINKING_ONLY250/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */251252/*!253* @brief Gives access to internal definitions.254*255* Usage:256* @code{.c}257* #define XXH_STATIC_LINKING_ONLY258* #define XXH_IMPLEMENTATION259* #include "xxhash.h"260* @endcode261*/262# define XXH_IMPLEMENTATION263/* Do not undef XXH_IMPLEMENTATION for Doxygen */264265/*!266* @brief Exposes the implementation and marks all functions as `inline`.267*268* Use these build macros to inline xxhash into the target unit.269* Inlining improves performance on small inputs, especially when the length is270* expressed as a compile-time constant:271*272* https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html273*274* It also keeps xxHash symbols private to the unit, so they are not exported.275*276* Usage:277* @code{.c}278* #define XXH_INLINE_ALL279* #include "xxhash.h"280* @endcode281* Do not compile and link xxhash.o as a separate object, as it is not useful.282*/283# define XXH_INLINE_ALL284# undef XXH_INLINE_ALL285/*!286* @brief Exposes the implementation without marking functions as inline.287*/288# define XXH_PRIVATE_API289# undef XXH_PRIVATE_API290/*!291* @brief Emulate a namespace by transparently prefixing all symbols.292*293* If you want to include _and expose_ xxHash functions from within your own294* library, but also want to avoid symbol collisions with other libraries which295* may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix296* any public symbol from xxhash library with the value of @ref XXH_NAMESPACE297* (therefore, avoid empty or numeric values).298*299* Note that no change is required within the calling program as long as it300* includes `xxhash.h`: Regular symbol names will be automatically translated301* by this header.302*/303# define XXH_NAMESPACE /* YOUR NAME HERE */304# undef XXH_NAMESPACE305#endif306307#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \308&& !defined(XXH_INLINE_ALL_31684351384)309/* this section should be traversed only once */310# define XXH_INLINE_ALL_31684351384311/* give access to the advanced API, required to compile implementations */312# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */313# define XXH_STATIC_LINKING_ONLY314/* make all functions private */315# undef XXH_PUBLIC_API316# if defined(__GNUC__)317# define XXH_PUBLIC_API static __inline __attribute__((unused))318# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)319# define XXH_PUBLIC_API static inline320# elif defined(_MSC_VER)321# define XXH_PUBLIC_API static __inline322# else323/* note: this version may generate warnings for unused static functions */324# define XXH_PUBLIC_API static325# endif326327/*328* This part deals with the special case where a unit wants to inline xxHash,329* but "xxhash.h" has previously been included without XXH_INLINE_ALL,330* such as part of some previously included *.h header file.331* Without further action, the new include would just be ignored,332* and functions would effectively _not_ be inlined (silent failure).333* The following macros solve this situation by prefixing all inlined names,334* avoiding naming collision with previous inclusions.335*/336/* Before that, we unconditionally #undef all symbols,337* in case they were already defined with XXH_NAMESPACE.338* They will then be redefined for XXH_INLINE_ALL339*/340# undef XXH_versionNumber341/* XXH32 */342# undef XXH32343# undef XXH32_createState344# undef XXH32_freeState345# undef XXH32_reset346# undef XXH32_update347# undef XXH32_digest348# undef XXH32_copyState349# undef XXH32_canonicalFromHash350# undef XXH32_hashFromCanonical351/* XXH64 */352# undef XXH64353# undef XXH64_createState354# undef XXH64_freeState355# undef XXH64_reset356# undef XXH64_update357# undef XXH64_digest358# undef XXH64_copyState359# undef XXH64_canonicalFromHash360# undef XXH64_hashFromCanonical361/* XXH3_64bits */362# undef XXH3_64bits363# undef XXH3_64bits_withSecret364# undef XXH3_64bits_withSeed365# undef XXH3_64bits_withSecretandSeed366# undef XXH3_createState367# undef XXH3_freeState368# undef XXH3_copyState369# undef XXH3_64bits_reset370# undef XXH3_64bits_reset_withSeed371# undef XXH3_64bits_reset_withSecret372# undef XXH3_64bits_update373# undef XXH3_64bits_digest374# undef XXH3_generateSecret375/* XXH3_128bits */376# undef XXH128377# undef XXH3_128bits378# undef XXH3_128bits_withSeed379# undef XXH3_128bits_withSecret380# undef XXH3_128bits_reset381# undef XXH3_128bits_reset_withSeed382# undef XXH3_128bits_reset_withSecret383# undef XXH3_128bits_reset_withSecretandSeed384# undef XXH3_128bits_update385# undef XXH3_128bits_digest386# undef XXH128_isEqual387# undef XXH128_cmp388# undef XXH128_canonicalFromHash389# undef XXH128_hashFromCanonical390/* Finally, free the namespace itself */391# undef XXH_NAMESPACE392393/* employ the namespace for XXH_INLINE_ALL */394# define XXH_NAMESPACE XXH_INLINE_395/*396* Some identifiers (enums, type names) are not symbols,397* but they must nonetheless be renamed to avoid redeclaration.398* Alternative solution: do not redeclare them.399* However, this requires some #ifdefs, and has a more dispersed impact.400* Meanwhile, renaming can be achieved in a single place.401*/402# define XXH_IPREF(Id) XXH_NAMESPACE ## Id403# define XXH_OK XXH_IPREF(XXH_OK)404# define XXH_ERROR XXH_IPREF(XXH_ERROR)405# define XXH_errorcode XXH_IPREF(XXH_errorcode)406# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t)407# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t)408# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)409# define XXH32_state_s XXH_IPREF(XXH32_state_s)410# define XXH32_state_t XXH_IPREF(XXH32_state_t)411# define XXH64_state_s XXH_IPREF(XXH64_state_s)412# define XXH64_state_t XXH_IPREF(XXH64_state_t)413# define XXH3_state_s XXH_IPREF(XXH3_state_s)414# define XXH3_state_t XXH_IPREF(XXH3_state_t)415# define XXH128_hash_t XXH_IPREF(XXH128_hash_t)416/* Ensure the header is parsed again, even if it was previously included */417# undef XXHASH_H_5627135585666179418# undef XXHASH_H_STATIC_13879238742419#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */420421/* ****************************************************************422* Stable API423*****************************************************************/424#ifndef XXHASH_H_5627135585666179425#define XXHASH_H_5627135585666179 1426427/*! @brief Marks a global symbol. */428#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)429# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))430# ifdef XXH_EXPORT431# define XXH_PUBLIC_API __declspec(dllexport)432# elif XXH_IMPORT433# define XXH_PUBLIC_API __declspec(dllimport)434# endif435# else436# define XXH_PUBLIC_API /* do nothing */437# endif438#endif439440#ifdef XXH_NAMESPACE441# define XXH_CAT(A,B) A##B442# define XXH_NAME2(A,B) XXH_CAT(A,B)443# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)444/* XXH32 */445# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)446# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)447# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)448# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)449# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)450# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)451# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)452# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)453# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)454/* XXH64 */455# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)456# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)457# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)458# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)459# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)460# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)461# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)462# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)463# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)464/* XXH3_64bits */465# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)466# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)467# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)468# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)469# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)470# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)471# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)472# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)473# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)474# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)475# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)476# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)477# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)478# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)479# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)480/* XXH3_128bits */481# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)482# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)483# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)484# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)485# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)486# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)487# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)488# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)489# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)490# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)491# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)492# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)493# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)494# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)495# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)496#endif497498499/* *************************************500* Compiler specifics501***************************************/502503/* specific declaration modes for Windows */504#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)505# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))506# ifdef XXH_EXPORT507# define XXH_PUBLIC_API __declspec(dllexport)508# elif XXH_IMPORT509# define XXH_PUBLIC_API __declspec(dllimport)510# endif511# else512# define XXH_PUBLIC_API /* do nothing */513# endif514#endif515516#if defined (__GNUC__)517# define XXH_CONSTF __attribute__((const))518# define XXH_PUREF __attribute__((pure))519# define XXH_MALLOCF __attribute__((malloc))520#else521# define XXH_CONSTF /* disable */522# define XXH_PUREF523# define XXH_MALLOCF524#endif525526/* *************************************527* Version528***************************************/529#define XXH_VERSION_MAJOR 0530#define XXH_VERSION_MINOR 8531#define XXH_VERSION_RELEASE 2532/*! @brief Version number, encoded as two digits each */533#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)534535#if defined (__cplusplus)536extern "C" {537#endif538/*!539* @brief Obtains the xxHash version.540*541* This is mostly useful when xxHash is compiled as a shared library,542* since the returned value comes from the library, as opposed to header file.543*544* @return @ref XXH_VERSION_NUMBER of the invoked library.545*/546XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void);547548#if defined (__cplusplus)549}550#endif551552/* ****************************553* Common basic types554******************************/555#include <stddef.h> /* size_t */556/*!557* @brief Exit code for the streaming API.558*/559typedef enum {560XXH_OK = 0, /*!< OK */561XXH_ERROR /*!< Error */562} XXH_errorcode;563564565/*-**********************************************************************566* 32-bit hash567************************************************************************/568#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */569/*!570* @brief An unsigned 32-bit integer.571*572* Not necessarily defined to `uint32_t` but functionally equivalent.573*/574typedef uint32_t XXH32_hash_t;575576#elif !defined (__VMS) \577&& (defined (__cplusplus) \578|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )579# ifdef _AIX580# include <inttypes.h>581# else582# include <stdint.h>583# endif584typedef uint32_t XXH32_hash_t;585586#else587# include <limits.h>588# if UINT_MAX == 0xFFFFFFFFUL589typedef unsigned int XXH32_hash_t;590# elif ULONG_MAX == 0xFFFFFFFFUL591typedef unsigned long XXH32_hash_t;592# else593# error "unsupported platform: need a 32-bit type"594# endif595#endif596597#if defined (__cplusplus)598extern "C" {599#endif600601/*!602* @}603*604* @defgroup XXH32_family XXH32 family605* @ingroup public606* Contains functions used in the classic 32-bit xxHash algorithm.607*608* @note609* XXH32 is useful for older platforms, with no or poor 64-bit performance.610* Note that the @ref XXH3_family provides competitive speed for both 32-bit611* and 64-bit systems, and offers true 64/128 bit hash results.612*613* @see @ref XXH64_family, @ref XXH3_family : Other xxHash families614* @see @ref XXH32_impl for implementation details615* @{616*/617618/*!619* @brief Calculates the 32-bit hash of @p input using xxHash32.620*621* @param input The block of data to be hashed, at least @p length bytes in size.622* @param length The length of @p input, in bytes.623* @param seed The 32-bit seed to alter the hash's output predictably.624*625* @pre626* The memory between @p input and @p input + @p length must be valid,627* readable, contiguous memory. However, if @p length is `0`, @p input may be628* `NULL`. In C++, this also must be *TriviallyCopyable*.629*630* @return The calculated 32-bit xxHash32 value.631*632* @see @ref single_shot_example "Single Shot Example" for an example.633*/634XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);635636#ifndef XXH_NO_STREAM637/*!638* @typedef struct XXH32_state_s XXH32_state_t639* @brief The opaque state struct for the XXH32 streaming API.640*641* @see XXH32_state_s for details.642*/643typedef struct XXH32_state_s XXH32_state_t;644645/*!646* @brief Allocates an @ref XXH32_state_t.647*648* @return An allocated pointer of @ref XXH32_state_t on success.649* @return `NULL` on failure.650*651* @note Must be freed with XXH32_freeState().652*/653XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void);654/*!655* @brief Frees an @ref XXH32_state_t.656*657* @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().658*659* @return @ref XXH_OK.660*661* @note @p statePtr must be allocated with XXH32_createState().662*663*/664XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);665/*!666* @brief Copies one @ref XXH32_state_t to another.667*668* @param dst_state The state to copy to.669* @param src_state The state to copy from.670* @pre671* @p dst_state and @p src_state must not be `NULL` and must not overlap.672*/673XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);674675/*!676* @brief Resets an @ref XXH32_state_t to begin a new hash.677*678* @param statePtr The state struct to reset.679* @param seed The 32-bit seed to alter the hash result predictably.680*681* @pre682* @p statePtr must not be `NULL`.683*684* @return @ref XXH_OK on success.685* @return @ref XXH_ERROR on failure.686*687* @note This function resets and seeds a state. Call it before @ref XXH32_update().688*/689XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed);690691/*!692* @brief Consumes a block of @p input to an @ref XXH32_state_t.693*694* @param statePtr The state struct to update.695* @param input The block of data to be hashed, at least @p length bytes in size.696* @param length The length of @p input, in bytes.697*698* @pre699* @p statePtr must not be `NULL`.700* @pre701* The memory between @p input and @p input + @p length must be valid,702* readable, contiguous memory. However, if @p length is `0`, @p input may be703* `NULL`. In C++, this also must be *TriviallyCopyable*.704*705* @return @ref XXH_OK on success.706* @return @ref XXH_ERROR on failure.707*708* @note Call this to incrementally consume blocks of data.709*/710XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);711712/*!713* @brief Returns the calculated hash value from an @ref XXH32_state_t.714*715* @param statePtr The state struct to calculate the hash from.716*717* @pre718* @p statePtr must not be `NULL`.719*720* @return The calculated 32-bit xxHash32 value from that state.721*722* @note723* Calling XXH32_digest() will not affect @p statePtr, so you can update,724* digest, and update again.725*/726XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);727#endif /* !XXH_NO_STREAM */728729/******* Canonical representation *******/730731/*!732* @brief Canonical (big endian) representation of @ref XXH32_hash_t.733*/734typedef struct {735unsigned char digest[4]; /*!< Hash bytes, big endian */736} XXH32_canonical_t;737738/*!739* @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.740*741* @param dst The @ref XXH32_canonical_t pointer to be stored to.742* @param hash The @ref XXH32_hash_t to be converted.743*744* @pre745* @p dst must not be `NULL`.746*747* @see @ref canonical_representation_example "Canonical Representation Example"748*/749XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);750751/*!752* @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.753*754* @param src The @ref XXH32_canonical_t to convert.755*756* @pre757* @p src must not be `NULL`.758*759* @return The converted hash.760*761* @see @ref canonical_representation_example "Canonical Representation Example"762*/763XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);764765766/*! @cond Doxygen ignores this part */767#ifdef __has_attribute768# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)769#else770# define XXH_HAS_ATTRIBUTE(x) 0771#endif772/*! @endcond */773774/*! @cond Doxygen ignores this part */775/*776* C23 __STDC_VERSION__ number hasn't been specified yet. For now777* leave as `201711L` (C17 + 1).778* TODO: Update to correct value when its been specified.779*/780#define XXH_C23_VN 201711L781/*! @endcond */782783/*! @cond Doxygen ignores this part */784/* C-language Attributes are added in C23. */785#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute)786# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)787#else788# define XXH_HAS_C_ATTRIBUTE(x) 0789#endif790/*! @endcond */791792/*! @cond Doxygen ignores this part */793#if defined(__cplusplus) && defined(__has_cpp_attribute)794# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)795#else796# define XXH_HAS_CPP_ATTRIBUTE(x) 0797#endif798/*! @endcond */799800/*! @cond Doxygen ignores this part */801/*802* Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute803* introduced in CPP17 and C23.804* CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough805* C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough806*/807#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough)808# define XXH_FALLTHROUGH [[fallthrough]]809#elif XXH_HAS_ATTRIBUTE(__fallthrough__)810# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__))811#else812# define XXH_FALLTHROUGH /* fallthrough */813#endif814/*! @endcond */815816/*! @cond Doxygen ignores this part */817/*818* Define XXH_NOESCAPE for annotated pointers in public API.819* https://clang.llvm.org/docs/AttributeReference.html#noescape820* As of writing this, only supported by clang.821*/822#if XXH_HAS_ATTRIBUTE(noescape)823# define XXH_NOESCAPE __attribute__((noescape))824#else825# define XXH_NOESCAPE826#endif827/*! @endcond */828829#if defined (__cplusplus)830} /* end of extern "C" */831#endif832833/*!834* @}835* @ingroup public836* @{837*/838839#ifndef XXH_NO_LONG_LONG840/*-**********************************************************************841* 64-bit hash842************************************************************************/843#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */844/*!845* @brief An unsigned 64-bit integer.846*847* Not necessarily defined to `uint64_t` but functionally equivalent.848*/849typedef uint64_t XXH64_hash_t;850#elif !defined (__VMS) \851&& (defined (__cplusplus) \852|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )853# ifdef _AIX854# include <inttypes.h>855# else856# include <stdint.h>857# endif858typedef uint64_t XXH64_hash_t;859#else860# include <limits.h>861# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL862/* LP64 ABI says uint64_t is unsigned long */863typedef unsigned long XXH64_hash_t;864# else865/* the following type must have a width of 64-bit */866typedef unsigned long long XXH64_hash_t;867# endif868#endif869870#if defined (__cplusplus)871extern "C" {872#endif873/*!874* @}875*876* @defgroup XXH64_family XXH64 family877* @ingroup public878* @{879* Contains functions used in the classic 64-bit xxHash algorithm.880*881* @note882* XXH3 provides competitive speed for both 32-bit and 64-bit systems,883* and offers true 64/128 bit hash results.884* It provides better speed for systems with vector processing capabilities.885*/886887/*!888* @brief Calculates the 64-bit hash of @p input using xxHash64.889*890* @param input The block of data to be hashed, at least @p length bytes in size.891* @param length The length of @p input, in bytes.892* @param seed The 64-bit seed to alter the hash's output predictably.893*894* @pre895* The memory between @p input and @p input + @p length must be valid,896* readable, contiguous memory. However, if @p length is `0`, @p input may be897* `NULL`. In C++, this also must be *TriviallyCopyable*.898*899* @return The calculated 64-bit xxHash64 value.900*901* @see @ref single_shot_example "Single Shot Example" for an example.902*/903XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);904905/******* Streaming *******/906#ifndef XXH_NO_STREAM907/*!908* @brief The opaque state struct for the XXH64 streaming API.909*910* @see XXH64_state_s for details.911*/912typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */913914/*!915* @brief Allocates an @ref XXH64_state_t.916*917* @return An allocated pointer of @ref XXH64_state_t on success.918* @return `NULL` on failure.919*920* @note Must be freed with XXH64_freeState().921*/922XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);923924/*!925* @brief Frees an @ref XXH64_state_t.926*927* @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState().928*929* @return @ref XXH_OK.930*931* @note @p statePtr must be allocated with XXH64_createState().932*/933XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);934935/*!936* @brief Copies one @ref XXH64_state_t to another.937*938* @param dst_state The state to copy to.939* @param src_state The state to copy from.940* @pre941* @p dst_state and @p src_state must not be `NULL` and must not overlap.942*/943XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);944945/*!946* @brief Resets an @ref XXH64_state_t to begin a new hash.947*948* @param statePtr The state struct to reset.949* @param seed The 64-bit seed to alter the hash result predictably.950*951* @pre952* @p statePtr must not be `NULL`.953*954* @return @ref XXH_OK on success.955* @return @ref XXH_ERROR on failure.956*957* @note This function resets and seeds a state. Call it before @ref XXH64_update().958*/959XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);960961/*!962* @brief Consumes a block of @p input to an @ref XXH64_state_t.963*964* @param statePtr The state struct to update.965* @param input The block of data to be hashed, at least @p length bytes in size.966* @param length The length of @p input, in bytes.967*968* @pre969* @p statePtr must not be `NULL`.970* @pre971* The memory between @p input and @p input + @p length must be valid,972* readable, contiguous memory. However, if @p length is `0`, @p input may be973* `NULL`. In C++, this also must be *TriviallyCopyable*.974*975* @return @ref XXH_OK on success.976* @return @ref XXH_ERROR on failure.977*978* @note Call this to incrementally consume blocks of data.979*/980XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);981982/*!983* @brief Returns the calculated hash value from an @ref XXH64_state_t.984*985* @param statePtr The state struct to calculate the hash from.986*987* @pre988* @p statePtr must not be `NULL`.989*990* @return The calculated 64-bit xxHash64 value from that state.991*992* @note993* Calling XXH64_digest() will not affect @p statePtr, so you can update,994* digest, and update again.995*/996XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);997#endif /* !XXH_NO_STREAM */998/******* Canonical representation *******/9991000/*!1001* @brief Canonical (big endian) representation of @ref XXH64_hash_t.1002*/1003typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;10041005/*!1006* @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t.1007*1008* @param dst The @ref XXH64_canonical_t pointer to be stored to.1009* @param hash The @ref XXH64_hash_t to be converted.1010*1011* @pre1012* @p dst must not be `NULL`.1013*1014* @see @ref canonical_representation_example "Canonical Representation Example"1015*/1016XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);10171018/*!1019* @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t.1020*1021* @param src The @ref XXH64_canonical_t to convert.1022*1023* @pre1024* @p src must not be `NULL`.1025*1026* @return The converted hash.1027*1028* @see @ref canonical_representation_example "Canonical Representation Example"1029*/1030XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);10311032#ifndef XXH_NO_XXH310331034/*!1035* @}1036* ************************************************************************1037* @defgroup XXH3_family XXH3 family1038* @ingroup public1039* @{1040*1041* XXH3 is a more recent hash algorithm featuring:1042* - Improved speed for both small and large inputs1043* - True 64-bit and 128-bit outputs1044* - SIMD acceleration1045* - Improved 32-bit viability1046*1047* Speed analysis methodology is explained here:1048*1049* https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html1050*1051* Compared to XXH64, expect XXH3 to run approximately1052* ~2x faster on large inputs and >3x faster on small ones,1053* exact differences vary depending on platform.1054*1055* XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,1056* but does not require it.1057* Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH31058* at competitive speeds, even without vector support. Further details are1059* explained in the implementation.1060*1061* XXH3 has a fast scalar implementation, but it also includes accelerated SIMD1062* implementations for many common platforms:1063* - AVX5121064* - AVX21065* - SSE21066* - ARM NEON1067* - WebAssembly SIMD1281068* - POWER8 VSX1069* - s390x ZVector1070* This can be controlled via the @ref XXH_VECTOR macro, but it automatically1071* selects the best version according to predefined macros. For the x86 family, an1072* automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c.1073*1074* XXH3 implementation is portable:1075* it has a generic C90 formulation that can be compiled on any platform,1076* all implementations generate exactly the same hash value on all platforms.1077* Starting from v0.8.0, it's also labelled "stable", meaning that1078* any future version will also generate the same hash value.1079*1080* XXH3 offers 2 variants, _64bits and _128bits.1081*1082* When only 64 bits are needed, prefer invoking the _64bits variant, as it1083* reduces the amount of mixing, resulting in faster speed on small inputs.1084* It's also generally simpler to manipulate a scalar return type than a struct.1085*1086* The API supports one-shot hashing, streaming mode, and custom secrets.1087*/1088/*-**********************************************************************1089* XXH3 64-bit variant1090************************************************************************/10911092/*!1093* @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input.1094*1095* @param input The block of data to be hashed, at least @p length bytes in size.1096* @param length The length of @p input, in bytes.1097*1098* @pre1099* The memory between @p input and @p input + @p length must be valid,1100* readable, contiguous memory. However, if @p length is `0`, @p input may be1101* `NULL`. In C++, this also must be *TriviallyCopyable*.1102*1103* @return The calculated 64-bit XXH3 hash value.1104*1105* @note1106* This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however1107* it may have slightly better performance due to constant propagation of the1108* defaults.1109*1110* @see1111* XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants1112* @see @ref single_shot_example "Single Shot Example" for an example.1113*/1114XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);11151116/*!1117* @brief Calculates 64-bit seeded variant of XXH3 hash of @p input.1118*1119* @param input The block of data to be hashed, at least @p length bytes in size.1120* @param length The length of @p input, in bytes.1121* @param seed The 64-bit seed to alter the hash result predictably.1122*1123* @pre1124* The memory between @p input and @p input + @p length must be valid,1125* readable, contiguous memory. However, if @p length is `0`, @p input may be1126* `NULL`. In C++, this also must be *TriviallyCopyable*.1127*1128* @return The calculated 64-bit XXH3 hash value.1129*1130* @note1131* seed == 0 produces the same results as @ref XXH3_64bits().1132*1133* This variant generates a custom secret on the fly based on default secret1134* altered using the @p seed value.1135*1136* While this operation is decently fast, note that it's not completely free.1137*1138* @see @ref single_shot_example "Single Shot Example" for an example.1139*/1140XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);11411142/*!1143* The bare minimum size for a custom secret.1144*1145* @see1146* XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),1147* XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().1148*/1149#define XXH3_SECRET_SIZE_MIN 13611501151/*!1152* @brief Calculates 64-bit variant of XXH3 with a custom "secret".1153*1154* @param data The block of data to be hashed, at least @p len bytes in size.1155* @param len The length of @p data, in bytes.1156* @param secret The secret data.1157* @param secretSize The length of @p secret, in bytes.1158*1159* @return The calculated 64-bit XXH3 hash value.1160*1161* @pre1162* The memory between @p data and @p data + @p len must be valid,1163* readable, contiguous memory. However, if @p length is `0`, @p data may be1164* `NULL`. In C++, this also must be *TriviallyCopyable*.1165*1166* It's possible to provide any blob of bytes as a "secret" to generate the hash.1167* This makes it more difficult for an external actor to prepare an intentional collision.1168* The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).1169* However, the quality of the secret impacts the dispersion of the hash algorithm.1170* Therefore, the secret _must_ look like a bunch of random bytes.1171* Avoid "trivial" or structured data such as repeated sequences or a text document.1172* Whenever in doubt about the "randomness" of the blob of bytes,1173* consider employing @ref XXH3_generateSecret() instead (see below).1174* It will generate a proper high entropy secret derived from the blob of bytes.1175* Another advantage of using XXH3_generateSecret() is that1176* it guarantees that all bits within the initial blob of bytes1177* will impact every bit of the output.1178* This is not necessarily the case when using the blob of bytes directly1179* because, when hashing _small_ inputs, only a portion of the secret is employed.1180*1181* @see @ref single_shot_example "Single Shot Example" for an example.1182*/1183XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);118411851186/******* Streaming *******/1187#ifndef XXH_NO_STREAM1188/*1189* Streaming requires state maintenance.1190* This operation costs memory and CPU.1191* As a consequence, streaming is slower than one-shot hashing.1192* For better performance, prefer one-shot functions whenever applicable.1193*/11941195/*!1196* @brief The opaque state struct for the XXH3 streaming API.1197*1198* @see XXH3_state_s for details.1199*/1200typedef struct XXH3_state_s XXH3_state_t;1201XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);1202XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);12031204/*!1205* @brief Copies one @ref XXH3_state_t to another.1206*1207* @param dst_state The state to copy to.1208* @param src_state The state to copy from.1209* @pre1210* @p dst_state and @p src_state must not be `NULL` and must not overlap.1211*/1212XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);12131214/*!1215* @brief Resets an @ref XXH3_state_t to begin a new hash.1216*1217* @param statePtr The state struct to reset.1218*1219* @pre1220* @p statePtr must not be `NULL`.1221*1222* @return @ref XXH_OK on success.1223* @return @ref XXH_ERROR on failure.1224*1225* @note1226* - This function resets `statePtr` and generate a secret with default parameters.1227* - Call this function before @ref XXH3_64bits_update().1228* - Digest will be equivalent to `XXH3_64bits()`.1229*1230*/1231XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);12321233/*!1234* @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.1235*1236* @param statePtr The state struct to reset.1237* @param seed The 64-bit seed to alter the hash result predictably.1238*1239* @pre1240* @p statePtr must not be `NULL`.1241*1242* @return @ref XXH_OK on success.1243* @return @ref XXH_ERROR on failure.1244*1245* @note1246* - This function resets `statePtr` and generate a secret from `seed`.1247* - Call this function before @ref XXH3_64bits_update().1248* - Digest will be equivalent to `XXH3_64bits_withSeed()`.1249*1250*/1251XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);12521253/*!1254* @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.1255*1256* @param statePtr The state struct to reset.1257* @param secret The secret data.1258* @param secretSize The length of @p secret, in bytes.1259*1260* @pre1261* @p statePtr must not be `NULL`.1262*1263* @return @ref XXH_OK on success.1264* @return @ref XXH_ERROR on failure.1265*1266* @note1267* `secret` is referenced, it _must outlive_ the hash streaming session.1268*1269* Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,1270* and the quality of produced hash values depends on secret's entropy1271* (secret's content should look like a bunch of random bytes).1272* When in doubt about the randomness of a candidate `secret`,1273* consider employing `XXH3_generateSecret()` instead (see below).1274*/1275XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);12761277/*!1278* @brief Consumes a block of @p input to an @ref XXH3_state_t.1279*1280* @param statePtr The state struct to update.1281* @param input The block of data to be hashed, at least @p length bytes in size.1282* @param length The length of @p input, in bytes.1283*1284* @pre1285* @p statePtr must not be `NULL`.1286* @pre1287* The memory between @p input and @p input + @p length must be valid,1288* readable, contiguous memory. However, if @p length is `0`, @p input may be1289* `NULL`. In C++, this also must be *TriviallyCopyable*.1290*1291* @return @ref XXH_OK on success.1292* @return @ref XXH_ERROR on failure.1293*1294* @note Call this to incrementally consume blocks of data.1295*/1296XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);12971298/*!1299* @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t.1300*1301* @param statePtr The state struct to calculate the hash from.1302*1303* @pre1304* @p statePtr must not be `NULL`.1305*1306* @return The calculated XXH3 64-bit hash value from that state.1307*1308* @note1309* Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update,1310* digest, and update again.1311*/1312XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);1313#endif /* !XXH_NO_STREAM */13141315/* note : canonical representation of XXH3 is the same as XXH641316* since they both produce XXH64_hash_t values */131713181319/*-**********************************************************************1320* XXH3 128-bit variant1321************************************************************************/13221323/*!1324* @brief The return value from 128-bit hashes.1325*1326* Stored in little endian order, although the fields themselves are in native1327* endianness.1328*/1329typedef struct {1330XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */1331XXH64_hash_t high64; /*!< `value >> 64` */1332} XXH128_hash_t;13331334/*!1335* @brief Calculates 128-bit unseeded variant of XXH3 of @p data.1336*1337* @param data The block of data to be hashed, at least @p length bytes in size.1338* @param len The length of @p data, in bytes.1339*1340* @return The calculated 128-bit variant of XXH3 value.1341*1342* The 128-bit variant of XXH3 has more strength, but it has a bit of overhead1343* for shorter inputs.1344*1345* This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however1346* it may have slightly better performance due to constant propagation of the1347* defaults.1348*1349* @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants1350* @see @ref single_shot_example "Single Shot Example" for an example.1351*/1352XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);1353/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.1354*1355* @param data The block of data to be hashed, at least @p length bytes in size.1356* @param len The length of @p data, in bytes.1357* @param seed The 64-bit seed to alter the hash result predictably.1358*1359* @return The calculated 128-bit variant of XXH3 value.1360*1361* @note1362* seed == 0 produces the same results as @ref XXH3_64bits().1363*1364* This variant generates a custom secret on the fly based on default secret1365* altered using the @p seed value.1366*1367* While this operation is decently fast, note that it's not completely free.1368*1369* @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants1370* @see @ref single_shot_example "Single Shot Example" for an example.1371*/1372XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);1373/*!1374* @brief Calculates 128-bit variant of XXH3 with a custom "secret".1375*1376* @param data The block of data to be hashed, at least @p len bytes in size.1377* @param len The length of @p data, in bytes.1378* @param secret The secret data.1379* @param secretSize The length of @p secret, in bytes.1380*1381* @return The calculated 128-bit variant of XXH3 value.1382*1383* It's possible to provide any blob of bytes as a "secret" to generate the hash.1384* This makes it more difficult for an external actor to prepare an intentional collision.1385* The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).1386* However, the quality of the secret impacts the dispersion of the hash algorithm.1387* Therefore, the secret _must_ look like a bunch of random bytes.1388* Avoid "trivial" or structured data such as repeated sequences or a text document.1389* Whenever in doubt about the "randomness" of the blob of bytes,1390* consider employing @ref XXH3_generateSecret() instead (see below).1391* It will generate a proper high entropy secret derived from the blob of bytes.1392* Another advantage of using XXH3_generateSecret() is that1393* it guarantees that all bits within the initial blob of bytes1394* will impact every bit of the output.1395* This is not necessarily the case when using the blob of bytes directly1396* because, when hashing _small_ inputs, only a portion of the secret is employed.1397*1398* @see @ref single_shot_example "Single Shot Example" for an example.1399*/1400XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);14011402/******* Streaming *******/1403#ifndef XXH_NO_STREAM1404/*1405* Streaming requires state maintenance.1406* This operation costs memory and CPU.1407* As a consequence, streaming is slower than one-shot hashing.1408* For better performance, prefer one-shot functions whenever applicable.1409*1410* XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().1411* Use already declared XXH3_createState() and XXH3_freeState().1412*1413* All reset and streaming functions have same meaning as their 64-bit counterpart.1414*/14151416/*!1417* @brief Resets an @ref XXH3_state_t to begin a new hash.1418*1419* @param statePtr The state struct to reset.1420*1421* @pre1422* @p statePtr must not be `NULL`.1423*1424* @return @ref XXH_OK on success.1425* @return @ref XXH_ERROR on failure.1426*1427* @note1428* - This function resets `statePtr` and generate a secret with default parameters.1429* - Call it before @ref XXH3_128bits_update().1430* - Digest will be equivalent to `XXH3_128bits()`.1431*/1432XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);14331434/*!1435* @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.1436*1437* @param statePtr The state struct to reset.1438* @param seed The 64-bit seed to alter the hash result predictably.1439*1440* @pre1441* @p statePtr must not be `NULL`.1442*1443* @return @ref XXH_OK on success.1444* @return @ref XXH_ERROR on failure.1445*1446* @note1447* - This function resets `statePtr` and generate a secret from `seed`.1448* - Call it before @ref XXH3_128bits_update().1449* - Digest will be equivalent to `XXH3_128bits_withSeed()`.1450*/1451XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);1452/*!1453* @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.1454*1455* @param statePtr The state struct to reset.1456* @param secret The secret data.1457* @param secretSize The length of @p secret, in bytes.1458*1459* @pre1460* @p statePtr must not be `NULL`.1461*1462* @return @ref XXH_OK on success.1463* @return @ref XXH_ERROR on failure.1464*1465* `secret` is referenced, it _must outlive_ the hash streaming session.1466* Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,1467* and the quality of produced hash values depends on secret's entropy1468* (secret's content should look like a bunch of random bytes).1469* When in doubt about the randomness of a candidate `secret`,1470* consider employing `XXH3_generateSecret()` instead (see below).1471*/1472XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);14731474/*!1475* @brief Consumes a block of @p input to an @ref XXH3_state_t.1476*1477* Call this to incrementally consume blocks of data.1478*1479* @param statePtr The state struct to update.1480* @param input The block of data to be hashed, at least @p length bytes in size.1481* @param length The length of @p input, in bytes.1482*1483* @pre1484* @p statePtr must not be `NULL`.1485*1486* @return @ref XXH_OK on success.1487* @return @ref XXH_ERROR on failure.1488*1489* @note1490* The memory between @p input and @p input + @p length must be valid,1491* readable, contiguous memory. However, if @p length is `0`, @p input may be1492* `NULL`. In C++, this also must be *TriviallyCopyable*.1493*1494*/1495XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);14961497/*!1498* @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t.1499*1500* @param statePtr The state struct to calculate the hash from.1501*1502* @pre1503* @p statePtr must not be `NULL`.1504*1505* @return The calculated XXH3 128-bit hash value from that state.1506*1507* @note1508* Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update,1509* digest, and update again.1510*1511*/1512XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);1513#endif /* !XXH_NO_STREAM */15141515/* Following helper functions make it possible to compare XXH128_hast_t values.1516* Since XXH128_hash_t is a structure, this capability is not offered by the language.1517* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */15181519/*!1520* @brief Check equality of two XXH128_hash_t values1521*1522* @param h1 The 128-bit hash value.1523* @param h2 Another 128-bit hash value.1524*1525* @return `1` if `h1` and `h2` are equal.1526* @return `0` if they are not.1527*/1528XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);15291530/*!1531* @brief Compares two @ref XXH128_hash_t1532*1533* This comparator is compatible with stdlib's `qsort()`/`bsearch()`.1534*1535* @param h128_1 Left-hand side value1536* @param h128_2 Right-hand side value1537*1538* @return >0 if @p h128_1 > @p h128_21539* @return =0 if @p h128_1 == @p h128_21540* @return <0 if @p h128_1 < @p h128_21541*/1542XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);154315441545/******* Canonical representation *******/1546typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;154715481549/*!1550* @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t.1551*1552* @param dst The @ref XXH128_canonical_t pointer to be stored to.1553* @param hash The @ref XXH128_hash_t to be converted.1554*1555* @pre1556* @p dst must not be `NULL`.1557* @see @ref canonical_representation_example "Canonical Representation Example"1558*/1559XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);15601561/*!1562* @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t.1563*1564* @param src The @ref XXH128_canonical_t to convert.1565*1566* @pre1567* @p src must not be `NULL`.1568*1569* @return The converted hash.1570* @see @ref canonical_representation_example "Canonical Representation Example"1571*/1572XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);157315741575#endif /* !XXH_NO_XXH3 */15761577#if defined (__cplusplus)1578} /* extern "C" */1579#endif15801581#endif /* XXH_NO_LONG_LONG */15821583/*!1584* @}1585*/1586#endif /* XXHASH_H_5627135585666179 */1587158815891590#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)1591#define XXHASH_H_STATIC_138792387421592/* ****************************************************************************1593* This section contains declarations which are not guaranteed to remain stable.1594* They may change in future versions, becoming incompatible with a different1595* version of the library.1596* These declarations should only be used with static linking.1597* Never use them in association with dynamic linking!1598***************************************************************************** */15991600/*1601* These definitions are only present to allow static allocation1602* of XXH states, on stack or in a struct, for example.1603* Never **ever** access their members directly.1604*/16051606/*!1607* @internal1608* @brief Structure for XXH32 streaming API.1609*1610* @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,1611* @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is1612* an opaque type. This allows fields to safely be changed.1613*1614* Typedef'd to @ref XXH32_state_t.1615* Do not access the members of this struct directly.1616* @see XXH64_state_s, XXH3_state_s1617*/1618struct XXH32_state_s {1619XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */1620XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */1621XXH32_hash_t v[4]; /*!< Accumulator lanes */1622XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */1623XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */1624XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */1625}; /* typedef'd to XXH32_state_t */162616271628#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */16291630/*!1631* @internal1632* @brief Structure for XXH64 streaming API.1633*1634* @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,1635* @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is1636* an opaque type. This allows fields to safely be changed.1637*1638* Typedef'd to @ref XXH64_state_t.1639* Do not access the members of this struct directly.1640* @see XXH32_state_s, XXH3_state_s1641*/1642struct XXH64_state_s {1643XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */1644XXH64_hash_t v[4]; /*!< Accumulator lanes */1645XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */1646XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */1647XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/1648XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */1649}; /* typedef'd to XXH64_state_t */16501651#ifndef XXH_NO_XXH316521653#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */1654# include <stdalign.h>1655# define XXH_ALIGN(n) alignas(n)1656#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */1657/* In C++ alignas() is a keyword */1658# define XXH_ALIGN(n) alignas(n)1659#elif defined(__GNUC__)1660# define XXH_ALIGN(n) __attribute__ ((aligned(n)))1661#elif defined(_MSC_VER)1662# define XXH_ALIGN(n) __declspec(align(n))1663#else1664# define XXH_ALIGN(n) /* disabled */1665#endif16661667/* Old GCC versions only accept the attribute after the type in structures. */1668#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \1669&& ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \1670&& defined(__GNUC__)1671# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)1672#else1673# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type1674#endif16751676/*!1677* @brief The size of the internal XXH3 buffer.1678*1679* This is the optimal update size for incremental hashing.1680*1681* @see XXH3_64b_update(), XXH3_128b_update().1682*/1683#define XXH3_INTERNALBUFFER_SIZE 25616841685/*!1686* @internal1687* @brief Default size of the secret buffer (and @ref XXH3_kSecret).1688*1689* This is the size used in @ref XXH3_kSecret and the seeded functions.1690*1691* Not to be confused with @ref XXH3_SECRET_SIZE_MIN.1692*/1693#define XXH3_SECRET_DEFAULT_SIZE 19216941695/*!1696* @internal1697* @brief Structure for XXH3 streaming API.1698*1699* @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,1700* @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.1701* Otherwise it is an opaque type.1702* Never use this definition in combination with dynamic library.1703* This allows fields to safely be changed in the future.1704*1705* @note ** This structure has a strict alignment requirement of 64 bytes!! **1706* Do not allocate this with `malloc()` or `new`,1707* it will not be sufficiently aligned.1708* Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.1709*1710* Typedef'd to @ref XXH3_state_t.1711* Do never access the members of this struct directly.1712*1713* @see XXH3_INITSTATE() for stack initialization.1714* @see XXH3_createState(), XXH3_freeState().1715* @see XXH32_state_s, XXH64_state_s1716*/1717struct XXH3_state_s {1718XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);1719/*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */1720XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);1721/*!< Used to store a custom secret generated from a seed. */1722XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);1723/*!< The internal buffer. @see XXH32_state_s::mem32 */1724XXH32_hash_t bufferedSize;1725/*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */1726XXH32_hash_t useSeed;1727/*!< Reserved field. Needed for padding on 64-bit. */1728size_t nbStripesSoFar;1729/*!< Number or stripes processed. */1730XXH64_hash_t totalLen;1731/*!< Total length hashed. 64-bit even on 32-bit targets. */1732size_t nbStripesPerBlock;1733/*!< Number of stripes per block. */1734size_t secretLimit;1735/*!< Size of @ref customSecret or @ref extSecret */1736XXH64_hash_t seed;1737/*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */1738XXH64_hash_t reserved64;1739/*!< Reserved field. */1740const unsigned char* extSecret;1741/*!< Reference to an external secret for the _withSecret variants, NULL1742* for other variants. */1743/* note: there may be some padding at the end due to alignment on 64 bytes */1744}; /* typedef'd to XXH3_state_t */17451746#undef XXH_ALIGN_MEMBER17471748/*!1749* @brief Initializes a stack-allocated `XXH3_state_s`.1750*1751* When the @ref XXH3_state_t structure is merely emplaced on stack,1752* it should be initialized with XXH3_INITSTATE() or a memset()1753* in case its first reset uses XXH3_NNbits_reset_withSeed().1754* This init can be omitted if the first reset uses default or _withSecret mode.1755* This operation isn't necessary when the state is created with XXH3_createState().1756* Note that this doesn't prepare the state for a streaming operation,1757* it's still necessary to use XXH3_NNbits_reset*() afterwards.1758*/1759#define XXH3_INITSTATE(XXH3_state_ptr) \1760do { \1761XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \1762tmp_xxh3_state_ptr->seed = 0; \1763tmp_xxh3_state_ptr->extSecret = NULL; \1764} while(0)176517661767#if defined (__cplusplus)1768extern "C" {1769#endif17701771/*!1772* @brief Calculates the 128-bit hash of @p data using XXH3.1773*1774* @param data The block of data to be hashed, at least @p len bytes in size.1775* @param len The length of @p data, in bytes.1776* @param seed The 64-bit seed to alter the hash's output predictably.1777*1778* @pre1779* The memory between @p data and @p data + @p len must be valid,1780* readable, contiguous memory. However, if @p len is `0`, @p data may be1781* `NULL`. In C++, this also must be *TriviallyCopyable*.1782*1783* @return The calculated 128-bit XXH3 value.1784*1785* @see @ref single_shot_example "Single Shot Example" for an example.1786*/1787XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);178817891790/* === Experimental API === */1791/* Symbols defined below must be considered tied to a specific library version. */17921793/*!1794* @brief Derive a high-entropy secret from any user-defined content, named customSeed.1795*1796* @param secretBuffer A writable buffer for derived high-entropy secret data.1797* @param secretSize Size of secretBuffer, in bytes. Must be >= XXH3_SECRET_DEFAULT_SIZE.1798* @param customSeed A user-defined content.1799* @param customSeedSize Size of customSeed, in bytes.1800*1801* @return @ref XXH_OK on success.1802* @return @ref XXH_ERROR on failure.1803*1804* The generated secret can be used in combination with `*_withSecret()` functions.1805* The `_withSecret()` variants are useful to provide a higher level of protection1806* than 64-bit seed, as it becomes much more difficult for an external actor to1807* guess how to impact the calculation logic.1808*1809* The function accepts as input a custom seed of any length and any content,1810* and derives from it a high-entropy secret of length @p secretSize into an1811* already allocated buffer @p secretBuffer.1812*1813* The generated secret can then be used with any `*_withSecret()` variant.1814* The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(),1815* @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret()1816* are part of this list. They all accept a `secret` parameter1817* which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN)1818* _and_ feature very high entropy (consist of random-looking bytes).1819* These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can1820* be employed to ensure proper quality.1821*1822* @p customSeed can be anything. It can have any size, even small ones,1823* and its content can be anything, even "poor entropy" sources such as a bunch1824* of zeroes. The resulting `secret` will nonetheless provide all required qualities.1825*1826* @pre1827* - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN1828* - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior.1829*1830* Example code:1831* @code{.c}1832* #include <stdio.h>1833* #include <stdlib.h>1834* #include <string.h>1835* #define XXH_STATIC_LINKING_ONLY // expose unstable API1836* #include "xxhash.h"1837* // Hashes argv[2] using the entropy from argv[1].1838* int main(int argc, char* argv[])1839* {1840* char secret[XXH3_SECRET_SIZE_MIN];1841* if (argv != 3) { return 1; }1842* XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1]));1843* XXH64_hash_t h = XXH3_64bits_withSecret(1844* argv[2], strlen(argv[2]),1845* secret, sizeof(secret)1846* );1847* printf("%016llx\n", (unsigned long long) h);1848* }1849* @endcode1850*/1851XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);18521853/*!1854* @brief Generate the same secret as the _withSeed() variants.1855*1856* @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes1857* @param seed The 64-bit seed to alter the hash result predictably.1858*1859* The generated secret can be used in combination with1860*`*_withSecret()` and `_withSecretandSeed()` variants.1861*1862* Example C++ `std::string` hash class:1863* @code{.cpp}1864* #include <string>1865* #define XXH_STATIC_LINKING_ONLY // expose unstable API1866* #include "xxhash.h"1867* // Slow, seeds each time1868* class HashSlow {1869* XXH64_hash_t seed;1870* public:1871* HashSlow(XXH64_hash_t s) : seed{s} {}1872* size_t operator()(const std::string& x) const {1873* return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)};1874* }1875* };1876* // Fast, caches the seeded secret for future uses.1877* class HashFast {1878* unsigned char secret[XXH3_SECRET_SIZE_MIN];1879* public:1880* HashFast(XXH64_hash_t s) {1881* XXH3_generateSecret_fromSeed(secret, seed);1882* }1883* size_t operator()(const std::string& x) const {1884* return size_t{1885* XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret))1886* };1887* }1888* };1889* @endcode1890*/1891XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);18921893/*!1894* @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data.1895*1896* @param data The block of data to be hashed, at least @p len bytes in size.1897* @param len The length of @p data, in bytes.1898* @param secret The secret data.1899* @param secretSize The length of @p secret, in bytes.1900* @param seed The 64-bit seed to alter the hash result predictably.1901*1902* These variants generate hash values using either1903* @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes)1904* or @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX).1905*1906* This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.1907* `_withSeed()` has to generate the secret on the fly for "large" keys.1908* It's fast, but can be perceptible for "not so large" keys (< 1 KB).1909* `_withSecret()` has to generate the masks on the fly for "small" keys,1910* which requires more instructions than _withSeed() variants.1911* Therefore, _withSecretandSeed variant combines the best of both worlds.1912*1913* When @p secret has been generated by XXH3_generateSecret_fromSeed(),1914* this variant produces *exactly* the same results as `_withSeed()` variant,1915* hence offering only a pure speed benefit on "large" input,1916* by skipping the need to regenerate the secret for every large input.1917*1918* Another usage scenario is to hash the secret to a 64-bit hash value,1919* for example with XXH3_64bits(), which then becomes the seed,1920* and then employ both the seed and the secret in _withSecretandSeed().1921* On top of speed, an added benefit is that each bit in the secret1922* has a 50% chance to swap each bit in the output, via its impact to the seed.1923*1924* This is not guaranteed when using the secret directly in "small data" scenarios,1925* because only portions of the secret are employed for small data.1926*/1927XXH_PUBLIC_API XXH_PUREF XXH64_hash_t1928XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,1929XXH_NOESCAPE const void* secret, size_t secretSize,1930XXH64_hash_t seed);1931/*!1932* @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.1933*1934* @param input The block of data to be hashed, at least @p len bytes in size.1935* @param length The length of @p data, in bytes.1936* @param secret The secret data.1937* @param secretSize The length of @p secret, in bytes.1938* @param seed64 The 64-bit seed to alter the hash result predictably.1939*1940* @return @ref XXH_OK on success.1941* @return @ref XXH_ERROR on failure.1942*1943* @see XXH3_64bits_withSecretandSeed()1944*/1945XXH_PUBLIC_API XXH_PUREF XXH128_hash_t1946XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,1947XXH_NOESCAPE const void* secret, size_t secretSize,1948XXH64_hash_t seed64);1949#ifndef XXH_NO_STREAM1950/*!1951* @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.1952*1953* @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().1954* @param secret The secret data.1955* @param secretSize The length of @p secret, in bytes.1956* @param seed64 The 64-bit seed to alter the hash result predictably.1957*1958* @return @ref XXH_OK on success.1959* @return @ref XXH_ERROR on failure.1960*1961* @see XXH3_64bits_withSecretandSeed()1962*/1963XXH_PUBLIC_API XXH_errorcode1964XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,1965XXH_NOESCAPE const void* secret, size_t secretSize,1966XXH64_hash_t seed64);1967/*!1968* @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.1969*1970* @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().1971* @param secret The secret data.1972* @param secretSize The length of @p secret, in bytes.1973* @param seed64 The 64-bit seed to alter the hash result predictably.1974*1975* @return @ref XXH_OK on success.1976* @return @ref XXH_ERROR on failure.1977*1978* @see XXH3_64bits_withSecretandSeed()1979*/1980XXH_PUBLIC_API XXH_errorcode1981XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,1982XXH_NOESCAPE const void* secret, size_t secretSize,1983XXH64_hash_t seed64);1984#endif /* !XXH_NO_STREAM */19851986#if defined (__cplusplus)1987} /* extern "C" */1988#endif19891990#endif /* !XXH_NO_XXH3 */1991#endif /* XXH_NO_LONG_LONG */19921993#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)1994# define XXH_IMPLEMENTATION1995#endif19961997#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */199819992000/* ======================================================================== */2001/* ======================================================================== */2002/* ======================================================================== */200320042005/*-**********************************************************************2006* xxHash implementation2007*-**********************************************************************2008* xxHash's implementation used to be hosted inside xxhash.c.2009*2010* However, inlining requires implementation to be visible to the compiler,2011* hence be included alongside the header.2012* Previously, implementation was hosted inside xxhash.c,2013* which was then #included when inlining was activated.2014* This construction created issues with a few build and install systems,2015* as it required xxhash.c to be stored in /include directory.2016*2017* xxHash implementation is now directly integrated within xxhash.h.2018* As a consequence, xxhash.c is no longer needed in /include.2019*2020* xxhash.c is still available and is still useful.2021* In a "normal" setup, when xxhash is not inlined,2022* xxhash.h only exposes the prototypes and public symbols,2023* while xxhash.c can be built into an object file xxhash.o2024* which can then be linked into the final binary.2025************************************************************************/20262027#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \2028|| defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)2029# define XXH_IMPLEM_13a873738720302031/* *************************************2032* Tuning parameters2033***************************************/20342035/*!2036* @defgroup tuning Tuning parameters2037* @{2038*2039* Various macros to control xxHash's behavior.2040*/2041#ifdef XXH_DOXYGEN2042/*!2043* @brief Define this to disable 64-bit code.2044*2045* Useful if only using the @ref XXH32_family and you have a strict C90 compiler.2046*/2047# define XXH_NO_LONG_LONG2048# undef XXH_NO_LONG_LONG /* don't actually */2049/*!2050* @brief Controls how unaligned memory is accessed.2051*2052* By default, access to unaligned memory is controlled by `memcpy()`, which is2053* safe and portable.2054*2055* Unfortunately, on some target/compiler combinations, the generated assembly2056* is sub-optimal.2057*2058* The below switch allow selection of a different access method2059* in the search for improved performance.2060*2061* @par Possible options:2062*2063* - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`2064* @par2065* Use `memcpy()`. Safe and portable. Note that most modern compilers will2066* eliminate the function call and treat it as an unaligned access.2067*2068* - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))`2069* @par2070* Depends on compiler extensions and is therefore not portable.2071* This method is safe _if_ your compiler supports it,2072* and *generally* as fast or faster than `memcpy`.2073*2074* - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast2075* @par2076* Casts directly and dereferences. This method doesn't depend on the2077* compiler, but it violates the C standard as it directly dereferences an2078* unaligned pointer. It can generate buggy code on targets which do not2079* support unaligned memory accesses, but in some circumstances, it's the2080* only known way to get the most performance.2081*2082* - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift2083* @par2084* Also portable. This can generate the best code on old compilers which don't2085* inline small `memcpy()` calls, and it might also be faster on big-endian2086* systems which lack a native byteswap instruction. However, some compilers2087* will emit literal byteshifts even if the target supports unaligned access.2088*2089*2090* @warning2091* Methods 1 and 2 rely on implementation-defined behavior. Use these with2092* care, as what works on one compiler/platform/optimization level may cause2093* another to read garbage data or even crash.2094*2095* See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.2096*2097* Prefer these methods in priority order (0 > 3 > 1 > 2)2098*/2099# define XXH_FORCE_MEMORY_ACCESS 021002101/*!2102* @def XXH_SIZE_OPT2103* @brief Controls how much xxHash optimizes for size.2104*2105* xxHash, when compiled, tends to result in a rather large binary size. This2106* is mostly due to heavy usage to forced inlining and constant folding of the2107* @ref XXH3_family to increase performance.2108*2109* However, some developers prefer size over speed. This option can2110* significantly reduce the size of the generated code. When using the `-Os`2111* or `-Oz` options on GCC or Clang, this is defined to 1 by default,2112* otherwise it is defined to 0.2113*2114* Most of these size optimizations can be controlled manually.2115*2116* This is a number from 0-2.2117* - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed2118* comes first.2119* - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more2120* conservative and disables hacks that increase code size. It implies the2121* options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0,2122* and @ref XXH3_NEON_LANES == 8 if they are not already defined.2123* - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible.2124* Performance may cry. For example, the single shot functions just use the2125* streaming API.2126*/2127# define XXH_SIZE_OPT 021282129/*!2130* @def XXH_FORCE_ALIGN_CHECK2131* @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()2132* and XXH64() only).2133*2134* This is an important performance trick for architectures without decent2135* unaligned memory access performance.2136*2137* It checks for input alignment, and when conditions are met, uses a "fast2138* path" employing direct 32-bit/64-bit reads, resulting in _dramatically2139* faster_ read speed.2140*2141* The check costs one initial branch per hash, which is generally negligible,2142* but not zero.2143*2144* Moreover, it's not useful to generate an additional code path if memory2145* access uses the same instruction for both aligned and unaligned2146* addresses (e.g. x86 and aarch64).2147*2148* In these cases, the alignment check can be removed by setting this macro to 0.2149* Then the code will always use unaligned memory access.2150* Align check is automatically disabled on x86, x64, ARM64, and some ARM chips2151* which are platforms known to offer good unaligned memory accesses performance.2152*2153* It is also disabled by default when @ref XXH_SIZE_OPT >= 1.2154*2155* This option does not affect XXH3 (only XXH32 and XXH64).2156*/2157# define XXH_FORCE_ALIGN_CHECK 021582159/*!2160* @def XXH_NO_INLINE_HINTS2161* @brief When non-zero, sets all functions to `static`.2162*2163* By default, xxHash tries to force the compiler to inline almost all internal2164* functions.2165*2166* This can usually improve performance due to reduced jumping and improved2167* constant folding, but significantly increases the size of the binary which2168* might not be favorable.2169*2170* Additionally, sometimes the forced inlining can be detrimental to performance,2171* depending on the architecture.2172*2173* XXH_NO_INLINE_HINTS marks all internal functions as static, giving the2174* compiler full control on whether to inline or not.2175*2176* When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if2177* @ref XXH_SIZE_OPT >= 1, this will automatically be defined.2178*/2179# define XXH_NO_INLINE_HINTS 021802181/*!2182* @def XXH3_INLINE_SECRET2183* @brief Determines whether to inline the XXH3 withSecret code.2184*2185* When the secret size is known, the compiler can improve the performance2186* of XXH3_64bits_withSecret() and XXH3_128bits_withSecret().2187*2188* However, if the secret size is not known, it doesn't have any benefit. This2189* happens when xxHash is compiled into a global symbol. Therefore, if2190* @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0.2191*2192* Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers2193* that are *sometimes* force inline on -Og, and it is impossible to automatically2194* detect this optimization level.2195*/2196# define XXH3_INLINE_SECRET 021972198/*!2199* @def XXH32_ENDJMP2200* @brief Whether to use a jump for `XXH32_finalize`.2201*2202* For performance, `XXH32_finalize` uses multiple branches in the finalizer.2203* This is generally preferable for performance,2204* but depending on exact architecture, a jmp may be preferable.2205*2206* This setting is only possibly making a difference for very small inputs.2207*/2208# define XXH32_ENDJMP 022092210/*!2211* @internal2212* @brief Redefines old internal names.2213*2214* For compatibility with code that uses xxHash's internals before the names2215* were changed to improve namespacing. There is no other reason to use this.2216*/2217# define XXH_OLD_NAMES2218# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */22192220/*!2221* @def XXH_NO_STREAM2222* @brief Disables the streaming API.2223*2224* When xxHash is not inlined and the streaming functions are not used, disabling2225* the streaming functions can improve code size significantly, especially with2226* the @ref XXH3_family which tends to make constant folded copies of itself.2227*/2228# define XXH_NO_STREAM2229# undef XXH_NO_STREAM /* don't actually */2230#endif /* XXH_DOXYGEN */2231/*!2232* @}2233*/22342235#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */2236/* prefer __packed__ structures (method 1) for GCC2237* < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy2238* which for some reason does unaligned loads. */2239# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED))2240# define XXH_FORCE_MEMORY_ACCESS 12241# endif2242#endif22432244#ifndef XXH_SIZE_OPT2245/* default to 1 for -Os or -Oz */2246# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__)2247# define XXH_SIZE_OPT 12248# else2249# define XXH_SIZE_OPT 02250# endif2251#endif22522253#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */2254/* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */2255# if XXH_SIZE_OPT >= 1 || \2256defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \2257|| defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */2258# define XXH_FORCE_ALIGN_CHECK 02259# else2260# define XXH_FORCE_ALIGN_CHECK 12261# endif2262#endif22632264#ifndef XXH_NO_INLINE_HINTS2265# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */2266# define XXH_NO_INLINE_HINTS 12267# else2268# define XXH_NO_INLINE_HINTS 02269# endif2270#endif22712272#ifndef XXH3_INLINE_SECRET2273# if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \2274|| !defined(XXH_INLINE_ALL)2275# define XXH3_INLINE_SECRET 02276# else2277# define XXH3_INLINE_SECRET 12278# endif2279#endif22802281#ifndef XXH32_ENDJMP2282/* generally preferable for performance */2283# define XXH32_ENDJMP 02284#endif22852286/*!2287* @defgroup impl Implementation2288* @{2289*/22902291/* *************************************2292* Includes & Memory related functions2293***************************************/2294#include <string.h> /* memcmp, memcpy */2295#include <limits.h> /* ULLONG_MAX */22962297#if defined(XXH_NO_STREAM)2298/* nothing */2299#elif defined(XXH_NO_STDLIB)23002301/* When requesting to disable any mention of stdlib,2302* the library loses the ability to invoked malloc / free.2303* In practice, it means that functions like `XXH*_createState()`2304* will always fail, and return NULL.2305* This flag is useful in situations where2306* xxhash.h is integrated into some kernel, embedded or limited environment2307* without access to dynamic allocation.2308*/23092310#if defined (__cplusplus)2311extern "C" {2312#endif23132314static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; }2315static void XXH_free(void* p) { (void)p; }23162317#if defined (__cplusplus)2318} /* extern "C" */2319#endif23202321#else23222323/*2324* Modify the local functions below should you wish to use2325* different memory routines for malloc() and free()2326*/2327#include <stdlib.h>23282329#if defined (__cplusplus)2330extern "C" {2331#endif2332/*!2333* @internal2334* @brief Modify this function to use a different routine than malloc().2335*/2336static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); }23372338/*!2339* @internal2340* @brief Modify this function to use a different routine than free().2341*/2342static void XXH_free(void* p) { free(p); }23432344#if defined (__cplusplus)2345} /* extern "C" */2346#endif23472348#endif /* XXH_NO_STDLIB */23492350#if defined (__cplusplus)2351extern "C" {2352#endif2353/*!2354* @internal2355* @brief Modify this function to use a different routine than memcpy().2356*/2357static void* XXH_memcpy(void* dest, const void* src, size_t size)2358{2359return memcpy(dest,src,size);2360}23612362#if defined (__cplusplus)2363} /* extern "C" */2364#endif23652366/* *************************************2367* Compiler Specific Options2368***************************************/2369#ifdef _MSC_VER /* Visual Studio warning fix */2370# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */2371#endif23722373#if XXH_NO_INLINE_HINTS /* disable inlining hints */2374# if defined(__GNUC__) || defined(__clang__)2375# define XXH_FORCE_INLINE static __attribute__((unused))2376# else2377# define XXH_FORCE_INLINE static2378# endif2379# define XXH_NO_INLINE static2380/* enable inlining hints */2381#elif defined(__GNUC__) || defined(__clang__)2382# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))2383# define XXH_NO_INLINE static __attribute__((noinline))2384#elif defined(_MSC_VER) /* Visual Studio */2385# define XXH_FORCE_INLINE static __forceinline2386# define XXH_NO_INLINE static __declspec(noinline)2387#elif defined (__cplusplus) \2388|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */2389# define XXH_FORCE_INLINE static inline2390# define XXH_NO_INLINE static2391#else2392# define XXH_FORCE_INLINE static2393# define XXH_NO_INLINE static2394#endif23952396#if XXH3_INLINE_SECRET2397# define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE2398#else2399# define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE2400#endif240124022403/* *************************************2404* Debug2405***************************************/2406/*!2407* @ingroup tuning2408* @def XXH_DEBUGLEVEL2409* @brief Sets the debugging level.2410*2411* XXH_DEBUGLEVEL is expected to be defined externally, typically via the2412* compiler's command line options. The value must be a number.2413*/2414#ifndef XXH_DEBUGLEVEL2415# ifdef DEBUGLEVEL /* backwards compat */2416# define XXH_DEBUGLEVEL DEBUGLEVEL2417# else2418# define XXH_DEBUGLEVEL 02419# endif2420#endif24212422#if (XXH_DEBUGLEVEL>=1)2423# include <assert.h> /* note: can still be disabled with NDEBUG */2424# define XXH_ASSERT(c) assert(c)2425#else2426# if defined(__INTEL_COMPILER)2427# define XXH_ASSERT(c) XXH_ASSUME((unsigned char) (c))2428# else2429# define XXH_ASSERT(c) XXH_ASSUME(c)2430# endif2431#endif24322433/* note: use after variable declarations */2434#ifndef XXH_STATIC_ASSERT2435# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */2436# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0)2437# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */2438# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)2439# else2440# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)2441# endif2442# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)2443#endif24442445/*!2446* @internal2447* @def XXH_COMPILER_GUARD(var)2448* @brief Used to prevent unwanted optimizations for @p var.2449*2450* It uses an empty GCC inline assembly statement with a register constraint2451* which forces @p var into a general purpose register (eg eax, ebx, ecx2452* on x86) and marks it as modified.2453*2454* This is used in a few places to avoid unwanted autovectorization (e.g.2455* XXH32_round()). All vectorization we want is explicit via intrinsics,2456* and _usually_ isn't wanted elsewhere.2457*2458* We also use it to prevent unwanted constant folding for AArch64 in2459* XXH3_initCustomSecret_scalar().2460*/2461#if defined(__GNUC__) || defined(__clang__)2462# define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var))2463#else2464# define XXH_COMPILER_GUARD(var) ((void)0)2465#endif24662467/* Specifically for NEON vectors which use the "w" constraint, on2468* Clang. */2469#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__)2470# define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var))2471#else2472# define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0)2473#endif24742475/* *************************************2476* Basic Types2477***************************************/2478#if !defined (__VMS) \2479&& (defined (__cplusplus) \2480|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )2481# ifdef _AIX2482# include <inttypes.h>2483# else2484# include <stdint.h>2485# endif2486typedef uint8_t xxh_u8;2487#else2488typedef unsigned char xxh_u8;2489#endif2490typedef XXH32_hash_t xxh_u32;24912492#ifdef XXH_OLD_NAMES2493# warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly"2494# define BYTE xxh_u82495# define U8 xxh_u82496# define U32 xxh_u322497#endif24982499#if defined (__cplusplus)2500extern "C" {2501#endif25022503/* *** Memory access *** */25042505/*!2506* @internal2507* @fn xxh_u32 XXH_read32(const void* ptr)2508* @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.2509*2510* Affected by @ref XXH_FORCE_MEMORY_ACCESS.2511*2512* @param ptr The pointer to read from.2513* @return The 32-bit native endian integer from the bytes at @p ptr.2514*/25152516/*!2517* @internal2518* @fn xxh_u32 XXH_readLE32(const void* ptr)2519* @brief Reads an unaligned 32-bit little endian integer from @p ptr.2520*2521* Affected by @ref XXH_FORCE_MEMORY_ACCESS.2522*2523* @param ptr The pointer to read from.2524* @return The 32-bit little endian integer from the bytes at @p ptr.2525*/25262527/*!2528* @internal2529* @fn xxh_u32 XXH_readBE32(const void* ptr)2530* @brief Reads an unaligned 32-bit big endian integer from @p ptr.2531*2532* Affected by @ref XXH_FORCE_MEMORY_ACCESS.2533*2534* @param ptr The pointer to read from.2535* @return The 32-bit big endian integer from the bytes at @p ptr.2536*/25372538/*!2539* @internal2540* @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)2541* @brief Like @ref XXH_readLE32(), but has an option for aligned reads.2542*2543* Affected by @ref XXH_FORCE_MEMORY_ACCESS.2544* Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is2545* always @ref XXH_alignment::XXH_unaligned.2546*2547* @param ptr The pointer to read from.2548* @param align Whether @p ptr is aligned.2549* @pre2550* If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte2551* aligned.2552* @return The 32-bit little endian integer from the bytes at @p ptr.2553*/25542555#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))2556/*2557* Manual byteshift. Best for old compilers which don't inline memcpy.2558* We actually directly use XXH_readLE32 and XXH_readBE32.2559*/2560#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))25612562/*2563* Force direct memory access. Only works on CPU which support unaligned memory2564* access in hardware.2565*/2566static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }25672568#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))25692570/*2571* __attribute__((aligned(1))) is supported by gcc and clang. Originally the2572* documentation claimed that it only increased the alignment, but actually it2573* can decrease it on gcc, clang, and icc:2574* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,2575* https://gcc.godbolt.org/z/xYez1j67Y.2576*/2577#ifdef XXH_OLD_NAMES2578typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;2579#endif2580static xxh_u32 XXH_read32(const void* ptr)2581{2582typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32;2583return *((const xxh_unalign32*)ptr);2584}25852586#else25872588/*2589* Portable and safe solution. Generally efficient.2590* see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html2591*/2592static xxh_u32 XXH_read32(const void* memPtr)2593{2594xxh_u32 val;2595XXH_memcpy(&val, memPtr, sizeof(val));2596return val;2597}25982599#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */260026012602/* *** Endianness *** */26032604/*!2605* @ingroup tuning2606* @def XXH_CPU_LITTLE_ENDIAN2607* @brief Whether the target is little endian.2608*2609* Defined to 1 if the target is little endian, or 0 if it is big endian.2610* It can be defined externally, for example on the compiler command line.2611*2612* If it is not defined,2613* a runtime check (which is usually constant folded) is used instead.2614*2615* @note2616* This is not necessarily defined to an integer constant.2617*2618* @see XXH_isLittleEndian() for the runtime check.2619*/2620#ifndef XXH_CPU_LITTLE_ENDIAN2621/*2622* Try to detect endianness automatically, to avoid the nonstandard behavior2623* in `XXH_isLittleEndian()`2624*/2625# if defined(_WIN32) /* Windows is always little endian */ \2626|| defined(__LITTLE_ENDIAN__) \2627|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)2628# define XXH_CPU_LITTLE_ENDIAN 12629# elif defined(__BIG_ENDIAN__) \2630|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)2631# define XXH_CPU_LITTLE_ENDIAN 02632# else2633/*!2634* @internal2635* @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.2636*2637* Most compilers will constant fold this.2638*/2639static int XXH_isLittleEndian(void)2640{2641/*2642* Portable and well-defined behavior.2643* Don't use static: it is detrimental to performance.2644*/2645const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };2646return one.c[0];2647}2648# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()2649# endif2650#endif26512652265326542655/* ****************************************2656* Compiler-specific Functions and Macros2657******************************************/2658#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)26592660#ifdef __has_builtin2661# define XXH_HAS_BUILTIN(x) __has_builtin(x)2662#else2663# define XXH_HAS_BUILTIN(x) 02664#endif2665266626672668/*2669* C23 and future versions have standard "unreachable()".2670* Once it has been implemented reliably we can add it as an2671* additional case:2672*2673* ```2674* #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN)2675* # include <stddef.h>2676* # ifdef unreachable2677* # define XXH_UNREACHABLE() unreachable()2678* # endif2679* #endif2680* ```2681*2682* Note C++23 also has std::unreachable() which can be detected2683* as follows:2684* ```2685* #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L)2686* # include <utility>2687* # define XXH_UNREACHABLE() std::unreachable()2688* #endif2689* ```2690* NB: `__cpp_lib_unreachable` is defined in the `<version>` header.2691* We don't use that as including `<utility>` in `extern "C"` blocks2692* doesn't work on GCC122693*/26942695#if XXH_HAS_BUILTIN(__builtin_unreachable)2696# define XXH_UNREACHABLE() __builtin_unreachable()26972698#elif defined(_MSC_VER)2699# define XXH_UNREACHABLE() __assume(0)27002701#else2702# define XXH_UNREACHABLE()2703#endif27042705#if XXH_HAS_BUILTIN(__builtin_assume)2706# define XXH_ASSUME(c) __builtin_assume(c)2707#else2708# define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }2709#endif27102711/*!2712* @internal2713* @def XXH_rotl32(x,r)2714* @brief 32-bit rotate left.2715*2716* @param x The 32-bit integer to be rotated.2717* @param r The number of bits to rotate.2718* @pre2719* @p r > 0 && @p r < 322720* @note2721* @p x and @p r may be evaluated multiple times.2722* @return The rotated result.2723*/2724#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \2725&& XXH_HAS_BUILTIN(__builtin_rotateleft64)2726# define XXH_rotl32 __builtin_rotateleft322727# define XXH_rotl64 __builtin_rotateleft642728/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */2729#elif defined(_MSC_VER)2730# define XXH_rotl32(x,r) _rotl(x,r)2731# define XXH_rotl64(x,r) _rotl64(x,r)2732#else2733# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))2734# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))2735#endif27362737/*!2738* @internal2739* @fn xxh_u32 XXH_swap32(xxh_u32 x)2740* @brief A 32-bit byteswap.2741*2742* @param x The 32-bit integer to byteswap.2743* @return @p x, byteswapped.2744*/2745#if defined(_MSC_VER) /* Visual Studio */2746# define XXH_swap32 _byteswap_ulong2747#elif XXH_GCC_VERSION >= 4032748# define XXH_swap32 __builtin_bswap322749#else2750static xxh_u32 XXH_swap32 (xxh_u32 x)2751{2752return ((x << 24) & 0xff000000 ) |2753((x << 8) & 0x00ff0000 ) |2754((x >> 8) & 0x0000ff00 ) |2755((x >> 24) & 0x000000ff );2756}2757#endif275827592760/* ***************************2761* Memory reads2762*****************************/27632764/*!2765* @internal2766* @brief Enum to indicate whether a pointer is aligned.2767*/2768typedef enum {2769XXH_aligned, /*!< Aligned */2770XXH_unaligned /*!< Possibly unaligned */2771} XXH_alignment;27722773/*2774* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.2775*2776* This is ideal for older compilers which don't inline memcpy.2777*/2778#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))27792780XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)2781{2782const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;2783return bytePtr[0]2784| ((xxh_u32)bytePtr[1] << 8)2785| ((xxh_u32)bytePtr[2] << 16)2786| ((xxh_u32)bytePtr[3] << 24);2787}27882789XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)2790{2791const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;2792return bytePtr[3]2793| ((xxh_u32)bytePtr[2] << 8)2794| ((xxh_u32)bytePtr[1] << 16)2795| ((xxh_u32)bytePtr[0] << 24);2796}27972798#else2799XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)2800{2801return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));2802}28032804static xxh_u32 XXH_readBE32(const void* ptr)2805{2806return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);2807}2808#endif28092810XXH_FORCE_INLINE xxh_u322811XXH_readLE32_align(const void* ptr, XXH_alignment align)2812{2813if (align==XXH_unaligned) {2814return XXH_readLE32(ptr);2815} else {2816return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);2817}2818}281928202821/* *************************************2822* Misc2823***************************************/2824/*! @ingroup public */2825XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }282628272828/* *******************************************************************2829* 32-bit hash functions2830*********************************************************************/2831/*!2832* @}2833* @defgroup XXH32_impl XXH32 implementation2834* @ingroup impl2835*2836* Details on the XXH32 implementation.2837* @{2838*/2839/* #define instead of static const, to be used as initializers */2840#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */2841#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */2842#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */2843#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */2844#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */28452846#ifdef XXH_OLD_NAMES2847# define PRIME32_1 XXH_PRIME32_12848# define PRIME32_2 XXH_PRIME32_22849# define PRIME32_3 XXH_PRIME32_32850# define PRIME32_4 XXH_PRIME32_42851# define PRIME32_5 XXH_PRIME32_52852#endif28532854/*!2855* @internal2856* @brief Normal stripe processing routine.2857*2858* This shuffles the bits so that any bit from @p input impacts several bits in2859* @p acc.2860*2861* @param acc The accumulator lane.2862* @param input The stripe of input to mix.2863* @return The mixed accumulator lane.2864*/2865static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)2866{2867acc += input * XXH_PRIME32_2;2868acc = XXH_rotl32(acc, 13);2869acc *= XXH_PRIME32_1;2870#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)2871/*2872* UGLY HACK:2873* A compiler fence is the only thing that prevents GCC and Clang from2874* autovectorizing the XXH32 loop (pragmas and attributes don't work for some2875* reason) without globally disabling SSE4.1.2876*2877* The reason we want to avoid vectorization is because despite working on2878* 4 integers at a time, there are multiple factors slowing XXH32 down on2879* SSE4:2880* - There's a ridiculous amount of lag from pmulld (10 cycles of latency on2881* newer chips!) making it slightly slower to multiply four integers at2882* once compared to four integers independently. Even when pmulld was2883* fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE2884* just to multiply unless doing a long operation.2885*2886* - Four instructions are required to rotate,2887* movqda tmp, v // not required with VEX encoding2888* pslld tmp, 13 // tmp <<= 132889* psrld v, 19 // x >>= 192890* por v, tmp // x |= tmp2891* compared to one for scalar:2892* roll v, 13 // reliably fast across the board2893* shldl v, v, 13 // Sandy Bridge and later prefer this for some reason2894*2895* - Instruction level parallelism is actually more beneficial here because2896* the SIMD actually serializes this operation: While v1 is rotating, v22897* can load data, while v3 can multiply. SSE forces them to operate2898* together.2899*2900* This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing2901* the loop. NEON is only faster on the A53, and with the newer cores, it is less2902* than half the speed.2903*2904* Additionally, this is used on WASM SIMD128 because it JITs to the same2905* SIMD instructions and has the same issue.2906*/2907XXH_COMPILER_GUARD(acc);2908#endif2909return acc;2910}29112912/*!2913* @internal2914* @brief Mixes all bits to finalize the hash.2915*2916* The final mix ensures that all input bits have a chance to impact any bit in2917* the output digest, resulting in an unbiased distribution.2918*2919* @param hash The hash to avalanche.2920* @return The avalanched hash.2921*/2922static xxh_u32 XXH32_avalanche(xxh_u32 hash)2923{2924hash ^= hash >> 15;2925hash *= XXH_PRIME32_2;2926hash ^= hash >> 13;2927hash *= XXH_PRIME32_3;2928hash ^= hash >> 16;2929return hash;2930}29312932#define XXH_get32bits(p) XXH_readLE32_align(p, align)29332934/*!2935* @internal2936* @brief Processes the last 0-15 bytes of @p ptr.2937*2938* There may be up to 15 bytes remaining to consume from the input.2939* This final stage will digest them to ensure that all input bytes are present2940* in the final mix.2941*2942* @param hash The hash to finalize.2943* @param ptr The pointer to the remaining input.2944* @param len The remaining length, modulo 16.2945* @param align Whether @p ptr is aligned.2946* @return The finalized hash.2947* @see XXH64_finalize().2948*/2949static XXH_PUREF xxh_u322950XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)2951{2952#define XXH_PROCESS1 do { \2953hash += (*ptr++) * XXH_PRIME32_5; \2954hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \2955} while (0)29562957#define XXH_PROCESS4 do { \2958hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \2959ptr += 4; \2960hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \2961} while (0)29622963if (ptr==NULL) XXH_ASSERT(len == 0);29642965/* Compact rerolled version; generally faster */2966if (!XXH32_ENDJMP) {2967len &= 15;2968while (len >= 4) {2969XXH_PROCESS4;2970len -= 4;2971}2972while (len > 0) {2973XXH_PROCESS1;2974--len;2975}2976return XXH32_avalanche(hash);2977} else {2978switch(len&15) /* or switch(bEnd - p) */ {2979case 12: XXH_PROCESS4;2980XXH_FALLTHROUGH; /* fallthrough */2981case 8: XXH_PROCESS4;2982XXH_FALLTHROUGH; /* fallthrough */2983case 4: XXH_PROCESS4;2984return XXH32_avalanche(hash);29852986case 13: XXH_PROCESS4;2987XXH_FALLTHROUGH; /* fallthrough */2988case 9: XXH_PROCESS4;2989XXH_FALLTHROUGH; /* fallthrough */2990case 5: XXH_PROCESS4;2991XXH_PROCESS1;2992return XXH32_avalanche(hash);29932994case 14: XXH_PROCESS4;2995XXH_FALLTHROUGH; /* fallthrough */2996case 10: XXH_PROCESS4;2997XXH_FALLTHROUGH; /* fallthrough */2998case 6: XXH_PROCESS4;2999XXH_PROCESS1;3000XXH_PROCESS1;3001return XXH32_avalanche(hash);30023003case 15: XXH_PROCESS4;3004XXH_FALLTHROUGH; /* fallthrough */3005case 11: XXH_PROCESS4;3006XXH_FALLTHROUGH; /* fallthrough */3007case 7: XXH_PROCESS4;3008XXH_FALLTHROUGH; /* fallthrough */3009case 3: XXH_PROCESS1;3010XXH_FALLTHROUGH; /* fallthrough */3011case 2: XXH_PROCESS1;3012XXH_FALLTHROUGH; /* fallthrough */3013case 1: XXH_PROCESS1;3014XXH_FALLTHROUGH; /* fallthrough */3015case 0: return XXH32_avalanche(hash);3016}3017XXH_ASSERT(0);3018return hash; /* reaching this point is deemed impossible */3019}3020}30213022#ifdef XXH_OLD_NAMES3023# define PROCESS1 XXH_PROCESS13024# define PROCESS4 XXH_PROCESS43025#else3026# undef XXH_PROCESS13027# undef XXH_PROCESS43028#endif30293030/*!3031* @internal3032* @brief The implementation for @ref XXH32().3033*3034* @param input , len , seed Directly passed from @ref XXH32().3035* @param align Whether @p input is aligned.3036* @return The calculated hash.3037*/3038XXH_FORCE_INLINE XXH_PUREF xxh_u323039XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)3040{3041xxh_u32 h32;30423043if (input==NULL) XXH_ASSERT(len == 0);30443045if (len>=16) {3046const xxh_u8* const bEnd = input + len;3047const xxh_u8* const limit = bEnd - 15;3048xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;3049xxh_u32 v2 = seed + XXH_PRIME32_2;3050xxh_u32 v3 = seed + 0;3051xxh_u32 v4 = seed - XXH_PRIME32_1;30523053do {3054v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;3055v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;3056v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;3057v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;3058} while (input < limit);30593060h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7)3061+ XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);3062} else {3063h32 = seed + XXH_PRIME32_5;3064}30653066h32 += (xxh_u32)len;30673068return XXH32_finalize(h32, input, len&15, align);3069}30703071/*! @ingroup XXH32_family */3072XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)3073{3074#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 23075/* Simple version, good for code maintenance, but unfortunately slow for small inputs */3076XXH32_state_t state;3077XXH32_reset(&state, seed);3078XXH32_update(&state, (const xxh_u8*)input, len);3079return XXH32_digest(&state);3080#else3081if (XXH_FORCE_ALIGN_CHECK) {3082if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */3083return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);3084} }30853086return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);3087#endif3088}3089309030913092/******* Hash streaming *******/3093#ifndef XXH_NO_STREAM3094/*! @ingroup XXH32_family */3095XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)3096{3097return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));3098}3099/*! @ingroup XXH32_family */3100XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)3101{3102XXH_free(statePtr);3103return XXH_OK;3104}31053106/*! @ingroup XXH32_family */3107XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)3108{3109XXH_memcpy(dstState, srcState, sizeof(*dstState));3110}31113112/*! @ingroup XXH32_family */3113XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)3114{3115XXH_ASSERT(statePtr != NULL);3116memset(statePtr, 0, sizeof(*statePtr));3117statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;3118statePtr->v[1] = seed + XXH_PRIME32_2;3119statePtr->v[2] = seed + 0;3120statePtr->v[3] = seed - XXH_PRIME32_1;3121return XXH_OK;3122}312331243125/*! @ingroup XXH32_family */3126XXH_PUBLIC_API XXH_errorcode3127XXH32_update(XXH32_state_t* state, const void* input, size_t len)3128{3129if (input==NULL) {3130XXH_ASSERT(len == 0);3131return XXH_OK;3132}31333134{ const xxh_u8* p = (const xxh_u8*)input;3135const xxh_u8* const bEnd = p + len;31363137state->total_len_32 += (XXH32_hash_t)len;3138state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));31393140if (state->memsize + len < 16) { /* fill in tmp buffer */3141XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);3142state->memsize += (XXH32_hash_t)len;3143return XXH_OK;3144}31453146if (state->memsize) { /* some data left from previous update */3147XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);3148{ const xxh_u32* p32 = state->mem32;3149state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++;3150state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++;3151state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++;3152state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));3153}3154p += 16-state->memsize;3155state->memsize = 0;3156}31573158if (p <= bEnd-16) {3159const xxh_u8* const limit = bEnd - 16;31603161do {3162state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4;3163state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4;3164state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4;3165state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4;3166} while (p<=limit);31673168}31693170if (p < bEnd) {3171XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));3172state->memsize = (unsigned)(bEnd-p);3173}3174}31753176return XXH_OK;3177}317831793180/*! @ingroup XXH32_family */3181XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)3182{3183xxh_u32 h32;31843185if (state->large_len) {3186h32 = XXH_rotl32(state->v[0], 1)3187+ XXH_rotl32(state->v[1], 7)3188+ XXH_rotl32(state->v[2], 12)3189+ XXH_rotl32(state->v[3], 18);3190} else {3191h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;3192}31933194h32 += state->total_len_32;31953196return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);3197}3198#endif /* !XXH_NO_STREAM */31993200/******* Canonical representation *******/32013202/*! @ingroup XXH32_family */3203XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)3204{3205XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));3206if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);3207XXH_memcpy(dst, &hash, sizeof(*dst));3208}3209/*! @ingroup XXH32_family */3210XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)3211{3212return XXH_readBE32(src);3213}321432153216#ifndef XXH_NO_LONG_LONG32173218/* *******************************************************************3219* 64-bit hash functions3220*********************************************************************/3221/*!3222* @}3223* @ingroup impl3224* @{3225*/3226/******* Memory access *******/32273228typedef XXH64_hash_t xxh_u64;32293230#ifdef XXH_OLD_NAMES3231# define U64 xxh_u643232#endif32333234#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))3235/*3236* Manual byteshift. Best for old compilers which don't inline memcpy.3237* We actually directly use XXH_readLE64 and XXH_readBE64.3238*/3239#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))32403241/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */3242static xxh_u64 XXH_read64(const void* memPtr)3243{3244return *(const xxh_u64*) memPtr;3245}32463247#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))32483249/*3250* __attribute__((aligned(1))) is supported by gcc and clang. Originally the3251* documentation claimed that it only increased the alignment, but actually it3252* can decrease it on gcc, clang, and icc:3253* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,3254* https://gcc.godbolt.org/z/xYez1j67Y.3255*/3256#ifdef XXH_OLD_NAMES3257typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;3258#endif3259static xxh_u64 XXH_read64(const void* ptr)3260{3261typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64;3262return *((const xxh_unalign64*)ptr);3263}32643265#else32663267/*3268* Portable and safe solution. Generally efficient.3269* see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html3270*/3271static xxh_u64 XXH_read64(const void* memPtr)3272{3273xxh_u64 val;3274XXH_memcpy(&val, memPtr, sizeof(val));3275return val;3276}32773278#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */32793280#if defined(_MSC_VER) /* Visual Studio */3281# define XXH_swap64 _byteswap_uint643282#elif XXH_GCC_VERSION >= 4033283# define XXH_swap64 __builtin_bswap643284#else3285static xxh_u64 XXH_swap64(xxh_u64 x)3286{3287return ((x << 56) & 0xff00000000000000ULL) |3288((x << 40) & 0x00ff000000000000ULL) |3289((x << 24) & 0x0000ff0000000000ULL) |3290((x << 8) & 0x000000ff00000000ULL) |3291((x >> 8) & 0x00000000ff000000ULL) |3292((x >> 24) & 0x0000000000ff0000ULL) |3293((x >> 40) & 0x000000000000ff00ULL) |3294((x >> 56) & 0x00000000000000ffULL);3295}3296#endif329732983299/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */3300#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))33013302XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)3303{3304const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;3305return bytePtr[0]3306| ((xxh_u64)bytePtr[1] << 8)3307| ((xxh_u64)bytePtr[2] << 16)3308| ((xxh_u64)bytePtr[3] << 24)3309| ((xxh_u64)bytePtr[4] << 32)3310| ((xxh_u64)bytePtr[5] << 40)3311| ((xxh_u64)bytePtr[6] << 48)3312| ((xxh_u64)bytePtr[7] << 56);3313}33143315XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)3316{3317const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;3318return bytePtr[7]3319| ((xxh_u64)bytePtr[6] << 8)3320| ((xxh_u64)bytePtr[5] << 16)3321| ((xxh_u64)bytePtr[4] << 24)3322| ((xxh_u64)bytePtr[3] << 32)3323| ((xxh_u64)bytePtr[2] << 40)3324| ((xxh_u64)bytePtr[1] << 48)3325| ((xxh_u64)bytePtr[0] << 56);3326}33273328#else3329XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)3330{3331return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));3332}33333334static xxh_u64 XXH_readBE64(const void* ptr)3335{3336return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);3337}3338#endif33393340XXH_FORCE_INLINE xxh_u643341XXH_readLE64_align(const void* ptr, XXH_alignment align)3342{3343if (align==XXH_unaligned)3344return XXH_readLE64(ptr);3345else3346return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);3347}334833493350/******* xxh64 *******/3351/*!3352* @}3353* @defgroup XXH64_impl XXH64 implementation3354* @ingroup impl3355*3356* Details on the XXH64 implementation.3357* @{3358*/3359/* #define rather that static const, to be used as initializers */3360#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */3361#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */3362#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */3363#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */3364#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */33653366#ifdef XXH_OLD_NAMES3367# define PRIME64_1 XXH_PRIME64_13368# define PRIME64_2 XXH_PRIME64_23369# define PRIME64_3 XXH_PRIME64_33370# define PRIME64_4 XXH_PRIME64_43371# define PRIME64_5 XXH_PRIME64_53372#endif33733374/*! @copydoc XXH32_round */3375static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)3376{3377acc += input * XXH_PRIME64_2;3378acc = XXH_rotl64(acc, 31);3379acc *= XXH_PRIME64_1;3380#if (defined(__AVX512F__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)3381/*3382* DISABLE AUTOVECTORIZATION:3383* A compiler fence is used to prevent GCC and Clang from3384* autovectorizing the XXH64 loop (pragmas and attributes don't work for some3385* reason) without globally disabling AVX512.3386*3387* Autovectorization of XXH64 tends to be detrimental,3388* though the exact outcome may change depending on exact cpu and compiler version.3389* For information, it has been reported as detrimental for Skylake-X,3390* but possibly beneficial for Zen4.3391*3392* The default is to disable auto-vectorization,3393* but you can select to enable it instead using `XXH_ENABLE_AUTOVECTORIZE` build variable.3394*/3395XXH_COMPILER_GUARD(acc);3396#endif3397return acc;3398}33993400static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)3401{3402val = XXH64_round(0, val);3403acc ^= val;3404acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4;3405return acc;3406}34073408/*! @copydoc XXH32_avalanche */3409static xxh_u64 XXH64_avalanche(xxh_u64 hash)3410{3411hash ^= hash >> 33;3412hash *= XXH_PRIME64_2;3413hash ^= hash >> 29;3414hash *= XXH_PRIME64_3;3415hash ^= hash >> 32;3416return hash;3417}341834193420#define XXH_get64bits(p) XXH_readLE64_align(p, align)34213422/*!3423* @internal3424* @brief Processes the last 0-31 bytes of @p ptr.3425*3426* There may be up to 31 bytes remaining to consume from the input.3427* This final stage will digest them to ensure that all input bytes are present3428* in the final mix.3429*3430* @param hash The hash to finalize.3431* @param ptr The pointer to the remaining input.3432* @param len The remaining length, modulo 32.3433* @param align Whether @p ptr is aligned.3434* @return The finalized hash3435* @see XXH32_finalize().3436*/3437static XXH_PUREF xxh_u643438XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)3439{3440if (ptr==NULL) XXH_ASSERT(len == 0);3441len &= 31;3442while (len >= 8) {3443xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));3444ptr += 8;3445hash ^= k1;3446hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4;3447len -= 8;3448}3449if (len >= 4) {3450hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;3451ptr += 4;3452hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;3453len -= 4;3454}3455while (len > 0) {3456hash ^= (*ptr++) * XXH_PRIME64_5;3457hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1;3458--len;3459}3460return XXH64_avalanche(hash);3461}34623463#ifdef XXH_OLD_NAMES3464# define PROCESS1_64 XXH_PROCESS1_643465# define PROCESS4_64 XXH_PROCESS4_643466# define PROCESS8_64 XXH_PROCESS8_643467#else3468# undef XXH_PROCESS1_643469# undef XXH_PROCESS4_643470# undef XXH_PROCESS8_643471#endif34723473/*!3474* @internal3475* @brief The implementation for @ref XXH64().3476*3477* @param input , len , seed Directly passed from @ref XXH64().3478* @param align Whether @p input is aligned.3479* @return The calculated hash.3480*/3481XXH_FORCE_INLINE XXH_PUREF xxh_u643482XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)3483{3484xxh_u64 h64;3485if (input==NULL) XXH_ASSERT(len == 0);34863487if (len>=32) {3488const xxh_u8* const bEnd = input + len;3489const xxh_u8* const limit = bEnd - 31;3490xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;3491xxh_u64 v2 = seed + XXH_PRIME64_2;3492xxh_u64 v3 = seed + 0;3493xxh_u64 v4 = seed - XXH_PRIME64_1;34943495do {3496v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;3497v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;3498v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;3499v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;3500} while (input<limit);35013502h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);3503h64 = XXH64_mergeRound(h64, v1);3504h64 = XXH64_mergeRound(h64, v2);3505h64 = XXH64_mergeRound(h64, v3);3506h64 = XXH64_mergeRound(h64, v4);35073508} else {3509h64 = seed + XXH_PRIME64_5;3510}35113512h64 += (xxh_u64) len;35133514return XXH64_finalize(h64, input, len, align);3515}351635173518/*! @ingroup XXH64_family */3519XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)3520{3521#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 23522/* Simple version, good for code maintenance, but unfortunately slow for small inputs */3523XXH64_state_t state;3524XXH64_reset(&state, seed);3525XXH64_update(&state, (const xxh_u8*)input, len);3526return XXH64_digest(&state);3527#else3528if (XXH_FORCE_ALIGN_CHECK) {3529if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */3530return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);3531} }35323533return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);35343535#endif3536}35373538/******* Hash Streaming *******/3539#ifndef XXH_NO_STREAM3540/*! @ingroup XXH64_family*/3541XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)3542{3543return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));3544}3545/*! @ingroup XXH64_family */3546XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)3547{3548XXH_free(statePtr);3549return XXH_OK;3550}35513552/*! @ingroup XXH64_family */3553XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)3554{3555XXH_memcpy(dstState, srcState, sizeof(*dstState));3556}35573558/*! @ingroup XXH64_family */3559XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)3560{3561XXH_ASSERT(statePtr != NULL);3562memset(statePtr, 0, sizeof(*statePtr));3563statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;3564statePtr->v[1] = seed + XXH_PRIME64_2;3565statePtr->v[2] = seed + 0;3566statePtr->v[3] = seed - XXH_PRIME64_1;3567return XXH_OK;3568}35693570/*! @ingroup XXH64_family */3571XXH_PUBLIC_API XXH_errorcode3572XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)3573{3574if (input==NULL) {3575XXH_ASSERT(len == 0);3576return XXH_OK;3577}35783579{ const xxh_u8* p = (const xxh_u8*)input;3580const xxh_u8* const bEnd = p + len;35813582state->total_len += len;35833584if (state->memsize + len < 32) { /* fill in tmp buffer */3585XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);3586state->memsize += (xxh_u32)len;3587return XXH_OK;3588}35893590if (state->memsize) { /* tmp buffer is full */3591XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);3592state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0));3593state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1));3594state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2));3595state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3));3596p += 32 - state->memsize;3597state->memsize = 0;3598}35993600if (p+32 <= bEnd) {3601const xxh_u8* const limit = bEnd - 32;36023603do {3604state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8;3605state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8;3606state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8;3607state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8;3608} while (p<=limit);36093610}36113612if (p < bEnd) {3613XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));3614state->memsize = (unsigned)(bEnd-p);3615}3616}36173618return XXH_OK;3619}362036213622/*! @ingroup XXH64_family */3623XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)3624{3625xxh_u64 h64;36263627if (state->total_len >= 32) {3628h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);3629h64 = XXH64_mergeRound(h64, state->v[0]);3630h64 = XXH64_mergeRound(h64, state->v[1]);3631h64 = XXH64_mergeRound(h64, state->v[2]);3632h64 = XXH64_mergeRound(h64, state->v[3]);3633} else {3634h64 = state->v[2] /*seed*/ + XXH_PRIME64_5;3635}36363637h64 += (xxh_u64) state->total_len;36383639return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);3640}3641#endif /* !XXH_NO_STREAM */36423643/******* Canonical representation *******/36443645/*! @ingroup XXH64_family */3646XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)3647{3648XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));3649if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);3650XXH_memcpy(dst, &hash, sizeof(*dst));3651}36523653/*! @ingroup XXH64_family */3654XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)3655{3656return XXH_readBE64(src);3657}36583659#if defined (__cplusplus)3660}3661#endif36623663#ifndef XXH_NO_XXH336643665/* *********************************************************************3666* XXH33667* New generation hash designed for speed on small keys and vectorization3668************************************************************************ */3669/*!3670* @}3671* @defgroup XXH3_impl XXH3 implementation3672* @ingroup impl3673* @{3674*/36753676/* === Compiler specifics === */36773678#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */3679# define XXH_RESTRICT /* disable */3680#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */3681# define XXH_RESTRICT restrict3682#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \3683|| (defined (__clang__)) \3684|| (defined (_MSC_VER) && (_MSC_VER >= 1400)) \3685|| (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300))3686/*3687* There are a LOT more compilers that recognize __restrict but this3688* covers the major ones.3689*/3690# define XXH_RESTRICT __restrict3691#else3692# define XXH_RESTRICT /* disable */3693#endif36943695#if (defined(__GNUC__) && (__GNUC__ >= 3)) \3696|| (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \3697|| defined(__clang__)3698# define XXH_likely(x) __builtin_expect(x, 1)3699# define XXH_unlikely(x) __builtin_expect(x, 0)3700#else3701# define XXH_likely(x) (x)3702# define XXH_unlikely(x) (x)3703#endif37043705#ifndef XXH_HAS_INCLUDE3706# ifdef __has_include3707/*3708* Not defined as XXH_HAS_INCLUDE(x) (function-like) because3709* this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion)3710*/3711# define XXH_HAS_INCLUDE __has_include3712# else3713# define XXH_HAS_INCLUDE(x) 03714# endif3715#endif37163717#if defined(__GNUC__) || defined(__clang__)3718# if defined(__ARM_FEATURE_SVE)3719# include <arm_sve.h>3720# endif3721# if defined(__ARM_NEON__) || defined(__ARM_NEON) \3722|| (defined(_M_ARM) && _M_ARM >= 7) \3723|| defined(_M_ARM64) || defined(_M_ARM64EC) \3724|| (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* WASM SIMD128 via SIMDe */3725# define inline __inline__ /* circumvent a clang bug */3726# include <arm_neon.h>3727# undef inline3728# elif defined(__AVX2__)3729# include <immintrin.h>3730# elif defined(__SSE2__)3731# include <emmintrin.h>3732# endif3733#endif37343735#if defined(_MSC_VER)3736# include <intrin.h>3737#endif37383739/*3740* One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while3741* remaining a true 64-bit/128-bit hash function.3742*3743* This is done by prioritizing a subset of 64-bit operations that can be3744* emulated without too many steps on the average 32-bit machine.3745*3746* For example, these two lines seem similar, and run equally fast on 64-bit:3747*3748* xxh_u64 x;3749* x ^= (x >> 47); // good3750* x ^= (x >> 13); // bad3751*3752* However, to a 32-bit machine, there is a major difference.3753*3754* x ^= (x >> 47) looks like this:3755*3756* x.lo ^= (x.hi >> (47 - 32));3757*3758* while x ^= (x >> 13) looks like this:3759*3760* // note: funnel shifts are not usually cheap.3761* x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));3762* x.hi ^= (x.hi >> 13);3763*3764* The first one is significantly faster than the second, simply because the3765* shift is larger than 32. This means:3766* - All the bits we need are in the upper 32 bits, so we can ignore the lower3767* 32 bits in the shift.3768* - The shift result will always fit in the lower 32 bits, and therefore,3769* we can ignore the upper 32 bits in the xor.3770*3771* Thanks to this optimization, XXH3 only requires these features to be efficient:3772*3773* - Usable unaligned access3774* - A 32-bit or 64-bit ALU3775* - If 32-bit, a decent ADC instruction3776* - A 32 or 64-bit multiply with a 64-bit result3777* - For the 128-bit variant, a decent byteswap helps short inputs.3778*3779* The first two are already required by XXH32, and almost all 32-bit and 64-bit3780* platforms which can run XXH32 can run XXH3 efficiently.3781*3782* Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one3783* notable exception.3784*3785* First of all, Thumb-1 lacks support for the UMULL instruction which3786* performs the important long multiply. This means numerous __aeabi_lmul3787* calls.3788*3789* Second of all, the 8 functional registers are just not enough.3790* Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need3791* Lo registers, and this shuffling results in thousands more MOVs than A32.3792*3793* A32 and T32 don't have this limitation. They can access all 14 registers,3794* do a 32->64 multiply with UMULL, and the flexible operand allowing free3795* shifts is helpful, too.3796*3797* Therefore, we do a quick sanity check.3798*3799* If compiling Thumb-1 for a target which supports ARM instructions, we will3800* emit a warning, as it is not a "sane" platform to compile for.3801*3802* Usually, if this happens, it is because of an accident and you probably need3803* to specify -march, as you likely meant to compile for a newer architecture.3804*3805* Credit: large sections of the vectorial and asm source code paths3806* have been contributed by @easyaspi3143807*/3808#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)3809# warning "XXH3 is highly inefficient without ARM or Thumb-2."3810#endif38113812/* ==========================================3813* Vectorization detection3814* ========================================== */38153816#ifdef XXH_DOXYGEN3817/*!3818* @ingroup tuning3819* @brief Overrides the vectorization implementation chosen for XXH3.3820*3821* Can be defined to 0 to disable SIMD or any of the values mentioned in3822* @ref XXH_VECTOR_TYPE.3823*3824* If this is not defined, it uses predefined macros to determine the best3825* implementation.3826*/3827# define XXH_VECTOR XXH_SCALAR3828/*!3829* @ingroup tuning3830* @brief Possible values for @ref XXH_VECTOR.3831*3832* Note that these are actually implemented as macros.3833*3834* If this is not defined, it is detected automatically.3835* internal macro XXH_X86DISPATCH overrides this.3836*/3837enum XXH_VECTOR_TYPE /* fake enum */ {3838XXH_SCALAR = 0, /*!< Portable scalar version */3839XXH_SSE2 = 1, /*!<3840* SSE2 for Pentium 4, Opteron, all x86_64.3841*3842* @note SSE2 is also guaranteed on Windows 10, macOS, and3843* Android x86.3844*/3845XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */3846XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */3847XXH_NEON = 4, /*!<3848* NEON for most ARMv7-A, all AArch64, and WASM SIMD1283849* via the SIMDeverywhere polyfill provided with the3850* Emscripten SDK.3851*/3852XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */3853XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */3854};3855/*!3856* @ingroup tuning3857* @brief Selects the minimum alignment for XXH3's accumulators.3858*3859* When using SIMD, this should match the alignment required for said vector3860* type, so, for example, 32 for AVX2.3861*3862* Default: Auto detected.3863*/3864# define XXH_ACC_ALIGN 83865#endif38663867/* Actual definition */3868#ifndef XXH_DOXYGEN3869# define XXH_SCALAR 03870# define XXH_SSE2 13871# define XXH_AVX2 23872# define XXH_AVX512 33873# define XXH_NEON 43874# define XXH_VSX 53875# define XXH_SVE 63876#endif38773878#ifndef XXH_VECTOR /* can be defined on command line */3879# if defined(__ARM_FEATURE_SVE)3880# define XXH_VECTOR XXH_SVE3881# elif ( \3882defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \3883|| defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \3884|| (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* wasm simd128 via SIMDe */ \3885) && ( \3886defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \3887|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \3888)3889# define XXH_VECTOR XXH_NEON3890# elif defined(__AVX512F__)3891# define XXH_VECTOR XXH_AVX5123892# elif defined(__AVX2__)3893# define XXH_VECTOR XXH_AVX23894# elif defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))3895# define XXH_VECTOR XXH_SSE23896# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \3897|| (defined(__s390x__) && defined(__VEC__)) \3898&& defined(__GNUC__) /* TODO: IBM XL */3899# define XXH_VECTOR XXH_VSX3900# else3901# define XXH_VECTOR XXH_SCALAR3902# endif3903#endif39043905/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */3906#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)3907# ifdef _MSC_VER3908# pragma warning(once : 4606)3909# else3910# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."3911# endif3912# undef XXH_VECTOR3913# define XXH_VECTOR XXH_SCALAR3914#endif39153916/*3917* Controls the alignment of the accumulator,3918* for compatibility with aligned vector loads, which are usually faster.3919*/3920#ifndef XXH_ACC_ALIGN3921# if defined(XXH_X86DISPATCH)3922# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */3923# elif XXH_VECTOR == XXH_SCALAR /* scalar */3924# define XXH_ACC_ALIGN 83925# elif XXH_VECTOR == XXH_SSE2 /* sse2 */3926# define XXH_ACC_ALIGN 163927# elif XXH_VECTOR == XXH_AVX2 /* avx2 */3928# define XXH_ACC_ALIGN 323929# elif XXH_VECTOR == XXH_NEON /* neon */3930# define XXH_ACC_ALIGN 163931# elif XXH_VECTOR == XXH_VSX /* vsx */3932# define XXH_ACC_ALIGN 163933# elif XXH_VECTOR == XXH_AVX512 /* avx512 */3934# define XXH_ACC_ALIGN 643935# elif XXH_VECTOR == XXH_SVE /* sve */3936# define XXH_ACC_ALIGN 643937# endif3938#endif39393940#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \3941|| XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX5123942# define XXH_SEC_ALIGN XXH_ACC_ALIGN3943#elif XXH_VECTOR == XXH_SVE3944# define XXH_SEC_ALIGN XXH_ACC_ALIGN3945#else3946# define XXH_SEC_ALIGN 83947#endif39483949#if defined(__GNUC__) || defined(__clang__)3950# define XXH_ALIASING __attribute__((may_alias))3951#else3952# define XXH_ALIASING /* nothing */3953#endif39543955/*3956* UGLY HACK:3957* GCC usually generates the best code with -O3 for xxHash.3958*3959* However, when targeting AVX2, it is overzealous in its unrolling resulting3960* in code roughly 3/4 the speed of Clang.3961*3962* There are other issues, such as GCC splitting _mm256_loadu_si256 into3963* _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which3964* only applies to Sandy and Ivy Bridge... which don't even support AVX2.3965*3966* That is why when compiling the AVX2 version, it is recommended to use either3967* -O2 -mavx2 -march=haswell3968* or3969* -O2 -mavx2 -mno-avx256-split-unaligned-load3970* for decent performance, or to use Clang instead.3971*3972* Fortunately, we can control the first one with a pragma that forces GCC into3973* -O2, but the other one we can't control without "failed to inline always3974* inline function due to target mismatch" warnings.3975*/3976#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \3977&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \3978&& defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */3979# pragma GCC push_options3980# pragma GCC optimize("-O2")3981#endif39823983#if defined (__cplusplus)3984extern "C" {3985#endif39863987#if XXH_VECTOR == XXH_NEON39883989/*3990* UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O33991* optimizes out the entire hashLong loop because of the aliasing violation.3992*3993* However, GCC is also inefficient at load-store optimization with vld1q/vst1q,3994* so the only option is to mark it as aliasing.3995*/3996typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;39973998/*!3999* @internal4000* @brief `vld1q_u64` but faster and alignment-safe.4001*4002* On AArch64, unaligned access is always safe, but on ARMv7-a, it is only4003* *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86).4004*4005* GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it4006* prohibits load-store optimizations. Therefore, a direct dereference is used.4007*4008* Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe4009* unaligned load.4010*/4011#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)4012XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */4013{4014return *(xxh_aliasing_uint64x2_t const *)ptr;4015}4016#else4017XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)4018{4019return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));4020}4021#endif40224023/*!4024* @internal4025* @brief `vmlal_u32` on low and high halves of a vector.4026*4027* This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with4028* inline assembly and were therefore incapable of merging the `vget_{low, high}_u32`4029* with `vmlal_u32`.4030*/4031#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 114032XXH_FORCE_INLINE uint64x2_t4033XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)4034{4035/* Inline assembly is the only way */4036__asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs));4037return acc;4038}4039XXH_FORCE_INLINE uint64x2_t4040XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)4041{4042/* This intrinsic works as expected */4043return vmlal_high_u32(acc, lhs, rhs);4044}4045#else4046/* Portable intrinsic versions */4047XXH_FORCE_INLINE uint64x2_t4048XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)4049{4050return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs));4051}4052/*! @copydoc XXH_vmlal_low_u324053* Assume the compiler converts this to vmlal_high_u32 on aarch64 */4054XXH_FORCE_INLINE uint64x2_t4055XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)4056{4057return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs));4058}4059#endif40604061/*!4062* @ingroup tuning4063* @brief Controls the NEON to scalar ratio for XXH34064*4065* This can be set to 2, 4, 6, or 8.4066*4067* ARM Cortex CPUs are _very_ sensitive to how their pipelines are used.4068*4069* For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those4070* can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU4071* bandwidth.4072*4073* This is even more noticeable on the more advanced cores like the Cortex-A76 which4074* can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.4075*4076* Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes4077* and 2 scalar lanes, which is chosen by default.4078*4079* This does not apply to Apple processors or 32-bit processors, which run better with4080* full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes.4081*4082* This change benefits CPUs with large micro-op buffers without negatively affecting4083* most other CPUs:4084*4085* | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. |4086* |:----------------------|:--------------------|----------:|-----------:|------:|4087* | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% |4088* | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% |4089* | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% |4090* | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% |4091*4092* It also seems to fix some bad codegen on GCC, making it almost as fast as clang.4093*4094* When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning4095* it effectively becomes worse 4.4096*4097* @see XXH3_accumulate_512_neon()4098*/4099# ifndef XXH3_NEON_LANES4100# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \4101&& !defined(__APPLE__) && XXH_SIZE_OPT <= 04102# define XXH3_NEON_LANES 64103# else4104# define XXH3_NEON_LANES XXH_ACC_NB4105# endif4106# endif4107#endif /* XXH_VECTOR == XXH_NEON */41084109#if defined (__cplusplus)4110} /* extern "C" */4111#endif41124113/*4114* VSX and Z Vector helpers.4115*4116* This is very messy, and any pull requests to clean this up are welcome.4117*4118* There are a lot of problems with supporting VSX and s390x, due to4119* inconsistent intrinsics, spotty coverage, and multiple endiannesses.4120*/4121#if XXH_VECTOR == XXH_VSX4122/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`,4123* and `pixel`. This is a problem for obvious reasons.4124*4125* These keywords are unnecessary; the spec literally says they are4126* equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd4127* after including the header.4128*4129* We use pragma push_macro/pop_macro to keep the namespace clean. */4130# pragma push_macro("bool")4131# pragma push_macro("vector")4132# pragma push_macro("pixel")4133/* silence potential macro redefined warnings */4134# undef bool4135# undef vector4136# undef pixel41374138# if defined(__s390x__)4139# include <s390intrin.h>4140# else4141# include <altivec.h>4142# endif41434144/* Restore the original macro values, if applicable. */4145# pragma pop_macro("pixel")4146# pragma pop_macro("vector")4147# pragma pop_macro("bool")41484149typedef __vector unsigned long long xxh_u64x2;4150typedef __vector unsigned char xxh_u8x16;4151typedef __vector unsigned xxh_u32x4;41524153/*4154* UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue.4155*/4156typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING;41574158# ifndef XXH_VSX_BE4159# if defined(__BIG_ENDIAN__) \4160|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)4161# define XXH_VSX_BE 14162# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__4163# warning "-maltivec=be is not recommended. Please use native endianness."4164# define XXH_VSX_BE 14165# else4166# define XXH_VSX_BE 04167# endif4168# endif /* !defined(XXH_VSX_BE) */41694170# if XXH_VSX_BE4171# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))4172# define XXH_vec_revb vec_revb4173# else4174#if defined (__cplusplus)4175extern "C" {4176#endif4177/*!4178* A polyfill for POWER9's vec_revb().4179*/4180XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)4181{4182xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,41830x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };4184return vec_perm(val, val, vByteSwap);4185}4186#if defined (__cplusplus)4187} /* extern "C" */4188#endif4189# endif4190# endif /* XXH_VSX_BE */41914192#if defined (__cplusplus)4193extern "C" {4194#endif4195/*!4196* Performs an unaligned vector load and byte swaps it on big endian.4197*/4198XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)4199{4200xxh_u64x2 ret;4201XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));4202# if XXH_VSX_BE4203ret = XXH_vec_revb(ret);4204# endif4205return ret;4206}42074208/*4209* vec_mulo and vec_mule are very problematic intrinsics on PowerPC4210*4211* These intrinsics weren't added until GCC 8, despite existing for a while,4212* and they are endian dependent. Also, their meaning swap depending on version.4213* */4214# if defined(__s390x__)4215/* s390x is always big endian, no issue on this platform */4216# define XXH_vec_mulo vec_mulo4217# define XXH_vec_mule vec_mule4218# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__)4219/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */4220/* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */4221# define XXH_vec_mulo __builtin_altivec_vmulouw4222# define XXH_vec_mule __builtin_altivec_vmuleuw4223# else4224/* gcc needs inline assembly */4225/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */4226XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)4227{4228xxh_u64x2 result;4229__asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));4230return result;4231}4232XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)4233{4234xxh_u64x2 result;4235__asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));4236return result;4237}4238# endif /* XXH_vec_mulo, XXH_vec_mule */42394240#if defined (__cplusplus)4241} /* extern "C" */4242#endif42434244#endif /* XXH_VECTOR == XXH_VSX */42454246#if XXH_VECTOR == XXH_SVE4247#define ACCRND(acc, offset) \4248do { \4249svuint64_t input_vec = svld1_u64(mask, xinput + offset); \4250svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \4251svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \4252svuint64_t swapped = svtbl_u64(input_vec, kSwap); \4253svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \4254svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \4255svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \4256acc = svadd_u64_x(mask, acc, mul); \4257} while (0)4258#endif /* XXH_VECTOR == XXH_SVE */42594260/* prefetch4261* can be disabled, by declaring XXH_NO_PREFETCH build macro */4262#if defined(XXH_NO_PREFETCH)4263# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */4264#else4265# if XXH_SIZE_OPT >= 14266# define XXH_PREFETCH(ptr) (void)(ptr)4267# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */4268# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */4269# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)4270# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )4271# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)4272# else4273# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */4274# endif4275#endif /* XXH_NO_PREFETCH */42764277#if defined (__cplusplus)4278extern "C" {4279#endif4280/* ==========================================4281* XXH3 default settings4282* ========================================== */42834284#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */42854286#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)4287# error "default keyset is not large enough"4288#endif42894290/*! Pseudorandom secret taken directly from FARSH. */4291XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {42920xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,42930xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,42940xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,42950xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,42960x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,42970x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,42980xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,42990x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,43000xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,43010x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,43020x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,43030x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,4304};43054306static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL; /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */4307static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL; /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */43084309#ifdef XXH_OLD_NAMES4310# define kSecret XXH3_kSecret4311#endif43124313#ifdef XXH_DOXYGEN4314/*!4315* @brief Calculates a 32-bit to 64-bit long multiply.4316*4317* Implemented as a macro.4318*4319* Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't4320* need to (but it shouldn't need to anyways, it is about 7 instructions to do4321* a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we4322* use that instead of the normal method.4323*4324* If you are compiling for platforms like Thumb-1 and don't have a better option,4325* you may also want to write your own long multiply routine here.4326*4327* @param x, y Numbers to be multiplied4328* @return 64-bit product of the low 32 bits of @p x and @p y.4329*/4330XXH_FORCE_INLINE xxh_u644331XXH_mult32to64(xxh_u64 x, xxh_u64 y)4332{4333return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);4334}4335#elif defined(_MSC_VER) && defined(_M_IX86)4336# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))4337#else4338/*4339* Downcast + upcast is usually better than masking on older compilers like4340* GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.4341*4342* The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands4343* and perform a full 64x64 multiply -- entirely redundant on 32-bit.4344*/4345# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))4346#endif43474348/*!4349* @brief Calculates a 64->128-bit long multiply.4350*4351* Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar4352* version.4353*4354* @param lhs , rhs The 64-bit integers to be multiplied4355* @return The 128-bit result represented in an @ref XXH128_hash_t.4356*/4357static XXH128_hash_t4358XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)4359{4360/*4361* GCC/Clang __uint128_t method.4362*4363* On most 64-bit targets, GCC and Clang define a __uint128_t type.4364* This is usually the best way as it usually uses a native long 64-bit4365* multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.4366*4367* Usually.4368*4369* Despite being a 32-bit platform, Clang (and emscripten) define this type4370* despite not having the arithmetic for it. This results in a laggy4371* compiler builtin call which calculates a full 128-bit multiply.4372* In that case it is best to use the portable one.4373* https://github.com/Cyan4973/xxHash/issues/211#issuecomment-5155756774374*/4375#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \4376&& defined(__SIZEOF_INT128__) \4377|| (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)43784379__uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;4380XXH128_hash_t r128;4381r128.low64 = (xxh_u64)(product);4382r128.high64 = (xxh_u64)(product >> 64);4383return r128;43844385/*4386* MSVC for x64's _umul128 method.4387*4388* xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);4389*4390* This compiles to single operand MUL on x64.4391*/4392#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)43934394#ifndef _MSC_VER4395# pragma intrinsic(_umul128)4396#endif4397xxh_u64 product_high;4398xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);4399XXH128_hash_t r128;4400r128.low64 = product_low;4401r128.high64 = product_high;4402return r128;44034404/*4405* MSVC for ARM64's __umulh method.4406*4407* This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.4408*/4409#elif defined(_M_ARM64) || defined(_M_ARM64EC)44104411#ifndef _MSC_VER4412# pragma intrinsic(__umulh)4413#endif4414XXH128_hash_t r128;4415r128.low64 = lhs * rhs;4416r128.high64 = __umulh(lhs, rhs);4417return r128;44184419#else4420/*4421* Portable scalar method. Optimized for 32-bit and 64-bit ALUs.4422*4423* This is a fast and simple grade school multiply, which is shown below4424* with base 10 arithmetic instead of base 0x100000000.4425*4426* 9 3 // D2 lhs = 934427* x 7 5 // D2 rhs = 754428* ----------4429* 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 154430* 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 454431* 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 214432* + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 634433* ---------4434* 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 274435* + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 674436* ---------4437* 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 69754438*4439* The reasons for adding the products like this are:4440* 1. It avoids manual carry tracking. Just like how4441* (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.4442* This avoids a lot of complexity.4443*4444* 2. It hints for, and on Clang, compiles to, the powerful UMAAL4445* instruction available in ARM's Digital Signal Processing extension4446* in 32-bit ARMv6 and later, which is shown below:4447*4448* void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)4449* {4450* xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;4451* *RdLo = (xxh_u32)(product & 0xFFFFFFFF);4452* *RdHi = (xxh_u32)(product >> 32);4453* }4454*4455* This instruction was designed for efficient long multiplication, and4456* allows this to be calculated in only 4 instructions at speeds4457* comparable to some 64-bit ALUs.4458*4459* 3. It isn't terrible on other platforms. Usually this will be a couple4460* of 32-bit ADD/ADCs.4461*/44624463/* First calculate all of the cross products. */4464xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);4465xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);4466xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);4467xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);44684469/* Now add the products together. These will never overflow. */4470xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;4471xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;4472xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);44734474XXH128_hash_t r128;4475r128.low64 = lower;4476r128.high64 = upper;4477return r128;4478#endif4479}44804481/*!4482* @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.4483*4484* The reason for the separate function is to prevent passing too many structs4485* around by value. This will hopefully inline the multiply, but we don't force it.4486*4487* @param lhs , rhs The 64-bit integers to multiply4488* @return The low 64 bits of the product XOR'd by the high 64 bits.4489* @see XXH_mult64to128()4490*/4491static xxh_u644492XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)4493{4494XXH128_hash_t product = XXH_mult64to128(lhs, rhs);4495return product.low64 ^ product.high64;4496}44974498/*! Seems to produce slightly better code on GCC for some reason. */4499XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)4500{4501XXH_ASSERT(0 <= shift && shift < 64);4502return v64 ^ (v64 >> shift);4503}45044505/*4506* This is a fast avalanche stage,4507* suitable when input bits are already partially mixed4508*/4509static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)4510{4511h64 = XXH_xorshift64(h64, 37);4512h64 *= PRIME_MX1;4513h64 = XXH_xorshift64(h64, 32);4514return h64;4515}45164517/*4518* This is a stronger avalanche,4519* inspired by Pelle Evensen's rrmxmx4520* preferable when input has not been previously mixed4521*/4522static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)4523{4524/* this mix is inspired by Pelle Evensen's rrmxmx */4525h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);4526h64 *= PRIME_MX2;4527h64 ^= (h64 >> 35) + len ;4528h64 *= PRIME_MX2;4529return XXH_xorshift64(h64, 28);4530}453145324533/* ==========================================4534* Short keys4535* ==========================================4536* One of the shortcomings of XXH32 and XXH64 was that their performance was4537* sub-optimal on short lengths. It used an iterative algorithm which strongly4538* favored lengths that were a multiple of 4 or 8.4539*4540* Instead of iterating over individual inputs, we use a set of single shot4541* functions which piece together a range of lengths and operate in constant time.4542*4543* Additionally, the number of multiplies has been significantly reduced. This4544* reduces latency, especially when emulating 64-bit multiplies on 32-bit.4545*4546* Depending on the platform, this may or may not be faster than XXH32, but it4547* is almost guaranteed to be faster than XXH64.4548*/45494550/*4551* At very short lengths, there isn't enough input to fully hide secrets, or use4552* the entire secret.4553*4554* There is also only a limited amount of mixing we can do before significantly4555* impacting performance.4556*4557* Therefore, we use different sections of the secret and always mix two secret4558* samples with an XOR. This should have no effect on performance on the4559* seedless or withSeed variants because everything _should_ be constant folded4560* by modern compilers.4561*4562* The XOR mixing hides individual parts of the secret and increases entropy.4563*4564* This adds an extra layer of strength for custom secrets.4565*/4566XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t4567XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)4568{4569XXH_ASSERT(input != NULL);4570XXH_ASSERT(1 <= len && len <= 3);4571XXH_ASSERT(secret != NULL);4572/*4573* len = 1: combined = { input[0], 0x01, input[0], input[0] }4574* len = 2: combined = { input[1], 0x02, input[0], input[1] }4575* len = 3: combined = { input[2], 0x03, input[0], input[1] }4576*/4577{ xxh_u8 const c1 = input[0];4578xxh_u8 const c2 = input[len >> 1];4579xxh_u8 const c3 = input[len - 1];4580xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24)4581| ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);4582xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;4583xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;4584return XXH64_avalanche(keyed);4585}4586}45874588XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t4589XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)4590{4591XXH_ASSERT(input != NULL);4592XXH_ASSERT(secret != NULL);4593XXH_ASSERT(4 <= len && len <= 8);4594seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;4595{ xxh_u32 const input1 = XXH_readLE32(input);4596xxh_u32 const input2 = XXH_readLE32(input + len - 4);4597xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;4598xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);4599xxh_u64 const keyed = input64 ^ bitflip;4600return XXH3_rrmxmx(keyed, len);4601}4602}46034604XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t4605XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)4606{4607XXH_ASSERT(input != NULL);4608XXH_ASSERT(secret != NULL);4609XXH_ASSERT(9 <= len && len <= 16);4610{ xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;4611xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;4612xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;4613xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;4614xxh_u64 const acc = len4615+ XXH_swap64(input_lo) + input_hi4616+ XXH3_mul128_fold64(input_lo, input_hi);4617return XXH3_avalanche(acc);4618}4619}46204621XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t4622XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)4623{4624XXH_ASSERT(len <= 16);4625{ if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed);4626if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);4627if (len) return XXH3_len_1to3_64b(input, len, secret, seed);4628return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));4629}4630}46314632/*4633* DISCLAIMER: There are known *seed-dependent* multicollisions here due to4634* multiplication by zero, affecting hashes of lengths 17 to 240.4635*4636* However, they are very unlikely.4637*4638* Keep this in mind when using the unseeded XXH3_64bits() variant: As with all4639* unseeded non-cryptographic hashes, it does not attempt to defend itself4640* against specially crafted inputs, only random inputs.4641*4642* Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes4643* cancelling out the secret is taken an arbitrary number of times (addressed4644* in XXH3_accumulate_512), this collision is very unlikely with random inputs4645* and/or proper seeding:4646*4647* This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a4648* function that is only called up to 16 times per hash with up to 240 bytes of4649* input.4650*4651* This is not too bad for a non-cryptographic hash function, especially with4652* only 64 bit outputs.4653*4654* The 128-bit variant (which trades some speed for strength) is NOT affected4655* by this, although it is always a good idea to use a proper seed if you care4656* about strength.4657*/4658XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,4659const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)4660{4661#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \4662&& defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \4663&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */4664/*4665* UGLY HACK:4666* GCC for x86 tends to autovectorize the 128-bit multiply, resulting in4667* slower code.4668*4669* By forcing seed64 into a register, we disrupt the cost model and4670* cause it to scalarize. See `XXH32_round()`4671*4672* FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,4673* XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on4674* GCC 9.2, despite both emitting scalar code.4675*4676* GCC generates much better scalar code than Clang for the rest of XXH3,4677* which is why finding a more optimal codepath is an interest.4678*/4679XXH_COMPILER_GUARD(seed64);4680#endif4681{ xxh_u64 const input_lo = XXH_readLE64(input);4682xxh_u64 const input_hi = XXH_readLE64(input+8);4683return XXH3_mul128_fold64(4684input_lo ^ (XXH_readLE64(secret) + seed64),4685input_hi ^ (XXH_readLE64(secret+8) - seed64)4686);4687}4688}46894690/* For mid range keys, XXH3 uses a Mum-hash variant. */4691XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t4692XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,4693const xxh_u8* XXH_RESTRICT secret, size_t secretSize,4694XXH64_hash_t seed)4695{4696XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;4697XXH_ASSERT(16 < len && len <= 128);46984699{ xxh_u64 acc = len * XXH_PRIME64_1;4700#if XXH_SIZE_OPT >= 14701/* Smaller and cleaner, but slightly slower. */4702unsigned int i = (unsigned int)(len - 1) / 32;4703do {4704acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);4705acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);4706} while (i-- != 0);4707#else4708if (len > 32) {4709if (len > 64) {4710if (len > 96) {4711acc += XXH3_mix16B(input+48, secret+96, seed);4712acc += XXH3_mix16B(input+len-64, secret+112, seed);4713}4714acc += XXH3_mix16B(input+32, secret+64, seed);4715acc += XXH3_mix16B(input+len-48, secret+80, seed);4716}4717acc += XXH3_mix16B(input+16, secret+32, seed);4718acc += XXH3_mix16B(input+len-32, secret+48, seed);4719}4720acc += XXH3_mix16B(input+0, secret+0, seed);4721acc += XXH3_mix16B(input+len-16, secret+16, seed);4722#endif4723return XXH3_avalanche(acc);4724}4725}47264727/*!4728* @brief Maximum size of "short" key in bytes.4729*/4730#define XXH3_MIDSIZE_MAX 24047314732XXH_NO_INLINE XXH_PUREF XXH64_hash_t4733XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,4734const xxh_u8* XXH_RESTRICT secret, size_t secretSize,4735XXH64_hash_t seed)4736{4737XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;4738XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);47394740#define XXH3_MIDSIZE_STARTOFFSET 34741#define XXH3_MIDSIZE_LASTOFFSET 1747424743{ xxh_u64 acc = len * XXH_PRIME64_1;4744xxh_u64 acc_end;4745unsigned int const nbRounds = (unsigned int)len / 16;4746unsigned int i;4747XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);4748for (i=0; i<8; i++) {4749acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);4750}4751/* last bytes */4752acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);4753XXH_ASSERT(nbRounds >= 8);4754acc = XXH3_avalanche(acc);4755#if defined(__clang__) /* Clang */ \4756&& (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \4757&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */4758/*4759* UGLY HACK:4760* Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.4761* In everywhere else, it uses scalar code.4762*4763* For 64->128-bit multiplies, even if the NEON was 100% optimal, it4764* would still be slower than UMAAL (see XXH_mult64to128).4765*4766* Unfortunately, Clang doesn't handle the long multiplies properly and4767* converts them to the nonexistent "vmulq_u64" intrinsic, which is then4768* scalarized into an ugly mess of VMOV.32 instructions.4769*4770* This mess is difficult to avoid without turning autovectorization4771* off completely, but they are usually relatively minor and/or not4772* worth it to fix.4773*4774* This loop is the easiest to fix, as unlike XXH32, this pragma4775* _actually works_ because it is a loop vectorization instead of an4776* SLP vectorization.4777*/4778#pragma clang loop vectorize(disable)4779#endif4780for (i=8 ; i < nbRounds; i++) {4781/*4782* Prevents clang for unrolling the acc loop and interleaving with this one.4783*/4784XXH_COMPILER_GUARD(acc);4785acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);4786}4787return XXH3_avalanche(acc + acc_end);4788}4789}479047914792/* ======= Long Keys ======= */47934794#define XXH_STRIPE_LEN 644795#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */4796#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))47974798#ifdef XXH_OLD_NAMES4799# define STRIPE_LEN XXH_STRIPE_LEN4800# define ACC_NB XXH_ACC_NB4801#endif48024803#ifndef XXH_PREFETCH_DIST4804# ifdef __clang__4805# define XXH_PREFETCH_DIST 3204806# else4807# if (XXH_VECTOR == XXH_AVX512)4808# define XXH_PREFETCH_DIST 5124809# else4810# define XXH_PREFETCH_DIST 3844811# endif4812# endif /* __clang__ */4813#endif /* XXH_PREFETCH_DIST */48144815/*4816* These macros are to generate an XXH3_accumulate() function.4817* The two arguments select the name suffix and target attribute.4818*4819* The name of this symbol is XXH3_accumulate_<name>() and it calls4820* XXH3_accumulate_512_<name>().4821*4822* It may be useful to hand implement this function if the compiler fails to4823* optimize the inline function.4824*/4825#define XXH3_ACCUMULATE_TEMPLATE(name) \4826void \4827XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \4828const xxh_u8* XXH_RESTRICT input, \4829const xxh_u8* XXH_RESTRICT secret, \4830size_t nbStripes) \4831{ \4832size_t n; \4833for (n = 0; n < nbStripes; n++ ) { \4834const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \4835XXH_PREFETCH(in + XXH_PREFETCH_DIST); \4836XXH3_accumulate_512_##name( \4837acc, \4838in, \4839secret + n*XXH_SECRET_CONSUME_RATE); \4840} \4841}484248434844XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)4845{4846if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);4847XXH_memcpy(dst, &v64, sizeof(v64));4848}48494850/* Several intrinsic functions below are supposed to accept __int64 as argument,4851* as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .4852* However, several environments do not define __int64 type,4853* requiring a workaround.4854*/4855#if !defined (__VMS) \4856&& (defined (__cplusplus) \4857|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )4858typedef int64_t xxh_i64;4859#else4860/* the following type must have a width of 64-bit */4861typedef long long xxh_i64;4862#endif486348644865/*4866* XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.4867*4868* It is a hardened version of UMAC, based off of FARSH's implementation.4869*4870* This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD4871* implementations, and it is ridiculously fast.4872*4873* We harden it by mixing the original input to the accumulators as well as the product.4874*4875* This means that in the (relatively likely) case of a multiply by zero, the4876* original input is preserved.4877*4878* On 128-bit inputs, we swap 64-bit pairs when we add the input to improve4879* cross-pollination, as otherwise the upper and lower halves would be4880* essentially independent.4881*4882* This doesn't matter on 64-bit hashes since they all get merged together in4883* the end, so we skip the extra step.4884*4885* Both XXH3_64bits and XXH3_128bits use this subroutine.4886*/48874888#if (XXH_VECTOR == XXH_AVX512) \4889|| (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)48904891#ifndef XXH_TARGET_AVX5124892# define XXH_TARGET_AVX512 /* disable attribute target */4893#endif48944895XXH_FORCE_INLINE XXH_TARGET_AVX512 void4896XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,4897const void* XXH_RESTRICT input,4898const void* XXH_RESTRICT secret)4899{4900__m512i* const xacc = (__m512i *) acc;4901XXH_ASSERT((((size_t)acc) & 63) == 0);4902XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));49034904{4905/* data_vec = input[0]; */4906__m512i const data_vec = _mm512_loadu_si512 (input);4907/* key_vec = secret[0]; */4908__m512i const key_vec = _mm512_loadu_si512 (secret);4909/* data_key = data_vec ^ key_vec; */4910__m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);4911/* data_key_lo = data_key >> 32; */4912__m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);4913/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */4914__m512i const product = _mm512_mul_epu32 (data_key, data_key_lo);4915/* xacc[0] += swap(data_vec); */4916__m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));4917__m512i const sum = _mm512_add_epi64(*xacc, data_swap);4918/* xacc[0] += product; */4919*xacc = _mm512_add_epi64(product, sum);4920}4921}4922XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)49234924/*4925* XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.4926*4927* Multiplication isn't perfect, as explained by Google in HighwayHash:4928*4929* // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to4930* // varying degrees. In descending order of goodness, bytes4931* // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.4932* // As expected, the upper and lower bytes are much worse.4933*4934* Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L2914935*4936* Since our algorithm uses a pseudorandom secret to add some variance into the4937* mix, we don't need to (or want to) mix as often or as much as HighwayHash does.4938*4939* This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid4940* extraction.4941*4942* Both XXH3_64bits and XXH3_128bits use this subroutine.4943*/49444945XXH_FORCE_INLINE XXH_TARGET_AVX512 void4946XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)4947{4948XXH_ASSERT((((size_t)acc) & 63) == 0);4949XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));4950{ __m512i* const xacc = (__m512i*) acc;4951const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);49524953/* xacc[0] ^= (xacc[0] >> 47) */4954__m512i const acc_vec = *xacc;4955__m512i const shifted = _mm512_srli_epi64 (acc_vec, 47);4956/* xacc[0] ^= secret; */4957__m512i const key_vec = _mm512_loadu_si512 (secret);4958__m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);49594960/* xacc[0] *= XXH_PRIME32_1; */4961__m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);4962__m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32);4963__m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32);4964*xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));4965}4966}49674968XXH_FORCE_INLINE XXH_TARGET_AVX512 void4969XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)4970{4971XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);4972XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);4973XXH_ASSERT(((size_t)customSecret & 63) == 0);4974(void)(&XXH_writeLE64);4975{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);4976__m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);4977__m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);49784979const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret);4980__m512i* const dest = ( __m512i*) customSecret;4981int i;4982XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */4983XXH_ASSERT(((size_t)dest & 63) == 0);4984for (i=0; i < nbRounds; ++i) {4985dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);4986} }4987}49884989#endif49904991#if (XXH_VECTOR == XXH_AVX2) \4992|| (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)49934994#ifndef XXH_TARGET_AVX24995# define XXH_TARGET_AVX2 /* disable attribute target */4996#endif49974998XXH_FORCE_INLINE XXH_TARGET_AVX2 void4999XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,5000const void* XXH_RESTRICT input,5001const void* XXH_RESTRICT secret)5002{5003XXH_ASSERT((((size_t)acc) & 31) == 0);5004{ __m256i* const xacc = (__m256i *) acc;5005/* Unaligned. This is mainly for pointer arithmetic, and because5006* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */5007const __m256i* const xinput = (const __m256i *) input;5008/* Unaligned. This is mainly for pointer arithmetic, and because5009* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */5010const __m256i* const xsecret = (const __m256i *) secret;50115012size_t i;5013for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {5014/* data_vec = xinput[i]; */5015__m256i const data_vec = _mm256_loadu_si256 (xinput+i);5016/* key_vec = xsecret[i]; */5017__m256i const key_vec = _mm256_loadu_si256 (xsecret+i);5018/* data_key = data_vec ^ key_vec; */5019__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);5020/* data_key_lo = data_key >> 32; */5021__m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);5022/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */5023__m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);5024/* xacc[i] += swap(data_vec); */5025__m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));5026__m256i const sum = _mm256_add_epi64(xacc[i], data_swap);5027/* xacc[i] += product; */5028xacc[i] = _mm256_add_epi64(product, sum);5029} }5030}5031XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)50325033XXH_FORCE_INLINE XXH_TARGET_AVX2 void5034XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)5035{5036XXH_ASSERT((((size_t)acc) & 31) == 0);5037{ __m256i* const xacc = (__m256i*) acc;5038/* Unaligned. This is mainly for pointer arithmetic, and because5039* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */5040const __m256i* const xsecret = (const __m256i *) secret;5041const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);50425043size_t i;5044for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {5045/* xacc[i] ^= (xacc[i] >> 47) */5046__m256i const acc_vec = xacc[i];5047__m256i const shifted = _mm256_srli_epi64 (acc_vec, 47);5048__m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted);5049/* xacc[i] ^= xsecret; */5050__m256i const key_vec = _mm256_loadu_si256 (xsecret+i);5051__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);50525053/* xacc[i] *= XXH_PRIME32_1; */5054__m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);5055__m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);5056__m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);5057xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));5058}5059}5060}50615062XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)5063{5064XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);5065XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);5066XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);5067(void)(&XXH_writeLE64);5068XXH_PREFETCH(customSecret);5069{ __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);50705071const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret);5072__m256i* dest = ( __m256i*) customSecret;50735074# if defined(__GNUC__) || defined(__clang__)5075/*5076* On GCC & Clang, marking 'dest' as modified will cause the compiler:5077* - do not extract the secret from sse registers in the internal loop5078* - use less common registers, and avoid pushing these reg into stack5079*/5080XXH_COMPILER_GUARD(dest);5081# endif5082XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */5083XXH_ASSERT(((size_t)dest & 31) == 0);50845085/* GCC -O2 need unroll loop manually */5086dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);5087dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);5088dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);5089dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);5090dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);5091dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);5092}5093}50945095#endif50965097/* x86dispatch always generates SSE2 */5098#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)50995100#ifndef XXH_TARGET_SSE25101# define XXH_TARGET_SSE2 /* disable attribute target */5102#endif51035104XXH_FORCE_INLINE XXH_TARGET_SSE2 void5105XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,5106const void* XXH_RESTRICT input,5107const void* XXH_RESTRICT secret)5108{5109/* SSE2 is just a half-scale version of the AVX2 version. */5110XXH_ASSERT((((size_t)acc) & 15) == 0);5111{ __m128i* const xacc = (__m128i *) acc;5112/* Unaligned. This is mainly for pointer arithmetic, and because5113* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */5114const __m128i* const xinput = (const __m128i *) input;5115/* Unaligned. This is mainly for pointer arithmetic, and because5116* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */5117const __m128i* const xsecret = (const __m128i *) secret;51185119size_t i;5120for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {5121/* data_vec = xinput[i]; */5122__m128i const data_vec = _mm_loadu_si128 (xinput+i);5123/* key_vec = xsecret[i]; */5124__m128i const key_vec = _mm_loadu_si128 (xsecret+i);5125/* data_key = data_vec ^ key_vec; */5126__m128i const data_key = _mm_xor_si128 (data_vec, key_vec);5127/* data_key_lo = data_key >> 32; */5128__m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));5129/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */5130__m128i const product = _mm_mul_epu32 (data_key, data_key_lo);5131/* xacc[i] += swap(data_vec); */5132__m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));5133__m128i const sum = _mm_add_epi64(xacc[i], data_swap);5134/* xacc[i] += product; */5135xacc[i] = _mm_add_epi64(product, sum);5136} }5137}5138XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)51395140XXH_FORCE_INLINE XXH_TARGET_SSE2 void5141XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)5142{5143XXH_ASSERT((((size_t)acc) & 15) == 0);5144{ __m128i* const xacc = (__m128i*) acc;5145/* Unaligned. This is mainly for pointer arithmetic, and because5146* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */5147const __m128i* const xsecret = (const __m128i *) secret;5148const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);51495150size_t i;5151for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {5152/* xacc[i] ^= (xacc[i] >> 47) */5153__m128i const acc_vec = xacc[i];5154__m128i const shifted = _mm_srli_epi64 (acc_vec, 47);5155__m128i const data_vec = _mm_xor_si128 (acc_vec, shifted);5156/* xacc[i] ^= xsecret[i]; */5157__m128i const key_vec = _mm_loadu_si128 (xsecret+i);5158__m128i const data_key = _mm_xor_si128 (data_vec, key_vec);51595160/* xacc[i] *= XXH_PRIME32_1; */5161__m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));5162__m128i const prod_lo = _mm_mul_epu32 (data_key, prime32);5163__m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32);5164xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));5165}5166}5167}51685169XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)5170{5171XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);5172(void)(&XXH_writeLE64);5173{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);51745175# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 19005176/* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */5177XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };5178__m128i const seed = _mm_load_si128((__m128i const*)seed64x2);5179# else5180__m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);5181# endif5182int i;51835184const void* const src16 = XXH3_kSecret;5185__m128i* dst16 = (__m128i*) customSecret;5186# if defined(__GNUC__) || defined(__clang__)5187/*5188* On GCC & Clang, marking 'dest' as modified will cause the compiler:5189* - do not extract the secret from sse registers in the internal loop5190* - use less common registers, and avoid pushing these reg into stack5191*/5192XXH_COMPILER_GUARD(dst16);5193# endif5194XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */5195XXH_ASSERT(((size_t)dst16 & 15) == 0);51965197for (i=0; i < nbRounds; ++i) {5198dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);5199} }5200}52015202#endif52035204#if (XXH_VECTOR == XXH_NEON)52055206/* forward declarations for the scalar routines */5207XXH_FORCE_INLINE void5208XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,5209void const* XXH_RESTRICT secret, size_t lane);52105211XXH_FORCE_INLINE void5212XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,5213void const* XXH_RESTRICT secret, size_t lane);52145215/*!5216* @internal5217* @brief The bulk processing loop for NEON and WASM SIMD128.5218*5219* The NEON code path is actually partially scalar when running on AArch64. This5220* is to optimize the pipelining and can have up to 15% speedup depending on the5221* CPU, and it also mitigates some GCC codegen issues.5222*5223* @see XXH3_NEON_LANES for configuring this and details about this optimization.5224*5225* NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit5226* integers instead of the other platforms which mask full 64-bit vectors,5227* so the setup is more complicated than just shifting right.5228*5229* Additionally, there is an optimization for 4 lanes at once noted below.5230*5231* Since, as stated, the most optimal amount of lanes for Cortexes is 6,5232* there needs to be *three* versions of the accumulate operation used5233* for the remaining 2 lanes.5234*5235* WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap5236* nearly perfectly.5237*/52385239XXH_FORCE_INLINE void5240XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,5241const void* XXH_RESTRICT input,5242const void* XXH_RESTRICT secret)5243{5244XXH_ASSERT((((size_t)acc) & 15) == 0);5245XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);5246{ /* GCC for darwin arm64 does not like aliasing here */5247xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc;5248/* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */5249uint8_t const* xinput = (const uint8_t *) input;5250uint8_t const* xsecret = (const uint8_t *) secret;52515252size_t i;5253#ifdef __wasm_simd128__5254/*5255* On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret5256* is constant propagated, which results in it converting it to this5257* inside the loop:5258*5259* a = v128.load(XXH3_kSecret + 0 + $secret_offset, offset = 0)5260* b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0)5261* ...5262*5263* This requires a full 32-bit address immediate (and therefore a 6 byte5264* instruction) as well as an add for each offset.5265*5266* Putting an asm guard prevents it from folding (at the cost of losing5267* the alignment hint), and uses the free offset in `v128.load` instead5268* of adding secret_offset each time which overall reduces code size by5269* about a kilobyte and improves performance.5270*/5271XXH_COMPILER_GUARD(xsecret);5272#endif5273/* Scalar lanes use the normal scalarRound routine */5274for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {5275XXH3_scalarRound(acc, input, secret, i);5276}5277i = 0;5278/* 4 NEON lanes at a time. */5279for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {5280/* data_vec = xinput[i]; */5281uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16));5282uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16));5283/* key_vec = xsecret[i]; */5284uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16));5285uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16));5286/* data_swap = swap(data_vec) */5287uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);5288uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);5289/* data_key = data_vec ^ key_vec; */5290uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);5291uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);52925293/*5294* If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a5295* de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to5296* get one vector with the low 32 bits of each lane, and one vector5297* with the high 32 bits of each lane.5298*5299* The intrinsic returns a double vector because the original ARMv7-a5300* instruction modified both arguments in place. AArch64 and SIMD128 emit5301* two instructions from this intrinsic.5302*5303* [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]5304* [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]5305*/5306uint32x4x2_t unzipped = vuzpq_u32(5307vreinterpretq_u32_u64(data_key_1),5308vreinterpretq_u32_u64(data_key_2)5309);5310/* data_key_lo = data_key & 0xFFFFFFFF */5311uint32x4_t data_key_lo = unzipped.val[0];5312/* data_key_hi = data_key >> 32 */5313uint32x4_t data_key_hi = unzipped.val[1];5314/*5315* Then, we can split the vectors horizontally and multiply which, as for most5316* widening intrinsics, have a variant that works on both high half vectors5317* for free on AArch64. A similar instruction is available on SIMD128.5318*5319* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi5320*/5321uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi);5322uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi);5323/*5324* Clang reorders5325* a += b * c; // umlal swap.2d, dkl.2s, dkh.2s5326* c += a; // add acc.2d, acc.2d, swap.2d5327* to5328* c += a; // add acc.2d, acc.2d, swap.2d5329* c += b * c; // umlal acc.2d, dkl.2s, dkh.2s5330*5331* While it would make sense in theory since the addition is faster,5332* for reasons likely related to umlal being limited to certain NEON5333* pipelines, this is worse. A compiler guard fixes this.5334*/5335XXH_COMPILER_GUARD_CLANG_NEON(sum_1);5336XXH_COMPILER_GUARD_CLANG_NEON(sum_2);5337/* xacc[i] = acc_vec + sum; */5338xacc[i] = vaddq_u64(xacc[i], sum_1);5339xacc[i+1] = vaddq_u64(xacc[i+1], sum_2);5340}5341/* Operate on the remaining NEON lanes 2 at a time. */5342for (; i < XXH3_NEON_LANES / 2; i++) {5343/* data_vec = xinput[i]; */5344uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16));5345/* key_vec = xsecret[i]; */5346uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16));5347/* acc_vec_2 = swap(data_vec) */5348uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1);5349/* data_key = data_vec ^ key_vec; */5350uint64x2_t data_key = veorq_u64(data_vec, key_vec);5351/* For two lanes, just use VMOVN and VSHRN. */5352/* data_key_lo = data_key & 0xFFFFFFFF; */5353uint32x2_t data_key_lo = vmovn_u64(data_key);5354/* data_key_hi = data_key >> 32; */5355uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);5356/* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */5357uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi);5358/* Same Clang workaround as before */5359XXH_COMPILER_GUARD_CLANG_NEON(sum);5360/* xacc[i] = acc_vec + sum; */5361xacc[i] = vaddq_u64 (xacc[i], sum);5362}5363}5364}5365XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)53665367XXH_FORCE_INLINE void5368XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)5369{5370XXH_ASSERT((((size_t)acc) & 15) == 0);53715372{ xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc;5373uint8_t const* xsecret = (uint8_t const*) secret;53745375size_t i;5376/* WASM uses operator overloads and doesn't need these. */5377#ifndef __wasm_simd128__5378/* { prime32_1, prime32_1 } */5379uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1);5380/* { 0, prime32_1, 0, prime32_1 } */5381uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32));5382#endif53835384/* AArch64 uses both scalar and neon at the same time */5385for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {5386XXH3_scalarScrambleRound(acc, secret, i);5387}5388for (i=0; i < XXH3_NEON_LANES / 2; i++) {5389/* xacc[i] ^= (xacc[i] >> 47); */5390uint64x2_t acc_vec = xacc[i];5391uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);5392uint64x2_t data_vec = veorq_u64(acc_vec, shifted);53935394/* xacc[i] ^= xsecret[i]; */5395uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16));5396uint64x2_t data_key = veorq_u64(data_vec, key_vec);5397/* xacc[i] *= XXH_PRIME32_1 */5398#ifdef __wasm_simd128__5399/* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */5400xacc[i] = data_key * XXH_PRIME32_1;5401#else5402/*5403* Expanded version with portable NEON intrinsics5404*5405* lo(x) * lo(y) + (hi(x) * lo(y) << 32)5406*5407* prod_hi = hi(data_key) * lo(prime) << 325408*5409* Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector5410* as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits5411* and avoid the shift.5412*/5413uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi);5414/* Extract low bits for vmlal_u32 */5415uint32x2_t data_key_lo = vmovn_u64(data_key);5416/* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */5417xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);5418#endif5419}5420}5421}5422#endif54235424#if (XXH_VECTOR == XXH_VSX)54255426XXH_FORCE_INLINE void5427XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,5428const void* XXH_RESTRICT input,5429const void* XXH_RESTRICT secret)5430{5431/* presumed aligned */5432xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;5433xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */5434xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */5435xxh_u64x2 const v32 = { 32, 32 };5436size_t i;5437for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {5438/* data_vec = xinput[i]; */5439xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i);5440/* key_vec = xsecret[i]; */5441xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i);5442xxh_u64x2 const data_key = data_vec ^ key_vec;5443/* shuffled = (data_key << 32) | (data_key >> 32); */5444xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);5445/* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */5446xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);5447/* acc_vec = xacc[i]; */5448xxh_u64x2 acc_vec = xacc[i];5449acc_vec += product;54505451/* swap high and low halves */5452#ifdef __s390x__5453acc_vec += vec_permi(data_vec, data_vec, 2);5454#else5455acc_vec += vec_xxpermdi(data_vec, data_vec, 2);5456#endif5457xacc[i] = acc_vec;5458}5459}5460XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)54615462XXH_FORCE_INLINE void5463XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)5464{5465XXH_ASSERT((((size_t)acc) & 15) == 0);54665467{ xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;5468const xxh_u8* const xsecret = (const xxh_u8*) secret;5469/* constants */5470xxh_u64x2 const v32 = { 32, 32 };5471xxh_u64x2 const v47 = { 47, 47 };5472xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };5473size_t i;5474for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {5475/* xacc[i] ^= (xacc[i] >> 47); */5476xxh_u64x2 const acc_vec = xacc[i];5477xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);54785479/* xacc[i] ^= xsecret[i]; */5480xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i);5481xxh_u64x2 const data_key = data_vec ^ key_vec;54825483/* xacc[i] *= XXH_PRIME32_1 */5484/* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */5485xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);5486/* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */5487xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);5488xacc[i] = prod_odd + (prod_even << v32);5489} }5490}54915492#endif54935494#if (XXH_VECTOR == XXH_SVE)54955496XXH_FORCE_INLINE void5497XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,5498const void* XXH_RESTRICT input,5499const void* XXH_RESTRICT secret)5500{5501uint64_t *xacc = (uint64_t *)acc;5502const uint64_t *xinput = (const uint64_t *)(const void *)input;5503const uint64_t *xsecret = (const uint64_t *)(const void *)secret;5504svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);5505uint64_t element_count = svcntd();5506if (element_count >= 8) {5507svbool_t mask = svptrue_pat_b64(SV_VL8);5508svuint64_t vacc = svld1_u64(mask, xacc);5509ACCRND(vacc, 0);5510svst1_u64(mask, xacc, vacc);5511} else if (element_count == 2) { /* sve128 */5512svbool_t mask = svptrue_pat_b64(SV_VL2);5513svuint64_t acc0 = svld1_u64(mask, xacc + 0);5514svuint64_t acc1 = svld1_u64(mask, xacc + 2);5515svuint64_t acc2 = svld1_u64(mask, xacc + 4);5516svuint64_t acc3 = svld1_u64(mask, xacc + 6);5517ACCRND(acc0, 0);5518ACCRND(acc1, 2);5519ACCRND(acc2, 4);5520ACCRND(acc3, 6);5521svst1_u64(mask, xacc + 0, acc0);5522svst1_u64(mask, xacc + 2, acc1);5523svst1_u64(mask, xacc + 4, acc2);5524svst1_u64(mask, xacc + 6, acc3);5525} else {5526svbool_t mask = svptrue_pat_b64(SV_VL4);5527svuint64_t acc0 = svld1_u64(mask, xacc + 0);5528svuint64_t acc1 = svld1_u64(mask, xacc + 4);5529ACCRND(acc0, 0);5530ACCRND(acc1, 4);5531svst1_u64(mask, xacc + 0, acc0);5532svst1_u64(mask, xacc + 4, acc1);5533}5534}55355536XXH_FORCE_INLINE void5537XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,5538const xxh_u8* XXH_RESTRICT input,5539const xxh_u8* XXH_RESTRICT secret,5540size_t nbStripes)5541{5542if (nbStripes != 0) {5543uint64_t *xacc = (uint64_t *)acc;5544const uint64_t *xinput = (const uint64_t *)(const void *)input;5545const uint64_t *xsecret = (const uint64_t *)(const void *)secret;5546svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);5547uint64_t element_count = svcntd();5548if (element_count >= 8) {5549svbool_t mask = svptrue_pat_b64(SV_VL8);5550svuint64_t vacc = svld1_u64(mask, xacc + 0);5551do {5552/* svprfd(svbool_t, void *, enum svfprop); */5553svprfd(mask, xinput + 128, SV_PLDL1STRM);5554ACCRND(vacc, 0);5555xinput += 8;5556xsecret += 1;5557nbStripes--;5558} while (nbStripes != 0);55595560svst1_u64(mask, xacc + 0, vacc);5561} else if (element_count == 2) { /* sve128 */5562svbool_t mask = svptrue_pat_b64(SV_VL2);5563svuint64_t acc0 = svld1_u64(mask, xacc + 0);5564svuint64_t acc1 = svld1_u64(mask, xacc + 2);5565svuint64_t acc2 = svld1_u64(mask, xacc + 4);5566svuint64_t acc3 = svld1_u64(mask, xacc + 6);5567do {5568svprfd(mask, xinput + 128, SV_PLDL1STRM);5569ACCRND(acc0, 0);5570ACCRND(acc1, 2);5571ACCRND(acc2, 4);5572ACCRND(acc3, 6);5573xinput += 8;5574xsecret += 1;5575nbStripes--;5576} while (nbStripes != 0);55775578svst1_u64(mask, xacc + 0, acc0);5579svst1_u64(mask, xacc + 2, acc1);5580svst1_u64(mask, xacc + 4, acc2);5581svst1_u64(mask, xacc + 6, acc3);5582} else {5583svbool_t mask = svptrue_pat_b64(SV_VL4);5584svuint64_t acc0 = svld1_u64(mask, xacc + 0);5585svuint64_t acc1 = svld1_u64(mask, xacc + 4);5586do {5587svprfd(mask, xinput + 128, SV_PLDL1STRM);5588ACCRND(acc0, 0);5589ACCRND(acc1, 4);5590xinput += 8;5591xsecret += 1;5592nbStripes--;5593} while (nbStripes != 0);55945595svst1_u64(mask, xacc + 0, acc0);5596svst1_u64(mask, xacc + 4, acc1);5597}5598}5599}56005601#endif56025603/* scalar variants - universal */56045605#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))5606/*5607* In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they5608* emit an excess mask and a full 64-bit multiply-add (MADD X-form).5609*5610* While this might not seem like much, as AArch64 is a 64-bit architecture, only5611* big Cortex designs have a full 64-bit multiplier.5612*5613* On the little cores, the smaller 32-bit multiplier is used, and full 64-bit5614* multiplies expand to 2-3 multiplies in microcode. This has a major penalty5615* of up to 4 latency cycles and 2 stall cycles in the multiply pipeline.5616*5617* Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does5618* not have this penalty and does the mask automatically.5619*/5620XXH_FORCE_INLINE xxh_u645621XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)5622{5623xxh_u64 ret;5624/* note: %x = 64-bit register, %w = 32-bit register */5625__asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc));5626return ret;5627}5628#else5629XXH_FORCE_INLINE xxh_u645630XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)5631{5632return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc;5633}5634#endif56355636/*!5637* @internal5638* @brief Scalar round for @ref XXH3_accumulate_512_scalar().5639*5640* This is extracted to its own function because the NEON path uses a combination5641* of NEON and scalar.5642*/5643XXH_FORCE_INLINE void5644XXH3_scalarRound(void* XXH_RESTRICT acc,5645void const* XXH_RESTRICT input,5646void const* XXH_RESTRICT secret,5647size_t lane)5648{5649xxh_u64* xacc = (xxh_u64*) acc;5650xxh_u8 const* xinput = (xxh_u8 const*) input;5651xxh_u8 const* xsecret = (xxh_u8 const*) secret;5652XXH_ASSERT(lane < XXH_ACC_NB);5653XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);5654{5655xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);5656xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);5657xacc[lane ^ 1] += data_val; /* swap adjacent lanes */5658xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]);5659}5660}56615662/*!5663* @internal5664* @brief Processes a 64 byte block of data using the scalar path.5665*/5666XXH_FORCE_INLINE void5667XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,5668const void* XXH_RESTRICT input,5669const void* XXH_RESTRICT secret)5670{5671size_t i;5672/* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */5673#if defined(__GNUC__) && !defined(__clang__) \5674&& (defined(__arm__) || defined(__thumb2__)) \5675&& defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \5676&& XXH_SIZE_OPT <= 05677# pragma GCC unroll 85678#endif5679for (i=0; i < XXH_ACC_NB; i++) {5680XXH3_scalarRound(acc, input, secret, i);5681}5682}5683XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)56845685/*!5686* @internal5687* @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().5688*5689* This is extracted to its own function because the NEON path uses a combination5690* of NEON and scalar.5691*/5692XXH_FORCE_INLINE void5693XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,5694void const* XXH_RESTRICT secret,5695size_t lane)5696{5697xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */5698const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */5699XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);5700XXH_ASSERT(lane < XXH_ACC_NB);5701{5702xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);5703xxh_u64 acc64 = xacc[lane];5704acc64 = XXH_xorshift64(acc64, 47);5705acc64 ^= key64;5706acc64 *= XXH_PRIME32_1;5707xacc[lane] = acc64;5708}5709}57105711/*!5712* @internal5713* @brief Scrambles the accumulators after a large chunk has been read5714*/5715XXH_FORCE_INLINE void5716XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)5717{5718size_t i;5719for (i=0; i < XXH_ACC_NB; i++) {5720XXH3_scalarScrambleRound(acc, secret, i);5721}5722}57235724XXH_FORCE_INLINE void5725XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)5726{5727/*5728* We need a separate pointer for the hack below,5729* which requires a non-const pointer.5730* Any decent compiler will optimize this out otherwise.5731*/5732const xxh_u8* kSecretPtr = XXH3_kSecret;5733XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);57345735#if defined(__GNUC__) && defined(__aarch64__)5736/*5737* UGLY HACK:5738* GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are5739* placed sequentially, in order, at the top of the unrolled loop.5740*5741* While MOVK is great for generating constants (2 cycles for a 64-bit5742* constant compared to 4 cycles for LDR), it fights for bandwidth with5743* the arithmetic instructions.5744*5745* I L S5746* MOVK5747* MOVK5748* MOVK5749* MOVK5750* ADD5751* SUB STR5752* STR5753* By forcing loads from memory (as the asm line causes the compiler to assume5754* that XXH3_kSecretPtr has been changed), the pipelines are used more5755* efficiently:5756* I L S5757* LDR5758* ADD LDR5759* SUB STR5760* STR5761*5762* See XXH3_NEON_LANES for details on the pipsline.5763*5764* XXH3_64bits_withSeed, len == 256, Snapdragon 8355765* without hack: 2654.4 MB/s5766* with hack: 3202.9 MB/s5767*/5768XXH_COMPILER_GUARD(kSecretPtr);5769#endif5770{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;5771int i;5772for (i=0; i < nbRounds; i++) {5773/*5774* The asm hack causes the compiler to assume that kSecretPtr aliases with5775* customSecret, and on aarch64, this prevented LDP from merging two5776* loads together for free. Putting the loads together before the stores5777* properly generates LDP.5778*/5779xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64;5780xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;5781XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo);5782XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);5783} }5784}578557865787typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);5788typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);5789typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);579057915792#if (XXH_VECTOR == XXH_AVX512)57935794#define XXH3_accumulate_512 XXH3_accumulate_512_avx5125795#define XXH3_accumulate XXH3_accumulate_avx5125796#define XXH3_scrambleAcc XXH3_scrambleAcc_avx5125797#define XXH3_initCustomSecret XXH3_initCustomSecret_avx51257985799#elif (XXH_VECTOR == XXH_AVX2)58005801#define XXH3_accumulate_512 XXH3_accumulate_512_avx25802#define XXH3_accumulate XXH3_accumulate_avx25803#define XXH3_scrambleAcc XXH3_scrambleAcc_avx25804#define XXH3_initCustomSecret XXH3_initCustomSecret_avx258055806#elif (XXH_VECTOR == XXH_SSE2)58075808#define XXH3_accumulate_512 XXH3_accumulate_512_sse25809#define XXH3_accumulate XXH3_accumulate_sse25810#define XXH3_scrambleAcc XXH3_scrambleAcc_sse25811#define XXH3_initCustomSecret XXH3_initCustomSecret_sse258125813#elif (XXH_VECTOR == XXH_NEON)58145815#define XXH3_accumulate_512 XXH3_accumulate_512_neon5816#define XXH3_accumulate XXH3_accumulate_neon5817#define XXH3_scrambleAcc XXH3_scrambleAcc_neon5818#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar58195820#elif (XXH_VECTOR == XXH_VSX)58215822#define XXH3_accumulate_512 XXH3_accumulate_512_vsx5823#define XXH3_accumulate XXH3_accumulate_vsx5824#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx5825#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar58265827#elif (XXH_VECTOR == XXH_SVE)5828#define XXH3_accumulate_512 XXH3_accumulate_512_sve5829#define XXH3_accumulate XXH3_accumulate_sve5830#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar5831#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar58325833#else /* scalar */58345835#define XXH3_accumulate_512 XXH3_accumulate_512_scalar5836#define XXH3_accumulate XXH3_accumulate_scalar5837#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar5838#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar58395840#endif58415842#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */5843# undef XXH3_initCustomSecret5844# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar5845#endif58465847XXH_FORCE_INLINE void5848XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,5849const xxh_u8* XXH_RESTRICT input, size_t len,5850const xxh_u8* XXH_RESTRICT secret, size_t secretSize,5851XXH3_f_accumulate f_acc,5852XXH3_f_scrambleAcc f_scramble)5853{5854size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;5855size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;5856size_t const nb_blocks = (len - 1) / block_len;58575858size_t n;58595860XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);58615862for (n = 0; n < nb_blocks; n++) {5863f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);5864f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);5865}58665867/* last partial block */5868XXH_ASSERT(len > XXH_STRIPE_LEN);5869{ size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;5870XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));5871f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);58725873/* last stripe */5874{ const xxh_u8* const p = input + len - XXH_STRIPE_LEN;5875#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */5876XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);5877} }5878}58795880XXH_FORCE_INLINE xxh_u645881XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)5882{5883return XXH3_mul128_fold64(5884acc[0] ^ XXH_readLE64(secret),5885acc[1] ^ XXH_readLE64(secret+8) );5886}58875888static XXH64_hash_t5889XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)5890{5891xxh_u64 result64 = start;5892size_t i = 0;58935894for (i = 0; i < 4; i++) {5895result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);5896#if defined(__clang__) /* Clang */ \5897&& (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \5898&& (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \5899&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */5900/*5901* UGLY HACK:5902* Prevent autovectorization on Clang ARMv7-a. Exact same problem as5903* the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.5904* XXH3_64bits, len == 256, Snapdragon 835:5905* without hack: 2063.7 MB/s5906* with hack: 2560.7 MB/s5907*/5908XXH_COMPILER_GUARD(result64);5909#endif5910}59115912return XXH3_avalanche(result64);5913}59145915#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \5916XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }59175918XXH_FORCE_INLINE XXH64_hash_t5919XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,5920const void* XXH_RESTRICT secret, size_t secretSize,5921XXH3_f_accumulate f_acc,5922XXH3_f_scrambleAcc f_scramble)5923{5924XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;59255926XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);59275928/* converge into final hash */5929XXH_STATIC_ASSERT(sizeof(acc) == 64);5930/* do not align on 8, so that the secret is different from the accumulator */5931#define XXH_SECRET_MERGEACCS_START 115932XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);5933return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);5934}59355936/*5937* It's important for performance to transmit secret's size (when it's static)5938* so that the compiler can properly optimize the vectorized loop.5939* This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.5940* When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE5941* breaks -Og, this is XXH_NO_INLINE.5942*/5943XXH3_WITH_SECRET_INLINE XXH64_hash_t5944XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,5945XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)5946{5947(void)seed64;5948return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);5949}59505951/*5952* It's preferable for performance that XXH3_hashLong is not inlined,5953* as it results in a smaller function for small data, easier to the instruction cache.5954* Note that inside this no_inline function, we do inline the internal loop,5955* and provide a statically defined secret size to allow optimization of vector loop.5956*/5957XXH_NO_INLINE XXH_PUREF XXH64_hash_t5958XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,5959XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)5960{5961(void)seed64; (void)secret; (void)secretLen;5962return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);5963}59645965/*5966* XXH3_hashLong_64b_withSeed():5967* Generate a custom key based on alteration of default XXH3_kSecret with the seed,5968* and then use this key for long mode hashing.5969*5970* This operation is decently fast but nonetheless costs a little bit of time.5971* Try to avoid it whenever possible (typically when seed==0).5972*5973* It's important for performance that XXH3_hashLong is not inlined. Not sure5974* why (uop cache maybe?), but the difference is large and easily measurable.5975*/5976XXH_FORCE_INLINE XXH64_hash_t5977XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,5978XXH64_hash_t seed,5979XXH3_f_accumulate f_acc,5980XXH3_f_scrambleAcc f_scramble,5981XXH3_f_initCustomSecret f_initSec)5982{5983#if XXH_SIZE_OPT <= 05984if (seed == 0)5985return XXH3_hashLong_64b_internal(input, len,5986XXH3_kSecret, sizeof(XXH3_kSecret),5987f_acc, f_scramble);5988#endif5989{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];5990f_initSec(secret, seed);5991return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),5992f_acc, f_scramble);5993}5994}59955996/*5997* It's important for performance that XXH3_hashLong is not inlined.5998*/5999XXH_NO_INLINE XXH64_hash_t6000XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,6001XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)6002{6003(void)secret; (void)secretLen;6004return XXH3_hashLong_64b_withSeed_internal(input, len, seed,6005XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);6006}600760086009typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,6010XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);60116012XXH_FORCE_INLINE XXH64_hash_t6013XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,6014XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,6015XXH3_hashLong64_f f_hashLong)6016{6017XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);6018/*6019* If an action is to be taken if `secretLen` condition is not respected,6020* it should be done here.6021* For now, it's a contract pre-condition.6022* Adding a check and a branch here would cost performance at every hash.6023* Also, note that function signature doesn't offer room to return an error.6024*/6025if (len <= 16)6026return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);6027if (len <= 128)6028return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);6029if (len <= XXH3_MIDSIZE_MAX)6030return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);6031return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);6032}603360346035/* === Public entry point === */60366037/*! @ingroup XXH3_family */6038XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)6039{6040return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);6041}60426043/*! @ingroup XXH3_family */6044XXH_PUBLIC_API XXH64_hash_t6045XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)6046{6047return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);6048}60496050/*! @ingroup XXH3_family */6051XXH_PUBLIC_API XXH64_hash_t6052XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)6053{6054return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);6055}60566057XXH_PUBLIC_API XXH64_hash_t6058XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)6059{6060if (length <= XXH3_MIDSIZE_MAX)6061return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);6062return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize);6063}606460656066/* === XXH3 streaming === */6067#ifndef XXH_NO_STREAM6068/*6069* Malloc's a pointer that is always aligned to align.6070*6071* This must be freed with `XXH_alignedFree()`.6072*6073* malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte6074* alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX26075* or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.6076*6077* This underalignment previously caused a rather obvious crash which went6078* completely unnoticed due to XXH3_createState() not actually being tested.6079* Credit to RedSpah for noticing this bug.6080*6081* The alignment is done manually: Functions like posix_memalign or _mm_malloc6082* are avoided: To maintain portability, we would have to write a fallback6083* like this anyways, and besides, testing for the existence of library6084* functions without relying on external build tools is impossible.6085*6086* The method is simple: Overallocate, manually align, and store the offset6087* to the original behind the returned pointer.6088*6089* Align must be a power of 2 and 8 <= align <= 128.6090*/6091static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align)6092{6093XXH_ASSERT(align <= 128 && align >= 8); /* range check */6094XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */6095XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */6096{ /* Overallocate to make room for manual realignment and an offset byte */6097xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);6098if (base != NULL) {6099/*6100* Get the offset needed to align this pointer.6101*6102* Even if the returned pointer is aligned, there will always be6103* at least one byte to store the offset to the original pointer.6104*/6105size_t offset = align - ((size_t)base & (align - 1)); /* base % align */6106/* Add the offset for the now-aligned pointer */6107xxh_u8* ptr = base + offset;61086109XXH_ASSERT((size_t)ptr % align == 0);61106111/* Store the offset immediately before the returned pointer. */6112ptr[-1] = (xxh_u8)offset;6113return ptr;6114}6115return NULL;6116}6117}6118/*6119* Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass6120* normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.6121*/6122static void XXH_alignedFree(void* p)6123{6124if (p != NULL) {6125xxh_u8* ptr = (xxh_u8*)p;6126/* Get the offset byte we added in XXH_malloc. */6127xxh_u8 offset = ptr[-1];6128/* Free the original malloc'd pointer */6129xxh_u8* base = ptr - offset;6130XXH_free(base);6131}6132}6133/*! @ingroup XXH3_family */6134/*!6135* @brief Allocate an @ref XXH3_state_t.6136*6137* @return An allocated pointer of @ref XXH3_state_t on success.6138* @return `NULL` on failure.6139*6140* @note Must be freed with XXH3_freeState().6141*/6142XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)6143{6144XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);6145if (state==NULL) return NULL;6146XXH3_INITSTATE(state);6147return state;6148}61496150/*! @ingroup XXH3_family */6151/*!6152* @brief Frees an @ref XXH3_state_t.6153*6154* @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().6155*6156* @return @ref XXH_OK.6157*6158* @note Must be allocated with XXH3_createState().6159*/6160XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)6161{6162XXH_alignedFree(statePtr);6163return XXH_OK;6164}61656166/*! @ingroup XXH3_family */6167XXH_PUBLIC_API void6168XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)6169{6170XXH_memcpy(dst_state, src_state, sizeof(*dst_state));6171}61726173static void6174XXH3_reset_internal(XXH3_state_t* statePtr,6175XXH64_hash_t seed,6176const void* secret, size_t secretSize)6177{6178size_t const initStart = offsetof(XXH3_state_t, bufferedSize);6179size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;6180XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);6181XXH_ASSERT(statePtr != NULL);6182/* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */6183memset((char*)statePtr + initStart, 0, initLength);6184statePtr->acc[0] = XXH_PRIME32_3;6185statePtr->acc[1] = XXH_PRIME64_1;6186statePtr->acc[2] = XXH_PRIME64_2;6187statePtr->acc[3] = XXH_PRIME64_3;6188statePtr->acc[4] = XXH_PRIME64_4;6189statePtr->acc[5] = XXH_PRIME32_2;6190statePtr->acc[6] = XXH_PRIME64_5;6191statePtr->acc[7] = XXH_PRIME32_1;6192statePtr->seed = seed;6193statePtr->useSeed = (seed != 0);6194statePtr->extSecret = (const unsigned char*)secret;6195XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);6196statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;6197statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;6198}61996200/*! @ingroup XXH3_family */6201XXH_PUBLIC_API XXH_errorcode6202XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)6203{6204if (statePtr == NULL) return XXH_ERROR;6205XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);6206return XXH_OK;6207}62086209/*! @ingroup XXH3_family */6210XXH_PUBLIC_API XXH_errorcode6211XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)6212{6213if (statePtr == NULL) return XXH_ERROR;6214XXH3_reset_internal(statePtr, 0, secret, secretSize);6215if (secret == NULL) return XXH_ERROR;6216if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;6217return XXH_OK;6218}62196220/*! @ingroup XXH3_family */6221XXH_PUBLIC_API XXH_errorcode6222XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)6223{6224if (statePtr == NULL) return XXH_ERROR;6225if (seed==0) return XXH3_64bits_reset(statePtr);6226if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))6227XXH3_initCustomSecret(statePtr->customSecret, seed);6228XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);6229return XXH_OK;6230}62316232/*! @ingroup XXH3_family */6233XXH_PUBLIC_API XXH_errorcode6234XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)6235{6236if (statePtr == NULL) return XXH_ERROR;6237if (secret == NULL) return XXH_ERROR;6238if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;6239XXH3_reset_internal(statePtr, seed64, secret, secretSize);6240statePtr->useSeed = 1; /* always, even if seed64==0 */6241return XXH_OK;6242}62436244/*!6245* @internal6246* @brief Processes a large input for XXH3_update() and XXH3_digest_long().6247*6248* Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block.6249*6250* @param acc Pointer to the 8 accumulator lanes6251* @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block*6252* @param nbStripesPerBlock Number of stripes in a block6253* @param input Input pointer6254* @param nbStripes Number of stripes to process6255* @param secret Secret pointer6256* @param secretLimit Offset of the last block in @p secret6257* @param f_acc Pointer to an XXH3_accumulate implementation6258* @param f_scramble Pointer to an XXH3_scrambleAcc implementation6259* @return Pointer past the end of @p input after processing6260*/6261XXH_FORCE_INLINE const xxh_u8 *6262XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,6263size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,6264const xxh_u8* XXH_RESTRICT input, size_t nbStripes,6265const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,6266XXH3_f_accumulate f_acc,6267XXH3_f_scrambleAcc f_scramble)6268{6269const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE;6270/* Process full blocks */6271if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) {6272/* Process the initial partial block... */6273size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr;62746275do {6276/* Accumulate and scramble */6277f_acc(acc, input, initialSecret, nbStripesThisIter);6278f_scramble(acc, secret + secretLimit);6279input += nbStripesThisIter * XXH_STRIPE_LEN;6280nbStripes -= nbStripesThisIter;6281/* Then continue the loop with the full block size */6282nbStripesThisIter = nbStripesPerBlock;6283initialSecret = secret;6284} while (nbStripes >= nbStripesPerBlock);6285*nbStripesSoFarPtr = 0;6286}6287/* Process a partial block */6288if (nbStripes > 0) {6289f_acc(acc, input, initialSecret, nbStripes);6290input += nbStripes * XXH_STRIPE_LEN;6291*nbStripesSoFarPtr += nbStripes;6292}6293/* Return end pointer */6294return input;6295}62966297#ifndef XXH3_STREAM_USE_STACK6298# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */6299# define XXH3_STREAM_USE_STACK 16300# endif6301#endif6302/*6303* Both XXH3_64bits_update and XXH3_128bits_update use this routine.6304*/6305XXH_FORCE_INLINE XXH_errorcode6306XXH3_update(XXH3_state_t* XXH_RESTRICT const state,6307const xxh_u8* XXH_RESTRICT input, size_t len,6308XXH3_f_accumulate f_acc,6309XXH3_f_scrambleAcc f_scramble)6310{6311if (input==NULL) {6312XXH_ASSERT(len == 0);6313return XXH_OK;6314}63156316XXH_ASSERT(state != NULL);6317{ const xxh_u8* const bEnd = input + len;6318const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;6319#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 16320/* For some reason, gcc and MSVC seem to suffer greatly6321* when operating accumulators directly into state.6322* Operating into stack space seems to enable proper optimization.6323* clang, on the other hand, doesn't seem to need this trick */6324XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];6325XXH_memcpy(acc, state->acc, sizeof(acc));6326#else6327xxh_u64* XXH_RESTRICT const acc = state->acc;6328#endif6329state->totalLen += len;6330XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);63316332/* small input : just fill in tmp buffer */6333if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) {6334XXH_memcpy(state->buffer + state->bufferedSize, input, len);6335state->bufferedSize += (XXH32_hash_t)len;6336return XXH_OK;6337}63386339/* total input is now > XXH3_INTERNALBUFFER_SIZE */6340#define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)6341XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */63426343/*6344* Internal buffer is partially filled (always, except at beginning)6345* Complete it, then consume it.6346*/6347if (state->bufferedSize) {6348size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;6349XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);6350input += loadSize;6351XXH3_consumeStripes(acc,6352&state->nbStripesSoFar, state->nbStripesPerBlock,6353state->buffer, XXH3_INTERNALBUFFER_STRIPES,6354secret, state->secretLimit,6355f_acc, f_scramble);6356state->bufferedSize = 0;6357}6358XXH_ASSERT(input < bEnd);6359if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {6360size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;6361input = XXH3_consumeStripes(acc,6362&state->nbStripesSoFar, state->nbStripesPerBlock,6363input, nbStripes,6364secret, state->secretLimit,6365f_acc, f_scramble);6366XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);63676368}6369/* Some remaining input (always) : buffer it */6370XXH_ASSERT(input < bEnd);6371XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);6372XXH_ASSERT(state->bufferedSize == 0);6373XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));6374state->bufferedSize = (XXH32_hash_t)(bEnd-input);6375#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 16376/* save stack accumulators into state */6377XXH_memcpy(state->acc, acc, sizeof(acc));6378#endif6379}63806381return XXH_OK;6382}63836384/*! @ingroup XXH3_family */6385XXH_PUBLIC_API XXH_errorcode6386XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)6387{6388return XXH3_update(state, (const xxh_u8*)input, len,6389XXH3_accumulate, XXH3_scrambleAcc);6390}639163926393XXH_FORCE_INLINE void6394XXH3_digest_long (XXH64_hash_t* acc,6395const XXH3_state_t* state,6396const unsigned char* secret)6397{6398xxh_u8 lastStripe[XXH_STRIPE_LEN];6399const xxh_u8* lastStripePtr;64006401/*6402* Digest on a local copy. This way, the state remains unaltered, and it can6403* continue ingesting more input afterwards.6404*/6405XXH_memcpy(acc, state->acc, sizeof(state->acc));6406if (state->bufferedSize >= XXH_STRIPE_LEN) {6407/* Consume remaining stripes then point to remaining data in buffer */6408size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;6409size_t nbStripesSoFar = state->nbStripesSoFar;6410XXH3_consumeStripes(acc,6411&nbStripesSoFar, state->nbStripesPerBlock,6412state->buffer, nbStripes,6413secret, state->secretLimit,6414XXH3_accumulate, XXH3_scrambleAcc);6415lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN;6416} else { /* bufferedSize < XXH_STRIPE_LEN */6417/* Copy to temp buffer */6418size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;6419XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */6420XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);6421XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);6422lastStripePtr = lastStripe;6423}6424/* Last stripe */6425XXH3_accumulate_512(acc,6426lastStripePtr,6427secret + state->secretLimit - XXH_SECRET_LASTACC_START);6428}64296430/*! @ingroup XXH3_family */6431XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)6432{6433const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;6434if (state->totalLen > XXH3_MIDSIZE_MAX) {6435XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];6436XXH3_digest_long(acc, state, secret);6437return XXH3_mergeAccs(acc,6438secret + XXH_SECRET_MERGEACCS_START,6439(xxh_u64)state->totalLen * XXH_PRIME64_1);6440}6441/* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */6442if (state->useSeed)6443return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);6444return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),6445secret, state->secretLimit + XXH_STRIPE_LEN);6446}6447#endif /* !XXH_NO_STREAM */644864496450/* ==========================================6451* XXH3 128 bits (a.k.a XXH128)6452* ==========================================6453* XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,6454* even without counting the significantly larger output size.6455*6456* For example, extra steps are taken to avoid the seed-dependent collisions6457* in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).6458*6459* This strength naturally comes at the cost of some speed, especially on short6460* lengths. Note that longer hashes are about as fast as the 64-bit version6461* due to it using only a slight modification of the 64-bit loop.6462*6463* XXH128 is also more oriented towards 64-bit machines. It is still extremely6464* fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).6465*/64666467XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t6468XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)6469{6470/* A doubled version of 1to3_64b with different constants. */6471XXH_ASSERT(input != NULL);6472XXH_ASSERT(1 <= len && len <= 3);6473XXH_ASSERT(secret != NULL);6474/*6475* len = 1: combinedl = { input[0], 0x01, input[0], input[0] }6476* len = 2: combinedl = { input[1], 0x02, input[0], input[1] }6477* len = 3: combinedl = { input[2], 0x03, input[0], input[1] }6478*/6479{ xxh_u8 const c1 = input[0];6480xxh_u8 const c2 = input[len >> 1];6481xxh_u8 const c3 = input[len - 1];6482xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)6483| ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);6484xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);6485xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;6486xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;6487xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;6488xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;6489XXH128_hash_t h128;6490h128.low64 = XXH64_avalanche(keyed_lo);6491h128.high64 = XXH64_avalanche(keyed_hi);6492return h128;6493}6494}64956496XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t6497XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)6498{6499XXH_ASSERT(input != NULL);6500XXH_ASSERT(secret != NULL);6501XXH_ASSERT(4 <= len && len <= 8);6502seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;6503{ xxh_u32 const input_lo = XXH_readLE32(input);6504xxh_u32 const input_hi = XXH_readLE32(input + len - 4);6505xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);6506xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;6507xxh_u64 const keyed = input_64 ^ bitflip;65086509/* Shift len to the left to ensure it is even, this avoids even multiplies. */6510XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));65116512m128.high64 += (m128.low64 << 1);6513m128.low64 ^= (m128.high64 >> 3);65146515m128.low64 = XXH_xorshift64(m128.low64, 35);6516m128.low64 *= PRIME_MX2;6517m128.low64 = XXH_xorshift64(m128.low64, 28);6518m128.high64 = XXH3_avalanche(m128.high64);6519return m128;6520}6521}65226523XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t6524XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)6525{6526XXH_ASSERT(input != NULL);6527XXH_ASSERT(secret != NULL);6528XXH_ASSERT(9 <= len && len <= 16);6529{ xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;6530xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;6531xxh_u64 const input_lo = XXH_readLE64(input);6532xxh_u64 input_hi = XXH_readLE64(input + len - 8);6533XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);6534/*6535* Put len in the middle of m128 to ensure that the length gets mixed to6536* both the low and high bits in the 128x64 multiply below.6537*/6538m128.low64 += (xxh_u64)(len - 1) << 54;6539input_hi ^= bitfliph;6540/*6541* Add the high 32 bits of input_hi to the high 32 bits of m128, then6542* add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to6543* the high 64 bits of m128.6544*6545* The best approach to this operation is different on 32-bit and 64-bit.6546*/6547if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */6548/*6549* 32-bit optimized version, which is more readable.6550*6551* On 32-bit, it removes an ADC and delays a dependency between the two6552* halves of m128.high64, but it generates an extra mask on 64-bit.6553*/6554m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);6555} else {6556/*6557* 64-bit optimized (albeit more confusing) version.6558*6559* Uses some properties of addition and multiplication to remove the mask:6560*6561* Let:6562* a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)6563* b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)6564* c = XXH_PRIME32_26565*6566* a + (b * c)6567* Inverse Property: x + y - x == y6568* a + (b * (1 + c - 1))6569* Distributive Property: x * (y + z) == (x * y) + (x * z)6570* a + (b * 1) + (b * (c - 1))6571* Identity Property: x * 1 == x6572* a + b + (b * (c - 1))6573*6574* Substitute a, b, and c:6575* input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))6576*6577* Since input_hi.hi + input_hi.lo == input_hi, we get this:6578* input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))6579*/6580m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);6581}6582/* m128 ^= XXH_swap64(m128 >> 64); */6583m128.low64 ^= XXH_swap64(m128.high64);65846585{ /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */6586XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);6587h128.high64 += m128.high64 * XXH_PRIME64_2;65886589h128.low64 = XXH3_avalanche(h128.low64);6590h128.high64 = XXH3_avalanche(h128.high64);6591return h128;6592} }6593}65946595/*6596* Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN6597*/6598XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t6599XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)6600{6601XXH_ASSERT(len <= 16);6602{ if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);6603if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);6604if (len) return XXH3_len_1to3_128b(input, len, secret, seed);6605{ XXH128_hash_t h128;6606xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);6607xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);6608h128.low64 = XXH64_avalanche(seed ^ bitflipl);6609h128.high64 = XXH64_avalanche( seed ^ bitfliph);6610return h128;6611} }6612}66136614/*6615* A bit slower than XXH3_mix16B, but handles multiply by zero better.6616*/6617XXH_FORCE_INLINE XXH128_hash_t6618XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,6619const xxh_u8* secret, XXH64_hash_t seed)6620{6621acc.low64 += XXH3_mix16B (input_1, secret+0, seed);6622acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);6623acc.high64 += XXH3_mix16B (input_2, secret+16, seed);6624acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);6625return acc;6626}662766286629XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t6630XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,6631const xxh_u8* XXH_RESTRICT secret, size_t secretSize,6632XXH64_hash_t seed)6633{6634XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;6635XXH_ASSERT(16 < len && len <= 128);66366637{ XXH128_hash_t acc;6638acc.low64 = len * XXH_PRIME64_1;6639acc.high64 = 0;66406641#if XXH_SIZE_OPT >= 16642{6643/* Smaller, but slightly slower. */6644unsigned int i = (unsigned int)(len - 1) / 32;6645do {6646acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);6647} while (i-- != 0);6648}6649#else6650if (len > 32) {6651if (len > 64) {6652if (len > 96) {6653acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);6654}6655acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);6656}6657acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);6658}6659acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);6660#endif6661{ XXH128_hash_t h128;6662h128.low64 = acc.low64 + acc.high64;6663h128.high64 = (acc.low64 * XXH_PRIME64_1)6664+ (acc.high64 * XXH_PRIME64_4)6665+ ((len - seed) * XXH_PRIME64_2);6666h128.low64 = XXH3_avalanche(h128.low64);6667h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);6668return h128;6669}6670}6671}66726673XXH_NO_INLINE XXH_PUREF XXH128_hash_t6674XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,6675const xxh_u8* XXH_RESTRICT secret, size_t secretSize,6676XXH64_hash_t seed)6677{6678XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;6679XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);66806681{ XXH128_hash_t acc;6682unsigned i;6683acc.low64 = len * XXH_PRIME64_1;6684acc.high64 = 0;6685/*6686* We set as `i` as offset + 32. We do this so that unchanged6687* `len` can be used as upper bound. This reaches a sweet spot6688* where both x86 and aarch64 get simple agen and good codegen6689* for the loop.6690*/6691for (i = 32; i < 160; i += 32) {6692acc = XXH128_mix32B(acc,6693input + i - 32,6694input + i - 16,6695secret + i - 32,6696seed);6697}6698acc.low64 = XXH3_avalanche(acc.low64);6699acc.high64 = XXH3_avalanche(acc.high64);6700/*6701* NB: `i <= len` will duplicate the last 32-bytes if6702* len % 32 was zero. This is an unfortunate necessity to keep6703* the hash result stable.6704*/6705for (i=160; i <= len; i += 32) {6706acc = XXH128_mix32B(acc,6707input + i - 32,6708input + i - 16,6709secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,6710seed);6711}6712/* last bytes */6713acc = XXH128_mix32B(acc,6714input + len - 16,6715input + len - 32,6716secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,6717(XXH64_hash_t)0 - seed);67186719{ XXH128_hash_t h128;6720h128.low64 = acc.low64 + acc.high64;6721h128.high64 = (acc.low64 * XXH_PRIME64_1)6722+ (acc.high64 * XXH_PRIME64_4)6723+ ((len - seed) * XXH_PRIME64_2);6724h128.low64 = XXH3_avalanche(h128.low64);6725h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);6726return h128;6727}6728}6729}67306731XXH_FORCE_INLINE XXH128_hash_t6732XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,6733const xxh_u8* XXH_RESTRICT secret, size_t secretSize,6734XXH3_f_accumulate f_acc,6735XXH3_f_scrambleAcc f_scramble)6736{6737XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;67386739XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);67406741/* converge into final hash */6742XXH_STATIC_ASSERT(sizeof(acc) == 64);6743XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);6744{ XXH128_hash_t h128;6745h128.low64 = XXH3_mergeAccs(acc,6746secret + XXH_SECRET_MERGEACCS_START,6747(xxh_u64)len * XXH_PRIME64_1);6748h128.high64 = XXH3_mergeAccs(acc,6749secret + secretSize6750- sizeof(acc) - XXH_SECRET_MERGEACCS_START,6751~((xxh_u64)len * XXH_PRIME64_2));6752return h128;6753}6754}67556756/*6757* It's important for performance that XXH3_hashLong() is not inlined.6758*/6759XXH_NO_INLINE XXH_PUREF XXH128_hash_t6760XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,6761XXH64_hash_t seed64,6762const void* XXH_RESTRICT secret, size_t secretLen)6763{6764(void)seed64; (void)secret; (void)secretLen;6765return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),6766XXH3_accumulate, XXH3_scrambleAcc);6767}67686769/*6770* It's important for performance to pass @p secretLen (when it's static)6771* to the compiler, so that it can properly optimize the vectorized loop.6772*6773* When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE6774* breaks -Og, this is XXH_NO_INLINE.6775*/6776XXH3_WITH_SECRET_INLINE XXH128_hash_t6777XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,6778XXH64_hash_t seed64,6779const void* XXH_RESTRICT secret, size_t secretLen)6780{6781(void)seed64;6782return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,6783XXH3_accumulate, XXH3_scrambleAcc);6784}67856786XXH_FORCE_INLINE XXH128_hash_t6787XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,6788XXH64_hash_t seed64,6789XXH3_f_accumulate f_acc,6790XXH3_f_scrambleAcc f_scramble,6791XXH3_f_initCustomSecret f_initSec)6792{6793if (seed64 == 0)6794return XXH3_hashLong_128b_internal(input, len,6795XXH3_kSecret, sizeof(XXH3_kSecret),6796f_acc, f_scramble);6797{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];6798f_initSec(secret, seed64);6799return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),6800f_acc, f_scramble);6801}6802}68036804/*6805* It's important for performance that XXH3_hashLong is not inlined.6806*/6807XXH_NO_INLINE XXH128_hash_t6808XXH3_hashLong_128b_withSeed(const void* input, size_t len,6809XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)6810{6811(void)secret; (void)secretLen;6812return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,6813XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);6814}68156816typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,6817XXH64_hash_t, const void* XXH_RESTRICT, size_t);68186819XXH_FORCE_INLINE XXH128_hash_t6820XXH3_128bits_internal(const void* input, size_t len,6821XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,6822XXH3_hashLong128_f f_hl128)6823{6824XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);6825/*6826* If an action is to be taken if `secret` conditions are not respected,6827* it should be done here.6828* For now, it's a contract pre-condition.6829* Adding a check and a branch here would cost performance at every hash.6830*/6831if (len <= 16)6832return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);6833if (len <= 128)6834return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);6835if (len <= XXH3_MIDSIZE_MAX)6836return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);6837return f_hl128(input, len, seed64, secret, secretLen);6838}683968406841/* === Public XXH128 API === */68426843/*! @ingroup XXH3_family */6844XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)6845{6846return XXH3_128bits_internal(input, len, 0,6847XXH3_kSecret, sizeof(XXH3_kSecret),6848XXH3_hashLong_128b_default);6849}68506851/*! @ingroup XXH3_family */6852XXH_PUBLIC_API XXH128_hash_t6853XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)6854{6855return XXH3_128bits_internal(input, len, 0,6856(const xxh_u8*)secret, secretSize,6857XXH3_hashLong_128b_withSecret);6858}68596860/*! @ingroup XXH3_family */6861XXH_PUBLIC_API XXH128_hash_t6862XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)6863{6864return XXH3_128bits_internal(input, len, seed,6865XXH3_kSecret, sizeof(XXH3_kSecret),6866XXH3_hashLong_128b_withSeed);6867}68686869/*! @ingroup XXH3_family */6870XXH_PUBLIC_API XXH128_hash_t6871XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)6872{6873if (len <= XXH3_MIDSIZE_MAX)6874return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);6875return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);6876}68776878/*! @ingroup XXH3_family */6879XXH_PUBLIC_API XXH128_hash_t6880XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)6881{6882return XXH3_128bits_withSeed(input, len, seed);6883}688468856886/* === XXH3 128-bit streaming === */6887#ifndef XXH_NO_STREAM6888/*6889* All initialization and update functions are identical to 64-bit streaming variant.6890* The only difference is the finalization routine.6891*/68926893/*! @ingroup XXH3_family */6894XXH_PUBLIC_API XXH_errorcode6895XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)6896{6897return XXH3_64bits_reset(statePtr);6898}68996900/*! @ingroup XXH3_family */6901XXH_PUBLIC_API XXH_errorcode6902XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)6903{6904return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);6905}69066907/*! @ingroup XXH3_family */6908XXH_PUBLIC_API XXH_errorcode6909XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)6910{6911return XXH3_64bits_reset_withSeed(statePtr, seed);6912}69136914/*! @ingroup XXH3_family */6915XXH_PUBLIC_API XXH_errorcode6916XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)6917{6918return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);6919}69206921/*! @ingroup XXH3_family */6922XXH_PUBLIC_API XXH_errorcode6923XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)6924{6925return XXH3_64bits_update(state, input, len);6926}69276928/*! @ingroup XXH3_family */6929XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)6930{6931const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;6932if (state->totalLen > XXH3_MIDSIZE_MAX) {6933XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];6934XXH3_digest_long(acc, state, secret);6935XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);6936{ XXH128_hash_t h128;6937h128.low64 = XXH3_mergeAccs(acc,6938secret + XXH_SECRET_MERGEACCS_START,6939(xxh_u64)state->totalLen * XXH_PRIME64_1);6940h128.high64 = XXH3_mergeAccs(acc,6941secret + state->secretLimit + XXH_STRIPE_LEN6942- sizeof(acc) - XXH_SECRET_MERGEACCS_START,6943~((xxh_u64)state->totalLen * XXH_PRIME64_2));6944return h128;6945}6946}6947/* len <= XXH3_MIDSIZE_MAX : short code */6948if (state->seed)6949return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);6950return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),6951secret, state->secretLimit + XXH_STRIPE_LEN);6952}6953#endif /* !XXH_NO_STREAM */6954/* 128-bit utility functions */69556956/* return : 1 is equal, 0 if different */6957/*! @ingroup XXH3_family */6958XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)6959{6960/* note : XXH128_hash_t is compact, it has no padding byte */6961return !(memcmp(&h1, &h2, sizeof(h1)));6962}69636964/* This prototype is compatible with stdlib's qsort().6965* @return : >0 if *h128_1 > *h128_26966* <0 if *h128_1 < *h128_26967* =0 if *h128_1 == *h128_2 */6968/*! @ingroup XXH3_family */6969XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)6970{6971XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;6972XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;6973int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);6974/* note : bets that, in most cases, hash values are different */6975if (hcmp) return hcmp;6976return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);6977}697869796980/*====== Canonical representation ======*/6981/*! @ingroup XXH3_family */6982XXH_PUBLIC_API void6983XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)6984{6985XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));6986if (XXH_CPU_LITTLE_ENDIAN) {6987hash.high64 = XXH_swap64(hash.high64);6988hash.low64 = XXH_swap64(hash.low64);6989}6990XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));6991XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));6992}69936994/*! @ingroup XXH3_family */6995XXH_PUBLIC_API XXH128_hash_t6996XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)6997{6998XXH128_hash_t h;6999h.high64 = XXH_readBE64(src);7000h.low64 = XXH_readBE64(src->digest + 8);7001return h;7002}7003700470057006/* ==========================================7007* Secret generators7008* ==========================================7009*/7010#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))70117012XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)7013{7014XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );7015XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );7016}70177018/*! @ingroup XXH3_family */7019XXH_PUBLIC_API XXH_errorcode7020XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)7021{7022#if (XXH_DEBUGLEVEL >= 1)7023XXH_ASSERT(secretBuffer != NULL);7024XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);7025#else7026/* production mode, assert() are disabled */7027if (secretBuffer == NULL) return XXH_ERROR;7028if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;7029#endif70307031if (customSeedSize == 0) {7032customSeed = XXH3_kSecret;7033customSeedSize = XXH_SECRET_DEFAULT_SIZE;7034}7035#if (XXH_DEBUGLEVEL >= 1)7036XXH_ASSERT(customSeed != NULL);7037#else7038if (customSeed == NULL) return XXH_ERROR;7039#endif70407041/* Fill secretBuffer with a copy of customSeed - repeat as needed */7042{ size_t pos = 0;7043while (pos < secretSize) {7044size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);7045memcpy((char*)secretBuffer + pos, customSeed, toCopy);7046pos += toCopy;7047} }70487049{ size_t const nbSeg16 = secretSize / 16;7050size_t n;7051XXH128_canonical_t scrambler;7052XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));7053for (n=0; n<nbSeg16; n++) {7054XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);7055XXH3_combine16((char*)secretBuffer + n*16, h128);7056}7057/* last segment */7058XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));7059}7060return XXH_OK;7061}70627063/*! @ingroup XXH3_family */7064XXH_PUBLIC_API void7065XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)7066{7067XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];7068XXH3_initCustomSecret(secret, seed);7069XXH_ASSERT(secretBuffer != NULL);7070memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);7071}7072707370747075/* Pop our optimization override from above */7076#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \7077&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \7078&& defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */7079# pragma GCC pop_options7080#endif708170827083#if defined (__cplusplus)7084} /* extern "C" */7085#endif70867087#endif /* XXH_NO_LONG_LONG */7088#endif /* XXH_NO_XXH3 */70897090/*!7091* @}7092*/7093#endif /* XXH_IMPLEMENTATION */709470957096