/* Bit and bytes utilities.12Bytes swap functions, reverse order of bytes:34- _Py_bswap16(uint16_t)5- _Py_bswap32(uint32_t)6- _Py_bswap64(uint64_t)7*/89#ifndef Py_INTERNAL_BITUTILS_H10#define Py_INTERNAL_BITUTILS_H11#ifdef __cplusplus12extern "C" {13#endif1415#ifndef Py_BUILD_CORE16# error "this header requires Py_BUILD_CORE define"17#endif1819#if defined(__GNUC__) \20&& ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8))21/* __builtin_bswap16() is available since GCC 4.8,22__builtin_bswap32() is available since GCC 4.3,23__builtin_bswap64() is available since GCC 4.3. */24# define _PY_HAVE_BUILTIN_BSWAP25#endif2627#ifdef _MSC_VER28/* Get _byteswap_ushort(), _byteswap_ulong(), _byteswap_uint64() */29# include <intrin.h>30#endif3132static inline uint16_t33_Py_bswap16(uint16_t word)34{35#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap16)36return __builtin_bswap16(word);37#elif defined(_MSC_VER)38Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned short));39return _byteswap_ushort(word);40#else41// Portable implementation which doesn't rely on circular bit shift42return ( ((word & UINT16_C(0x00FF)) << 8)43| ((word & UINT16_C(0xFF00)) >> 8));44#endif45}4647static inline uint32_t48_Py_bswap32(uint32_t word)49{50#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap32)51return __builtin_bswap32(word);52#elif defined(_MSC_VER)53Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned long));54return _byteswap_ulong(word);55#else56// Portable implementation which doesn't rely on circular bit shift57return ( ((word & UINT32_C(0x000000FF)) << 24)58| ((word & UINT32_C(0x0000FF00)) << 8)59| ((word & UINT32_C(0x00FF0000)) >> 8)60| ((word & UINT32_C(0xFF000000)) >> 24));61#endif62}6364static inline uint64_t65_Py_bswap64(uint64_t word)66{67#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap64)68return __builtin_bswap64(word);69#elif defined(_MSC_VER)70return _byteswap_uint64(word);71#else72// Portable implementation which doesn't rely on circular bit shift73return ( ((word & UINT64_C(0x00000000000000FF)) << 56)74| ((word & UINT64_C(0x000000000000FF00)) << 40)75| ((word & UINT64_C(0x0000000000FF0000)) << 24)76| ((word & UINT64_C(0x00000000FF000000)) << 8)77| ((word & UINT64_C(0x000000FF00000000)) >> 8)78| ((word & UINT64_C(0x0000FF0000000000)) >> 24)79| ((word & UINT64_C(0x00FF000000000000)) >> 40)80| ((word & UINT64_C(0xFF00000000000000)) >> 56));81#endif82}838485// Population count: count the number of 1's in 'x'86// (number of bits set to 1), also known as the hamming weight.87//88// Implementation note. CPUID is not used, to test if x86 POPCNT instruction89// can be used, to keep the implementation simple. For example, Visual Studio90// __popcnt() is not used this reason. The clang and GCC builtin function can91// use the x86 POPCNT instruction if the target architecture has SSE4a or92// newer.93static inline int94_Py_popcount32(uint32_t x)95{96#if (defined(__clang__) || defined(__GNUC__))9798#if SIZEOF_INT >= 499Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int));100return __builtin_popcount(x);101#else102// The C standard guarantees that unsigned long will always be big enough103// to hold a uint32_t value without losing information.104Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long));105return __builtin_popcountl(x);106#endif107108#else109// 32-bit SWAR (SIMD Within A Register) popcount110111// Binary: 0 1 0 1 ...112const uint32_t M1 = 0x55555555;113// Binary: 00 11 00 11. ..114const uint32_t M2 = 0x33333333;115// Binary: 0000 1111 0000 1111 ...116const uint32_t M4 = 0x0F0F0F0F;117118// Put count of each 2 bits into those 2 bits119x = x - ((x >> 1) & M1);120// Put count of each 4 bits into those 4 bits121x = (x & M2) + ((x >> 2) & M2);122// Put count of each 8 bits into those 8 bits123x = (x + (x >> 4)) & M4;124// Sum of the 4 byte counts.125// Take care when considering changes to the next line. Portability and126// correctness are delicate here, thanks to C's "integer promotions" (C99127// §6.3.1.1p2). On machines where the `int` type has width greater than 32128// bits, `x` will be promoted to an `int`, and following C's "usual129// arithmetic conversions" (C99 §6.3.1.8), the multiplication will be130// performed as a multiplication of two `unsigned int` operands. In this131// case it's critical that we cast back to `uint32_t` in order to keep only132// the least significant 32 bits. On machines where the `int` type has133// width no greater than 32, the multiplication is of two 32-bit unsigned134// integer types, and the (uint32_t) cast is a no-op. In both cases, we135// avoid the risk of undefined behaviour due to overflow of a136// multiplication of signed integer types.137return (uint32_t)(x * 0x01010101U) >> 24;138#endif139}140141142// Return the index of the most significant 1 bit in 'x'. This is the smallest143// integer k such that x < 2**k. Equivalent to floor(log2(x)) + 1 for x != 0.144static inline int145_Py_bit_length(unsigned long x)146{147#if (defined(__clang__) || defined(__GNUC__))148if (x != 0) {149// __builtin_clzl() is available since GCC 3.4.150// Undefined behavior for x == 0.151return (int)sizeof(unsigned long) * 8 - __builtin_clzl(x);152}153else {154return 0;155}156#elif defined(_MSC_VER)157// _BitScanReverse() is documented to search 32 bits.158Py_BUILD_ASSERT(sizeof(unsigned long) <= 4);159unsigned long msb;160if (_BitScanReverse(&msb, x)) {161return (int)msb + 1;162}163else {164return 0;165}166#else167const int BIT_LENGTH_TABLE[32] = {1680, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,1695, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5170};171int msb = 0;172while (x >= 32) {173msb += 6;174x >>= 6;175}176msb += BIT_LENGTH_TABLE[x];177return msb;178#endif179}180181182#ifdef __cplusplus183}184#endif185#endif /* !Py_INTERNAL_BITUTILS_H */186187188