Path: blob/master/Utilities/cmliblzma/common/tuklib_integer.h
3153 views
// SPDX-License-Identifier: 0BSD12///////////////////////////////////////////////////////////////////////////////3//4/// \file tuklib_integer.h5/// \brief Various integer and bit operations6///7/// This file provides macros or functions to do some basic integer and bit8/// operations.9///10/// Native endian inline functions (XX = 16, 32, or 64):11/// - Unaligned native endian reads: readXXne(ptr)12/// - Unaligned native endian writes: writeXXne(ptr, num)13/// - Aligned native endian reads: aligned_readXXne(ptr)14/// - Aligned native endian writes: aligned_writeXXne(ptr, num)15///16/// Endianness-converting integer operations (these can be macros!)17/// (XX = 16, 32, or 64; Y = b or l):18/// - Byte swapping: byteswapXX(num)19/// - Byte order conversions to/from native (byteswaps if Y isn't20/// the native endianness): convXXYe(num)21/// - Unaligned reads: readXXYe(ptr)22/// - Unaligned writes: writeXXYe(ptr, num)23/// - Aligned reads: aligned_readXXYe(ptr)24/// - Aligned writes: aligned_writeXXYe(ptr, num)25///26/// Since the above can macros, the arguments should have no side effects27/// because they may be evaluated more than once.28///29/// Bit scan operations for non-zero 32-bit integers (inline functions):30/// - Bit scan reverse (find highest non-zero bit): bsr32(num)31/// - Count leading zeros: clz32(num)32/// - Count trailing zeros: ctz32(num)33/// - Bit scan forward (simply an alias for ctz32()): bsf32(num)34///35/// The above bit scan operations return 0-31. If num is zero,36/// the result is undefined.37//38// Authors: Lasse Collin39// Joachim Henke40//41///////////////////////////////////////////////////////////////////////////////4243#ifndef TUKLIB_INTEGER_H44#define TUKLIB_INTEGER_H4546#include "tuklib_common.h"47#include <string.h>4849// Newer Intel C compilers require immintrin.h for _bit_scan_reverse()50// and such functions.51#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)52# include <immintrin.h>53// Only include <intrin.h> when it is needed. GCC and Clang can both54// use __builtin's, so we only need Windows instrincs when using MSVC.55// GCC and Clang can set _MSC_VER on Windows, so we need to exclude these56// cases explicitly.57#elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)58# include <intrin.h>59#endif606162///////////////////63// Byte swapping //64///////////////////6566#if defined(HAVE___BUILTIN_BSWAPXX)67// GCC >= 4.8 and Clang68# define byteswap16(num) __builtin_bswap16(num)69# define byteswap32(num) __builtin_bswap32(num)70# define byteswap64(num) __builtin_bswap64(num)7172#elif defined(HAVE_BYTESWAP_H)73// glibc, uClibc, dietlibc74# include <byteswap.h>75# ifdef HAVE_BSWAP_1676# define byteswap16(num) bswap_16(num)77# endif78# ifdef HAVE_BSWAP_3279# define byteswap32(num) bswap_32(num)80# endif81# ifdef HAVE_BSWAP_6482# define byteswap64(num) bswap_64(num)83# endif8485#elif defined(HAVE_SYS_ENDIAN_H)86// *BSDs and Darwin87# include <sys/endian.h>88# ifdef __OpenBSD__89# define byteswap16(num) swap16(num)90# define byteswap32(num) swap32(num)91# define byteswap64(num) swap64(num)92# else93# define byteswap16(num) bswap16(num)94# define byteswap32(num) bswap32(num)95# define byteswap64(num) bswap64(num)96# endif9798#elif defined(HAVE_SYS_BYTEORDER_H)99// Solaris100# include <sys/byteorder.h>101# ifdef BSWAP_16102# define byteswap16(num) BSWAP_16(num)103# endif104# ifdef BSWAP_32105# define byteswap32(num) BSWAP_32(num)106# endif107# ifdef BSWAP_64108# define byteswap64(num) BSWAP_64(num)109# endif110# ifdef BE_16111# define conv16be(num) BE_16(num)112# endif113# ifdef BE_32114# define conv32be(num) BE_32(num)115# endif116# ifdef BE_64117# define conv64be(num) BE_64(num)118# endif119# ifdef LE_16120# define conv16le(num) LE_16(num)121# endif122# ifdef LE_32123# define conv32le(num) LE_32(num)124# endif125# ifdef LE_64126# define conv64le(num) LE_64(num)127# endif128#endif129130#ifndef byteswap16131# define byteswap16(n) (uint16_t)( \132(((n) & 0x00FFU) << 8) \133| (((n) & 0xFF00U) >> 8) \134)135#endif136137#ifndef byteswap32138# define byteswap32(n) (uint32_t)( \139(((n) & UINT32_C(0x000000FF)) << 24) \140| (((n) & UINT32_C(0x0000FF00)) << 8) \141| (((n) & UINT32_C(0x00FF0000)) >> 8) \142| (((n) & UINT32_C(0xFF000000)) >> 24) \143)144#endif145146#ifndef byteswap64147# define byteswap64(n) (uint64_t)( \148(((n) & UINT64_C(0x00000000000000FF)) << 56) \149| (((n) & UINT64_C(0x000000000000FF00)) << 40) \150| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \151| (((n) & UINT64_C(0x00000000FF000000)) << 8) \152| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \153| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \154| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \155| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \156)157#endif158159// Define conversion macros using the basic byte swapping macros.160#ifdef WORDS_BIGENDIAN161# ifndef conv16be162# define conv16be(num) ((uint16_t)(num))163# endif164# ifndef conv32be165# define conv32be(num) ((uint32_t)(num))166# endif167# ifndef conv64be168# define conv64be(num) ((uint64_t)(num))169# endif170# ifndef conv16le171# define conv16le(num) byteswap16(num)172# endif173# ifndef conv32le174# define conv32le(num) byteswap32(num)175# endif176# ifndef conv64le177# define conv64le(num) byteswap64(num)178# endif179#else180# ifndef conv16be181# define conv16be(num) byteswap16(num)182# endif183# ifndef conv32be184# define conv32be(num) byteswap32(num)185# endif186# ifndef conv64be187# define conv64be(num) byteswap64(num)188# endif189# ifndef conv16le190# define conv16le(num) ((uint16_t)(num))191# endif192# ifndef conv32le193# define conv32le(num) ((uint32_t)(num))194# endif195# ifndef conv64le196# define conv64le(num) ((uint64_t)(num))197# endif198#endif199200201////////////////////////////////202// Unaligned reads and writes //203////////////////////////////////204205// No-strict-align archs like x86-64206// ---------------------------------207//208// The traditional way of casting e.g. *(const uint16_t *)uint8_pointer209// is bad even if the uint8_pointer is properly aligned because this kind210// of casts break strict aliasing rules and result in undefined behavior.211// With unaligned pointers it's even worse: compilers may emit vector212// instructions that require aligned pointers even if non-vector213// instructions work with unaligned pointers.214//215// Using memcpy() is the standard compliant way to do unaligned access.216// Many modern compilers inline it so there is no function call overhead.217// For those compilers that don't handle the memcpy() method well, the218// old casting method (that violates strict aliasing) can be requested at219// build time. A third method, casting to a packed struct, would also be220// an option but isn't provided to keep things simpler (it's already a mess).221// Hopefully this is flexible enough in practice.222//223// Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that224//225// buf[0] | (buf[1] << 8)226//227// reads a 16-bit value and can emit a single 16-bit load and produce228// identical code than with the memcpy() method. In other cases Clang and GCC229// produce either the same or better code with memcpy(). For example, Clang 9230// on x86-64 can detect 32-bit load but not 16-bit load.231//232// MSVC uses unaligned access with the memcpy() method but emits byte-by-byte233// code for "buf[0] | (buf[1] << 8)".234//235// Conclusion: The memcpy() method is the best choice when unaligned access236// is supported.237//238// Strict-align archs like SPARC239// -----------------------------240//241// GCC versions from around 4.x to to at least 13.2.0 produce worse code242// from the memcpy() method than from simple byte-by-byte shift-or code243// when reading a 32-bit integer:244//245// (1) It may be constructed on stack using four 8-bit loads,246// four 8-bit stores to stack, and finally one 32-bit load from stack.247//248// (2) Especially with -Os, an actual memcpy() call may be emitted.249//250// This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and251// RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in252// some processors but not all so this is relevant only in the case when253// GCC assumes that unaligned is not supported or -mstrict-align or254// -mno-unaligned-access is used.255//256// For Clang it makes little difference. ARM64 with -O2 -mstrict-align257// was one the very few with a minor difference: the memcpy() version258// was one instruction longer.259//260// Conclusion: At least in case of GCC and Clang, byte-by-byte code is261// the best choice for strict-align archs to do unaligned access.262//263// See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502264//265// Thanks to <https://godbolt.org/> it was easy to test different compilers.266// The following is for little endian targets:267/*268#include <stdint.h>269#include <string.h>270271uint32_t bytes16(const uint8_t *b)272{273return (uint32_t)b[0]274| ((uint32_t)b[1] << 8);275}276277uint32_t copy16(const uint8_t *b)278{279uint16_t v;280memcpy(&v, b, sizeof(v));281return v;282}283284uint32_t bytes32(const uint8_t *b)285{286return (uint32_t)b[0]287| ((uint32_t)b[1] << 8)288| ((uint32_t)b[2] << 16)289| ((uint32_t)b[3] << 24);290}291292uint32_t copy32(const uint8_t *b)293{294uint32_t v;295memcpy(&v, b, sizeof(v));296return v;297}298299void wbytes16(uint8_t *b, uint16_t v)300{301b[0] = (uint8_t)v;302b[1] = (uint8_t)(v >> 8);303}304305void wcopy16(uint8_t *b, uint16_t v)306{307memcpy(b, &v, sizeof(v));308}309310void wbytes32(uint8_t *b, uint32_t v)311{312b[0] = (uint8_t)v;313b[1] = (uint8_t)(v >> 8);314b[2] = (uint8_t)(v >> 16);315b[3] = (uint8_t)(v >> 24);316}317318void wcopy32(uint8_t *b, uint32_t v)319{320memcpy(b, &v, sizeof(v));321}322*/323324325#ifdef TUKLIB_FAST_UNALIGNED_ACCESS326327static inline uint16_t328read16ne(const uint8_t *buf)329{330#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING331return *(const uint16_t *)buf;332#else333uint16_t num;334memcpy(&num, buf, sizeof(num));335return num;336#endif337}338339340static inline uint32_t341read32ne(const uint8_t *buf)342{343#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING344return *(const uint32_t *)buf;345#else346uint32_t num;347memcpy(&num, buf, sizeof(num));348return num;349#endif350}351352353static inline uint64_t354read64ne(const uint8_t *buf)355{356#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING357return *(const uint64_t *)buf;358#else359uint64_t num;360memcpy(&num, buf, sizeof(num));361return num;362#endif363}364365366static inline void367write16ne(uint8_t *buf, uint16_t num)368{369#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING370*(uint16_t *)buf = num;371#else372memcpy(buf, &num, sizeof(num));373#endif374return;375}376377378static inline void379write32ne(uint8_t *buf, uint32_t num)380{381#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING382*(uint32_t *)buf = num;383#else384memcpy(buf, &num, sizeof(num));385#endif386return;387}388389390static inline void391write64ne(uint8_t *buf, uint64_t num)392{393#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING394*(uint64_t *)buf = num;395#else396memcpy(buf, &num, sizeof(num));397#endif398return;399}400401402static inline uint16_t403read16be(const uint8_t *buf)404{405uint16_t num = read16ne(buf);406return conv16be(num);407}408409410static inline uint16_t411read16le(const uint8_t *buf)412{413uint16_t num = read16ne(buf);414return conv16le(num);415}416417418static inline uint32_t419read32be(const uint8_t *buf)420{421uint32_t num = read32ne(buf);422return conv32be(num);423}424425426static inline uint32_t427read32le(const uint8_t *buf)428{429uint32_t num = read32ne(buf);430return conv32le(num);431}432433434static inline uint64_t435read64be(const uint8_t *buf)436{437uint64_t num = read64ne(buf);438return conv64be(num);439}440441442static inline uint64_t443read64le(const uint8_t *buf)444{445uint64_t num = read64ne(buf);446return conv64le(num);447}448449450// NOTE: Possible byte swapping must be done in a macro to allow the compiler451// to optimize byte swapping of constants when using glibc's or *BSD's452// byte swapping macros. The actual write is done in an inline function453// to make type checking of the buf pointer possible.454#define write16be(buf, num) write16ne(buf, conv16be(num))455#define write32be(buf, num) write32ne(buf, conv32be(num))456#define write64be(buf, num) write64ne(buf, conv64be(num))457#define write16le(buf, num) write16ne(buf, conv16le(num))458#define write32le(buf, num) write32ne(buf, conv32le(num))459#define write64le(buf, num) write64ne(buf, conv64le(num))460461#else462463#ifdef WORDS_BIGENDIAN464# define read16ne read16be465# define read32ne read32be466# define read64ne read64be467# define write16ne write16be468# define write32ne write32be469# define write64ne write64be470#else471# define read16ne read16le472# define read32ne read32le473# define read64ne read64le474# define write16ne write16le475# define write32ne write32le476# define write64ne write64le477#endif478479480static inline uint16_t481read16be(const uint8_t *buf)482{483uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];484return num;485}486487488static inline uint16_t489read16le(const uint8_t *buf)490{491uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);492return num;493}494495496static inline uint32_t497read32be(const uint8_t *buf)498{499uint32_t num = (uint32_t)buf[0] << 24;500num |= (uint32_t)buf[1] << 16;501num |= (uint32_t)buf[2] << 8;502num |= (uint32_t)buf[3];503return num;504}505506507static inline uint32_t508read32le(const uint8_t *buf)509{510uint32_t num = (uint32_t)buf[0];511num |= (uint32_t)buf[1] << 8;512num |= (uint32_t)buf[2] << 16;513num |= (uint32_t)buf[3] << 24;514return num;515}516517518static inline uint64_t519read64be(const uint8_t *buf)520{521uint64_t num = (uint64_t)buf[0] << 56;522num |= (uint64_t)buf[1] << 48;523num |= (uint64_t)buf[2] << 40;524num |= (uint64_t)buf[3] << 32;525num |= (uint64_t)buf[4] << 24;526num |= (uint64_t)buf[5] << 16;527num |= (uint64_t)buf[6] << 8;528num |= (uint64_t)buf[7];529return num;530}531532533static inline uint64_t534read64le(const uint8_t *buf)535{536uint64_t num = (uint64_t)buf[0];537num |= (uint64_t)buf[1] << 8;538num |= (uint64_t)buf[2] << 16;539num |= (uint64_t)buf[3] << 24;540num |= (uint64_t)buf[4] << 32;541num |= (uint64_t)buf[5] << 40;542num |= (uint64_t)buf[6] << 48;543num |= (uint64_t)buf[7] << 56;544return num;545}546547548static inline void549write16be(uint8_t *buf, uint16_t num)550{551buf[0] = (uint8_t)(num >> 8);552buf[1] = (uint8_t)num;553return;554}555556557static inline void558write16le(uint8_t *buf, uint16_t num)559{560buf[0] = (uint8_t)num;561buf[1] = (uint8_t)(num >> 8);562return;563}564565566static inline void567write32be(uint8_t *buf, uint32_t num)568{569buf[0] = (uint8_t)(num >> 24);570buf[1] = (uint8_t)(num >> 16);571buf[2] = (uint8_t)(num >> 8);572buf[3] = (uint8_t)num;573return;574}575576577static inline void578write32le(uint8_t *buf, uint32_t num)579{580buf[0] = (uint8_t)num;581buf[1] = (uint8_t)(num >> 8);582buf[2] = (uint8_t)(num >> 16);583buf[3] = (uint8_t)(num >> 24);584return;585}586587588static inline void589write64be(uint8_t *buf, uint64_t num)590{591buf[0] = (uint8_t)(num >> 56);592buf[1] = (uint8_t)(num >> 48);593buf[2] = (uint8_t)(num >> 40);594buf[3] = (uint8_t)(num >> 32);595buf[4] = (uint8_t)(num >> 24);596buf[5] = (uint8_t)(num >> 16);597buf[6] = (uint8_t)(num >> 8);598buf[7] = (uint8_t)num;599return;600}601602603static inline void604write64le(uint8_t *buf, uint64_t num)605{606buf[0] = (uint8_t)num;607buf[1] = (uint8_t)(num >> 8);608buf[2] = (uint8_t)(num >> 16);609buf[3] = (uint8_t)(num >> 24);610buf[4] = (uint8_t)(num >> 32);611buf[5] = (uint8_t)(num >> 40);612buf[6] = (uint8_t)(num >> 48);613buf[7] = (uint8_t)(num >> 56);614return;615}616617#endif618619620//////////////////////////////621// Aligned reads and writes //622//////////////////////////////623624// Separate functions for aligned reads and writes are provided since on625// strict-align archs aligned access is much faster than unaligned access.626//627// Just like in the unaligned case, memcpy() is needed to avoid628// strict aliasing violations. However, on archs that don't support629// unaligned access the compiler cannot know that the pointers given630// to memcpy() are aligned which results in slow code. As of C11 there is631// no standard way to tell the compiler that we know that the address is632// aligned but some compilers have language extensions to do that. With633// such language extensions the memcpy() method gives excellent results.634//635// What to do on a strict-align system when no known language extensions636// are available? Falling back to byte-by-byte access would be safe but ruin637// optimizations that have been made specifically with aligned access in mind.638// As a compromise, aligned reads will fall back to non-compliant type punning639// but aligned writes will be byte-by-byte, that is, fast reads are preferred640// over fast writes. This obviously isn't great but hopefully it's a working641// compromise for now.642//643// __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.644#ifdef HAVE___BUILTIN_ASSUME_ALIGNED645# define tuklib_memcpy_aligned(dest, src, size) \646memcpy(dest, __builtin_assume_aligned(src, size), size)647#else648# define tuklib_memcpy_aligned(dest, src, size) \649memcpy(dest, src, size)650# ifndef TUKLIB_FAST_UNALIGNED_ACCESS651# define TUKLIB_USE_UNSAFE_ALIGNED_READS 1652# endif653#endif654655656static inline uint16_t657aligned_read16ne(const uint8_t *buf)658{659#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \660|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)661return *(const uint16_t *)buf;662#else663uint16_t num;664tuklib_memcpy_aligned(&num, buf, sizeof(num));665return num;666#endif667}668669670static inline uint32_t671aligned_read32ne(const uint8_t *buf)672{673#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \674|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)675return *(const uint32_t *)buf;676#else677uint32_t num;678tuklib_memcpy_aligned(&num, buf, sizeof(num));679return num;680#endif681}682683684static inline uint64_t685aligned_read64ne(const uint8_t *buf)686{687#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \688|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)689return *(const uint64_t *)buf;690#else691uint64_t num;692tuklib_memcpy_aligned(&num, buf, sizeof(num));693return num;694#endif695}696697698static inline void699aligned_write16ne(uint8_t *buf, uint16_t num)700{701#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING702*(uint16_t *)buf = num;703#else704tuklib_memcpy_aligned(buf, &num, sizeof(num));705#endif706return;707}708709710static inline void711aligned_write32ne(uint8_t *buf, uint32_t num)712{713#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING714*(uint32_t *)buf = num;715#else716tuklib_memcpy_aligned(buf, &num, sizeof(num));717#endif718return;719}720721722static inline void723aligned_write64ne(uint8_t *buf, uint64_t num)724{725#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING726*(uint64_t *)buf = num;727#else728tuklib_memcpy_aligned(buf, &num, sizeof(num));729#endif730return;731}732733734static inline uint16_t735aligned_read16be(const uint8_t *buf)736{737uint16_t num = aligned_read16ne(buf);738return conv16be(num);739}740741742static inline uint16_t743aligned_read16le(const uint8_t *buf)744{745uint16_t num = aligned_read16ne(buf);746return conv16le(num);747}748749750static inline uint32_t751aligned_read32be(const uint8_t *buf)752{753uint32_t num = aligned_read32ne(buf);754return conv32be(num);755}756757758static inline uint32_t759aligned_read32le(const uint8_t *buf)760{761uint32_t num = aligned_read32ne(buf);762return conv32le(num);763}764765766static inline uint64_t767aligned_read64be(const uint8_t *buf)768{769uint64_t num = aligned_read64ne(buf);770return conv64be(num);771}772773774static inline uint64_t775aligned_read64le(const uint8_t *buf)776{777uint64_t num = aligned_read64ne(buf);778return conv64le(num);779}780781782// These need to be macros like in the unaligned case.783#define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))784#define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))785#define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))786#define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))787#define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))788#define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))789790791////////////////////792// Bit operations //793////////////////////794795static inline uint32_t796bsr32(uint32_t n)797{798// Check for ICC first, since it tends to define __GNUC__ too.799#if defined(__INTEL_COMPILER)800return _bit_scan_reverse(n);801802#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX803// GCC >= 3.4 has __builtin_clz(), which gives good results on804// multiple architectures. On x86, __builtin_clz() ^ 31U becomes805// either plain BSR (so the XOR gets optimized away) or LZCNT and806// XOR (if -march indicates that SSE4a instructions are supported).807return (uint32_t)__builtin_clz(n) ^ 31U;808809#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))810uint32_t i;811__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));812return i;813814#else815uint32_t i = 31;816817if ((n & 0xFFFF0000) == 0) {818n <<= 16;819i = 15;820}821822if ((n & 0xFF000000) == 0) {823n <<= 8;824i -= 8;825}826827if ((n & 0xF0000000) == 0) {828n <<= 4;829i -= 4;830}831832if ((n & 0xC0000000) == 0) {833n <<= 2;834i -= 2;835}836837if ((n & 0x80000000) == 0)838--i;839840return i;841#endif842}843844845static inline uint32_t846clz32(uint32_t n)847{848#if defined(__INTEL_COMPILER)849return _bit_scan_reverse(n) ^ 31U;850851#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX852return (uint32_t)__builtin_clz(n);853854#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))855uint32_t i;856__asm__("bsrl %1, %0\n\t"857"xorl $31, %0"858: "=r" (i) : "rm" (n));859return i;860861#else862uint32_t i = 0;863864if ((n & 0xFFFF0000) == 0) {865n <<= 16;866i = 16;867}868869if ((n & 0xFF000000) == 0) {870n <<= 8;871i += 8;872}873874if ((n & 0xF0000000) == 0) {875n <<= 4;876i += 4;877}878879if ((n & 0xC0000000) == 0) {880n <<= 2;881i += 2;882}883884if ((n & 0x80000000) == 0)885++i;886887return i;888#endif889}890891892static inline uint32_t893ctz32(uint32_t n)894{895#if defined(__INTEL_COMPILER)896return _bit_scan_forward(n);897898#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX899return (uint32_t)__builtin_ctz(n);900901#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))902uint32_t i;903__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));904return i;905906#else907uint32_t i = 0;908909if ((n & 0x0000FFFF) == 0) {910n >>= 16;911i = 16;912}913914if ((n & 0x000000FF) == 0) {915n >>= 8;916i += 8;917}918919if ((n & 0x0000000F) == 0) {920n >>= 4;921i += 4;922}923924if ((n & 0x00000003) == 0) {925n >>= 2;926i += 2;927}928929if ((n & 0x00000001) == 0)930++i;931932return i;933#endif934}935936#define bsf32 ctz32937938#endif939940941