Path: blob/master/Utilities/cmliblzma/liblzma/lzma/fastpos.h
3156 views
// SPDX-License-Identifier: 0BSD12///////////////////////////////////////////////////////////////////////////////3//4/// \file fastpos.h5/// \brief Kind of two-bit version of bit scan reverse6///7// Authors: Igor Pavlov8// Lasse Collin9//10///////////////////////////////////////////////////////////////////////////////1112#ifndef LZMA_FASTPOS_H13#define LZMA_FASTPOS_H1415// LZMA encodes match distances by storing the highest two bits using16// a six-bit value [0, 63], and then the missing lower bits.17// Dictionary size is also stored using this encoding in the .xz18// file format header.19//20// fastpos.h provides a way to quickly find out the correct six-bit21// values. The following table gives some examples of this encoding:22//23// dist return24// 0 025// 1 126// 2 227// 3 328// 4 429// 5 430// 6 531// 7 532// 8 633// 11 634// 12 735// ... ...36// 15 737// 16 838// 17 839// ... ...40// 23 841// 24 942// 25 943// ... ...44//45//46// Provided functions or macros47// ----------------------------48//49// get_dist_slot(dist) is the basic version. get_dist_slot_2(dist)50// assumes that dist >= FULL_DISTANCES, thus the result is at least51// FULL_DISTANCES_BITS * 2. Using get_dist_slot(dist) instead of52// get_dist_slot_2(dist) would give the same result, but get_dist_slot_2(dist)53// should be tiny bit faster due to the assumption being made.54//55//56// Size vs. speed57// --------------58//59// With some CPUs that have fast BSR (bit scan reverse) instruction, the60// size optimized version is slightly faster than the bigger table based61// approach. Such CPUs include Intel Pentium Pro, Pentium II, Pentium III62// and Core 2 (possibly others). AMD K7 seems to have slower BSR, but that63// would still have speed roughly comparable to the table version. Older64// x86 CPUs like the original Pentium have very slow BSR; on those systems65// the table version is a lot faster.66//67// On some CPUs, the table version is a lot faster when using position68// dependent code, but with position independent code the size optimized69// version is slightly faster. This occurs at least on 32-bit SPARC (no70// ASM optimizations).71//72// I'm making the table version the default, because that has good speed73// on all systems I have tried. The size optimized version is sometimes74// slightly faster, but sometimes it is a lot slower.7576#ifdef HAVE_SMALL77# define get_dist_slot(dist) \78((dist) <= 4 ? (dist) : get_dist_slot_2(dist))7980static inline uint32_t81get_dist_slot_2(uint32_t dist)82{83const uint32_t i = bsr32(dist);84return (i + i) + ((dist >> (i - 1)) & 1);85}868788#else8990#define FASTPOS_BITS 139192lzma_attr_visibility_hidden93extern const uint8_t lzma_fastpos[1 << FASTPOS_BITS];949596#define fastpos_shift(extra, n) \97((extra) + (n) * (FASTPOS_BITS - 1))9899#define fastpos_limit(extra, n) \100(UINT32_C(1) << (FASTPOS_BITS + fastpos_shift(extra, n)))101102#define fastpos_result(dist, extra, n) \103(uint32_t)(lzma_fastpos[(dist) >> fastpos_shift(extra, n)]) \104+ 2 * fastpos_shift(extra, n)105106107static inline uint32_t108get_dist_slot(uint32_t dist)109{110// If it is small enough, we can pick the result directly from111// the precalculated table.112if (dist < fastpos_limit(0, 0))113return lzma_fastpos[dist];114115if (dist < fastpos_limit(0, 1))116return fastpos_result(dist, 0, 1);117118return fastpos_result(dist, 0, 2);119}120121122#ifdef FULL_DISTANCES_BITS123static inline uint32_t124get_dist_slot_2(uint32_t dist)125{126assert(dist >= FULL_DISTANCES);127128if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 0))129return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 0);130131if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 1))132return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 1);133134return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 2);135}136#endif137138#endif139140#endif141142143