Path: blob/master/thirdparty/basis_universal/encoder/basisu_enc.h
21461 views
// basisu_enc.h1// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.2//3// Licensed under the Apache License, Version 2.0 (the "License");4// you may not use this file except in compliance with the License.5// You may obtain a copy of the License at6//7// http://www.apache.org/licenses/LICENSE-2.08//9// Unless required by applicable law or agreed to in writing, software10// distributed under the License is distributed on an "AS IS" BASIS,11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12// See the License for the specific language governing permissions and13// limitations under the License.14#pragma once15#include "../transcoder/basisu.h"16#include "../transcoder/basisu_transcoder_internal.h"1718#include <mutex>19#include <atomic>20#include <condition_variable>21#include <functional>22#include <thread>23#include <unordered_map>24#include <map>25#include <ostream>2627#if !defined(_WIN32) || defined(__MINGW32__)28#include <libgen.h>29#endif3031// This module is really just a huge grab bag of classes and helper functions needed by the encoder.3233// If BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE is 1, quality in perceptual mode will be slightly greater, but at a large increase in encoding CPU time.34#define BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE (0)3536#if BASISU_SUPPORT_SSE37// Declared in basisu_kernels_imp.h, but we can't include that here otherwise it would lead to circular type errors.38extern void update_covar_matrix_16x16_sse41(uint32_t num_vecs, const void* pWeighted_vecs, const void* pOrigin, const uint32_t *pVec_indices, void* pMatrix16x16);39#endif4041namespace basisu42{43extern uint8_t g_hamming_dist[256];44extern const uint8_t g_debug_font8x8_basic[127 - 32 + 1][8];4546// true if basisu_encoder_init() has been called and returned.47extern bool g_library_initialized;4849// Encoder library initialization.50// This function MUST be called before encoding anything!51// Returns false if library initialization fails.52bool basisu_encoder_init(bool use_opencl = false, bool opencl_force_serialization = false);53void basisu_encoder_deinit();5455// basisu_kernels_sse.cpp - will be a no-op and g_cpu_supports_sse41 will always be false unless compiled with BASISU_SUPPORT_SSE=156extern void detect_sse41();5758#if BASISU_SUPPORT_SSE59extern bool g_cpu_supports_sse41;60#else61const bool g_cpu_supports_sse41 = false;62#endif6364void error_vprintf(const char* pFmt, va_list args);65void error_printf(const char *pFmt, ...);6667template <typename... Args>68inline void fmt_error_printf(const char* pFmt, Args&&... args)69{70std::string res;71if (!fmt_variants(res, pFmt, fmt_variant_vec{ fmt_variant(std::forward<Args>(args))... }))72return;73error_printf("%s", res.c_str());74}7576void platform_sleep(uint32_t ms);7778// Helpers7980inline uint8_t clamp255(int32_t i)81{82return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i);83}8485inline int left_shift32(int val, int shift)86{87assert((shift >= 0) && (shift < 32));88return static_cast<int>(static_cast<uint32_t>(val) << shift);89}9091inline uint32_t left_shift32(uint32_t val, int shift)92{93assert((shift >= 0) && (shift < 32));94return val << shift;95}9697inline int32_t clampi(int32_t value, int32_t low, int32_t high)98{99if (value < low)100value = low;101else if (value > high)102value = high;103return value;104}105106inline uint8_t mul_8(uint32_t v, uint32_t a)107{108v = v * a + 128;109return (uint8_t)((v + (v >> 8)) >> 8);110}111112inline int fast_roundf_int(float x)113{114return (x >= 0.0f) ? (int)(x + 0.5f) : (int)(x - 0.5f);115}116117inline int fast_floorf_int(float x)118{119int xi = (int)x; // Truncate towards zero120return ((x < 0.0f) && (x != (float)xi)) ? (xi - 1) : xi;121}122123inline uint64_t read_bits(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)124{125assert(codesize <= 64);126uint64_t bits = 0;127uint32_t total_bits = 0;128129while (total_bits < codesize)130{131uint32_t byte_bit_offset = bit_offset & 7;132uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset);133134uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset;135byte_bits &= ((1 << bits_to_read) - 1);136137bits |= ((uint64_t)(byte_bits) << total_bits);138139total_bits += bits_to_read;140bit_offset += bits_to_read;141}142143return bits;144}145146inline uint32_t read_bits32(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)147{148assert(codesize <= 32);149uint32_t bits = 0;150uint32_t total_bits = 0;151152while (total_bits < codesize)153{154uint32_t byte_bit_offset = bit_offset & 7;155uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset);156157uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset;158byte_bits &= ((1 << bits_to_read) - 1);159160bits |= (byte_bits << total_bits);161162total_bits += bits_to_read;163bit_offset += bits_to_read;164}165166return bits;167}168169// Open interval170inline int bounds_check(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }171inline uint32_t bounds_check(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }172173// Closed interval174inline int bounds_check_incl(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v <= h); return v; }175inline uint32_t bounds_check_incl(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v <= h); return v; }176177inline uint32_t clz(uint32_t x)178{179if (!x)180return 32;181182uint32_t n = 0;183while ((x & 0x80000000) == 0)184{185x <<= 1u;186n++;187}188189return n;190}191192bool string_begins_with(const std::string& str, const char* pPhrase);193194// Case sensitive, returns -1 if can't find195inline int string_find_first(const std::string& str, const char* pPhrase)196{197size_t res = str.find(pPhrase, 0);198if (res == std::string::npos)199return -1;200return (int)res;201}202203// Hashing204205inline uint32_t bitmix32c(uint32_t v)206{207v = (v + 0x7ed55d16) + (v << 12);208v = (v ^ 0xc761c23c) ^ (v >> 19);209v = (v + 0x165667b1) + (v << 5);210v = (v + 0xd3a2646c) ^ (v << 9);211v = (v + 0xfd7046c5) + (v << 3);212v = (v ^ 0xb55a4f09) ^ (v >> 16);213return v;214}215216inline uint32_t bitmix32(uint32_t v)217{218v -= (v << 6);219v ^= (v >> 17);220v -= (v << 9);221v ^= (v << 4);222v -= (v << 3);223v ^= (v << 10);224v ^= (v >> 15);225return v;226}227228inline uint32_t wang_hash(uint32_t seed)229{230seed = (seed ^ 61) ^ (seed >> 16);231seed *= 9;232seed = seed ^ (seed >> 4);233seed *= 0x27d4eb2d;234seed = seed ^ (seed >> 15);235return seed;236}237238uint32_t hash_hsieh(const uint8_t* pBuf, size_t len);239240template <typename Key>241struct bit_hasher242{243inline std::size_t operator()(const Key& k) const244{245return hash_hsieh(reinterpret_cast<const uint8_t *>(&k), sizeof(k));246}247};248249struct string_hasher250{251inline std::size_t operator()(const std::string& k) const252{253size_t l = k.size();254if (!l)255return 0;256return hash_hsieh(reinterpret_cast<const uint8_t*>(k.c_str()), l);257}258};259260class running_stat261{262public:263running_stat() { clear(); }264265void clear()266{267m_n = 0;268m_total = 0;269m_old_m = 0;270m_new_m = 0;271m_old_s = 0;272m_new_s = 0;273m_min = 0;274m_max = 0;275}276277void push(double x)278{279m_n++;280m_total += x;281if (m_n == 1)282{283m_old_m = m_new_m = x;284m_old_s = 0.0;285m_min = x;286m_max = x;287}288else289{290// See Knuth TAOCP vol 2, 3rd edition, page 232291m_new_m = m_old_m + (x - m_old_m) / m_n;292m_new_s = m_old_s + (x - m_old_m) * (x - m_new_m);293m_old_m = m_new_m;294m_old_s = m_new_s;295m_min = basisu::minimum(x, m_min);296m_max = basisu::maximum(x, m_max);297}298}299300uint32_t get_num() const301{302return m_n;303}304305double get_total() const306{307return m_total;308}309310double get_mean() const311{312return (m_n > 0) ? m_new_m : 0.0;313}314315// Returns sample variance316double get_variance() const317{318return ((m_n > 1) ? m_new_s / (m_n - 1) : 0.0);319}320321double get_std_dev() const322{323return sqrt(get_variance());324}325326double get_min() const327{328return m_min;329}330331double get_max() const332{333return m_max;334}335336private:337uint32_t m_n;338double m_total, m_old_m, m_new_m, m_old_s, m_new_s, m_min, m_max;339};340341// Linear algebra342343template <uint32_t N, typename T>344class vec345{346protected:347T m_v[N];348349public:350enum { num_elements = N };351typedef T scalar_type;352353inline vec() { }354inline vec(eZero) { set_zero(); }355356explicit inline vec(T val) { set(val); }357inline vec(T v0, T v1) { set(v0, v1); }358inline vec(T v0, T v1, T v2) { set(v0, v1, v2); }359inline vec(T v0, T v1, T v2, T v3) { set(v0, v1, v2, v3); }360inline vec(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] = other.m_v[i]; }361template <uint32_t OtherN, typename OtherT> inline vec(const vec<OtherN, OtherT> &other) { set(other); }362363inline const T& operator[](uint32_t i) const { assert(i < N); return m_v[i]; }364inline T &operator[](uint32_t i) { assert(i < N); return m_v[i]; }365366inline T getX() const { return m_v[0]; }367inline T getY() const { static_assert(N >= 2, "N too small"); return m_v[1]; }368inline T getZ() const { static_assert(N >= 3, "N too small"); return m_v[2]; }369inline T getW() const { static_assert(N >= 4, "N too small"); return m_v[3]; }370371inline bool operator==(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) if (m_v[i] != rhs.m_v[i]) return false; return true; }372inline bool operator!=(const vec& rhs) const { return !(*this == rhs); }373inline bool operator<(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) { if (m_v[i] < rhs.m_v[i]) return true; else if (m_v[i] != rhs.m_v[i]) return false; } return false; }374375inline void set_zero() { for (uint32_t i = 0; i < N; i++) m_v[i] = 0; }376inline void clear() { set_zero(); }377378template <uint32_t OtherN, typename OtherT>379inline vec &set(const vec<OtherN, OtherT> &other)380{381uint32_t i;382if ((const void *)(&other) == (const void *)(this))383return *this;384const uint32_t m = minimum(OtherN, N);385for (i = 0; i < m; i++)386m_v[i] = static_cast<T>(other[i]);387for (; i < N; i++)388m_v[i] = 0;389return *this;390}391392inline vec &set_component(uint32_t index, T val) { assert(index < N); m_v[index] = val; return *this; }393inline vec &set(T val) { for (uint32_t i = 0; i < N; i++) m_v[i] = val; return *this; }394inline void clear_elements(uint32_t s, uint32_t e) { assert(e <= N); for (uint32_t i = s; i < e; i++) m_v[i] = 0; }395396inline vec &set(T v0, T v1)397{398m_v[0] = v0;399if (N >= 2)400{401m_v[1] = v1;402clear_elements(2, N);403}404return *this;405}406407inline vec &set(T v0, T v1, T v2)408{409m_v[0] = v0;410if (N >= 2)411{412m_v[1] = v1;413if (N >= 3)414{415m_v[2] = v2;416clear_elements(3, N);417}418}419return *this;420}421422inline vec &set(T v0, T v1, T v2, T v3)423{424m_v[0] = v0;425if (N >= 2)426{427m_v[1] = v1;428if (N >= 3)429{430m_v[2] = v2;431432if (N >= 4)433{434m_v[3] = v3;435clear_elements(5, N);436}437}438}439return *this;440}441442inline vec &operator=(const vec &rhs) { if (this != &rhs) for (uint32_t i = 0; i < N; i++) m_v[i] = rhs.m_v[i]; return *this; }443template <uint32_t OtherN, typename OtherT> inline vec &operator=(const vec<OtherN, OtherT> &rhs) { set(rhs); return *this; }444445inline const T *get_ptr() const { return reinterpret_cast<const T *>(&m_v[0]); }446inline T *get_ptr() { return reinterpret_cast<T *>(&m_v[0]); }447448inline vec operator- () const { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = -m_v[i]; return res; }449inline vec operator+ () const { return *this; }450inline vec &operator+= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] += other.m_v[i]; return *this; }451inline vec &operator-= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] -= other.m_v[i]; return *this; }452inline vec &operator/= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] /= other.m_v[i]; return *this; }453inline vec &operator*=(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] *= other.m_v[i]; return *this; }454inline vec &operator/= (T s) { for (uint32_t i = 0; i < N; i++) m_v[i] /= s; return *this; }455inline vec &operator*= (T s) { for (uint32_t i = 0; i < N; i++) m_v[i] *= s; return *this; }456457friend inline vec operator+(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] + rhs.m_v[i]; return res; }458friend inline vec operator-(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] - rhs.m_v[i]; return res; }459friend inline vec operator*(const vec &lhs, T val) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] * val; return res; }460friend inline vec operator*(T val, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = val * rhs.m_v[i]; return res; }461friend inline vec operator/(const vec &lhs, T val) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] / val; return res; }462friend inline vec operator/(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] / rhs.m_v[i]; return res; }463464static inline T dot_product(const vec &lhs, const vec &rhs) { T res = lhs.m_v[0] * rhs.m_v[0]; for (uint32_t i = 1; i < N; i++) res += lhs.m_v[i] * rhs.m_v[i]; return res; }465466inline T dot(const vec &rhs) const { return dot_product(*this, rhs); }467468inline T norm() const { return dot_product(*this, *this); }469inline T length() const { return sqrt(norm()); }470471inline T squared_distance(const vec &other) const { T d2 = 0; for (uint32_t i = 0; i < N; i++) { T d = m_v[i] - other.m_v[i]; d2 += d * d; } return d2; }472inline double squared_distance_d(const vec& other) const { double d2 = 0; for (uint32_t i = 0; i < N; i++) { double d = (double)m_v[i] - (double)other.m_v[i]; d2 += d * d; } return d2; }473474inline T distance(const vec &other) const { return static_cast<T>(sqrt(squared_distance(other))); }475inline double distance_d(const vec& other) const { return sqrt(squared_distance_d(other)); }476477inline vec &normalize_in_place() { T len = length(); if (len != 0.0f) *this *= (1.0f / len); return *this; }478479inline vec get_normalized() const { vec res(*this); res.normalize_in_place(); return res; }480481inline vec &clamp(T l, T h)482{483for (uint32_t i = 0; i < N; i++)484m_v[i] = basisu::clamp(m_v[i], l, h);485return *this;486}487488static vec component_mul(const vec& a, const vec& b)489{490vec res;491for (uint32_t i = 0; i < N; i++)492res[i] = a[i] * b[i];493return res;494}495496static vec component_min(const vec& a, const vec& b)497{498vec res;499for (uint32_t i = 0; i < N; i++)500res[i] = minimum(a[i], b[i]);501return res;502}503504static vec component_max(const vec& a, const vec& b)505{506vec res;507for (uint32_t i = 0; i < N; i++)508res[i] = maximum(a[i], b[i]);509return res;510}511512static vec lerp(const vec& a, const vec& b, float s)513{514vec res;515for (uint32_t i = 0; i < N; i++)516res[i] = basisu::lerp(a[i], b[i], s);517return res;518}519};520521typedef vec<4, double> vec4D;522typedef vec<3, double> vec3D;523typedef vec<2, double> vec2D;524typedef vec<1, double> vec1D;525526typedef vec<6, float> vec6F;527typedef vec<5, float> vec5F;528typedef vec<4, float> vec4F;529typedef vec<3, float> vec3F;530typedef vec<2, float> vec2F;531typedef vec<1, float> vec1F;532533typedef vec<16, float> vec16F;534535template<uint32_t N, typename T> struct bitwise_copyable< vec<N, T> > { enum { cFlag = true }; };536template<uint32_t N, typename T> struct bitwise_movable< vec<N, T> > { enum { cFlag = true }; };537538template <uint32_t Rows, uint32_t Cols, typename T>539class matrix540{541public:542typedef vec<Rows, T> col_vec;543typedef vec<Cols, T> row_vec;544545typedef T scalar_type;546547enum { rows = Rows, cols = Cols };548549protected:550row_vec m_r[Rows];551552public:553inline matrix() {}554inline matrix(eZero) { set_zero(); }555inline matrix(const matrix &other) { for (uint32_t i = 0; i < Rows; i++) m_r[i] = other.m_r[i]; }556inline matrix &operator=(const matrix &rhs) { if (this != &rhs) for (uint32_t i = 0; i < Rows; i++) m_r[i] = rhs.m_r[i]; return *this; }557558inline T operator()(uint32_t r, uint32_t c) const { assert((r < Rows) && (c < Cols)); return m_r[r][c]; }559inline T &operator()(uint32_t r, uint32_t c) { assert((r < Rows) && (c < Cols)); return m_r[r][c]; }560561inline const row_vec &operator[](uint32_t r) const { assert(r < Rows); return m_r[r]; }562inline row_vec &operator[](uint32_t r) { assert(r < Rows); return m_r[r]; }563564inline matrix &set_zero()565{566for (uint32_t i = 0; i < Rows; i++)567m_r[i].set_zero();568return *this;569}570571inline matrix &set_identity()572{573for (uint32_t i = 0; i < Rows; i++)574{575m_r[i].set_zero();576if (i < Cols)577m_r[i][i] = 1.0f;578}579return *this;580}581};582583template<uint32_t R, uint32_t C, typename T> struct bitwise_copyable< matrix<R, C, T> > { enum { cFlag = true }; };584template<uint32_t R, uint32_t C, typename T> struct bitwise_movable< matrix<R, C, T> > { enum { cFlag = true }; };585586template<uint32_t N, typename VectorType>587inline VectorType compute_pca_from_covar(matrix<N, N, float> &cmatrix)588{589VectorType axis;590if (N == 1)591axis.set(1.0f);592else593{594for (uint32_t i = 0; i < N; i++)595axis[i] = lerp(.75f, 1.25f, i * (1.0f / maximum<int>(N - 1, 1)));596}597598VectorType prev_axis(axis);599600// Power iterations601for (uint32_t power_iter = 0; power_iter < 8; power_iter++)602{603VectorType trial_axis;604double max_sum = 0;605606for (uint32_t i = 0; i < N; i++)607{608double sum = 0;609for (uint32_t j = 0; j < N; j++)610sum += cmatrix[i][j] * axis[j];611612trial_axis[i] = static_cast<float>(sum);613614max_sum = maximum(fabs(sum), max_sum);615}616617if (max_sum != 0.0f)618trial_axis *= static_cast<float>(1.0f / max_sum);619620VectorType delta_axis(prev_axis - trial_axis);621622prev_axis = axis;623axis = trial_axis;624625if (delta_axis.norm() < .0024f)626break;627}628629return axis.normalize_in_place();630}631632template<typename T> inline void indirect_sort(uint32_t num_indices, uint32_t* pIndices, const T* pKeys)633{634for (uint32_t i = 0; i < num_indices; i++)635pIndices[i] = i;636637std::sort(638pIndices,639pIndices + num_indices,640[pKeys](uint32_t a, uint32_t b) { return pKeys[a] < pKeys[b]; }641);642}643644// 1-4 byte direct Radix sort.645template <typename T>646T* radix_sort(uint32_t num_vals, T* pBuf0, T* pBuf1, uint32_t key_ofs, uint32_t key_size)647{648assert(key_ofs < sizeof(T));649assert((key_size >= 1) && (key_size <= 4));650651uint32_t hist[256 * 4];652653memset(hist, 0, sizeof(hist[0]) * 256 * key_size);654655#define BASISU_GET_KEY(p) (*(uint32_t *)((uint8_t *)(p) + key_ofs))656657if (key_size == 4)658{659T* p = pBuf0;660T* q = pBuf0 + num_vals;661for (; p != q; p++)662{663const uint32_t key = BASISU_GET_KEY(p);664665hist[key & 0xFF]++;666hist[256 + ((key >> 8) & 0xFF)]++;667hist[512 + ((key >> 16) & 0xFF)]++;668hist[768 + ((key >> 24) & 0xFF)]++;669}670}671else if (key_size == 3)672{673T* p = pBuf0;674T* q = pBuf0 + num_vals;675for (; p != q; p++)676{677const uint32_t key = BASISU_GET_KEY(p);678679hist[key & 0xFF]++;680hist[256 + ((key >> 8) & 0xFF)]++;681hist[512 + ((key >> 16) & 0xFF)]++;682}683}684else if (key_size == 2)685{686T* p = pBuf0;687T* q = pBuf0 + (num_vals >> 1) * 2;688689for (; p != q; p += 2)690{691const uint32_t key0 = BASISU_GET_KEY(p);692const uint32_t key1 = BASISU_GET_KEY(p + 1);693694hist[key0 & 0xFF]++;695hist[256 + ((key0 >> 8) & 0xFF)]++;696697hist[key1 & 0xFF]++;698hist[256 + ((key1 >> 8) & 0xFF)]++;699}700701if (num_vals & 1)702{703const uint32_t key = BASISU_GET_KEY(p);704705hist[key & 0xFF]++;706hist[256 + ((key >> 8) & 0xFF)]++;707}708}709else710{711assert(key_size == 1);712if (key_size != 1)713return NULL;714715T* p = pBuf0;716T* q = pBuf0 + (num_vals >> 1) * 2;717718for (; p != q; p += 2)719{720const uint32_t key0 = BASISU_GET_KEY(p);721const uint32_t key1 = BASISU_GET_KEY(p + 1);722723hist[key0 & 0xFF]++;724hist[key1 & 0xFF]++;725}726727if (num_vals & 1)728{729const uint32_t key = BASISU_GET_KEY(p);730hist[key & 0xFF]++;731}732}733734T* pCur = pBuf0;735T* pNew = pBuf1;736737for (uint32_t pass = 0; pass < key_size; pass++)738{739const uint32_t* pHist = &hist[pass << 8];740741uint32_t offsets[256];742743uint32_t cur_ofs = 0;744for (uint32_t i = 0; i < 256; i += 2)745{746offsets[i] = cur_ofs;747cur_ofs += pHist[i];748749offsets[i + 1] = cur_ofs;750cur_ofs += pHist[i + 1];751}752753const uint32_t pass_shift = pass << 3;754755T* p = pCur;756T* q = pCur + (num_vals >> 1) * 2;757758for (; p != q; p += 2)759{760uint32_t c0 = (BASISU_GET_KEY(p) >> pass_shift) & 0xFF;761uint32_t c1 = (BASISU_GET_KEY(p + 1) >> pass_shift) & 0xFF;762763if (c0 == c1)764{765uint32_t dst_offset0 = offsets[c0];766767offsets[c0] = dst_offset0 + 2;768769pNew[dst_offset0] = p[0];770pNew[dst_offset0 + 1] = p[1];771}772else773{774uint32_t dst_offset0 = offsets[c0]++;775uint32_t dst_offset1 = offsets[c1]++;776777pNew[dst_offset0] = p[0];778pNew[dst_offset1] = p[1];779}780}781782if (num_vals & 1)783{784uint32_t c = (BASISU_GET_KEY(p) >> pass_shift) & 0xFF;785786uint32_t dst_offset = offsets[c];787offsets[c] = dst_offset + 1;788789pNew[dst_offset] = *p;790}791792T* t = pCur;793pCur = pNew;794pNew = t;795}796797return pCur;798}799800#undef BASISU_GET_KEY801802// Very simple job pool with no dependencies.803class job_pool804{805BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(job_pool);806807public:808// num_threads is the TOTAL number of job pool threads, including the calling thread! So 2=1 new thread, 3=2 new threads, etc.809job_pool(uint32_t num_threads);810~job_pool();811812void add_job(const std::function<void()>& job);813void add_job(std::function<void()>&& job);814815void wait_for_all();816817size_t get_total_threads() const { return 1 + m_threads.size(); }818819private:820std::vector<std::thread> m_threads;821std::vector<std::function<void()> > m_queue;822823std::mutex m_mutex;824std::condition_variable m_has_work;825std::condition_variable m_no_more_jobs;826827uint32_t m_num_active_jobs;828829std::atomic<bool> m_kill_flag;830831std::atomic<int> m_num_active_workers;832833void job_thread(uint32_t index);834};835836// Simple 64-bit color class837838class color_rgba_i16839{840public:841union842{843int16_t m_comps[4];844845struct846{847int16_t r;848int16_t g;849int16_t b;850int16_t a;851};852};853854inline color_rgba_i16()855{856static_assert(sizeof(*this) == sizeof(int16_t)*4, "sizeof(*this) == sizeof(int16_t)*4");857}858859inline color_rgba_i16(int sr, int sg, int sb, int sa)860{861set(sr, sg, sb, sa);862}863864inline color_rgba_i16 &set(int sr, int sg, int sb, int sa)865{866m_comps[0] = (int16_t)clamp<int>(sr, INT16_MIN, INT16_MAX);867m_comps[1] = (int16_t)clamp<int>(sg, INT16_MIN, INT16_MAX);868m_comps[2] = (int16_t)clamp<int>(sb, INT16_MIN, INT16_MAX);869m_comps[3] = (int16_t)clamp<int>(sa, INT16_MIN, INT16_MAX);870return *this;871}872};873874class color_rgba875{876public:877union878{879uint8_t m_comps[4];880881struct882{883uint8_t r;884uint8_t g;885uint8_t b;886uint8_t a;887};888};889890inline color_rgba()891{892static_assert(sizeof(*this) == 4, "sizeof(*this) != 4");893static_assert(sizeof(*this) == sizeof(basist::color32), "sizeof(*this) != sizeof(basist::color32)");894}895896// Not too hot about this idea.897inline color_rgba(const basist::color32& other) :898r(other.r),899g(other.g),900b(other.b),901a(other.a)902{903}904905color_rgba& operator= (const basist::color32& rhs)906{907r = rhs.r;908g = rhs.g;909b = rhs.b;910a = rhs.a;911return *this;912}913914inline color_rgba(int y)915{916set(y);917}918919inline color_rgba(int y, int na)920{921set(y, na);922}923924inline color_rgba(int sr, int sg, int sb, int sa)925{926set(sr, sg, sb, sa);927}928929inline color_rgba(eNoClamp, int sr, int sg, int sb, int sa)930{931set_noclamp_rgba((uint8_t)sr, (uint8_t)sg, (uint8_t)sb, (uint8_t)sa);932}933934inline color_rgba& set_noclamp_y(int y)935{936m_comps[0] = (uint8_t)y;937m_comps[1] = (uint8_t)y;938m_comps[2] = (uint8_t)y;939m_comps[3] = (uint8_t)255;940return *this;941}942943inline color_rgba &set_noclamp_rgba(int sr, int sg, int sb, int sa)944{945m_comps[0] = (uint8_t)sr;946m_comps[1] = (uint8_t)sg;947m_comps[2] = (uint8_t)sb;948m_comps[3] = (uint8_t)sa;949return *this;950}951952inline color_rgba &set(int y)953{954m_comps[0] = static_cast<uint8_t>(clamp<int>(y, 0, 255));955m_comps[1] = m_comps[0];956m_comps[2] = m_comps[0];957m_comps[3] = 255;958return *this;959}960961inline color_rgba &set(int y, int na)962{963m_comps[0] = static_cast<uint8_t>(clamp<int>(y, 0, 255));964m_comps[1] = m_comps[0];965m_comps[2] = m_comps[0];966m_comps[3] = static_cast<uint8_t>(clamp<int>(na, 0, 255));967return *this;968}969970inline color_rgba &set(int sr, int sg, int sb, int sa)971{972m_comps[0] = static_cast<uint8_t>(clamp<int>(sr, 0, 255));973m_comps[1] = static_cast<uint8_t>(clamp<int>(sg, 0, 255));974m_comps[2] = static_cast<uint8_t>(clamp<int>(sb, 0, 255));975m_comps[3] = static_cast<uint8_t>(clamp<int>(sa, 0, 255));976return *this;977}978979inline color_rgba &set_rgb(int sr, int sg, int sb)980{981m_comps[0] = static_cast<uint8_t>(clamp<int>(sr, 0, 255));982m_comps[1] = static_cast<uint8_t>(clamp<int>(sg, 0, 255));983m_comps[2] = static_cast<uint8_t>(clamp<int>(sb, 0, 255));984return *this;985}986987inline color_rgba &set_rgb(const color_rgba &other)988{989r = other.r;990g = other.g;991b = other.b;992return *this;993}994995inline const uint8_t &operator[] (uint32_t index) const { assert(index < 4); return m_comps[index]; }996inline uint8_t &operator[] (uint32_t index) { assert(index < 4); return m_comps[index]; }997998inline void clear()999{1000m_comps[0] = 0;1001m_comps[1] = 0;1002m_comps[2] = 0;1003m_comps[3] = 0;1004}10051006inline bool operator== (const color_rgba &rhs) const1007{1008if (m_comps[0] != rhs.m_comps[0]) return false;1009if (m_comps[1] != rhs.m_comps[1]) return false;1010if (m_comps[2] != rhs.m_comps[2]) return false;1011if (m_comps[3] != rhs.m_comps[3]) return false;1012return true;1013}10141015inline bool operator!= (const color_rgba &rhs) const1016{1017return !(*this == rhs);1018}10191020inline bool operator<(const color_rgba &rhs) const1021{1022for (int i = 0; i < 4; i++)1023{1024if (m_comps[i] < rhs.m_comps[i])1025return true;1026else if (m_comps[i] != rhs.m_comps[i])1027return false;1028}1029return false;1030}10311032inline int get_601_luma() const { return (19595U * m_comps[0] + 38470U * m_comps[1] + 7471U * m_comps[2] + 32768U) >> 16U; }1033inline int get_709_luma() const { return (13938U * m_comps[0] + 46869U * m_comps[1] + 4729U * m_comps[2] + 32768U) >> 16U; }1034inline int get_luma(bool luma_601) const { return luma_601 ? get_601_luma() : get_709_luma(); }10351036inline uint32_t get_bgra_uint32() const { return b | (g << 8) | (r << 16) | (a << 24); }1037inline uint32_t get_rgba_uint32() const { return r | (g << 8) | (b << 16) | (a << 24); }10381039inline basist::color32 get_color32() const1040{1041return basist::color32(r, g, b, a);1042}10431044static color_rgba comp_min(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::minimum(a[0], b[0]), basisu::minimum(a[1], b[1]), basisu::minimum(a[2], b[2]), basisu::minimum(a[3], b[3])); }1045static color_rgba comp_max(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::maximum(a[0], b[0]), basisu::maximum(a[1], b[1]), basisu::maximum(a[2], b[2]), basisu::maximum(a[3], b[3])); }1046};10471048typedef basisu::vector<color_rgba> color_rgba_vec;10491050const color_rgba g_black_color(0, 0, 0, 255);1051const color_rgba g_black_trans_color(0, 0, 0, 0);1052const color_rgba g_white_color(255, 255, 255, 255);10531054inline int color_distance(int r0, int g0, int b0, int r1, int g1, int b1)1055{1056int dr = r0 - r1, dg = g0 - g1, db = b0 - b1;1057return dr * dr + dg * dg + db * db;1058}10591060inline int color_distance(int r0, int g0, int b0, int a0, int r1, int g1, int b1, int a1)1061{1062int dr = r0 - r1, dg = g0 - g1, db = b0 - b1, da = a0 - a1;1063return dr * dr + dg * dg + db * db + da * da;1064}10651066inline int color_distance(const color_rgba &c0, const color_rgba &c1, bool alpha)1067{1068if (alpha)1069return color_distance(c0.r, c0.g, c0.b, c0.a, c1.r, c1.g, c1.b, c1.a);1070else1071return color_distance(c0.r, c0.g, c0.b, c1.r, c1.g, c1.b);1072}10731074// TODO: Allow user to control channel weightings.1075inline uint32_t color_distance(bool perceptual, const color_rgba &e1, const color_rgba &e2, bool alpha)1076{1077if (perceptual)1078{1079#if BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE1080const float l1 = e1.r * .2126f + e1.g * .715f + e1.b * .0722f;1081const float l2 = e2.r * .2126f + e2.g * .715f + e2.b * .0722f;10821083const float cr1 = e1.r - l1;1084const float cr2 = e2.r - l2;10851086const float cb1 = e1.b - l1;1087const float cb2 = e2.b - l2;10881089const float dl = l1 - l2;1090const float dcr = cr1 - cr2;1091const float dcb = cb1 - cb2;10921093uint32_t d = static_cast<uint32_t>(32.0f*4.0f*dl*dl + 32.0f*2.0f*(.5f / (1.0f - .2126f))*(.5f / (1.0f - .2126f))*dcr*dcr + 32.0f*.25f*(.5f / (1.0f - .0722f))*(.5f / (1.0f - .0722f))*dcb*dcb);10941095if (alpha)1096{1097int da = static_cast<int>(e1.a) - static_cast<int>(e2.a);1098d += static_cast<uint32_t>(128.0f*da*da);1099}11001101return d;1102#elif 11103int dr = e1.r - e2.r;1104int dg = e1.g - e2.g;1105int db = e1.b - e2.b;11061107#if 01108int delta_l = dr * 27 + dg * 92 + db * 9;1109int delta_cr = dr * 128 - delta_l;1110int delta_cb = db * 128 - delta_l;11111112uint32_t id = ((uint32_t)(delta_l * delta_l) >> 7U) +1113((((uint32_t)(delta_cr * delta_cr) >> 7U) * 26U) >> 7U) +1114((((uint32_t)(delta_cb * delta_cb) >> 7U) * 3U) >> 7U);1115#else1116int64_t delta_l = dr * 27 + dg * 92 + db * 9;1117int64_t delta_cr = dr * 128 - delta_l;1118int64_t delta_cb = db * 128 - delta_l;11191120uint32_t id = ((uint32_t)((delta_l * delta_l) >> 7U)) +1121((((uint32_t)((delta_cr * delta_cr) >> 7U)) * 26U) >> 7U) +1122((((uint32_t)((delta_cb * delta_cb) >> 7U)) * 3U) >> 7U);1123#endif11241125if (alpha)1126{1127int da = (e1.a - e2.a) << 7;1128// This shouldn't overflow if da is 255 or -255: 29.99 bits after squaring.1129id += ((uint32_t)(da * da) >> 7U);1130}11311132return id;1133#else1134int dr = e1.r - e2.r;1135int dg = e1.g - e2.g;1136int db = e1.b - e2.b;11371138int64_t delta_l = dr * 27 + dg * 92 + db * 9;1139int64_t delta_cr = dr * 128 - delta_l;1140int64_t delta_cb = db * 128 - delta_l;11411142int64_t id = ((delta_l * delta_l) * 128) +1143((delta_cr * delta_cr) * 26) +1144((delta_cb * delta_cb) * 3);11451146if (alpha)1147{1148int64_t da = (e1.a - e2.a);1149id += (da * da) * 128;1150}11511152int d = (id + 8192) >> 14;11531154return d;1155#endif1156}1157else1158return color_distance(e1, e2, alpha);1159}11601161static inline uint32_t color_distance_la(const color_rgba& a, const color_rgba& b)1162{1163const int dl = a.r - b.r;1164const int da = a.a - b.a;1165return dl * dl + da * da;1166}11671168// String helpers11691170inline int string_find_right(const std::string& filename, char c)1171{1172size_t result = filename.find_last_of(c);1173return (result == std::string::npos) ? -1 : (int)result;1174}11751176inline std::string string_get_extension(const std::string &filename)1177{1178int sep = -1;1179#ifdef _WIN321180sep = string_find_right(filename, '\\');1181#endif1182if (sep < 0)1183sep = string_find_right(filename, '/');11841185int dot = string_find_right(filename, '.');1186if (dot <= sep)1187return "";11881189std::string result(filename);1190result.erase(0, dot + 1);11911192return result;1193}11941195inline bool string_remove_extension(std::string &filename)1196{1197int sep = -1;1198#ifdef _WIN321199sep = string_find_right(filename, '\\');1200#endif1201if (sep < 0)1202sep = string_find_right(filename, '/');12031204int dot = string_find_right(filename, '.');1205if ((dot < sep) || (dot < 0))1206return false;12071208filename.resize(dot);12091210return true;1211}12121213inline std::string string_tolower(const std::string& s)1214{1215std::string result(s);1216for (size_t i = 0; i < result.size(); i++)1217{1218result[i] = (char)tolower((uint8_t)(result[i]));1219}1220return result;1221}12221223inline char *strcpy_safe(char *pDst, size_t dst_len, const char *pSrc)1224{1225assert(pDst && pSrc && dst_len);1226if (!dst_len)1227return pDst;12281229const size_t src_len = strlen(pSrc);1230const size_t src_len_plus_terminator = src_len + 1;12311232if (src_len_plus_terminator <= dst_len)1233memcpy(pDst, pSrc, src_len_plus_terminator);1234else1235{1236if (dst_len > 1)1237memcpy(pDst, pSrc, dst_len - 1);1238pDst[dst_len - 1] = '\0';1239}12401241return pDst;1242}12431244inline bool string_ends_with(const std::string& s, char c)1245{1246return (s.size() != 0) && (s.back() == c);1247}12481249inline bool string_split_path(const char *p, std::string *pDrive, std::string *pDir, std::string *pFilename, std::string *pExt)1250{1251#ifdef _MSC_VER1252char drive_buf[_MAX_DRIVE] = { 0 };1253char dir_buf[_MAX_DIR] = { 0 };1254char fname_buf[_MAX_FNAME] = { 0 };1255char ext_buf[_MAX_EXT] = { 0 };12561257errno_t error = _splitpath_s(p,1258pDrive ? drive_buf : NULL, pDrive ? _MAX_DRIVE : 0,1259pDir ? dir_buf : NULL, pDir ? _MAX_DIR : 0,1260pFilename ? fname_buf : NULL, pFilename ? _MAX_FNAME : 0,1261pExt ? ext_buf : NULL, pExt ? _MAX_EXT : 0);1262if (error != 0)1263return false;12641265if (pDrive) *pDrive = drive_buf;1266if (pDir) *pDir = dir_buf;1267if (pFilename) *pFilename = fname_buf;1268if (pExt) *pExt = ext_buf;1269return true;1270#else1271char dirtmp[1024], nametmp[1024];1272strcpy_safe(dirtmp, sizeof(dirtmp), p);1273strcpy_safe(nametmp, sizeof(nametmp), p);12741275if (pDrive)1276pDrive->resize(0);12771278const char *pDirName = dirname(dirtmp);1279const char* pBaseName = basename(nametmp);1280if ((!pDirName) || (!pBaseName))1281return false;12821283if (pDir)1284{1285*pDir = pDirName;1286if ((pDir->size()) && (pDir->back() != '/'))1287*pDir += "/";1288}12891290if (pFilename)1291{1292*pFilename = pBaseName;1293string_remove_extension(*pFilename);1294}12951296if (pExt)1297{1298*pExt = pBaseName;1299*pExt = string_get_extension(*pExt);1300if (pExt->size())1301*pExt = "." + *pExt;1302}13031304return true;1305#endif1306}13071308inline bool is_path_separator(char c)1309{1310#ifdef _WIN321311return (c == '/') || (c == '\\');1312#else1313return (c == '/');1314#endif1315}13161317inline bool is_drive_separator(char c)1318{1319#ifdef _WIN321320return (c == ':');1321#else1322(void)c;1323return false;1324#endif1325}13261327inline void string_combine_path(std::string &dst, const char *p, const char *q)1328{1329std::string temp(p);1330if (temp.size() && !is_path_separator(q[0]))1331{1332if (!is_path_separator(temp.back()))1333temp.append(1, BASISU_PATH_SEPERATOR_CHAR);1334}1335temp += q;1336dst.swap(temp);1337}13381339inline void string_combine_path(std::string &dst, const char *p, const char *q, const char *r)1340{1341string_combine_path(dst, p, q);1342string_combine_path(dst, dst.c_str(), r);1343}13441345inline void string_combine_path_and_extension(std::string &dst, const char *p, const char *q, const char *r, const char *pExt)1346{1347string_combine_path(dst, p, q, r);1348if ((!string_ends_with(dst, '.')) && (pExt[0]) && (pExt[0] != '.'))1349dst.append(1, '.');1350dst.append(pExt);1351}13521353inline bool string_get_pathname(const char *p, std::string &path)1354{1355std::string temp_drive, temp_path;1356if (!string_split_path(p, &temp_drive, &temp_path, NULL, NULL))1357return false;1358string_combine_path(path, temp_drive.c_str(), temp_path.c_str());1359return true;1360}13611362inline bool string_get_filename(const char *p, std::string &filename)1363{1364std::string temp_ext;1365if (!string_split_path(p, nullptr, nullptr, &filename, &temp_ext))1366return false;1367filename += temp_ext;1368return true;1369}13701371class rand1372{1373std::mt19937 m_mt;13741375public:1376rand() { }13771378rand(uint32_t s) { seed(s); }1379void seed(uint32_t s) { m_mt.seed(s); }13801381// between [l,h]1382int irand(int l, int h) { std::uniform_int_distribution<int> d(l, h); return d(m_mt); }13831384uint32_t urand32() { return static_cast<uint32_t>(irand(INT32_MIN, INT32_MAX)); }13851386bool bit() { return irand(0, 1) == 1; }13871388uint8_t byte() { return static_cast<uint8_t>(urand32()); }13891390// between [l,h)1391float frand(float l, float h) { std::uniform_real_distribution<float> d(l, h); return d(m_mt); }13921393float gaussian(float mean, float stddev) { std::normal_distribution<float> d(mean, stddev); return d(m_mt); }1394};13951396class priority_queue1397{1398public:1399priority_queue() :1400m_size(0)1401{1402}14031404void clear()1405{1406m_heap.clear();1407m_size = 0;1408}14091410void init(uint32_t max_entries, uint32_t first_index, float first_priority)1411{1412m_heap.resize(max_entries + 1);1413m_heap[1].m_index = first_index;1414m_heap[1].m_priority = first_priority;1415m_size = 1;1416}14171418inline uint32_t size() const { return m_size; }14191420inline uint32_t get_top_index() const { return m_heap[1].m_index; }1421inline float get_top_priority() const { return m_heap[1].m_priority; }14221423inline void delete_top()1424{1425assert(m_size > 0);1426m_heap[1] = m_heap[m_size];1427m_size--;1428if (m_size)1429down_heap(1);1430}14311432inline void add_heap(uint32_t index, float priority)1433{1434m_size++;14351436uint32_t k = m_size;14371438if (m_size >= m_heap.size())1439m_heap.resize(m_size + 1);14401441for (;;)1442{1443uint32_t parent_index = k >> 1;1444if ((!parent_index) || (m_heap[parent_index].m_priority > priority))1445break;1446m_heap[k] = m_heap[parent_index];1447k = parent_index;1448}14491450m_heap[k].m_index = index;1451m_heap[k].m_priority = priority;1452}14531454private:1455struct entry1456{1457uint32_t m_index;1458float m_priority;1459};14601461basisu::vector<entry> m_heap;1462uint32_t m_size;14631464// Push down entry at index1465inline void down_heap(uint32_t heap_index)1466{1467uint32_t orig_index = m_heap[heap_index].m_index;1468const float orig_priority = m_heap[heap_index].m_priority;14691470uint32_t child_index;1471while ((child_index = (heap_index << 1)) <= m_size)1472{1473if ((child_index < m_size) && (m_heap[child_index].m_priority < m_heap[child_index + 1].m_priority)) ++child_index;1474if (orig_priority > m_heap[child_index].m_priority)1475break;1476m_heap[heap_index] = m_heap[child_index];1477heap_index = child_index;1478}14791480m_heap[heap_index].m_index = orig_index;1481m_heap[heap_index].m_priority = orig_priority;1482}1483};14841485// Tree structured vector quantization (TSVQ)14861487template <typename TrainingVectorType>1488class tree_vector_quant1489{1490public:1491typedef TrainingVectorType training_vec_type;1492typedef std::pair<TrainingVectorType, uint64_t> training_vec_with_weight;1493typedef basisu::vector< training_vec_with_weight > array_of_weighted_training_vecs;14941495tree_vector_quant() :1496m_next_codebook_index(0)1497{1498}14991500void clear()1501{1502clear_vector(m_training_vecs);1503clear_vector(m_nodes);1504m_next_codebook_index = 0;1505}15061507void add_training_vec(const TrainingVectorType &v, uint64_t weight) { m_training_vecs.push_back(std::make_pair(v, weight)); }15081509size_t get_total_training_vecs() const { return m_training_vecs.size(); }1510const array_of_weighted_training_vecs &get_training_vecs() const { return m_training_vecs; }1511array_of_weighted_training_vecs &get_training_vecs() { return m_training_vecs; }15121513void retrieve(basisu::vector< basisu::vector<uint32_t> > &codebook) const1514{1515for (uint32_t i = 0; i < m_nodes.size(); i++)1516{1517const tsvq_node &n = m_nodes[i];1518if (!n.is_leaf())1519continue;15201521codebook.resize(codebook.size() + 1);1522codebook.back() = n.m_training_vecs;1523}1524}15251526void retrieve(basisu::vector<TrainingVectorType> &codebook) const1527{1528for (uint32_t i = 0; i < m_nodes.size(); i++)1529{1530const tsvq_node &n = m_nodes[i];1531if (!n.is_leaf())1532continue;15331534codebook.resize(codebook.size() + 1);1535codebook.back() = n.m_origin;1536}1537}15381539void retrieve(uint32_t max_clusters, basisu::vector<uint_vec> &codebook) const1540{1541uint_vec node_stack;1542node_stack.reserve(512);15431544codebook.resize(0);1545codebook.reserve(max_clusters);15461547uint32_t node_index = 0;15481549while (true)1550{1551const tsvq_node& cur = m_nodes[node_index];15521553if (cur.is_leaf() || ((2 + cur.m_codebook_index) > (int)max_clusters))1554{1555codebook.resize(codebook.size() + 1);1556codebook.back() = cur.m_training_vecs;15571558if (node_stack.empty())1559break;15601561node_index = node_stack.back();1562node_stack.pop_back();1563continue;1564}15651566node_stack.push_back(cur.m_right_index);1567node_index = cur.m_left_index;1568}1569}15701571bool generate(uint32_t max_size)1572{1573if (!m_training_vecs.size())1574return false;15751576m_next_codebook_index = 0;15771578clear_vector(m_nodes);1579m_nodes.reserve(max_size * 2 + 1);15801581m_nodes.push_back(prepare_root());15821583priority_queue var_heap;1584var_heap.init(max_size, 0, m_nodes[0].m_var);15851586basisu::vector<uint32_t> l_children, r_children;15871588// Now split the worst nodes1589l_children.reserve(m_training_vecs.size() + 1);1590r_children.reserve(m_training_vecs.size() + 1);15911592uint32_t total_leaf_nodes = 1;15931594//interval_timer tm;1595//tm.start();15961597while ((var_heap.size()) && (total_leaf_nodes < max_size))1598{1599const uint32_t node_index = var_heap.get_top_index();1600const tsvq_node &node = m_nodes[node_index];16011602assert(node.m_var == var_heap.get_top_priority());1603assert(node.is_leaf());16041605var_heap.delete_top();16061607if (node.m_training_vecs.size() > 1)1608{1609if (split_node(node_index, var_heap, l_children, r_children))1610{1611// This removes one leaf node (making an internal node) and replaces it with two new leaves, so +1 total.1612total_leaf_nodes += 1;1613}1614}1615}16161617//debug_printf("tree_vector_quant::generate %u: %3.3f secs\n", TrainingVectorType::num_elements, tm.get_elapsed_secs());16181619return true;1620}16211622private:1623class tsvq_node1624{1625public:1626inline tsvq_node() : m_weight(0), m_origin(cZero), m_left_index(-1), m_right_index(-1), m_codebook_index(-1) { }16271628// vecs is erased1629inline void set(const TrainingVectorType &org, uint64_t weight, float var, basisu::vector<uint32_t> &vecs) { m_origin = org; m_weight = weight; m_var = var; m_training_vecs.swap(vecs); }16301631inline bool is_leaf() const { return m_left_index < 0; }16321633float m_var;1634uint64_t m_weight;1635TrainingVectorType m_origin;1636int32_t m_left_index, m_right_index;1637basisu::vector<uint32_t> m_training_vecs;1638int m_codebook_index;1639};16401641typedef basisu::vector<tsvq_node> tsvq_node_vec;1642tsvq_node_vec m_nodes;16431644array_of_weighted_training_vecs m_training_vecs;16451646uint32_t m_next_codebook_index;16471648tsvq_node prepare_root() const1649{1650double ttsum = 0.0f;16511652// Prepare root node containing all training vectors1653tsvq_node root;1654root.m_training_vecs.reserve(m_training_vecs.size());16551656for (uint32_t i = 0; i < m_training_vecs.size(); i++)1657{1658const TrainingVectorType &v = m_training_vecs[i].first;1659const uint64_t weight = m_training_vecs[i].second;16601661root.m_training_vecs.push_back(i);16621663root.m_origin += (v * static_cast<float>(weight));1664root.m_weight += weight;16651666ttsum += v.dot(v) * weight;1667}16681669root.m_var = static_cast<float>(ttsum - (root.m_origin.dot(root.m_origin) / root.m_weight));16701671root.m_origin *= (1.0f / root.m_weight);16721673return root;1674}16751676bool split_node(uint32_t node_index, priority_queue &var_heap, basisu::vector<uint32_t> &l_children, basisu::vector<uint32_t> &r_children)1677{1678TrainingVectorType l_child_org, r_child_org;1679uint64_t l_weight = 0, r_weight = 0;1680float l_var = 0.0f, r_var = 0.0f;16811682// Compute initial left/right child origins1683if (!prep_split(m_nodes[node_index], l_child_org, r_child_org))1684return false;16851686// Use k-means iterations to refine these children vectors1687if (!refine_split(m_nodes[node_index], l_child_org, l_weight, l_var, l_children, r_child_org, r_weight, r_var, r_children))1688return false;16891690// Create children1691const uint32_t l_child_index = (uint32_t)m_nodes.size(), r_child_index = (uint32_t)m_nodes.size() + 1;16921693m_nodes[node_index].m_left_index = l_child_index;1694m_nodes[node_index].m_right_index = r_child_index;16951696m_nodes[node_index].m_codebook_index = m_next_codebook_index;1697m_next_codebook_index++;16981699m_nodes.resize(m_nodes.size() + 2);17001701tsvq_node &l_child = m_nodes[l_child_index], &r_child = m_nodes[r_child_index];17021703l_child.set(l_child_org, l_weight, l_var, l_children);1704r_child.set(r_child_org, r_weight, r_var, r_children);17051706if ((l_child.m_var <= 0.0f) && (l_child.m_training_vecs.size() > 1))1707{1708TrainingVectorType v(m_training_vecs[l_child.m_training_vecs[0]].first);17091710for (uint32_t i = 1; i < l_child.m_training_vecs.size(); i++)1711{1712if (!(v == m_training_vecs[l_child.m_training_vecs[i]].first))1713{1714l_child.m_var = 1e-4f;1715break;1716}1717}1718}17191720if ((r_child.m_var <= 0.0f) && (r_child.m_training_vecs.size() > 1))1721{1722TrainingVectorType v(m_training_vecs[r_child.m_training_vecs[0]].first);17231724for (uint32_t i = 1; i < r_child.m_training_vecs.size(); i++)1725{1726if (!(v == m_training_vecs[r_child.m_training_vecs[i]].first))1727{1728r_child.m_var = 1e-4f;1729break;1730}1731}1732}17331734if ((l_child.m_var > 0.0f) && (l_child.m_training_vecs.size() > 1))1735var_heap.add_heap(l_child_index, l_child.m_var);17361737if ((r_child.m_var > 0.0f) && (r_child.m_training_vecs.size() > 1))1738var_heap.add_heap(r_child_index, r_child.m_var);17391740return true;1741}17421743TrainingVectorType compute_split_axis(const tsvq_node &node) const1744{1745const uint32_t N = TrainingVectorType::num_elements;17461747matrix<N, N, float> cmatrix;17481749if ((N != 16) || (!g_cpu_supports_sse41))1750{1751cmatrix.set_zero();17521753// Compute covariance matrix from weighted input vectors1754for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)1755{1756const TrainingVectorType v(m_training_vecs[node.m_training_vecs[i]].first - node.m_origin);1757const TrainingVectorType w(static_cast<float>(m_training_vecs[node.m_training_vecs[i]].second) * v);17581759for (uint32_t x = 0; x < N; x++)1760for (uint32_t y = x; y < N; y++)1761cmatrix[x][y] = cmatrix[x][y] + v[x] * w[y];1762}1763}1764else1765{1766#if BASISU_SUPPORT_SSE1767// Specialize the case with 16x16 matrices, which are quite expensive without SIMD.1768// This SSE function takes pointers to void types, so do some sanity checks.1769assert(sizeof(TrainingVectorType) == sizeof(float) * 16);1770assert(sizeof(training_vec_with_weight) == sizeof(std::pair<vec16F, uint64_t>));1771update_covar_matrix_16x16_sse41(node.m_training_vecs.size_u32(), m_training_vecs.data(), &node.m_origin, node.m_training_vecs.data(), &cmatrix);1772#endif1773}17741775const float renorm_scale = 1.0f / node.m_weight;17761777for (uint32_t x = 0; x < N; x++)1778for (uint32_t y = x; y < N; y++)1779cmatrix[x][y] *= renorm_scale;17801781// Diagonal flip1782for (uint32_t x = 0; x < (N - 1); x++)1783for (uint32_t y = x + 1; y < N; y++)1784cmatrix[y][x] = cmatrix[x][y];17851786return compute_pca_from_covar<N, TrainingVectorType>(cmatrix);1787}17881789bool prep_split(const tsvq_node &node, TrainingVectorType &l_child_result, TrainingVectorType &r_child_result) const1790{1791//const uint32_t N = TrainingVectorType::num_elements;17921793if (2 == node.m_training_vecs.size())1794{1795l_child_result = m_training_vecs[node.m_training_vecs[0]].first;1796r_child_result = m_training_vecs[node.m_training_vecs[1]].first;1797return true;1798}17991800TrainingVectorType axis(compute_split_axis(node)), l_child(0.0f), r_child(0.0f);1801double l_weight = 0.0f, r_weight = 0.0f;18021803// Compute initial left/right children1804for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)1805{1806const float weight = (float)m_training_vecs[node.m_training_vecs[i]].second;18071808const TrainingVectorType &v = m_training_vecs[node.m_training_vecs[i]].first;18091810double t = (v - node.m_origin).dot(axis);1811if (t >= 0.0f)1812{1813r_child += v * weight;1814r_weight += weight;1815}1816else1817{1818l_child += v * weight;1819l_weight += weight;1820}1821}18221823if ((l_weight > 0.0f) && (r_weight > 0.0f))1824{1825l_child_result = l_child * static_cast<float>(1.0f / l_weight);1826r_child_result = r_child * static_cast<float>(1.0f / r_weight);1827}1828else1829{1830TrainingVectorType l(1e+20f);1831TrainingVectorType h(-1e+20f);1832for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)1833{1834const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;18351836l = TrainingVectorType::component_min(l, v);1837h = TrainingVectorType::component_max(h, v);1838}18391840TrainingVectorType r(h - l);18411842float largest_axis_v = 0.0f;1843int largest_axis_index = -1;1844for (uint32_t i = 0; i < TrainingVectorType::num_elements; i++)1845{1846if (r[i] > largest_axis_v)1847{1848largest_axis_v = r[i];1849largest_axis_index = i;1850}1851}18521853if (largest_axis_index < 0)1854return false;18551856basisu::vector<float> keys(node.m_training_vecs.size());1857for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)1858keys[i] = m_training_vecs[node.m_training_vecs[i]].first[largest_axis_index];18591860uint_vec indices(node.m_training_vecs.size());1861indirect_sort((uint32_t)node.m_training_vecs.size(), &indices[0], &keys[0]);18621863l_child.set_zero();1864l_weight = 0;18651866r_child.set_zero();1867r_weight = 0;18681869const uint32_t half_index = (uint32_t)node.m_training_vecs.size() / 2;1870for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)1871{1872const float weight = (float)m_training_vecs[node.m_training_vecs[i]].second;18731874const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;18751876if (i < half_index)1877{1878l_child += v * weight;1879l_weight += weight;1880}1881else1882{1883r_child += v * weight;1884r_weight += weight;1885}1886}18871888if ((l_weight > 0.0f) && (r_weight > 0.0f))1889{1890l_child_result = l_child * static_cast<float>(1.0f / l_weight);1891r_child_result = r_child * static_cast<float>(1.0f / r_weight);1892}1893else1894{1895l_child_result = l;1896r_child_result = h;1897}1898}18991900return true;1901}19021903bool refine_split(const tsvq_node &node,1904TrainingVectorType &l_child, uint64_t &l_weight, float &l_var, basisu::vector<uint32_t> &l_children,1905TrainingVectorType &r_child, uint64_t &r_weight, float &r_var, basisu::vector<uint32_t> &r_children) const1906{1907l_children.reserve(node.m_training_vecs.size());1908r_children.reserve(node.m_training_vecs.size());19091910float prev_total_variance = 1e+10f;19111912// Refine left/right children locations using k-means iterations1913const uint32_t cMaxIters = 6;1914for (uint32_t iter = 0; iter < cMaxIters; iter++)1915{1916l_children.resize(0);1917r_children.resize(0);19181919TrainingVectorType new_l_child(cZero), new_r_child(cZero);19201921double l_ttsum = 0.0f, r_ttsum = 0.0f;19221923l_weight = 0;1924r_weight = 0;19251926for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)1927{1928const TrainingVectorType &v = m_training_vecs[node.m_training_vecs[i]].first;1929const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second;19301931double left_dist2 = l_child.squared_distance_d(v), right_dist2 = r_child.squared_distance_d(v);19321933if (left_dist2 >= right_dist2)1934{1935new_r_child += (v * static_cast<float>(weight));1936r_weight += weight;19371938r_ttsum += weight * v.dot(v);1939r_children.push_back(node.m_training_vecs[i]);1940}1941else1942{1943new_l_child += (v * static_cast<float>(weight));1944l_weight += weight;19451946l_ttsum += weight * v.dot(v);1947l_children.push_back(node.m_training_vecs[i]);1948}1949}19501951// Node is unsplittable using the above algorithm - try something else to split it up.1952if ((!l_weight) || (!r_weight))1953{1954l_children.resize(0);1955new_l_child.set(0.0f);1956l_ttsum = 0.0f;1957l_weight = 0;19581959r_children.resize(0);1960new_r_child.set(0.0f);1961r_ttsum = 0.0f;1962r_weight = 0;19631964TrainingVectorType firstVec;1965for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)1966{1967const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;1968const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second;19691970if ((!i) || (v == firstVec))1971{1972firstVec = v;19731974new_r_child += (v * static_cast<float>(weight));1975r_weight += weight;19761977r_ttsum += weight * v.dot(v);1978r_children.push_back(node.m_training_vecs[i]);1979}1980else1981{1982new_l_child += (v * static_cast<float>(weight));1983l_weight += weight;19841985l_ttsum += weight * v.dot(v);1986l_children.push_back(node.m_training_vecs[i]);1987}1988}19891990if ((!l_weight) || (!r_weight))1991return false;1992}19931994l_var = static_cast<float>(l_ttsum - (new_l_child.dot(new_l_child) / l_weight));1995r_var = static_cast<float>(r_ttsum - (new_r_child.dot(new_r_child) / r_weight));19961997new_l_child *= (1.0f / l_weight);1998new_r_child *= (1.0f / r_weight);19992000l_child = new_l_child;2001r_child = new_r_child;20022003float total_var = l_var + r_var;2004const float cGiveupVariance = .00001f;2005if (total_var < cGiveupVariance)2006break;20072008// Check to see if the variance has settled2009const float cVarianceDeltaThresh = .00125f;2010if (((prev_total_variance - total_var) / total_var) < cVarianceDeltaThresh)2011break;20122013prev_total_variance = total_var;2014}20152016return true;2017}2018};20192020struct weighted_block_group2021{2022uint64_t m_total_weight;2023uint_vec m_indices;2024};20252026template<typename Quantizer>2027bool generate_hierarchical_codebook_threaded_internal(Quantizer& q,2028uint32_t max_codebook_size, uint32_t max_parent_codebook_size,2029basisu::vector<uint_vec>& codebook,2030basisu::vector<uint_vec>& parent_codebook,2031uint32_t max_threads, bool limit_clusterizers, job_pool *pJob_pool)2032{2033codebook.resize(0);2034parent_codebook.resize(0);20352036if ((max_threads <= 1) || (q.get_training_vecs().size() < 256) || (max_codebook_size < max_threads * 16))2037{2038if (!q.generate(max_codebook_size))2039return false;20402041q.retrieve(codebook);20422043if (max_parent_codebook_size)2044q.retrieve(max_parent_codebook_size, parent_codebook);20452046return true;2047}20482049const uint32_t cMaxThreads = 16;2050if (max_threads > cMaxThreads)2051max_threads = cMaxThreads;20522053if (!q.generate(max_threads))2054return false;20552056basisu::vector<uint_vec> initial_codebook;20572058q.retrieve(initial_codebook);20592060if (initial_codebook.size() < max_threads)2061{2062codebook = initial_codebook;20632064if (max_parent_codebook_size)2065q.retrieve(max_parent_codebook_size, parent_codebook);20662067return true;2068}20692070Quantizer quantizers[cMaxThreads];20712072bool success_flags[cMaxThreads];2073clear_obj(success_flags);20742075basisu::vector<uint_vec> local_clusters[cMaxThreads];2076basisu::vector<uint_vec> local_parent_clusters[cMaxThreads];20772078for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)2079{2080pJob_pool->add_job( [thread_iter, &local_clusters, &local_parent_clusters, &success_flags, &quantizers, &initial_codebook, &q, &limit_clusterizers, &max_codebook_size, &max_threads, &max_parent_codebook_size] {20812082Quantizer& lq = quantizers[thread_iter];2083uint_vec& cluster_indices = initial_codebook[thread_iter];20842085uint_vec local_to_global(cluster_indices.size());20862087for (uint32_t i = 0; i < cluster_indices.size(); i++)2088{2089const uint32_t global_training_vec_index = cluster_indices[i];2090local_to_global[i] = global_training_vec_index;20912092lq.add_training_vec(q.get_training_vecs()[global_training_vec_index].first, q.get_training_vecs()[global_training_vec_index].second);2093}20942095const uint32_t max_clusters = limit_clusterizers ? ((max_codebook_size + max_threads - 1) / max_threads) : (uint32_t)lq.get_total_training_vecs();20962097success_flags[thread_iter] = lq.generate(max_clusters);20982099if (success_flags[thread_iter])2100{2101lq.retrieve(local_clusters[thread_iter]);21022103for (uint32_t i = 0; i < local_clusters[thread_iter].size(); i++)2104{2105for (uint32_t j = 0; j < local_clusters[thread_iter][i].size(); j++)2106local_clusters[thread_iter][i][j] = local_to_global[local_clusters[thread_iter][i][j]];2107}21082109if (max_parent_codebook_size)2110{2111lq.retrieve((max_parent_codebook_size + max_threads - 1) / max_threads, local_parent_clusters[thread_iter]);21122113for (uint32_t i = 0; i < local_parent_clusters[thread_iter].size(); i++)2114{2115for (uint32_t j = 0; j < local_parent_clusters[thread_iter][i].size(); j++)2116local_parent_clusters[thread_iter][i][j] = local_to_global[local_parent_clusters[thread_iter][i][j]];2117}2118}2119}21202121} );21222123} // thread_iter21242125pJob_pool->wait_for_all();21262127uint32_t total_clusters = 0, total_parent_clusters = 0;21282129for (int thread_iter = 0; thread_iter < (int)max_threads; thread_iter++)2130{2131if (!success_flags[thread_iter])2132return false;2133total_clusters += (uint32_t)local_clusters[thread_iter].size();2134total_parent_clusters += (uint32_t)local_parent_clusters[thread_iter].size();2135}21362137codebook.reserve(total_clusters);2138parent_codebook.reserve(total_parent_clusters);21392140for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)2141{2142for (uint32_t j = 0; j < local_clusters[thread_iter].size(); j++)2143{2144codebook.resize(codebook.size() + 1);2145codebook.back().swap(local_clusters[thread_iter][j]);2146}21472148for (uint32_t j = 0; j < local_parent_clusters[thread_iter].size(); j++)2149{2150parent_codebook.resize(parent_codebook.size() + 1);2151parent_codebook.back().swap(local_parent_clusters[thread_iter][j]);2152}2153}21542155return true;2156}21572158template<typename Quantizer>2159bool generate_hierarchical_codebook_threaded(Quantizer& q,2160uint32_t max_codebook_size, uint32_t max_parent_codebook_size,2161basisu::vector<uint_vec>& codebook,2162basisu::vector<uint_vec>& parent_codebook,2163uint32_t max_threads, job_pool *pJob_pool,2164bool even_odd_input_pairs_equal)2165{2166// rg 6/24/2025 - Cross platform determinism2167#if 02168typedef bit_hasher<typename Quantizer::training_vec_type> training_vec_bit_hasher;2169typedef std::unordered_map < typename Quantizer::training_vec_type, weighted_block_group,2170training_vec_bit_hasher> group_hash;2171#else2172typedef std::map< typename Quantizer::training_vec_type, weighted_block_group > group_hash;2173#endif21742175//interval_timer tm;2176//tm.start();21772178group_hash unique_vecs;21792180// rg 6/24/2025 - Cross platform determinism2181#if 02182unique_vecs.reserve(20000);2183#endif21842185weighted_block_group g;21862187if (even_odd_input_pairs_equal)2188{2189g.m_indices.resize(2);21902191assert(q.get_training_vecs().size() >= 2 && (q.get_training_vecs().size() & 1) == 0);21922193for (uint32_t i = 0; i < q.get_training_vecs().size(); i += 2)2194{2195assert(q.get_training_vecs()[i].first == q.get_training_vecs()[i + 1].first);21962197g.m_total_weight = q.get_training_vecs()[i].second + q.get_training_vecs()[i + 1].second;2198g.m_indices[0] = i;2199g.m_indices[1] = i + 1;22002201auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g));22022203if (!ins_res.second)2204{2205(ins_res.first)->second.m_total_weight += g.m_total_weight;2206(ins_res.first)->second.m_indices.push_back(i);2207(ins_res.first)->second.m_indices.push_back(i + 1);2208}2209}2210}2211else2212{2213g.m_indices.resize(1);22142215for (uint32_t i = 0; i < q.get_training_vecs().size(); i++)2216{2217g.m_total_weight = q.get_training_vecs()[i].second;2218g.m_indices[0] = i;22192220auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g));22212222if (!ins_res.second)2223{2224(ins_res.first)->second.m_total_weight += g.m_total_weight;2225(ins_res.first)->second.m_indices.push_back(i);2226}2227}2228}22292230//debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors, %3.3f secs\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size(), tm.get_elapsed_secs());2231debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size());22322233Quantizer group_quant;2234typedef typename group_hash::const_iterator group_hash_const_iter;2235basisu::vector<group_hash_const_iter> unique_vec_iters;2236unique_vec_iters.reserve(unique_vecs.size());22372238for (auto iter = unique_vecs.begin(); iter != unique_vecs.end(); ++iter)2239{2240group_quant.add_training_vec(iter->first, iter->second.m_total_weight);2241unique_vec_iters.push_back(iter);2242}22432244bool limit_clusterizers = true;2245if (unique_vecs.size() <= max_codebook_size)2246limit_clusterizers = false;22472248debug_printf("Limit clusterizers: %u\n", limit_clusterizers);22492250basisu::vector<uint_vec> group_codebook, group_parent_codebook;2251bool status = generate_hierarchical_codebook_threaded_internal(group_quant,2252max_codebook_size, max_parent_codebook_size,2253group_codebook,2254group_parent_codebook,2255(unique_vecs.size() < 65536*4) ? 1 : max_threads, limit_clusterizers, pJob_pool);22562257if (!status)2258return false;22592260codebook.resize(0);2261for (uint32_t i = 0; i < group_codebook.size(); i++)2262{2263codebook.resize(codebook.size() + 1);22642265for (uint32_t j = 0; j < group_codebook[i].size(); j++)2266{2267const uint32_t group_index = group_codebook[i][j];22682269typename group_hash::const_iterator group_iter = unique_vec_iters[group_index];2270const uint_vec& training_vec_indices = group_iter->second.m_indices;22712272append_vector(codebook.back(), training_vec_indices);2273}2274}22752276parent_codebook.resize(0);2277for (uint32_t i = 0; i < group_parent_codebook.size(); i++)2278{2279parent_codebook.resize(parent_codebook.size() + 1);22802281for (uint32_t j = 0; j < group_parent_codebook[i].size(); j++)2282{2283const uint32_t group_index = group_parent_codebook[i][j];22842285typename group_hash::const_iterator group_iter = unique_vec_iters[group_index];2286const uint_vec& training_vec_indices = group_iter->second.m_indices;22872288append_vector(parent_codebook.back(), training_vec_indices);2289}2290}22912292return true;2293}22942295// Canonical Huffman coding22962297class histogram2298{2299basisu::vector<uint32_t> m_hist;23002301public:2302histogram(uint32_t size = 0) { init(size); }23032304void clear()2305{2306clear_vector(m_hist);2307}23082309void init(uint32_t size)2310{2311m_hist.resize(0);2312m_hist.resize(size);2313}23142315inline uint32_t size() const { return static_cast<uint32_t>(m_hist.size()); }23162317inline const uint32_t &operator[] (uint32_t index) const2318{2319return m_hist[index];2320}23212322inline uint32_t &operator[] (uint32_t index)2323{2324return m_hist[index];2325}23262327inline void inc(uint32_t index)2328{2329m_hist[index]++;2330}23312332uint64_t get_total() const2333{2334uint64_t total = 0;2335for (uint32_t i = 0; i < m_hist.size(); ++i)2336total += m_hist[i];2337return total;2338}23392340double get_entropy() const2341{2342double total = static_cast<double>(get_total());2343if (total == 0.0f)2344return 0.0f;23452346const double inv_total = 1.0f / total;2347const double neg_inv_log2 = -1.0f / log(2.0f);23482349double e = 0.0f;2350for (uint32_t i = 0; i < m_hist.size(); i++)2351if (m_hist[i])2352e += log(m_hist[i] * inv_total) * neg_inv_log2 * static_cast<double>(m_hist[i]);23532354return e;2355}2356};23572358struct sym_freq2359{2360uint32_t m_key;2361uint16_t m_sym_index;2362};23632364sym_freq *canonical_huffman_radix_sort_syms(uint32_t num_syms, sym_freq *pSyms0, sym_freq *pSyms1);2365void canonical_huffman_calculate_minimum_redundancy(sym_freq *A, int num_syms);2366void canonical_huffman_enforce_max_code_size(int *pNum_codes, int code_list_len, int max_code_size);23672368class huffman_encoding_table2369{2370public:2371huffman_encoding_table()2372{2373}23742375void clear()2376{2377clear_vector(m_codes);2378clear_vector(m_code_sizes);2379}23802381bool init(const histogram &h, uint32_t max_code_size = cHuffmanMaxSupportedCodeSize)2382{2383return init(h.size(), &h[0], max_code_size);2384}23852386bool init(uint32_t num_syms, const uint16_t *pFreq, uint32_t max_code_size);2387bool init(uint32_t num_syms, const uint32_t *pSym_freq, uint32_t max_code_size);23882389inline const uint16_vec &get_codes() const { return m_codes; }2390inline const uint8_vec &get_code_sizes() const { return m_code_sizes; }23912392uint32_t get_total_used_codes() const2393{2394for (int i = static_cast<int>(m_code_sizes.size()) - 1; i >= 0; i--)2395if (m_code_sizes[i])2396return i + 1;2397return 0;2398}23992400private:2401uint16_vec m_codes;2402uint8_vec m_code_sizes;2403};24042405class bitwise_coder2406{2407public:2408bitwise_coder() :2409m_bit_buffer(0),2410m_bit_buffer_size(0),2411m_total_bits(0)2412{2413}24142415bitwise_coder(const bitwise_coder& other) :2416m_bytes(other.m_bytes),2417m_bit_buffer(other.m_bit_buffer),2418m_bit_buffer_size(other.m_bit_buffer_size),2419m_total_bits(other.m_total_bits)2420{2421}24222423bitwise_coder(bitwise_coder&& other) :2424m_bytes(std::move(other.m_bytes)),2425m_bit_buffer(other.m_bit_buffer),2426m_bit_buffer_size(other.m_bit_buffer_size),2427m_total_bits(other.m_total_bits)2428{2429}24302431bitwise_coder& operator= (const bitwise_coder& rhs)2432{2433if (this == &rhs)2434return *this;24352436m_bytes = rhs.m_bytes;2437m_bit_buffer = rhs.m_bit_buffer;2438m_bit_buffer_size = rhs.m_bit_buffer_size;2439m_total_bits = rhs.m_total_bits;24402441return *this;2442}24432444bitwise_coder& operator= (bitwise_coder&& rhs)2445{2446if (this == &rhs)2447return *this;24482449m_bytes = std::move(rhs.m_bytes);2450m_bit_buffer = rhs.m_bit_buffer;2451m_bit_buffer_size = rhs.m_bit_buffer_size;2452m_total_bits = rhs.m_total_bits;24532454return *this;2455}24562457inline void clear()2458{2459clear_vector(m_bytes);2460m_bit_buffer = 0;2461m_bit_buffer_size = 0;2462m_total_bits = 0;2463}24642465inline void restart()2466{2467m_bytes.resize(0);2468m_bit_buffer = 0;2469m_bit_buffer_size = 0;2470m_total_bits = 0;2471}24722473inline const uint8_vec &get_bytes() const { return m_bytes; }2474inline uint8_vec& get_bytes() { return m_bytes; }24752476inline void reserve(uint32_t size) { m_bytes.reserve(size); }24772478inline uint64_t get_total_bits() const { return m_total_bits; }2479inline uint32_t get_total_bits_u32() const { assert(m_total_bits <= UINT32_MAX); return static_cast<uint32_t>(m_total_bits); }2480inline void clear_total_bits() { m_total_bits = 0; }24812482inline void init(uint32_t reserve_size = 1024)2483{2484m_bytes.reserve(reserve_size);2485m_bytes.resize(0);24862487m_bit_buffer = 0;2488m_bit_buffer_size = 0;2489m_total_bits = 0;2490}24912492inline uint32_t flush()2493{2494if (m_bit_buffer_size)2495{2496m_total_bits += 8 - (m_bit_buffer_size & 7);2497append_byte(static_cast<uint8_t>(m_bit_buffer));24982499m_bit_buffer = 0;2500m_bit_buffer_size = 0;25012502return 8;2503}25042505return 0;2506}25072508inline uint32_t put_bits(uint32_t bits, uint32_t num_bits)2509{2510assert(num_bits <= 32);2511assert(bits < (1ULL << num_bits));25122513if (!num_bits)2514return 0;25152516m_total_bits += num_bits;25172518uint64_t v = (static_cast<uint64_t>(bits) << m_bit_buffer_size) | m_bit_buffer;2519m_bit_buffer_size += num_bits;25202521while (m_bit_buffer_size >= 8)2522{2523append_byte(static_cast<uint8_t>(v));2524v >>= 8;2525m_bit_buffer_size -= 8;2526}25272528m_bit_buffer = static_cast<uint8_t>(v);2529return num_bits;2530}25312532inline uint32_t put_code(uint32_t sym, const huffman_encoding_table &tab)2533{2534uint32_t code = tab.get_codes()[sym];2535uint32_t code_size = tab.get_code_sizes()[sym];2536assert(code_size >= 1);2537put_bits(code, code_size);2538return code_size;2539}25402541inline uint32_t put_truncated_binary(uint32_t v, uint32_t n)2542{2543assert((n >= 2) && (v < n));25442545uint32_t k = floor_log2i(n);2546uint32_t u = (1 << (k + 1)) - n;25472548if (v < u)2549return put_bits(v, k);25502551uint32_t x = v + u;2552assert((x >> 1) >= u);25532554put_bits(x >> 1, k);2555put_bits(x & 1, 1);2556return k + 1;2557}25582559inline uint32_t put_rice(uint32_t v, uint32_t m)2560{2561assert(m);25622563const uint64_t start_bits = m_total_bits;25642565uint32_t q = v >> m, r = v & ((1 << m) - 1);25662567// rice coding sanity check2568assert(q <= 64);25692570for (; q > 16; q -= 16)2571put_bits(0xFFFF, 16);25722573put_bits((1 << q) - 1, q);2574put_bits(r << 1, m + 1);25752576return (uint32_t)(m_total_bits - start_bits);2577}25782579inline uint32_t put_vlc(uint32_t v, uint32_t chunk_bits)2580{2581assert(chunk_bits);25822583const uint32_t chunk_size = 1 << chunk_bits;2584const uint32_t chunk_mask = chunk_size - 1;25852586uint32_t total_bits = 0;25872588for ( ; ; )2589{2590uint32_t next_v = v >> chunk_bits;25912592total_bits += put_bits((v & chunk_mask) | (next_v ? chunk_size : 0), chunk_bits + 1);2593if (!next_v)2594break;25952596v = next_v;2597}25982599return total_bits;2600}26012602uint32_t emit_huffman_table(const huffman_encoding_table &tab);26032604void append(const bitwise_coder& other)2605{2606for (uint32_t i = 0; i < other.m_bytes.size(); i++)2607put_bits(other.m_bytes[i], 8);26082609if (other.m_bit_buffer_size)2610put_bits(other.m_bit_buffer, other.m_bit_buffer_size);2611}26122613private:2614uint8_vec m_bytes;2615uint32_t m_bit_buffer, m_bit_buffer_size;2616uint64_t m_total_bits;26172618inline void append_byte(uint8_t c)2619{2620//m_bytes.resize(m_bytes.size() + 1);2621//m_bytes.back() = c;26222623m_bytes.push_back(c);2624}26252626static void end_nonzero_run(uint16_vec &syms, uint32_t &run_size, uint32_t len);2627static void end_zero_run(uint16_vec &syms, uint32_t &run_size);2628};26292630class huff2D2631{2632public:2633huff2D() { }2634huff2D(uint32_t bits_per_sym, uint32_t total_syms_per_group) { init(bits_per_sym, total_syms_per_group); }26352636inline const histogram &get_histogram() const { return m_histogram; }2637inline const huffman_encoding_table &get_encoding_table() const { return m_encoding_table; }26382639inline void init(uint32_t bits_per_sym, uint32_t total_syms_per_group)2640{2641assert((bits_per_sym * total_syms_per_group) <= 16 && total_syms_per_group >= 1 && bits_per_sym >= 1);26422643m_bits_per_sym = bits_per_sym;2644m_total_syms_per_group = total_syms_per_group;2645m_cur_sym_bits = 0;2646m_cur_num_syms = 0;2647m_decode_syms_remaining = 0;2648m_next_decoder_group_index = 0;26492650m_histogram.init(1 << (bits_per_sym * total_syms_per_group));2651}26522653inline void clear()2654{2655m_group_bits.clear();26562657m_cur_sym_bits = 0;2658m_cur_num_syms = 0;2659m_decode_syms_remaining = 0;2660m_next_decoder_group_index = 0;2661}26622663inline void emit(uint32_t sym)2664{2665m_cur_sym_bits |= (sym << (m_cur_num_syms * m_bits_per_sym));2666m_cur_num_syms++;26672668if (m_cur_num_syms == m_total_syms_per_group)2669flush();2670}26712672inline void flush()2673{2674if (m_cur_num_syms)2675{2676m_group_bits.push_back(m_cur_sym_bits);2677m_histogram.inc(m_cur_sym_bits);26782679m_cur_sym_bits = 0;2680m_cur_num_syms = 0;2681}2682}26832684inline bool start_encoding(uint32_t code_size_limit = 16)2685{2686flush();26872688if (!m_encoding_table.init(m_histogram, code_size_limit))2689return false;26902691m_decode_syms_remaining = 0;2692m_next_decoder_group_index = 0;26932694return true;2695}26962697inline uint32_t emit_next_sym(bitwise_coder &c)2698{2699uint32_t bits = 0;27002701if (!m_decode_syms_remaining)2702{2703bits = c.put_code(m_group_bits[m_next_decoder_group_index++], m_encoding_table);2704m_decode_syms_remaining = m_total_syms_per_group;2705}27062707m_decode_syms_remaining--;2708return bits;2709}27102711inline void emit_flush()2712{2713m_decode_syms_remaining = 0;2714}27152716private:2717uint_vec m_group_bits;2718huffman_encoding_table m_encoding_table;2719histogram m_histogram;2720uint32_t m_bits_per_sym, m_total_syms_per_group, m_cur_sym_bits, m_cur_num_syms, m_next_decoder_group_index, m_decode_syms_remaining;2721};27222723bool huffman_test(int rand_seed);27242725// VQ index reordering27262727class palette_index_reorderer2728{2729public:2730palette_index_reorderer()2731{2732}27332734void clear()2735{2736clear_vector(m_hist);2737clear_vector(m_total_count_to_picked);2738clear_vector(m_entries_picked);2739clear_vector(m_entries_to_do);2740clear_vector(m_remap_table);2741}27422743// returns [0,1] distance of entry i to entry j2744typedef float(*pEntry_dist_func)(uint32_t i, uint32_t j, void *pCtx);27452746void init(uint32_t num_indices, const uint32_t *pIndices, uint32_t num_syms, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight);27472748// Table remaps old to new symbol indices2749inline const uint_vec &get_remap_table() const { return m_remap_table; }27502751private:2752uint_vec m_hist, m_total_count_to_picked, m_entries_picked, m_entries_to_do, m_remap_table;27532754inline uint32_t get_hist(int i, int j, int n) const { return (i > j) ? m_hist[j * n + i] : m_hist[i * n + j]; }2755inline void inc_hist(int i, int j, int n) { if ((i != j) && (i < j) && (i != -1) && (j != -1)) { assert(((uint32_t)i < (uint32_t)n) && ((uint32_t)j < (uint32_t)n)); m_hist[i * n + j]++; } }27562757void prepare_hist(uint32_t num_syms, uint32_t num_indices, const uint32_t *pIndices);2758void find_initial(uint32_t num_syms);2759void find_next_entry(uint32_t &best_entry, double &best_count, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight);2760float pick_side(uint32_t num_syms, uint32_t entry_to_move, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight);2761};27622763// Simple 32-bit 2D image class27642765class image2766{2767public:2768image() :2769m_width(0), m_height(0), m_pitch(0)2770{2771}27722773image(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) :2774m_width(0), m_height(0), m_pitch(0)2775{2776resize(w, h, p);2777}27782779image(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps) :2780m_width(0), m_height(0), m_pitch(0)2781{2782init(pImage, width, height, comps);2783}27842785image(const image &other) :2786m_width(0), m_height(0), m_pitch(0)2787{2788*this = other;2789}27902791image(image&& other) :2792m_width(other.m_width), m_height(other.m_height), m_pitch(other.m_pitch),2793m_pixels(std::move(other.m_pixels))2794{2795other.m_width = 0;2796other.m_height = 0;2797other.m_pitch = 0;2798}27992800image& operator= (image&& rhs)2801{2802if (this != &rhs)2803{2804m_width = rhs.m_width;2805m_height = rhs.m_height;2806m_pitch = rhs.m_pitch;2807m_pixels = std::move(rhs.m_pixels);28082809rhs.m_width = 0;2810rhs.m_height = 0;2811rhs.m_pitch = 0;2812}2813return *this;2814}28152816image &swap(image &other)2817{2818std::swap(m_width, other.m_width);2819std::swap(m_height, other.m_height);2820std::swap(m_pitch, other.m_pitch);2821m_pixels.swap(other.m_pixels);2822return *this;2823}28242825image &operator= (const image &rhs)2826{2827if (this != &rhs)2828{2829m_width = rhs.m_width;2830m_height = rhs.m_height;2831m_pitch = rhs.m_pitch;2832m_pixels = rhs.m_pixels;2833}2834return *this;2835}28362837image &clear()2838{2839m_width = 0;2840m_height = 0;2841m_pitch = 0;2842clear_vector(m_pixels);2843return *this;2844}28452846image& match_dimensions(const image& other)2847{2848resize(other.get_width(), other.get_height());2849return *this;2850}28512852image &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba& background = g_black_color)2853{2854return crop(w, h, p, background);2855}28562857image &set_all(const color_rgba &c)2858{2859for (uint32_t i = 0; i < m_pixels.size(); i++)2860m_pixels[i] = c;2861return *this;2862}28632864void init(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps)2865{2866assert(comps >= 1 && comps <= 4);28672868resize(width, height);28692870for (uint32_t y = 0; y < height; y++)2871{2872for (uint32_t x = 0; x < width; x++)2873{2874const uint8_t *pSrc = &pImage[(x + y * width) * comps];2875color_rgba &dst = (*this)(x, y);28762877if (comps == 1)2878{2879dst.r = pSrc[0];2880dst.g = pSrc[0];2881dst.b = pSrc[0];2882dst.a = 255;2883}2884else if (comps == 2)2885{2886dst.r = pSrc[0];2887dst.g = pSrc[0];2888dst.b = pSrc[0];2889dst.a = pSrc[1];2890}2891else2892{2893dst.r = pSrc[0];2894dst.g = pSrc[1];2895dst.b = pSrc[2];2896if (comps == 4)2897dst.a = pSrc[3];2898else2899dst.a = 255;2900}2901}2902}2903}29042905image &fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba &c)2906{2907for (uint32_t iy = 0; iy < h; iy++)2908for (uint32_t ix = 0; ix < w; ix++)2909set_clipped(x + ix, y + iy, c);2910return *this;2911}29122913image& fill_box_alpha(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba& c)2914{2915for (uint32_t iy = 0; iy < h; iy++)2916for (uint32_t ix = 0; ix < w; ix++)2917set_clipped_alpha(x + ix, y + iy, c);2918return *this;2919}29202921image &crop_dup_borders(uint32_t w, uint32_t h)2922{2923const uint32_t orig_w = m_width, orig_h = m_height;29242925crop(w, h);29262927if (orig_w && orig_h)2928{2929if (m_width > orig_w)2930{2931for (uint32_t x = orig_w; x < m_width; x++)2932for (uint32_t y = 0; y < m_height; y++)2933set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));2934}29352936if (m_height > orig_h)2937{2938for (uint32_t y = orig_h; y < m_height; y++)2939for (uint32_t x = 0; x < m_width; x++)2940set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));2941}2942}2943return *this;2944}29452946// pPixels MUST have been allocated using malloc() (basisu::vector will eventually use free() on the pointer).2947image& grant_ownership(color_rgba* pPixels, uint32_t w, uint32_t h, uint32_t p = UINT32_MAX)2948{2949if (p == UINT32_MAX)2950p = w;29512952clear();29532954if ((!p) || (!w) || (!h))2955return *this;29562957m_pixels.grant_ownership(pPixels, p * h, p * h);29582959m_width = w;2960m_height = h;2961m_pitch = p;29622963return *this;2964}29652966image &crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba &background = g_black_color, bool init_image = true)2967{2968if (p == UINT32_MAX)2969p = w;29702971if ((w == m_width) && (m_height == h) && (m_pitch == p))2972return *this;29732974if ((!w) || (!h) || (!p))2975{2976clear();2977return *this;2978}29792980color_rgba_vec cur_state;2981cur_state.swap(m_pixels);29822983m_pixels.resize(p * h);29842985if (init_image)2986{2987if (m_width || m_height)2988{2989for (uint32_t y = 0; y < h; y++)2990{2991for (uint32_t x = 0; x < w; x++)2992{2993if ((x < m_width) && (y < m_height))2994m_pixels[x + y * p] = cur_state[x + y * m_pitch];2995else2996m_pixels[x + y * p] = background;2997}2998}2999}3000else3001{3002m_pixels.set_all(background);3003}3004}30053006m_width = w;3007m_height = h;3008m_pitch = p;30093010return *this;3011}30123013inline const color_rgba &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }3014inline color_rgba &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }30153016inline const color_rgba &get_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }3017inline color_rgba &get_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }30183019inline const color_rgba &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) const3020{3021x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);3022y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);3023return m_pixels[x + y * m_pitch];3024}30253026inline color_rgba &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v)3027{3028x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);3029y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);3030return m_pixels[x + y * m_pitch];3031}30323033inline image &set_clipped(int x, int y, const color_rgba &c)3034{3035if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))3036(*this)(x, y) = c;3037return *this;3038}30393040inline image& set_clipped_alpha(int x, int y, const color_rgba& c)3041{3042if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))3043(*this)(x, y).m_comps[3] = c.m_comps[3];3044return *this;3045}30463047// Very straightforward blit with full clipping. Not fast, but it works.3048image &blit(const image &src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y)3049{3050for (int y = 0; y < src_h; y++)3051{3052const int sy = src_y + y;3053if (sy < 0)3054continue;3055else if (sy >= (int)src.get_height())3056break;30573058for (int x = 0; x < src_w; x++)3059{3060const int sx = src_x + x;3061if (sx < 0)3062continue;3063else if (sx >= (int)src.get_width())3064break;30653066set_clipped(dst_x + x, dst_y + y, src(sx, sy));3067}3068}30693070return *this;3071}30723073const image &extract_block_clamped(color_rgba *pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const3074{3075if (((src_x + w) > m_width) || ((src_y + h) > m_height))3076{3077// Slower clamping case3078for (uint32_t y = 0; y < h; y++)3079for (uint32_t x = 0; x < w; x++)3080*pDst++ = get_clamped(src_x + x, src_y + y);3081}3082else3083{3084const color_rgba* pSrc = &m_pixels[src_x + src_y * m_pitch];30853086for (uint32_t y = 0; y < h; y++)3087{3088memcpy(pDst, pSrc, w * sizeof(color_rgba));3089pSrc += m_pitch;3090pDst += w;3091}3092}30933094return *this;3095}30963097image &set_block_clipped(const color_rgba *pSrc, uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h)3098{3099for (uint32_t y = 0; y < h; y++)3100for (uint32_t x = 0; x < w; x++)3101set_clipped(dst_x + x, dst_y + y, *pSrc++);3102return *this;3103}31043105inline bool is_valid() const { return m_width > 0; }31063107inline uint32_t get_width() const { return m_width; }3108inline uint32_t get_height() const { return m_height; }3109inline uint32_t get_pitch() const { return m_pitch; }3110inline uint32_t get_total_pixels() const { return m_width * m_height; }31113112inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - 1)) / w; }3113inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - 1)) / h; }3114inline uint32_t get_total_blocks(uint32_t w, uint32_t h) const { return get_block_width(w) * get_block_height(h); }31153116inline const color_rgba_vec &get_pixels() const { return m_pixels; }3117inline color_rgba_vec &get_pixels() { return m_pixels; }31183119inline const color_rgba *get_ptr() const { return &m_pixels[0]; }3120inline color_rgba *get_ptr() { return &m_pixels[0]; }31213122bool has_alpha(uint32_t channel = 3) const3123{3124for (uint32_t y = 0; y < m_height; ++y)3125for (uint32_t x = 0; x < m_width; ++x)3126if ((*this)(x, y)[channel] < 255)3127return true;31283129return false;3130}31313132image &set_alpha(uint8_t a)3133{3134for (uint32_t y = 0; y < m_height; ++y)3135for (uint32_t x = 0; x < m_width; ++x)3136(*this)(x, y).a = a;3137return *this;3138}31393140image &flip_y()3141{3142for (uint32_t y = 0; y < m_height / 2; ++y)3143for (uint32_t x = 0; x < m_width; ++x)3144std::swap((*this)(x, y), (*this)(x, m_height - 1 - y));3145return *this;3146}31473148// TODO: There are many ways to do this, not sure this is the best way.3149image &renormalize_normal_map()3150{3151for (uint32_t y = 0; y < m_height; y++)3152{3153for (uint32_t x = 0; x < m_width; x++)3154{3155color_rgba &c = (*this)(x, y);3156if ((c.r == 128) && (c.g == 128) && (c.b == 128))3157continue;31583159vec3F v(c.r, c.g, c.b);3160v = (v * (2.0f / 255.0f)) - vec3F(1.0f);3161v.clamp(-1.0f, 1.0f);31623163float length = v.length();3164const float cValidThresh = .077f;3165if (length < cValidThresh)3166{3167c.set(128, 128, 128, c.a);3168}3169else if (fabs(length - 1.0f) > cValidThresh)3170{3171if (length)3172v /= length;31733174for (uint32_t i = 0; i < 3; i++)3175c[i] = static_cast<uint8_t>(clamp<float>(floor((v[i] + 1.0f) * 255.0f * .5f + .5f), 0.0f, 255.0f));31763177if ((c.g == 128) && (c.r == 128))3178{3179if (c.b < 128)3180c.b = 0;3181else3182c.b = 255;3183}3184}3185}3186}3187return *this;3188}31893190void swap_rb()3191{3192for (auto& v : m_pixels)3193std::swap(v.r, v.b);3194}31953196void debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t x_scale, uint32_t y_scale, const color_rgba &fg, const color_rgba *pBG, bool alpha_only, const char* p, ...);31973198vec4F get_filtered_vec4F(float x, float y) const3199{3200x -= .5f;3201y -= .5f;32023203int ix = (int)floorf(x);3204int iy = (int)floorf(y);3205float wx = x - ix;3206float wy = y - iy;32073208color_rgba a(get_clamped(ix, iy));3209color_rgba b(get_clamped(ix + 1, iy));3210color_rgba c(get_clamped(ix, iy + 1));3211color_rgba d(get_clamped(ix + 1, iy + 1));32123213vec4F result;32143215for (uint32_t i = 0; i < 4; i++)3216{3217const float top = lerp<float>((float)a[i], (float)b[i], wx);3218const float bot = lerp<float>((float)c[i], (float)d[i], wx);3219const float m = lerp<float>((float)top, (float)bot, wy);32203221result[i] = m;3222}32233224return result;3225}32263227// (x,y) - Continuous coordinates, where pixel centers are at (.5,.5), valid image coords are [0,width] and [0,height]. Clamp addressing.3228color_rgba get_filtered(float x, float y) const3229{3230const vec4F fresult(get_filtered_vec4F(x, y));32313232color_rgba result;32333234for (uint32_t i = 0; i < 4; i++)3235result[i] = (uint8_t)clamp<int>((int)(fresult[i] + .5f), 0, 255);32363237return result;3238}32393240private:3241uint32_t m_width, m_height, m_pitch; // all in pixels3242color_rgba_vec m_pixels;3243};32443245// Float images32463247typedef basisu::vector<vec4F> vec4F_vec;32483249class imagef3250{3251public:3252imagef() :3253m_width(0), m_height(0), m_pitch(0)3254{3255}32563257imagef(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) :3258m_width(0), m_height(0), m_pitch(0)3259{3260resize(w, h, p);3261}32623263imagef(const imagef &other) :3264m_width(0), m_height(0), m_pitch(0)3265{3266*this = other;3267}32683269imagef(imagef&& other) :3270m_width(other.m_width), m_height(other.m_height), m_pitch(other.m_pitch),3271m_pixels(std::move(other.m_pixels))3272{3273other.m_width = 0;3274other.m_height = 0;3275other.m_pitch = 0;3276}32773278imagef& operator= (imagef&& rhs)3279{3280if (this != &rhs)3281{3282m_width = rhs.m_width;3283m_height = rhs.m_height;3284m_pitch = rhs.m_pitch;3285m_pixels = std::move(rhs.m_pixels);32863287rhs.m_width = 0;3288rhs.m_height = 0;3289rhs.m_pitch = 0;3290}3291return *this;3292}32933294imagef &swap(imagef &other)3295{3296std::swap(m_width, other.m_width);3297std::swap(m_height, other.m_height);3298std::swap(m_pitch, other.m_pitch);3299m_pixels.swap(other.m_pixels);3300return *this;3301}33023303imagef &operator= (const imagef &rhs)3304{3305if (this != &rhs)3306{3307m_width = rhs.m_width;3308m_height = rhs.m_height;3309m_pitch = rhs.m_pitch;3310m_pixels = rhs.m_pixels;3311}3312return *this;3313}33143315imagef &clear()3316{3317m_width = 0;3318m_height = 0;3319m_pitch = 0;3320clear_vector(m_pixels);3321return *this;3322}33233324imagef &set(const image &src, const vec4F &scale = vec4F(1), const vec4F &bias = vec4F(0))3325{3326const uint32_t width = src.get_width();3327const uint32_t height = src.get_height();33283329resize(width, height);33303331for (int y = 0; y < (int)height; y++)3332{3333for (uint32_t x = 0; x < width; x++)3334{3335const color_rgba &src_pixel = src(x, y);3336(*this)(x, y).set((float)src_pixel.r * scale[0] + bias[0], (float)src_pixel.g * scale[1] + bias[1], (float)src_pixel.b * scale[2] + bias[2], (float)src_pixel.a * scale[3] + bias[3]);3337}3338}33393340return *this;3341}33423343imagef& match_dimensions(const imagef& other)3344{3345resize(other.get_width(), other.get_height());3346return *this;3347}33483349imagef &resize(const imagef &other, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0,0,0,1))3350{3351return resize(other.get_width(), other.get_height(), p, background);3352}33533354imagef &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0,0,0,1))3355{3356return crop(w, h, p, background);3357}33583359imagef &set_all(const vec4F &c)3360{3361for (uint32_t i = 0; i < m_pixels.size(); i++)3362m_pixels[i] = c;3363return *this;3364}33653366imagef &fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const vec4F &c)3367{3368for (uint32_t iy = 0; iy < h; iy++)3369for (uint32_t ix = 0; ix < w; ix++)3370set_clipped(x + ix, y + iy, c);3371return *this;3372}33733374imagef &crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F &background = vec4F(0,0,0,1))3375{3376if (p == UINT32_MAX)3377p = w;33783379if ((w == m_width) && (m_height == h) && (m_pitch == p))3380return *this;33813382if ((!w) || (!h) || (!p))3383{3384clear();3385return *this;3386}33873388vec4F_vec cur_state;3389cur_state.swap(m_pixels);33903391m_pixels.resize(p * h);33923393for (uint32_t y = 0; y < h; y++)3394{3395for (uint32_t x = 0; x < w; x++)3396{3397if ((x < m_width) && (y < m_height))3398m_pixels[x + y * p] = cur_state[x + y * m_pitch];3399else3400m_pixels[x + y * p] = background;3401}3402}34033404m_width = w;3405m_height = h;3406m_pitch = p;34073408return *this;3409}34103411imagef& crop_dup_borders(uint32_t w, uint32_t h)3412{3413const uint32_t orig_w = m_width, orig_h = m_height;34143415crop(w, h);34163417if (orig_w && orig_h)3418{3419if (m_width > orig_w)3420{3421for (uint32_t x = orig_w; x < m_width; x++)3422for (uint32_t y = 0; y < m_height; y++)3423set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));3424}34253426if (m_height > orig_h)3427{3428for (uint32_t y = orig_h; y < m_height; y++)3429for (uint32_t x = 0; x < m_width; x++)3430set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));3431}3432}3433return *this;3434}34353436inline const vec4F &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }3437inline vec4F &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }34383439inline const vec4F &get_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }3440inline vec4F &get_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }34413442inline const vec4F &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) const3443{3444x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);3445y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);3446return m_pixels[x + y * m_pitch];3447}34483449inline vec4F &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v)3450{3451x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);3452y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);3453return m_pixels[x + y * m_pitch];3454}34553456inline imagef &set_clipped(int x, int y, const vec4F &c)3457{3458if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))3459(*this)(x, y) = c;3460return *this;3461}34623463// Very straightforward blit with full clipping. Not fast, but it works.3464imagef &blit(const imagef &src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y)3465{3466for (int y = 0; y < src_h; y++)3467{3468const int sy = src_y + y;3469if (sy < 0)3470continue;3471else if (sy >= (int)src.get_height())3472break;34733474for (int x = 0; x < src_w; x++)3475{3476const int sx = src_x + x;3477if (sx < 0)3478continue;3479else if (sx >= (int)src.get_width())3480break;34813482set_clipped(dst_x + x, dst_y + y, src(sx, sy));3483}3484}34853486return *this;3487}34883489const imagef &extract_block_clamped(vec4F *pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const3490{3491for (uint32_t y = 0; y < h; y++)3492for (uint32_t x = 0; x < w; x++)3493*pDst++ = get_clamped(src_x + x, src_y + y);3494return *this;3495}34963497imagef &set_block_clipped(const vec4F *pSrc, uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h)3498{3499for (uint32_t y = 0; y < h; y++)3500for (uint32_t x = 0; x < w; x++)3501set_clipped(dst_x + x, dst_y + y, *pSrc++);3502return *this;3503}35043505inline bool is_valid() const { return m_width > 0; }35063507inline uint32_t get_width() const { return m_width; }3508inline uint32_t get_height() const { return m_height; }3509inline uint32_t get_pitch() const { return m_pitch; }3510inline uint64_t get_total_pixels() const { return (uint64_t)m_width * m_height; }35113512inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - 1)) / w; }3513inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - 1)) / h; }3514inline uint32_t get_total_blocks(uint32_t w, uint32_t h) const { return get_block_width(w) * get_block_height(h); }35153516inline const vec4F_vec &get_pixels() const { return m_pixels; }3517inline vec4F_vec &get_pixels() { return m_pixels; }35183519inline const vec4F *get_ptr() const { return &m_pixels[0]; }3520inline vec4F *get_ptr() { return &m_pixels[0]; }35213522bool clean_astc_hdr_pixels(float highest_mag)3523{3524bool status = true;3525bool nan_msg = false;3526bool inf_msg = false;3527bool neg_zero_msg = false;3528bool neg_msg = false;3529bool clamp_msg = false;35303531for (uint32_t iy = 0; iy < m_height; iy++)3532{3533for (uint32_t ix = 0; ix < m_width; ix++)3534{3535vec4F& c = (*this)(ix, iy);35363537for (uint32_t s = 0; s < 4; s++)3538{3539float &p = c[s];3540union { float f; uint32_t u; } x; x.f = p;35413542if ((std::isnan(p)) || (std::isinf(p)) || (x.u == 0x80000000))3543{3544if (std::isnan(p))3545{3546if (!nan_msg)3547{3548fprintf(stderr, "One or more input pixels was NaN, setting to 0.\n");3549nan_msg = true;3550}3551}35523553if (std::isinf(p))3554{3555if (!inf_msg)3556{3557fprintf(stderr, "One or more input pixels was INF, setting to 0.\n");3558inf_msg = true;3559}3560}35613562if (x.u == 0x80000000)3563{3564if (!neg_zero_msg)3565{3566fprintf(stderr, "One or more input pixels was -0, setting them to 0.\n");3567neg_zero_msg = true;3568}3569}35703571p = 0.0f;3572status = false;3573}3574else3575{3576//const float o = p;3577if (p < 0.0f)3578{3579p = 0.0f;35803581if (!neg_msg)3582{3583fprintf(stderr, "One or more input pixels was negative -- setting these pixel components to 0 because ASTC HDR doesn't support signed values.\n");3584neg_msg = true;3585}35863587status = false;3588}35893590if (p > highest_mag)3591{3592p = highest_mag;35933594if (!clamp_msg)3595{3596fprintf(stderr, "One or more input pixels had to be clamped to %f.\n", highest_mag);3597clamp_msg = true;3598}35993600status = false;3601}3602}3603}3604}3605}36063607return status;3608}36093610imagef& flip_y()3611{3612for (uint32_t y = 0; y < m_height / 2; ++y)3613for (uint32_t x = 0; x < m_width; ++x)3614std::swap((*this)(x, y), (*this)(x, m_height - 1 - y));36153616return *this;3617}36183619bool has_alpha(uint32_t channel = 3) const3620{3621for (uint32_t y = 0; y < m_height; ++y)3622for (uint32_t x = 0; x < m_width; ++x)3623if ((*this)(x, y)[channel] != 1.0f)3624return true;36253626return false;3627}36283629vec4F get_filtered_vec4F(float x, float y) const3630{3631x -= .5f;3632y -= .5f;36333634int ix = (int)floorf(x);3635int iy = (int)floorf(y);3636float wx = x - ix;3637float wy = y - iy;36383639vec4F a(get_clamped(ix, iy));3640vec4F b(get_clamped(ix + 1, iy));3641vec4F c(get_clamped(ix, iy + 1));3642vec4F d(get_clamped(ix + 1, iy + 1));36433644vec4F result;36453646for (uint32_t i = 0; i < 4; i++)3647{3648const float top = lerp<float>((float)a[i], (float)b[i], wx);3649const float bot = lerp<float>((float)c[i], (float)d[i], wx);3650const float m = lerp<float>((float)top, (float)bot, wy);36513652result[i] = m;3653}36543655return result;3656}36573658private:3659uint32_t m_width, m_height, m_pitch; // all in pixels3660vec4F_vec m_pixels;3661};36623663// REC 709 coefficients3664const float REC_709_R = 0.212656f, REC_709_G = 0.715158f, REC_709_B = 0.072186f;36653666inline float get_luminance(const vec4F &c)3667{3668return c[0] * REC_709_R + c[1] * REC_709_G + c[2] * REC_709_B;3669}36703671float linear_to_srgb(float l);3672float srgb_to_linear(float s);36733674class fast_linear_to_srgb3675{3676public:3677fast_linear_to_srgb()3678{3679init();3680}36813682void init()3683{3684for (int i = 0; i < LINEAR_TO_SRGB_TABLE_SIZE; ++i)3685{3686float l = (float)i * (1.0f / (LINEAR_TO_SRGB_TABLE_SIZE - 1));3687m_linear_to_srgb_table[i] = (uint8_t)basisu::fast_floorf_int(255.0f * basisu::linear_to_srgb(l));3688}36893690float srgb_to_linear[256];3691for (int i = 0; i < 256; i++)3692srgb_to_linear[i] = basisu::srgb_to_linear((float)i / 255.0f);36933694for (int i = 0; i < 256; i++)3695m_srgb_to_linear_thresh[i] = (srgb_to_linear[i] + srgb_to_linear[basisu::minimum<int>(i + 1, 255)]) * .5f;3696}36973698inline uint8_t convert(float l) const3699{3700assert((l >= 0.0f) && (l <= 1.0f));3701int j = basisu::fast_roundf_int((LINEAR_TO_SRGB_TABLE_SIZE - 1) * l);37023703assert((j >= 0) && (j < LINEAR_TO_SRGB_TABLE_SIZE));3704int b = m_linear_to_srgb_table[j];37053706b += (l > m_srgb_to_linear_thresh[b]);37073708return (uint8_t)b;3709}37103711private:3712static constexpr int LINEAR_TO_SRGB_TABLE_SIZE = 2048;3713uint8_t m_linear_to_srgb_table[LINEAR_TO_SRGB_TABLE_SIZE];37143715float m_srgb_to_linear_thresh[256];3716};37173718extern fast_linear_to_srgb g_fast_linear_to_srgb;37193720// Image metrics37213722class image_metrics3723{3724public:3725// TODO: Add ssim3726double m_max, m_mean, m_mean_squared, m_rms, m_psnr, m_ssim;3727bool m_has_neg, m_hf_mag_overflow, m_any_abnormal;37283729image_metrics()3730{3731clear();3732}37333734void clear()3735{3736m_max = 0;3737m_mean = 0;3738m_mean_squared = 0;3739m_rms = 0;3740m_psnr = 0;3741m_ssim = 0;3742m_has_neg = false;3743m_hf_mag_overflow = false;3744m_any_abnormal = false;3745}37463747void print(const char *pPrefix = nullptr) { printf("%sMax: %3.3f Mean: %3.3f RMS: %3.3f PSNR: %2.3f dB\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr); }3748void print_hp(const char* pPrefix = nullptr) { printf("%sMax: %3.6f Mean: %3.6f RMS: %3.6f PSNR: %2.6f dB, Any Neg: %u, Half float overflow: %u, Any NaN/Inf: %u\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr, m_has_neg, m_hf_mag_overflow, m_any_abnormal); }37493750void calc(const imagef& a, const imagef& b, uint32_t first_chan = 0, uint32_t total_chans = 0, bool avg_comp_error = true, bool log = false);3751void calc_half(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error);3752void calc_half2(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error);3753void calc(const image &a, const image &b, uint32_t first_chan = 0, uint32_t total_chans = 0, bool avg_comp_error = true, bool use_601_luma = false);3754};37553756void print_image_metrics(const image& a, const image& b);37573758// Image saving/loading/resampling37593760bool load_png(const uint8_t* pBuf, size_t buf_size, image& img, const char* pFilename = nullptr);3761bool load_png(const char* pFilename, image& img);3762inline bool load_png(const std::string &filename, image &img) { return load_png(filename.c_str(), img); }37633764bool load_tga(const char* pFilename, image& img);3765inline bool load_tga(const std::string &filename, image &img) { return load_tga(filename.c_str(), img); }37663767bool load_qoi(const char* pFilename, image& img);37683769bool load_jpg(const char *pFilename, image& img);3770bool load_jpg(const uint8_t* pBuf, size_t buf_size, image& img);3771inline bool load_jpg(const std::string &filename, image &img) { return load_jpg(filename.c_str(), img); }37723773// Currently loads .PNG, .TGA, or .JPG3774bool load_image(const char* pFilename, image& img);3775inline bool load_image(const std::string &filename, image &img) { return load_image(filename.c_str(), img); }37763777bool is_image_filename_hdr(const char* pFilename);37783779// Supports .HDR and most (but not all) .EXR's (see TinyEXR).3780bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear = true, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f);37813782inline bool load_image_hdr(const std::string& filename, imagef& img, bool ldr_srgb_to_linear = true, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f)3783{3784return load_image_hdr(filename.c_str(), img, ldr_srgb_to_linear, linear_nit_multiplier, ldr_black_bias);3785}37863787enum class hdr_image_type3788{3789cHITRGBAHalfFloat = 0,3790cHITRGBAFloat = 1,3791cHITPNGImage = 2,3792cHITEXRImage = 3,3793cHITHDRImage = 4,3794cHITJPGImage = 53795};37963797bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f);37983799uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans);3800uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans);38013802struct rgbe_header_info3803{3804std::string m_program;38053806// Note no validation is done, either gamma or exposure may be 0.3807double m_gamma;3808bool m_has_gamma;38093810double m_exposure; // watts/steradian/m^2.3811bool m_has_exposure;38123813void clear()3814{3815m_program.clear();3816m_gamma = 1.0f;3817m_has_gamma = false;3818m_exposure = 1.0f;3819m_has_exposure = false;3820}3821};38223823bool read_rgbe(const uint8_vec& filedata, imagef& img, rgbe_header_info& hdr_info);3824bool read_rgbe(const char* pFilename, imagef& img, rgbe_header_info &hdr_info);38253826bool write_rgbe(uint8_vec& file_data, imagef& img, rgbe_header_info& hdr_info);3827bool write_rgbe(const char* pFilename, imagef& img, rgbe_header_info& hdr_info);38283829bool read_exr(const char* pFilename, imagef& img, int& n_chans);3830bool read_exr(const void* pMem, size_t mem_size, imagef& img);38313832enum3833{3834WRITE_EXR_LINEAR_HINT = 1, // hint for lossy comp. methods: exr_perceptual_treatment_t, logarithmic or linear, defaults to logarithmic3835WRITE_EXR_STORE_FLOATS = 2, // use 32-bit floats, otherwise it uses half floats3836WRITE_EXR_NO_COMPRESSION = 4 // no compression, otherwise it uses ZIP compression (16 scanlines per block)3837};38383839// Supports 1 (Y), 3 (RGB), or 4 (RGBA) channel images.3840bool write_exr(const char* pFilename, const imagef& img, uint32_t n_chans, uint32_t flags);38413842enum3843{3844cImageSaveGrayscale = 1,3845cImageSaveIgnoreAlpha = 23846};38473848bool save_png(const char* pFilename, const image& img, uint32_t image_save_flags = 0, uint32_t grayscale_comp = 0);3849inline bool save_png(const std::string &filename, const image &img, uint32_t image_save_flags = 0, uint32_t grayscale_comp = 0) { return save_png(filename.c_str(), img, image_save_flags, grayscale_comp); }38503851bool read_file_to_vec(const char* pFilename, uint8_vec& data);3852bool read_file_to_data(const char* pFilename, void *pData, size_t len);38533854bool write_data_to_file(const char* pFilename, const void* pData, size_t len);38553856inline bool write_vec_to_file(const char* pFilename, const uint8_vec& v) { return v.size() ? write_data_to_file(pFilename, &v[0], v.size()) : write_data_to_file(pFilename, "", 0); }38573858bool image_resample(const image &src, image &dst, bool srgb = false,3859const char *pFilter = "lanczos4", float filter_scale = 1.0f,3860bool wrapping = false,3861uint32_t first_comp = 0, uint32_t num_comps = 4);38623863bool image_resample(const imagef& src, imagef& dst,3864const char* pFilter = "lanczos4", float filter_scale = 1.0f,3865bool wrapping = false,3866uint32_t first_comp = 0, uint32_t num_comps = 4);38673868// Timing38693870typedef uint64_t timer_ticks;38713872class interval_timer3873{3874public:3875interval_timer();38763877void start();3878void stop();38793880double get_elapsed_secs() const;3881inline double get_elapsed_ms() const { return 1000.0f* get_elapsed_secs(); }38823883static void init();3884static inline timer_ticks get_ticks_per_sec() { return g_freq; }3885static timer_ticks get_ticks();3886static double ticks_to_secs(timer_ticks ticks);3887static inline double ticks_to_ms(timer_ticks ticks) { return ticks_to_secs(ticks) * 1000.0f; }38883889private:3890static timer_ticks g_init_ticks, g_freq;3891static double g_timer_freq;38923893timer_ticks m_start_time, m_stop_time;38943895bool m_started, m_stopped;3896};38973898inline double get_interval_timer() { return interval_timer::ticks_to_secs(interval_timer::get_ticks()); }38993900inline FILE *fopen_safe(const char *pFilename, const char *pMode)3901{3902#ifdef _WIN323903FILE *pFile = nullptr;3904fopen_s(&pFile, pFilename, pMode);3905return pFile;3906#else3907return fopen(pFilename, pMode);3908#endif3909}39103911void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed = 1);39123913const uint32_t cPixelBlockWidth = 4;3914const uint32_t cPixelBlockHeight = 4;3915const uint32_t cPixelBlockTotalPixels = cPixelBlockWidth * cPixelBlockHeight;39163917struct pixel_block3918{3919color_rgba m_pixels[cPixelBlockHeight][cPixelBlockWidth]; // [y][x]39203921inline const color_rgba& operator() (uint32_t x, uint32_t y) const { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }3922inline color_rgba& operator() (uint32_t x, uint32_t y) { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }39233924inline const color_rgba* get_ptr() const { return &m_pixels[0][0]; }3925inline color_rgba* get_ptr() { return &m_pixels[0][0]; }39263927inline void clear() { clear_obj(*this); }39283929inline bool operator== (const pixel_block& rhs) const3930{3931return memcmp(m_pixels, rhs.m_pixels, sizeof(m_pixels)) == 0;3932}3933};3934typedef basisu::vector<pixel_block> pixel_block_vec;39353936struct pixel_block_hdr3937{3938vec4F m_pixels[cPixelBlockHeight][cPixelBlockWidth]; // [y][x]39393940inline const vec4F& operator() (uint32_t x, uint32_t y) const { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }3941inline vec4F& operator() (uint32_t x, uint32_t y) { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }39423943inline const vec4F* get_ptr() const { return &m_pixels[0][0]; }3944inline vec4F* get_ptr() { return &m_pixels[0][0]; }39453946inline void clear() { clear_obj(*this); }39473948inline bool operator== (const pixel_block& rhs) const3949{3950return memcmp(m_pixels, rhs.m_pixels, sizeof(m_pixels)) == 0;3951}3952};3953typedef basisu::vector<pixel_block_hdr> pixel_block_hdr_vec;39543955void tonemap_image_reinhard(image& ldr_img, const imagef& hdr_img, float exposure, bool add_noise = false, bool per_component = true, bool luma_scaling = false);3956bool tonemap_image_compressive(image& dst_img, const imagef& hdr_test_img);3957bool tonemap_image_compressive2(image& dst_img, const imagef& hdr_test_img);39583959// Intersection3960enum eClear { cClear = 0 };3961enum eInitExpand { cInitExpand = 0 };3962enum eIdentity { cIdentity = 0 };39633964template<typename vector_type>3965class ray3966{3967public:3968typedef vector_type vector_t;3969typedef typename vector_type::scalar_type scalar_type;39703971inline ray() { }3972inline ray(eClear) { clear(); }3973inline ray(const vector_type& origin, const vector_type& direction) : m_origin(origin), m_direction(direction) { }39743975inline void clear()3976{3977m_origin.clear();3978m_direction.clear();3979}39803981inline const vector_type& get_origin(void) const { return m_origin; }3982inline void set_origin(const vector_type& origin) { m_origin = origin; }39833984inline const vector_type& get_direction(void) const { return m_direction; }3985inline void set_direction(const vector_type& direction) { m_direction = direction; }39863987inline void set_endpoints(const vector_type& start, const vector_type& end)3988{3989m_origin = start;39903991m_direction = end - start;3992m_direction.normalize_in_place();3993}39943995inline vector_type eval(scalar_type t) const3996{3997return m_origin + m_direction * t;3998}39994000private:4001vector_type m_origin;4002vector_type m_direction;4003};40044005typedef ray<vec2F> ray2F;4006typedef ray<vec3F> ray3F;40074008template<typename T>4009class vec_interval4010{4011public:4012enum { N = T::num_elements };4013typedef typename T::scalar_type scalar_type;40144015inline vec_interval(const T& v) { m_bounds[0] = v; m_bounds[1] = v; }4016inline vec_interval(const T& low, const T& high) { m_bounds[0] = low; m_bounds[1] = high; }40174018inline vec_interval() { }4019inline vec_interval(eClear) { clear(); }4020inline vec_interval(eInitExpand) { init_expand(); }40214022inline void clear() { m_bounds[0].clear(); m_bounds[1].clear(); }40234024inline void init_expand()4025{4026m_bounds[0].set(1e+30f, 1e+30f, 1e+30f);4027m_bounds[1].set(-1e+30f, -1e+30f, -1e+30f);4028}40294030inline vec_interval expand(const T& p)4031{4032for (uint32_t c = 0; c < N; c++)4033{4034if (p[c] < m_bounds[0][c])4035m_bounds[0][c] = p[c];40364037if (p[c] > m_bounds[1][c])4038m_bounds[1][c] = p[c];4039}40404041return *this;4042}40434044inline const T& operator[] (uint32_t i) const { assert(i < 2); return m_bounds[i]; }4045inline T& operator[] (uint32_t i) { assert(i < 2); return m_bounds[i]; }40464047const T& get_low() const { return m_bounds[0]; }4048T& get_low() { return m_bounds[0]; }40494050const T& get_high() const { return m_bounds[1]; }4051T& get_high() { return m_bounds[1]; }40524053scalar_type get_dim(uint32_t axis) const { return m_bounds[1][axis] - m_bounds[0][axis]; }40544055bool contains(const T& p) const4056{4057const T& low = get_low(), high = get_high();40584059for (uint32_t i = 0; i < N; i++)4060{4061if (p[i] < low[i])4062return false;40634064if (p[i] > high[i])4065return false;4066}4067return true;4068}40694070private:4071T m_bounds[2];4072};40734074typedef vec_interval<vec1F> vec_interval1F;4075typedef vec_interval<vec2F> vec_interval2F;4076typedef vec_interval<vec3F> vec_interval3F;4077typedef vec_interval<vec4F> vec_interval4F;40784079typedef vec_interval1F aabb1F;4080typedef vec_interval2F aabb2F;4081typedef vec_interval3F aabb3F;40824083namespace intersection4084{4085enum result4086{4087cBackfacing = -1,4088cFailure = 0,4089cSuccess,4090cParallel,4091cInside,4092};40934094// Returns cInside, cSuccess, or cFailure.4095// Algorithm: Graphics Gems 14096template<typename vector_type, typename scalar_type, typename ray_type, typename aabb_type>4097result ray_aabb(vector_type& coord, scalar_type& t, const ray_type& ray, const aabb_type& box)4098{4099enum4100{4101cNumDim = vector_type::num_elements,4102cRight = 0,4103cLeft = 1,4104cMiddle = 24105};41064107bool inside = true;4108int quadrant[cNumDim];4109scalar_type candidate_plane[cNumDim];41104111for (int i = 0; i < cNumDim; i++)4112{4113if (ray.get_origin()[i] < box[0][i])4114{4115quadrant[i] = cLeft;4116candidate_plane[i] = box[0][i];4117inside = false;4118}4119else if (ray.get_origin()[i] > box[1][i])4120{4121quadrant[i] = cRight;4122candidate_plane[i] = box[1][i];4123inside = false;4124}4125else4126{4127quadrant[i] = cMiddle;4128}4129}41304131if (inside)4132{4133coord = ray.get_origin();4134t = 0.0f;4135return cInside;4136}41374138scalar_type max_t[cNumDim];4139for (int i = 0; i < cNumDim; i++)4140{4141if ((quadrant[i] != cMiddle) && (ray.get_direction()[i] != 0.0f))4142max_t[i] = (candidate_plane[i] - ray.get_origin()[i]) / ray.get_direction()[i];4143else4144max_t[i] = -1.0f;4145}41464147int which_plane = 0;4148for (int i = 1; i < cNumDim; i++)4149if (max_t[which_plane] < max_t[i])4150which_plane = i;41514152if (max_t[which_plane] < 0.0f)4153return cFailure;41544155for (int i = 0; i < cNumDim; i++)4156{4157if (i != which_plane)4158{4159coord[i] = ray.get_origin()[i] + max_t[which_plane] * ray.get_direction()[i];41604161if ((coord[i] < box[0][i]) || (coord[i] > box[1][i]))4162return cFailure;4163}4164else4165{4166coord[i] = candidate_plane[i];4167}41684169assert(coord[i] >= box[0][i] && coord[i] <= box[1][i]);4170}41714172t = max_t[which_plane];4173return cSuccess;4174}41754176template<typename vector_type, typename scalar_type, typename ray_type, typename aabb_type>4177result ray_aabb(bool& started_within, vector_type& coord, scalar_type& t, const ray_type& ray, const aabb_type& box)4178{4179if (!box.contains(ray.get_origin()))4180{4181started_within = false;4182return ray_aabb(coord, t, ray, box);4183}41844185started_within = true;41864187typename vector_type::T diag_dist = box.diagonal_length() * 1.5f;4188ray_type outside_ray(ray.eval(diag_dist), -ray.get_direction());41894190result res(ray_aabb(coord, t, outside_ray, box));4191if (res != cSuccess)4192return res;41934194t = basisu::maximum(0.0f, diag_dist - t);4195return cSuccess;4196}41974198} // intersect41994200// This float->half conversion matches how "F32TO16" works on Intel GPU's.4201// Input cannot be negative, Inf or Nan.4202inline basist::half_float float_to_half_non_neg_no_nan_inf(float val)4203{4204union { float f; int32_t i; uint32_t u; } fi = { val };4205const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF;4206int e = 0, m = 0;42074208assert(((fi.i >> 31) == 0) && (flt_e != 0xFF));42094210// not zero or denormal4211if (flt_e != 0)4212{4213int new_exp = flt_e - 127;4214if (new_exp > 15)4215e = 31;4216else if (new_exp < -14)4217m = lrintf((1 << 24) * fabsf(fi.f));4218else4219{4220e = new_exp + 15;4221m = lrintf(flt_m * (1.0f / ((float)(1 << 13))));4222}4223}42244225assert((0 <= m) && (m <= 1024));4226if (m == 1024)4227{4228e++;4229m = 0;4230}42314232assert((e >= 0) && (e <= 31));4233assert((m >= 0) && (m <= 1023));42344235basist::half_float result = (basist::half_float)((e << 10) | m);4236return result;4237}42384239union fu324240{4241uint32_t u;4242float f;4243};42444245// Supports positive and denormals only. No NaN or Inf.4246BASISU_FORCE_INLINE float fast_half_to_float_pos_not_inf_or_nan(basist::half_float h)4247{4248assert(!basist::half_is_signed(h) && !basist::is_half_inf_or_nan(h));42494250// add 112 to the exponent (112+half float's exp bias of 15=float32's bias of 127)4251static const fu32 K = { 0x77800000 };42524253fu32 o;4254o.u = h << 13;4255o.f *= K.f;42564257return o.f;4258}42594260// Positive, negative, or denormals. No NaN or Inf. Clamped to MAX_HALF_FLOAT.4261inline basist::half_float fast_float_to_half_trunc_no_nan_or_inf(float f)4262{4263assert(!isnan(f) && !isinf(f));42644265// Sutract 112 from the exponent, to change the bias from 127 to 15.4266static const fu32 g_f_to_h{ 0x7800000 };42674268fu32 fu;42694270fu.f = minimum<float>((float)basist::MAX_HALF_FLOAT, fabsf(f)) * g_f_to_h.f;42714272return (basist::half_float)(((fu.u >> (23 - 10)) & 0x7FFF) | ((f < 0.0f) ? 0x8000 : 0));4273}42744275inline basist::half_float fast_float_to_half_trunc_no_clamp_neg_nan_or_inf(float f)4276{4277assert(!isnan(f) && !isinf(f));4278assert((f >= 0.0f) && (f <= basist::MAX_HALF_FLOAT));42794280// Sutract 112 from the exponent, to change the bias from 127 to 15.4281static const fu32 g_f_to_h{ 0x7800000 };42824283fu32 fu;42844285fu.f = f * g_f_to_h.f;42864287return (basist::half_float)((fu.u >> (23 - 10)) & 0x7FFF);4288}42894290inline basist::half_float fast_float_to_half_no_clamp_neg_nan_or_inf(float f)4291{4292assert(!isnan(f) && !isinf(f));4293assert((f >= 0.0f) && (f <= basist::MAX_HALF_FLOAT));42944295// Sutract 112 from the exponent, to change the bias from 127 to 15.4296static const fu32 g_f_to_h{ 0x7800000 };42974298fu32 fu;42994300fu.f = f * g_f_to_h.f;43014302uint32_t h = (basist::half_float)((fu.u >> (23 - 10)) & 0x7FFF);43034304// round to even or nearest4305uint32_t mant = fu.u & 8191; // examine lowest 13 bits4306uint32_t inc = (mant > 4096) | ((mant == 4096) & (h & 1));4307h += inc;43084309if (h > basist::MAX_HALF_FLOAT_AS_INT_BITS)4310h = basist::MAX_HALF_FLOAT_AS_INT_BITS;43114312return (basist::half_float)h;4313}43144315} // namespace basisu43164317#include "basisu_math.h"431843194320