Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/basis_universal/encoder/basisu_enc.h
21461 views
1
// basisu_enc.h
2
// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
3
//
4
// Licensed under the Apache License, Version 2.0 (the "License");
5
// you may not use this file except in compliance with the License.
6
// You may obtain a copy of the License at
7
//
8
// http://www.apache.org/licenses/LICENSE-2.0
9
//
10
// Unless required by applicable law or agreed to in writing, software
11
// distributed under the License is distributed on an "AS IS" BASIS,
12
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
// See the License for the specific language governing permissions and
14
// limitations under the License.
15
#pragma once
16
#include "../transcoder/basisu.h"
17
#include "../transcoder/basisu_transcoder_internal.h"
18
19
#include <mutex>
20
#include <atomic>
21
#include <condition_variable>
22
#include <functional>
23
#include <thread>
24
#include <unordered_map>
25
#include <map>
26
#include <ostream>
27
28
#if !defined(_WIN32) || defined(__MINGW32__)
29
#include <libgen.h>
30
#endif
31
32
// This module is really just a huge grab bag of classes and helper functions needed by the encoder.
33
34
// If BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE is 1, quality in perceptual mode will be slightly greater, but at a large increase in encoding CPU time.
35
#define BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE (0)
36
37
#if BASISU_SUPPORT_SSE
38
// Declared in basisu_kernels_imp.h, but we can't include that here otherwise it would lead to circular type errors.
39
extern void update_covar_matrix_16x16_sse41(uint32_t num_vecs, const void* pWeighted_vecs, const void* pOrigin, const uint32_t *pVec_indices, void* pMatrix16x16);
40
#endif
41
42
namespace basisu
43
{
44
extern uint8_t g_hamming_dist[256];
45
extern const uint8_t g_debug_font8x8_basic[127 - 32 + 1][8];
46
47
// true if basisu_encoder_init() has been called and returned.
48
extern bool g_library_initialized;
49
50
// Encoder library initialization.
51
// This function MUST be called before encoding anything!
52
// Returns false if library initialization fails.
53
bool basisu_encoder_init(bool use_opencl = false, bool opencl_force_serialization = false);
54
void basisu_encoder_deinit();
55
56
// basisu_kernels_sse.cpp - will be a no-op and g_cpu_supports_sse41 will always be false unless compiled with BASISU_SUPPORT_SSE=1
57
extern void detect_sse41();
58
59
#if BASISU_SUPPORT_SSE
60
extern bool g_cpu_supports_sse41;
61
#else
62
const bool g_cpu_supports_sse41 = false;
63
#endif
64
65
void error_vprintf(const char* pFmt, va_list args);
66
void error_printf(const char *pFmt, ...);
67
68
template <typename... Args>
69
inline void fmt_error_printf(const char* pFmt, Args&&... args)
70
{
71
std::string res;
72
if (!fmt_variants(res, pFmt, fmt_variant_vec{ fmt_variant(std::forward<Args>(args))... }))
73
return;
74
error_printf("%s", res.c_str());
75
}
76
77
void platform_sleep(uint32_t ms);
78
79
// Helpers
80
81
inline uint8_t clamp255(int32_t i)
82
{
83
return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i);
84
}
85
86
inline int left_shift32(int val, int shift)
87
{
88
assert((shift >= 0) && (shift < 32));
89
return static_cast<int>(static_cast<uint32_t>(val) << shift);
90
}
91
92
inline uint32_t left_shift32(uint32_t val, int shift)
93
{
94
assert((shift >= 0) && (shift < 32));
95
return val << shift;
96
}
97
98
inline int32_t clampi(int32_t value, int32_t low, int32_t high)
99
{
100
if (value < low)
101
value = low;
102
else if (value > high)
103
value = high;
104
return value;
105
}
106
107
inline uint8_t mul_8(uint32_t v, uint32_t a)
108
{
109
v = v * a + 128;
110
return (uint8_t)((v + (v >> 8)) >> 8);
111
}
112
113
inline int fast_roundf_int(float x)
114
{
115
return (x >= 0.0f) ? (int)(x + 0.5f) : (int)(x - 0.5f);
116
}
117
118
inline int fast_floorf_int(float x)
119
{
120
int xi = (int)x; // Truncate towards zero
121
return ((x < 0.0f) && (x != (float)xi)) ? (xi - 1) : xi;
122
}
123
124
inline uint64_t read_bits(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
125
{
126
assert(codesize <= 64);
127
uint64_t bits = 0;
128
uint32_t total_bits = 0;
129
130
while (total_bits < codesize)
131
{
132
uint32_t byte_bit_offset = bit_offset & 7;
133
uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset);
134
135
uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset;
136
byte_bits &= ((1 << bits_to_read) - 1);
137
138
bits |= ((uint64_t)(byte_bits) << total_bits);
139
140
total_bits += bits_to_read;
141
bit_offset += bits_to_read;
142
}
143
144
return bits;
145
}
146
147
inline uint32_t read_bits32(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
148
{
149
assert(codesize <= 32);
150
uint32_t bits = 0;
151
uint32_t total_bits = 0;
152
153
while (total_bits < codesize)
154
{
155
uint32_t byte_bit_offset = bit_offset & 7;
156
uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset);
157
158
uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset;
159
byte_bits &= ((1 << bits_to_read) - 1);
160
161
bits |= (byte_bits << total_bits);
162
163
total_bits += bits_to_read;
164
bit_offset += bits_to_read;
165
}
166
167
return bits;
168
}
169
170
// Open interval
171
inline int bounds_check(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }
172
inline uint32_t bounds_check(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }
173
174
// Closed interval
175
inline int bounds_check_incl(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v <= h); return v; }
176
inline uint32_t bounds_check_incl(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v <= h); return v; }
177
178
inline uint32_t clz(uint32_t x)
179
{
180
if (!x)
181
return 32;
182
183
uint32_t n = 0;
184
while ((x & 0x80000000) == 0)
185
{
186
x <<= 1u;
187
n++;
188
}
189
190
return n;
191
}
192
193
bool string_begins_with(const std::string& str, const char* pPhrase);
194
195
// Case sensitive, returns -1 if can't find
196
inline int string_find_first(const std::string& str, const char* pPhrase)
197
{
198
size_t res = str.find(pPhrase, 0);
199
if (res == std::string::npos)
200
return -1;
201
return (int)res;
202
}
203
204
// Hashing
205
206
inline uint32_t bitmix32c(uint32_t v)
207
{
208
v = (v + 0x7ed55d16) + (v << 12);
209
v = (v ^ 0xc761c23c) ^ (v >> 19);
210
v = (v + 0x165667b1) + (v << 5);
211
v = (v + 0xd3a2646c) ^ (v << 9);
212
v = (v + 0xfd7046c5) + (v << 3);
213
v = (v ^ 0xb55a4f09) ^ (v >> 16);
214
return v;
215
}
216
217
inline uint32_t bitmix32(uint32_t v)
218
{
219
v -= (v << 6);
220
v ^= (v >> 17);
221
v -= (v << 9);
222
v ^= (v << 4);
223
v -= (v << 3);
224
v ^= (v << 10);
225
v ^= (v >> 15);
226
return v;
227
}
228
229
inline uint32_t wang_hash(uint32_t seed)
230
{
231
seed = (seed ^ 61) ^ (seed >> 16);
232
seed *= 9;
233
seed = seed ^ (seed >> 4);
234
seed *= 0x27d4eb2d;
235
seed = seed ^ (seed >> 15);
236
return seed;
237
}
238
239
uint32_t hash_hsieh(const uint8_t* pBuf, size_t len);
240
241
template <typename Key>
242
struct bit_hasher
243
{
244
inline std::size_t operator()(const Key& k) const
245
{
246
return hash_hsieh(reinterpret_cast<const uint8_t *>(&k), sizeof(k));
247
}
248
};
249
250
struct string_hasher
251
{
252
inline std::size_t operator()(const std::string& k) const
253
{
254
size_t l = k.size();
255
if (!l)
256
return 0;
257
return hash_hsieh(reinterpret_cast<const uint8_t*>(k.c_str()), l);
258
}
259
};
260
261
class running_stat
262
{
263
public:
264
running_stat() { clear(); }
265
266
void clear()
267
{
268
m_n = 0;
269
m_total = 0;
270
m_old_m = 0;
271
m_new_m = 0;
272
m_old_s = 0;
273
m_new_s = 0;
274
m_min = 0;
275
m_max = 0;
276
}
277
278
void push(double x)
279
{
280
m_n++;
281
m_total += x;
282
if (m_n == 1)
283
{
284
m_old_m = m_new_m = x;
285
m_old_s = 0.0;
286
m_min = x;
287
m_max = x;
288
}
289
else
290
{
291
// See Knuth TAOCP vol 2, 3rd edition, page 232
292
m_new_m = m_old_m + (x - m_old_m) / m_n;
293
m_new_s = m_old_s + (x - m_old_m) * (x - m_new_m);
294
m_old_m = m_new_m;
295
m_old_s = m_new_s;
296
m_min = basisu::minimum(x, m_min);
297
m_max = basisu::maximum(x, m_max);
298
}
299
}
300
301
uint32_t get_num() const
302
{
303
return m_n;
304
}
305
306
double get_total() const
307
{
308
return m_total;
309
}
310
311
double get_mean() const
312
{
313
return (m_n > 0) ? m_new_m : 0.0;
314
}
315
316
// Returns sample variance
317
double get_variance() const
318
{
319
return ((m_n > 1) ? m_new_s / (m_n - 1) : 0.0);
320
}
321
322
double get_std_dev() const
323
{
324
return sqrt(get_variance());
325
}
326
327
double get_min() const
328
{
329
return m_min;
330
}
331
332
double get_max() const
333
{
334
return m_max;
335
}
336
337
private:
338
uint32_t m_n;
339
double m_total, m_old_m, m_new_m, m_old_s, m_new_s, m_min, m_max;
340
};
341
342
// Linear algebra
343
344
template <uint32_t N, typename T>
345
class vec
346
{
347
protected:
348
T m_v[N];
349
350
public:
351
enum { num_elements = N };
352
typedef T scalar_type;
353
354
inline vec() { }
355
inline vec(eZero) { set_zero(); }
356
357
explicit inline vec(T val) { set(val); }
358
inline vec(T v0, T v1) { set(v0, v1); }
359
inline vec(T v0, T v1, T v2) { set(v0, v1, v2); }
360
inline vec(T v0, T v1, T v2, T v3) { set(v0, v1, v2, v3); }
361
inline vec(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] = other.m_v[i]; }
362
template <uint32_t OtherN, typename OtherT> inline vec(const vec<OtherN, OtherT> &other) { set(other); }
363
364
inline const T& operator[](uint32_t i) const { assert(i < N); return m_v[i]; }
365
inline T &operator[](uint32_t i) { assert(i < N); return m_v[i]; }
366
367
inline T getX() const { return m_v[0]; }
368
inline T getY() const { static_assert(N >= 2, "N too small"); return m_v[1]; }
369
inline T getZ() const { static_assert(N >= 3, "N too small"); return m_v[2]; }
370
inline T getW() const { static_assert(N >= 4, "N too small"); return m_v[3]; }
371
372
inline bool operator==(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) if (m_v[i] != rhs.m_v[i]) return false; return true; }
373
inline bool operator!=(const vec& rhs) const { return !(*this == rhs); }
374
inline bool operator<(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) { if (m_v[i] < rhs.m_v[i]) return true; else if (m_v[i] != rhs.m_v[i]) return false; } return false; }
375
376
inline void set_zero() { for (uint32_t i = 0; i < N; i++) m_v[i] = 0; }
377
inline void clear() { set_zero(); }
378
379
template <uint32_t OtherN, typename OtherT>
380
inline vec &set(const vec<OtherN, OtherT> &other)
381
{
382
uint32_t i;
383
if ((const void *)(&other) == (const void *)(this))
384
return *this;
385
const uint32_t m = minimum(OtherN, N);
386
for (i = 0; i < m; i++)
387
m_v[i] = static_cast<T>(other[i]);
388
for (; i < N; i++)
389
m_v[i] = 0;
390
return *this;
391
}
392
393
inline vec &set_component(uint32_t index, T val) { assert(index < N); m_v[index] = val; return *this; }
394
inline vec &set(T val) { for (uint32_t i = 0; i < N; i++) m_v[i] = val; return *this; }
395
inline void clear_elements(uint32_t s, uint32_t e) { assert(e <= N); for (uint32_t i = s; i < e; i++) m_v[i] = 0; }
396
397
inline vec &set(T v0, T v1)
398
{
399
m_v[0] = v0;
400
if (N >= 2)
401
{
402
m_v[1] = v1;
403
clear_elements(2, N);
404
}
405
return *this;
406
}
407
408
inline vec &set(T v0, T v1, T v2)
409
{
410
m_v[0] = v0;
411
if (N >= 2)
412
{
413
m_v[1] = v1;
414
if (N >= 3)
415
{
416
m_v[2] = v2;
417
clear_elements(3, N);
418
}
419
}
420
return *this;
421
}
422
423
inline vec &set(T v0, T v1, T v2, T v3)
424
{
425
m_v[0] = v0;
426
if (N >= 2)
427
{
428
m_v[1] = v1;
429
if (N >= 3)
430
{
431
m_v[2] = v2;
432
433
if (N >= 4)
434
{
435
m_v[3] = v3;
436
clear_elements(5, N);
437
}
438
}
439
}
440
return *this;
441
}
442
443
inline vec &operator=(const vec &rhs) { if (this != &rhs) for (uint32_t i = 0; i < N; i++) m_v[i] = rhs.m_v[i]; return *this; }
444
template <uint32_t OtherN, typename OtherT> inline vec &operator=(const vec<OtherN, OtherT> &rhs) { set(rhs); return *this; }
445
446
inline const T *get_ptr() const { return reinterpret_cast<const T *>(&m_v[0]); }
447
inline T *get_ptr() { return reinterpret_cast<T *>(&m_v[0]); }
448
449
inline vec operator- () const { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = -m_v[i]; return res; }
450
inline vec operator+ () const { return *this; }
451
inline vec &operator+= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] += other.m_v[i]; return *this; }
452
inline vec &operator-= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] -= other.m_v[i]; return *this; }
453
inline vec &operator/= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] /= other.m_v[i]; return *this; }
454
inline vec &operator*=(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] *= other.m_v[i]; return *this; }
455
inline vec &operator/= (T s) { for (uint32_t i = 0; i < N; i++) m_v[i] /= s; return *this; }
456
inline vec &operator*= (T s) { for (uint32_t i = 0; i < N; i++) m_v[i] *= s; return *this; }
457
458
friend inline vec operator+(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] + rhs.m_v[i]; return res; }
459
friend inline vec operator-(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] - rhs.m_v[i]; return res; }
460
friend inline vec operator*(const vec &lhs, T val) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] * val; return res; }
461
friend inline vec operator*(T val, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = val * rhs.m_v[i]; return res; }
462
friend inline vec operator/(const vec &lhs, T val) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] / val; return res; }
463
friend inline vec operator/(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] / rhs.m_v[i]; return res; }
464
465
static inline T dot_product(const vec &lhs, const vec &rhs) { T res = lhs.m_v[0] * rhs.m_v[0]; for (uint32_t i = 1; i < N; i++) res += lhs.m_v[i] * rhs.m_v[i]; return res; }
466
467
inline T dot(const vec &rhs) const { return dot_product(*this, rhs); }
468
469
inline T norm() const { return dot_product(*this, *this); }
470
inline T length() const { return sqrt(norm()); }
471
472
inline T squared_distance(const vec &other) const { T d2 = 0; for (uint32_t i = 0; i < N; i++) { T d = m_v[i] - other.m_v[i]; d2 += d * d; } return d2; }
473
inline double squared_distance_d(const vec& other) const { double d2 = 0; for (uint32_t i = 0; i < N; i++) { double d = (double)m_v[i] - (double)other.m_v[i]; d2 += d * d; } return d2; }
474
475
inline T distance(const vec &other) const { return static_cast<T>(sqrt(squared_distance(other))); }
476
inline double distance_d(const vec& other) const { return sqrt(squared_distance_d(other)); }
477
478
inline vec &normalize_in_place() { T len = length(); if (len != 0.0f) *this *= (1.0f / len); return *this; }
479
480
inline vec get_normalized() const { vec res(*this); res.normalize_in_place(); return res; }
481
482
inline vec &clamp(T l, T h)
483
{
484
for (uint32_t i = 0; i < N; i++)
485
m_v[i] = basisu::clamp(m_v[i], l, h);
486
return *this;
487
}
488
489
static vec component_mul(const vec& a, const vec& b)
490
{
491
vec res;
492
for (uint32_t i = 0; i < N; i++)
493
res[i] = a[i] * b[i];
494
return res;
495
}
496
497
static vec component_min(const vec& a, const vec& b)
498
{
499
vec res;
500
for (uint32_t i = 0; i < N; i++)
501
res[i] = minimum(a[i], b[i]);
502
return res;
503
}
504
505
static vec component_max(const vec& a, const vec& b)
506
{
507
vec res;
508
for (uint32_t i = 0; i < N; i++)
509
res[i] = maximum(a[i], b[i]);
510
return res;
511
}
512
513
static vec lerp(const vec& a, const vec& b, float s)
514
{
515
vec res;
516
for (uint32_t i = 0; i < N; i++)
517
res[i] = basisu::lerp(a[i], b[i], s);
518
return res;
519
}
520
};
521
522
typedef vec<4, double> vec4D;
523
typedef vec<3, double> vec3D;
524
typedef vec<2, double> vec2D;
525
typedef vec<1, double> vec1D;
526
527
typedef vec<6, float> vec6F;
528
typedef vec<5, float> vec5F;
529
typedef vec<4, float> vec4F;
530
typedef vec<3, float> vec3F;
531
typedef vec<2, float> vec2F;
532
typedef vec<1, float> vec1F;
533
534
typedef vec<16, float> vec16F;
535
536
template<uint32_t N, typename T> struct bitwise_copyable< vec<N, T> > { enum { cFlag = true }; };
537
template<uint32_t N, typename T> struct bitwise_movable< vec<N, T> > { enum { cFlag = true }; };
538
539
template <uint32_t Rows, uint32_t Cols, typename T>
540
class matrix
541
{
542
public:
543
typedef vec<Rows, T> col_vec;
544
typedef vec<Cols, T> row_vec;
545
546
typedef T scalar_type;
547
548
enum { rows = Rows, cols = Cols };
549
550
protected:
551
row_vec m_r[Rows];
552
553
public:
554
inline matrix() {}
555
inline matrix(eZero) { set_zero(); }
556
inline matrix(const matrix &other) { for (uint32_t i = 0; i < Rows; i++) m_r[i] = other.m_r[i]; }
557
inline matrix &operator=(const matrix &rhs) { if (this != &rhs) for (uint32_t i = 0; i < Rows; i++) m_r[i] = rhs.m_r[i]; return *this; }
558
559
inline T operator()(uint32_t r, uint32_t c) const { assert((r < Rows) && (c < Cols)); return m_r[r][c]; }
560
inline T &operator()(uint32_t r, uint32_t c) { assert((r < Rows) && (c < Cols)); return m_r[r][c]; }
561
562
inline const row_vec &operator[](uint32_t r) const { assert(r < Rows); return m_r[r]; }
563
inline row_vec &operator[](uint32_t r) { assert(r < Rows); return m_r[r]; }
564
565
inline matrix &set_zero()
566
{
567
for (uint32_t i = 0; i < Rows; i++)
568
m_r[i].set_zero();
569
return *this;
570
}
571
572
inline matrix &set_identity()
573
{
574
for (uint32_t i = 0; i < Rows; i++)
575
{
576
m_r[i].set_zero();
577
if (i < Cols)
578
m_r[i][i] = 1.0f;
579
}
580
return *this;
581
}
582
};
583
584
template<uint32_t R, uint32_t C, typename T> struct bitwise_copyable< matrix<R, C, T> > { enum { cFlag = true }; };
585
template<uint32_t R, uint32_t C, typename T> struct bitwise_movable< matrix<R, C, T> > { enum { cFlag = true }; };
586
587
template<uint32_t N, typename VectorType>
588
inline VectorType compute_pca_from_covar(matrix<N, N, float> &cmatrix)
589
{
590
VectorType axis;
591
if (N == 1)
592
axis.set(1.0f);
593
else
594
{
595
for (uint32_t i = 0; i < N; i++)
596
axis[i] = lerp(.75f, 1.25f, i * (1.0f / maximum<int>(N - 1, 1)));
597
}
598
599
VectorType prev_axis(axis);
600
601
// Power iterations
602
for (uint32_t power_iter = 0; power_iter < 8; power_iter++)
603
{
604
VectorType trial_axis;
605
double max_sum = 0;
606
607
for (uint32_t i = 0; i < N; i++)
608
{
609
double sum = 0;
610
for (uint32_t j = 0; j < N; j++)
611
sum += cmatrix[i][j] * axis[j];
612
613
trial_axis[i] = static_cast<float>(sum);
614
615
max_sum = maximum(fabs(sum), max_sum);
616
}
617
618
if (max_sum != 0.0f)
619
trial_axis *= static_cast<float>(1.0f / max_sum);
620
621
VectorType delta_axis(prev_axis - trial_axis);
622
623
prev_axis = axis;
624
axis = trial_axis;
625
626
if (delta_axis.norm() < .0024f)
627
break;
628
}
629
630
return axis.normalize_in_place();
631
}
632
633
template<typename T> inline void indirect_sort(uint32_t num_indices, uint32_t* pIndices, const T* pKeys)
634
{
635
for (uint32_t i = 0; i < num_indices; i++)
636
pIndices[i] = i;
637
638
std::sort(
639
pIndices,
640
pIndices + num_indices,
641
[pKeys](uint32_t a, uint32_t b) { return pKeys[a] < pKeys[b]; }
642
);
643
}
644
645
// 1-4 byte direct Radix sort.
646
template <typename T>
647
T* radix_sort(uint32_t num_vals, T* pBuf0, T* pBuf1, uint32_t key_ofs, uint32_t key_size)
648
{
649
assert(key_ofs < sizeof(T));
650
assert((key_size >= 1) && (key_size <= 4));
651
652
uint32_t hist[256 * 4];
653
654
memset(hist, 0, sizeof(hist[0]) * 256 * key_size);
655
656
#define BASISU_GET_KEY(p) (*(uint32_t *)((uint8_t *)(p) + key_ofs))
657
658
if (key_size == 4)
659
{
660
T* p = pBuf0;
661
T* q = pBuf0 + num_vals;
662
for (; p != q; p++)
663
{
664
const uint32_t key = BASISU_GET_KEY(p);
665
666
hist[key & 0xFF]++;
667
hist[256 + ((key >> 8) & 0xFF)]++;
668
hist[512 + ((key >> 16) & 0xFF)]++;
669
hist[768 + ((key >> 24) & 0xFF)]++;
670
}
671
}
672
else if (key_size == 3)
673
{
674
T* p = pBuf0;
675
T* q = pBuf0 + num_vals;
676
for (; p != q; p++)
677
{
678
const uint32_t key = BASISU_GET_KEY(p);
679
680
hist[key & 0xFF]++;
681
hist[256 + ((key >> 8) & 0xFF)]++;
682
hist[512 + ((key >> 16) & 0xFF)]++;
683
}
684
}
685
else if (key_size == 2)
686
{
687
T* p = pBuf0;
688
T* q = pBuf0 + (num_vals >> 1) * 2;
689
690
for (; p != q; p += 2)
691
{
692
const uint32_t key0 = BASISU_GET_KEY(p);
693
const uint32_t key1 = BASISU_GET_KEY(p + 1);
694
695
hist[key0 & 0xFF]++;
696
hist[256 + ((key0 >> 8) & 0xFF)]++;
697
698
hist[key1 & 0xFF]++;
699
hist[256 + ((key1 >> 8) & 0xFF)]++;
700
}
701
702
if (num_vals & 1)
703
{
704
const uint32_t key = BASISU_GET_KEY(p);
705
706
hist[key & 0xFF]++;
707
hist[256 + ((key >> 8) & 0xFF)]++;
708
}
709
}
710
else
711
{
712
assert(key_size == 1);
713
if (key_size != 1)
714
return NULL;
715
716
T* p = pBuf0;
717
T* q = pBuf0 + (num_vals >> 1) * 2;
718
719
for (; p != q; p += 2)
720
{
721
const uint32_t key0 = BASISU_GET_KEY(p);
722
const uint32_t key1 = BASISU_GET_KEY(p + 1);
723
724
hist[key0 & 0xFF]++;
725
hist[key1 & 0xFF]++;
726
}
727
728
if (num_vals & 1)
729
{
730
const uint32_t key = BASISU_GET_KEY(p);
731
hist[key & 0xFF]++;
732
}
733
}
734
735
T* pCur = pBuf0;
736
T* pNew = pBuf1;
737
738
for (uint32_t pass = 0; pass < key_size; pass++)
739
{
740
const uint32_t* pHist = &hist[pass << 8];
741
742
uint32_t offsets[256];
743
744
uint32_t cur_ofs = 0;
745
for (uint32_t i = 0; i < 256; i += 2)
746
{
747
offsets[i] = cur_ofs;
748
cur_ofs += pHist[i];
749
750
offsets[i + 1] = cur_ofs;
751
cur_ofs += pHist[i + 1];
752
}
753
754
const uint32_t pass_shift = pass << 3;
755
756
T* p = pCur;
757
T* q = pCur + (num_vals >> 1) * 2;
758
759
for (; p != q; p += 2)
760
{
761
uint32_t c0 = (BASISU_GET_KEY(p) >> pass_shift) & 0xFF;
762
uint32_t c1 = (BASISU_GET_KEY(p + 1) >> pass_shift) & 0xFF;
763
764
if (c0 == c1)
765
{
766
uint32_t dst_offset0 = offsets[c0];
767
768
offsets[c0] = dst_offset0 + 2;
769
770
pNew[dst_offset0] = p[0];
771
pNew[dst_offset0 + 1] = p[1];
772
}
773
else
774
{
775
uint32_t dst_offset0 = offsets[c0]++;
776
uint32_t dst_offset1 = offsets[c1]++;
777
778
pNew[dst_offset0] = p[0];
779
pNew[dst_offset1] = p[1];
780
}
781
}
782
783
if (num_vals & 1)
784
{
785
uint32_t c = (BASISU_GET_KEY(p) >> pass_shift) & 0xFF;
786
787
uint32_t dst_offset = offsets[c];
788
offsets[c] = dst_offset + 1;
789
790
pNew[dst_offset] = *p;
791
}
792
793
T* t = pCur;
794
pCur = pNew;
795
pNew = t;
796
}
797
798
return pCur;
799
}
800
801
#undef BASISU_GET_KEY
802
803
// Very simple job pool with no dependencies.
804
class job_pool
805
{
806
BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(job_pool);
807
808
public:
809
// num_threads is the TOTAL number of job pool threads, including the calling thread! So 2=1 new thread, 3=2 new threads, etc.
810
job_pool(uint32_t num_threads);
811
~job_pool();
812
813
void add_job(const std::function<void()>& job);
814
void add_job(std::function<void()>&& job);
815
816
void wait_for_all();
817
818
size_t get_total_threads() const { return 1 + m_threads.size(); }
819
820
private:
821
std::vector<std::thread> m_threads;
822
std::vector<std::function<void()> > m_queue;
823
824
std::mutex m_mutex;
825
std::condition_variable m_has_work;
826
std::condition_variable m_no_more_jobs;
827
828
uint32_t m_num_active_jobs;
829
830
std::atomic<bool> m_kill_flag;
831
832
std::atomic<int> m_num_active_workers;
833
834
void job_thread(uint32_t index);
835
};
836
837
// Simple 64-bit color class
838
839
class color_rgba_i16
840
{
841
public:
842
union
843
{
844
int16_t m_comps[4];
845
846
struct
847
{
848
int16_t r;
849
int16_t g;
850
int16_t b;
851
int16_t a;
852
};
853
};
854
855
inline color_rgba_i16()
856
{
857
static_assert(sizeof(*this) == sizeof(int16_t)*4, "sizeof(*this) == sizeof(int16_t)*4");
858
}
859
860
inline color_rgba_i16(int sr, int sg, int sb, int sa)
861
{
862
set(sr, sg, sb, sa);
863
}
864
865
inline color_rgba_i16 &set(int sr, int sg, int sb, int sa)
866
{
867
m_comps[0] = (int16_t)clamp<int>(sr, INT16_MIN, INT16_MAX);
868
m_comps[1] = (int16_t)clamp<int>(sg, INT16_MIN, INT16_MAX);
869
m_comps[2] = (int16_t)clamp<int>(sb, INT16_MIN, INT16_MAX);
870
m_comps[3] = (int16_t)clamp<int>(sa, INT16_MIN, INT16_MAX);
871
return *this;
872
}
873
};
874
875
class color_rgba
876
{
877
public:
878
union
879
{
880
uint8_t m_comps[4];
881
882
struct
883
{
884
uint8_t r;
885
uint8_t g;
886
uint8_t b;
887
uint8_t a;
888
};
889
};
890
891
inline color_rgba()
892
{
893
static_assert(sizeof(*this) == 4, "sizeof(*this) != 4");
894
static_assert(sizeof(*this) == sizeof(basist::color32), "sizeof(*this) != sizeof(basist::color32)");
895
}
896
897
// Not too hot about this idea.
898
inline color_rgba(const basist::color32& other) :
899
r(other.r),
900
g(other.g),
901
b(other.b),
902
a(other.a)
903
{
904
}
905
906
color_rgba& operator= (const basist::color32& rhs)
907
{
908
r = rhs.r;
909
g = rhs.g;
910
b = rhs.b;
911
a = rhs.a;
912
return *this;
913
}
914
915
inline color_rgba(int y)
916
{
917
set(y);
918
}
919
920
inline color_rgba(int y, int na)
921
{
922
set(y, na);
923
}
924
925
inline color_rgba(int sr, int sg, int sb, int sa)
926
{
927
set(sr, sg, sb, sa);
928
}
929
930
inline color_rgba(eNoClamp, int sr, int sg, int sb, int sa)
931
{
932
set_noclamp_rgba((uint8_t)sr, (uint8_t)sg, (uint8_t)sb, (uint8_t)sa);
933
}
934
935
inline color_rgba& set_noclamp_y(int y)
936
{
937
m_comps[0] = (uint8_t)y;
938
m_comps[1] = (uint8_t)y;
939
m_comps[2] = (uint8_t)y;
940
m_comps[3] = (uint8_t)255;
941
return *this;
942
}
943
944
inline color_rgba &set_noclamp_rgba(int sr, int sg, int sb, int sa)
945
{
946
m_comps[0] = (uint8_t)sr;
947
m_comps[1] = (uint8_t)sg;
948
m_comps[2] = (uint8_t)sb;
949
m_comps[3] = (uint8_t)sa;
950
return *this;
951
}
952
953
inline color_rgba &set(int y)
954
{
955
m_comps[0] = static_cast<uint8_t>(clamp<int>(y, 0, 255));
956
m_comps[1] = m_comps[0];
957
m_comps[2] = m_comps[0];
958
m_comps[3] = 255;
959
return *this;
960
}
961
962
inline color_rgba &set(int y, int na)
963
{
964
m_comps[0] = static_cast<uint8_t>(clamp<int>(y, 0, 255));
965
m_comps[1] = m_comps[0];
966
m_comps[2] = m_comps[0];
967
m_comps[3] = static_cast<uint8_t>(clamp<int>(na, 0, 255));
968
return *this;
969
}
970
971
inline color_rgba &set(int sr, int sg, int sb, int sa)
972
{
973
m_comps[0] = static_cast<uint8_t>(clamp<int>(sr, 0, 255));
974
m_comps[1] = static_cast<uint8_t>(clamp<int>(sg, 0, 255));
975
m_comps[2] = static_cast<uint8_t>(clamp<int>(sb, 0, 255));
976
m_comps[3] = static_cast<uint8_t>(clamp<int>(sa, 0, 255));
977
return *this;
978
}
979
980
inline color_rgba &set_rgb(int sr, int sg, int sb)
981
{
982
m_comps[0] = static_cast<uint8_t>(clamp<int>(sr, 0, 255));
983
m_comps[1] = static_cast<uint8_t>(clamp<int>(sg, 0, 255));
984
m_comps[2] = static_cast<uint8_t>(clamp<int>(sb, 0, 255));
985
return *this;
986
}
987
988
inline color_rgba &set_rgb(const color_rgba &other)
989
{
990
r = other.r;
991
g = other.g;
992
b = other.b;
993
return *this;
994
}
995
996
inline const uint8_t &operator[] (uint32_t index) const { assert(index < 4); return m_comps[index]; }
997
inline uint8_t &operator[] (uint32_t index) { assert(index < 4); return m_comps[index]; }
998
999
inline void clear()
1000
{
1001
m_comps[0] = 0;
1002
m_comps[1] = 0;
1003
m_comps[2] = 0;
1004
m_comps[3] = 0;
1005
}
1006
1007
inline bool operator== (const color_rgba &rhs) const
1008
{
1009
if (m_comps[0] != rhs.m_comps[0]) return false;
1010
if (m_comps[1] != rhs.m_comps[1]) return false;
1011
if (m_comps[2] != rhs.m_comps[2]) return false;
1012
if (m_comps[3] != rhs.m_comps[3]) return false;
1013
return true;
1014
}
1015
1016
inline bool operator!= (const color_rgba &rhs) const
1017
{
1018
return !(*this == rhs);
1019
}
1020
1021
inline bool operator<(const color_rgba &rhs) const
1022
{
1023
for (int i = 0; i < 4; i++)
1024
{
1025
if (m_comps[i] < rhs.m_comps[i])
1026
return true;
1027
else if (m_comps[i] != rhs.m_comps[i])
1028
return false;
1029
}
1030
return false;
1031
}
1032
1033
inline int get_601_luma() const { return (19595U * m_comps[0] + 38470U * m_comps[1] + 7471U * m_comps[2] + 32768U) >> 16U; }
1034
inline int get_709_luma() const { return (13938U * m_comps[0] + 46869U * m_comps[1] + 4729U * m_comps[2] + 32768U) >> 16U; }
1035
inline int get_luma(bool luma_601) const { return luma_601 ? get_601_luma() : get_709_luma(); }
1036
1037
inline uint32_t get_bgra_uint32() const { return b | (g << 8) | (r << 16) | (a << 24); }
1038
inline uint32_t get_rgba_uint32() const { return r | (g << 8) | (b << 16) | (a << 24); }
1039
1040
inline basist::color32 get_color32() const
1041
{
1042
return basist::color32(r, g, b, a);
1043
}
1044
1045
static color_rgba comp_min(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::minimum(a[0], b[0]), basisu::minimum(a[1], b[1]), basisu::minimum(a[2], b[2]), basisu::minimum(a[3], b[3])); }
1046
static color_rgba comp_max(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::maximum(a[0], b[0]), basisu::maximum(a[1], b[1]), basisu::maximum(a[2], b[2]), basisu::maximum(a[3], b[3])); }
1047
};
1048
1049
typedef basisu::vector<color_rgba> color_rgba_vec;
1050
1051
const color_rgba g_black_color(0, 0, 0, 255);
1052
const color_rgba g_black_trans_color(0, 0, 0, 0);
1053
const color_rgba g_white_color(255, 255, 255, 255);
1054
1055
inline int color_distance(int r0, int g0, int b0, int r1, int g1, int b1)
1056
{
1057
int dr = r0 - r1, dg = g0 - g1, db = b0 - b1;
1058
return dr * dr + dg * dg + db * db;
1059
}
1060
1061
inline int color_distance(int r0, int g0, int b0, int a0, int r1, int g1, int b1, int a1)
1062
{
1063
int dr = r0 - r1, dg = g0 - g1, db = b0 - b1, da = a0 - a1;
1064
return dr * dr + dg * dg + db * db + da * da;
1065
}
1066
1067
inline int color_distance(const color_rgba &c0, const color_rgba &c1, bool alpha)
1068
{
1069
if (alpha)
1070
return color_distance(c0.r, c0.g, c0.b, c0.a, c1.r, c1.g, c1.b, c1.a);
1071
else
1072
return color_distance(c0.r, c0.g, c0.b, c1.r, c1.g, c1.b);
1073
}
1074
1075
// TODO: Allow user to control channel weightings.
1076
inline uint32_t color_distance(bool perceptual, const color_rgba &e1, const color_rgba &e2, bool alpha)
1077
{
1078
if (perceptual)
1079
{
1080
#if BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE
1081
const float l1 = e1.r * .2126f + e1.g * .715f + e1.b * .0722f;
1082
const float l2 = e2.r * .2126f + e2.g * .715f + e2.b * .0722f;
1083
1084
const float cr1 = e1.r - l1;
1085
const float cr2 = e2.r - l2;
1086
1087
const float cb1 = e1.b - l1;
1088
const float cb2 = e2.b - l2;
1089
1090
const float dl = l1 - l2;
1091
const float dcr = cr1 - cr2;
1092
const float dcb = cb1 - cb2;
1093
1094
uint32_t d = static_cast<uint32_t>(32.0f*4.0f*dl*dl + 32.0f*2.0f*(.5f / (1.0f - .2126f))*(.5f / (1.0f - .2126f))*dcr*dcr + 32.0f*.25f*(.5f / (1.0f - .0722f))*(.5f / (1.0f - .0722f))*dcb*dcb);
1095
1096
if (alpha)
1097
{
1098
int da = static_cast<int>(e1.a) - static_cast<int>(e2.a);
1099
d += static_cast<uint32_t>(128.0f*da*da);
1100
}
1101
1102
return d;
1103
#elif 1
1104
int dr = e1.r - e2.r;
1105
int dg = e1.g - e2.g;
1106
int db = e1.b - e2.b;
1107
1108
#if 0
1109
int delta_l = dr * 27 + dg * 92 + db * 9;
1110
int delta_cr = dr * 128 - delta_l;
1111
int delta_cb = db * 128 - delta_l;
1112
1113
uint32_t id = ((uint32_t)(delta_l * delta_l) >> 7U) +
1114
((((uint32_t)(delta_cr * delta_cr) >> 7U) * 26U) >> 7U) +
1115
((((uint32_t)(delta_cb * delta_cb) >> 7U) * 3U) >> 7U);
1116
#else
1117
int64_t delta_l = dr * 27 + dg * 92 + db * 9;
1118
int64_t delta_cr = dr * 128 - delta_l;
1119
int64_t delta_cb = db * 128 - delta_l;
1120
1121
uint32_t id = ((uint32_t)((delta_l * delta_l) >> 7U)) +
1122
((((uint32_t)((delta_cr * delta_cr) >> 7U)) * 26U) >> 7U) +
1123
((((uint32_t)((delta_cb * delta_cb) >> 7U)) * 3U) >> 7U);
1124
#endif
1125
1126
if (alpha)
1127
{
1128
int da = (e1.a - e2.a) << 7;
1129
// This shouldn't overflow if da is 255 or -255: 29.99 bits after squaring.
1130
id += ((uint32_t)(da * da) >> 7U);
1131
}
1132
1133
return id;
1134
#else
1135
int dr = e1.r - e2.r;
1136
int dg = e1.g - e2.g;
1137
int db = e1.b - e2.b;
1138
1139
int64_t delta_l = dr * 27 + dg * 92 + db * 9;
1140
int64_t delta_cr = dr * 128 - delta_l;
1141
int64_t delta_cb = db * 128 - delta_l;
1142
1143
int64_t id = ((delta_l * delta_l) * 128) +
1144
((delta_cr * delta_cr) * 26) +
1145
((delta_cb * delta_cb) * 3);
1146
1147
if (alpha)
1148
{
1149
int64_t da = (e1.a - e2.a);
1150
id += (da * da) * 128;
1151
}
1152
1153
int d = (id + 8192) >> 14;
1154
1155
return d;
1156
#endif
1157
}
1158
else
1159
return color_distance(e1, e2, alpha);
1160
}
1161
1162
static inline uint32_t color_distance_la(const color_rgba& a, const color_rgba& b)
1163
{
1164
const int dl = a.r - b.r;
1165
const int da = a.a - b.a;
1166
return dl * dl + da * da;
1167
}
1168
1169
// String helpers
1170
1171
inline int string_find_right(const std::string& filename, char c)
1172
{
1173
size_t result = filename.find_last_of(c);
1174
return (result == std::string::npos) ? -1 : (int)result;
1175
}
1176
1177
inline std::string string_get_extension(const std::string &filename)
1178
{
1179
int sep = -1;
1180
#ifdef _WIN32
1181
sep = string_find_right(filename, '\\');
1182
#endif
1183
if (sep < 0)
1184
sep = string_find_right(filename, '/');
1185
1186
int dot = string_find_right(filename, '.');
1187
if (dot <= sep)
1188
return "";
1189
1190
std::string result(filename);
1191
result.erase(0, dot + 1);
1192
1193
return result;
1194
}
1195
1196
inline bool string_remove_extension(std::string &filename)
1197
{
1198
int sep = -1;
1199
#ifdef _WIN32
1200
sep = string_find_right(filename, '\\');
1201
#endif
1202
if (sep < 0)
1203
sep = string_find_right(filename, '/');
1204
1205
int dot = string_find_right(filename, '.');
1206
if ((dot < sep) || (dot < 0))
1207
return false;
1208
1209
filename.resize(dot);
1210
1211
return true;
1212
}
1213
1214
inline std::string string_tolower(const std::string& s)
1215
{
1216
std::string result(s);
1217
for (size_t i = 0; i < result.size(); i++)
1218
{
1219
result[i] = (char)tolower((uint8_t)(result[i]));
1220
}
1221
return result;
1222
}
1223
1224
inline char *strcpy_safe(char *pDst, size_t dst_len, const char *pSrc)
1225
{
1226
assert(pDst && pSrc && dst_len);
1227
if (!dst_len)
1228
return pDst;
1229
1230
const size_t src_len = strlen(pSrc);
1231
const size_t src_len_plus_terminator = src_len + 1;
1232
1233
if (src_len_plus_terminator <= dst_len)
1234
memcpy(pDst, pSrc, src_len_plus_terminator);
1235
else
1236
{
1237
if (dst_len > 1)
1238
memcpy(pDst, pSrc, dst_len - 1);
1239
pDst[dst_len - 1] = '\0';
1240
}
1241
1242
return pDst;
1243
}
1244
1245
inline bool string_ends_with(const std::string& s, char c)
1246
{
1247
return (s.size() != 0) && (s.back() == c);
1248
}
1249
1250
inline bool string_split_path(const char *p, std::string *pDrive, std::string *pDir, std::string *pFilename, std::string *pExt)
1251
{
1252
#ifdef _MSC_VER
1253
char drive_buf[_MAX_DRIVE] = { 0 };
1254
char dir_buf[_MAX_DIR] = { 0 };
1255
char fname_buf[_MAX_FNAME] = { 0 };
1256
char ext_buf[_MAX_EXT] = { 0 };
1257
1258
errno_t error = _splitpath_s(p,
1259
pDrive ? drive_buf : NULL, pDrive ? _MAX_DRIVE : 0,
1260
pDir ? dir_buf : NULL, pDir ? _MAX_DIR : 0,
1261
pFilename ? fname_buf : NULL, pFilename ? _MAX_FNAME : 0,
1262
pExt ? ext_buf : NULL, pExt ? _MAX_EXT : 0);
1263
if (error != 0)
1264
return false;
1265
1266
if (pDrive) *pDrive = drive_buf;
1267
if (pDir) *pDir = dir_buf;
1268
if (pFilename) *pFilename = fname_buf;
1269
if (pExt) *pExt = ext_buf;
1270
return true;
1271
#else
1272
char dirtmp[1024], nametmp[1024];
1273
strcpy_safe(dirtmp, sizeof(dirtmp), p);
1274
strcpy_safe(nametmp, sizeof(nametmp), p);
1275
1276
if (pDrive)
1277
pDrive->resize(0);
1278
1279
const char *pDirName = dirname(dirtmp);
1280
const char* pBaseName = basename(nametmp);
1281
if ((!pDirName) || (!pBaseName))
1282
return false;
1283
1284
if (pDir)
1285
{
1286
*pDir = pDirName;
1287
if ((pDir->size()) && (pDir->back() != '/'))
1288
*pDir += "/";
1289
}
1290
1291
if (pFilename)
1292
{
1293
*pFilename = pBaseName;
1294
string_remove_extension(*pFilename);
1295
}
1296
1297
if (pExt)
1298
{
1299
*pExt = pBaseName;
1300
*pExt = string_get_extension(*pExt);
1301
if (pExt->size())
1302
*pExt = "." + *pExt;
1303
}
1304
1305
return true;
1306
#endif
1307
}
1308
1309
inline bool is_path_separator(char c)
1310
{
1311
#ifdef _WIN32
1312
return (c == '/') || (c == '\\');
1313
#else
1314
return (c == '/');
1315
#endif
1316
}
1317
1318
inline bool is_drive_separator(char c)
1319
{
1320
#ifdef _WIN32
1321
return (c == ':');
1322
#else
1323
(void)c;
1324
return false;
1325
#endif
1326
}
1327
1328
inline void string_combine_path(std::string &dst, const char *p, const char *q)
1329
{
1330
std::string temp(p);
1331
if (temp.size() && !is_path_separator(q[0]))
1332
{
1333
if (!is_path_separator(temp.back()))
1334
temp.append(1, BASISU_PATH_SEPERATOR_CHAR);
1335
}
1336
temp += q;
1337
dst.swap(temp);
1338
}
1339
1340
inline void string_combine_path(std::string &dst, const char *p, const char *q, const char *r)
1341
{
1342
string_combine_path(dst, p, q);
1343
string_combine_path(dst, dst.c_str(), r);
1344
}
1345
1346
inline void string_combine_path_and_extension(std::string &dst, const char *p, const char *q, const char *r, const char *pExt)
1347
{
1348
string_combine_path(dst, p, q, r);
1349
if ((!string_ends_with(dst, '.')) && (pExt[0]) && (pExt[0] != '.'))
1350
dst.append(1, '.');
1351
dst.append(pExt);
1352
}
1353
1354
inline bool string_get_pathname(const char *p, std::string &path)
1355
{
1356
std::string temp_drive, temp_path;
1357
if (!string_split_path(p, &temp_drive, &temp_path, NULL, NULL))
1358
return false;
1359
string_combine_path(path, temp_drive.c_str(), temp_path.c_str());
1360
return true;
1361
}
1362
1363
inline bool string_get_filename(const char *p, std::string &filename)
1364
{
1365
std::string temp_ext;
1366
if (!string_split_path(p, nullptr, nullptr, &filename, &temp_ext))
1367
return false;
1368
filename += temp_ext;
1369
return true;
1370
}
1371
1372
class rand
1373
{
1374
std::mt19937 m_mt;
1375
1376
public:
1377
rand() { }
1378
1379
rand(uint32_t s) { seed(s); }
1380
void seed(uint32_t s) { m_mt.seed(s); }
1381
1382
// between [l,h]
1383
int irand(int l, int h) { std::uniform_int_distribution<int> d(l, h); return d(m_mt); }
1384
1385
uint32_t urand32() { return static_cast<uint32_t>(irand(INT32_MIN, INT32_MAX)); }
1386
1387
bool bit() { return irand(0, 1) == 1; }
1388
1389
uint8_t byte() { return static_cast<uint8_t>(urand32()); }
1390
1391
// between [l,h)
1392
float frand(float l, float h) { std::uniform_real_distribution<float> d(l, h); return d(m_mt); }
1393
1394
float gaussian(float mean, float stddev) { std::normal_distribution<float> d(mean, stddev); return d(m_mt); }
1395
};
1396
1397
class priority_queue
1398
{
1399
public:
1400
priority_queue() :
1401
m_size(0)
1402
{
1403
}
1404
1405
void clear()
1406
{
1407
m_heap.clear();
1408
m_size = 0;
1409
}
1410
1411
void init(uint32_t max_entries, uint32_t first_index, float first_priority)
1412
{
1413
m_heap.resize(max_entries + 1);
1414
m_heap[1].m_index = first_index;
1415
m_heap[1].m_priority = first_priority;
1416
m_size = 1;
1417
}
1418
1419
inline uint32_t size() const { return m_size; }
1420
1421
inline uint32_t get_top_index() const { return m_heap[1].m_index; }
1422
inline float get_top_priority() const { return m_heap[1].m_priority; }
1423
1424
inline void delete_top()
1425
{
1426
assert(m_size > 0);
1427
m_heap[1] = m_heap[m_size];
1428
m_size--;
1429
if (m_size)
1430
down_heap(1);
1431
}
1432
1433
inline void add_heap(uint32_t index, float priority)
1434
{
1435
m_size++;
1436
1437
uint32_t k = m_size;
1438
1439
if (m_size >= m_heap.size())
1440
m_heap.resize(m_size + 1);
1441
1442
for (;;)
1443
{
1444
uint32_t parent_index = k >> 1;
1445
if ((!parent_index) || (m_heap[parent_index].m_priority > priority))
1446
break;
1447
m_heap[k] = m_heap[parent_index];
1448
k = parent_index;
1449
}
1450
1451
m_heap[k].m_index = index;
1452
m_heap[k].m_priority = priority;
1453
}
1454
1455
private:
1456
struct entry
1457
{
1458
uint32_t m_index;
1459
float m_priority;
1460
};
1461
1462
basisu::vector<entry> m_heap;
1463
uint32_t m_size;
1464
1465
// Push down entry at index
1466
inline void down_heap(uint32_t heap_index)
1467
{
1468
uint32_t orig_index = m_heap[heap_index].m_index;
1469
const float orig_priority = m_heap[heap_index].m_priority;
1470
1471
uint32_t child_index;
1472
while ((child_index = (heap_index << 1)) <= m_size)
1473
{
1474
if ((child_index < m_size) && (m_heap[child_index].m_priority < m_heap[child_index + 1].m_priority)) ++child_index;
1475
if (orig_priority > m_heap[child_index].m_priority)
1476
break;
1477
m_heap[heap_index] = m_heap[child_index];
1478
heap_index = child_index;
1479
}
1480
1481
m_heap[heap_index].m_index = orig_index;
1482
m_heap[heap_index].m_priority = orig_priority;
1483
}
1484
};
1485
1486
// Tree structured vector quantization (TSVQ)
1487
1488
template <typename TrainingVectorType>
1489
class tree_vector_quant
1490
{
1491
public:
1492
typedef TrainingVectorType training_vec_type;
1493
typedef std::pair<TrainingVectorType, uint64_t> training_vec_with_weight;
1494
typedef basisu::vector< training_vec_with_weight > array_of_weighted_training_vecs;
1495
1496
tree_vector_quant() :
1497
m_next_codebook_index(0)
1498
{
1499
}
1500
1501
void clear()
1502
{
1503
clear_vector(m_training_vecs);
1504
clear_vector(m_nodes);
1505
m_next_codebook_index = 0;
1506
}
1507
1508
void add_training_vec(const TrainingVectorType &v, uint64_t weight) { m_training_vecs.push_back(std::make_pair(v, weight)); }
1509
1510
size_t get_total_training_vecs() const { return m_training_vecs.size(); }
1511
const array_of_weighted_training_vecs &get_training_vecs() const { return m_training_vecs; }
1512
array_of_weighted_training_vecs &get_training_vecs() { return m_training_vecs; }
1513
1514
void retrieve(basisu::vector< basisu::vector<uint32_t> > &codebook) const
1515
{
1516
for (uint32_t i = 0; i < m_nodes.size(); i++)
1517
{
1518
const tsvq_node &n = m_nodes[i];
1519
if (!n.is_leaf())
1520
continue;
1521
1522
codebook.resize(codebook.size() + 1);
1523
codebook.back() = n.m_training_vecs;
1524
}
1525
}
1526
1527
void retrieve(basisu::vector<TrainingVectorType> &codebook) const
1528
{
1529
for (uint32_t i = 0; i < m_nodes.size(); i++)
1530
{
1531
const tsvq_node &n = m_nodes[i];
1532
if (!n.is_leaf())
1533
continue;
1534
1535
codebook.resize(codebook.size() + 1);
1536
codebook.back() = n.m_origin;
1537
}
1538
}
1539
1540
void retrieve(uint32_t max_clusters, basisu::vector<uint_vec> &codebook) const
1541
{
1542
uint_vec node_stack;
1543
node_stack.reserve(512);
1544
1545
codebook.resize(0);
1546
codebook.reserve(max_clusters);
1547
1548
uint32_t node_index = 0;
1549
1550
while (true)
1551
{
1552
const tsvq_node& cur = m_nodes[node_index];
1553
1554
if (cur.is_leaf() || ((2 + cur.m_codebook_index) > (int)max_clusters))
1555
{
1556
codebook.resize(codebook.size() + 1);
1557
codebook.back() = cur.m_training_vecs;
1558
1559
if (node_stack.empty())
1560
break;
1561
1562
node_index = node_stack.back();
1563
node_stack.pop_back();
1564
continue;
1565
}
1566
1567
node_stack.push_back(cur.m_right_index);
1568
node_index = cur.m_left_index;
1569
}
1570
}
1571
1572
bool generate(uint32_t max_size)
1573
{
1574
if (!m_training_vecs.size())
1575
return false;
1576
1577
m_next_codebook_index = 0;
1578
1579
clear_vector(m_nodes);
1580
m_nodes.reserve(max_size * 2 + 1);
1581
1582
m_nodes.push_back(prepare_root());
1583
1584
priority_queue var_heap;
1585
var_heap.init(max_size, 0, m_nodes[0].m_var);
1586
1587
basisu::vector<uint32_t> l_children, r_children;
1588
1589
// Now split the worst nodes
1590
l_children.reserve(m_training_vecs.size() + 1);
1591
r_children.reserve(m_training_vecs.size() + 1);
1592
1593
uint32_t total_leaf_nodes = 1;
1594
1595
//interval_timer tm;
1596
//tm.start();
1597
1598
while ((var_heap.size()) && (total_leaf_nodes < max_size))
1599
{
1600
const uint32_t node_index = var_heap.get_top_index();
1601
const tsvq_node &node = m_nodes[node_index];
1602
1603
assert(node.m_var == var_heap.get_top_priority());
1604
assert(node.is_leaf());
1605
1606
var_heap.delete_top();
1607
1608
if (node.m_training_vecs.size() > 1)
1609
{
1610
if (split_node(node_index, var_heap, l_children, r_children))
1611
{
1612
// This removes one leaf node (making an internal node) and replaces it with two new leaves, so +1 total.
1613
total_leaf_nodes += 1;
1614
}
1615
}
1616
}
1617
1618
//debug_printf("tree_vector_quant::generate %u: %3.3f secs\n", TrainingVectorType::num_elements, tm.get_elapsed_secs());
1619
1620
return true;
1621
}
1622
1623
private:
1624
class tsvq_node
1625
{
1626
public:
1627
inline tsvq_node() : m_weight(0), m_origin(cZero), m_left_index(-1), m_right_index(-1), m_codebook_index(-1) { }
1628
1629
// vecs is erased
1630
inline void set(const TrainingVectorType &org, uint64_t weight, float var, basisu::vector<uint32_t> &vecs) { m_origin = org; m_weight = weight; m_var = var; m_training_vecs.swap(vecs); }
1631
1632
inline bool is_leaf() const { return m_left_index < 0; }
1633
1634
float m_var;
1635
uint64_t m_weight;
1636
TrainingVectorType m_origin;
1637
int32_t m_left_index, m_right_index;
1638
basisu::vector<uint32_t> m_training_vecs;
1639
int m_codebook_index;
1640
};
1641
1642
typedef basisu::vector<tsvq_node> tsvq_node_vec;
1643
tsvq_node_vec m_nodes;
1644
1645
array_of_weighted_training_vecs m_training_vecs;
1646
1647
uint32_t m_next_codebook_index;
1648
1649
tsvq_node prepare_root() const
1650
{
1651
double ttsum = 0.0f;
1652
1653
// Prepare root node containing all training vectors
1654
tsvq_node root;
1655
root.m_training_vecs.reserve(m_training_vecs.size());
1656
1657
for (uint32_t i = 0; i < m_training_vecs.size(); i++)
1658
{
1659
const TrainingVectorType &v = m_training_vecs[i].first;
1660
const uint64_t weight = m_training_vecs[i].second;
1661
1662
root.m_training_vecs.push_back(i);
1663
1664
root.m_origin += (v * static_cast<float>(weight));
1665
root.m_weight += weight;
1666
1667
ttsum += v.dot(v) * weight;
1668
}
1669
1670
root.m_var = static_cast<float>(ttsum - (root.m_origin.dot(root.m_origin) / root.m_weight));
1671
1672
root.m_origin *= (1.0f / root.m_weight);
1673
1674
return root;
1675
}
1676
1677
bool split_node(uint32_t node_index, priority_queue &var_heap, basisu::vector<uint32_t> &l_children, basisu::vector<uint32_t> &r_children)
1678
{
1679
TrainingVectorType l_child_org, r_child_org;
1680
uint64_t l_weight = 0, r_weight = 0;
1681
float l_var = 0.0f, r_var = 0.0f;
1682
1683
// Compute initial left/right child origins
1684
if (!prep_split(m_nodes[node_index], l_child_org, r_child_org))
1685
return false;
1686
1687
// Use k-means iterations to refine these children vectors
1688
if (!refine_split(m_nodes[node_index], l_child_org, l_weight, l_var, l_children, r_child_org, r_weight, r_var, r_children))
1689
return false;
1690
1691
// Create children
1692
const uint32_t l_child_index = (uint32_t)m_nodes.size(), r_child_index = (uint32_t)m_nodes.size() + 1;
1693
1694
m_nodes[node_index].m_left_index = l_child_index;
1695
m_nodes[node_index].m_right_index = r_child_index;
1696
1697
m_nodes[node_index].m_codebook_index = m_next_codebook_index;
1698
m_next_codebook_index++;
1699
1700
m_nodes.resize(m_nodes.size() + 2);
1701
1702
tsvq_node &l_child = m_nodes[l_child_index], &r_child = m_nodes[r_child_index];
1703
1704
l_child.set(l_child_org, l_weight, l_var, l_children);
1705
r_child.set(r_child_org, r_weight, r_var, r_children);
1706
1707
if ((l_child.m_var <= 0.0f) && (l_child.m_training_vecs.size() > 1))
1708
{
1709
TrainingVectorType v(m_training_vecs[l_child.m_training_vecs[0]].first);
1710
1711
for (uint32_t i = 1; i < l_child.m_training_vecs.size(); i++)
1712
{
1713
if (!(v == m_training_vecs[l_child.m_training_vecs[i]].first))
1714
{
1715
l_child.m_var = 1e-4f;
1716
break;
1717
}
1718
}
1719
}
1720
1721
if ((r_child.m_var <= 0.0f) && (r_child.m_training_vecs.size() > 1))
1722
{
1723
TrainingVectorType v(m_training_vecs[r_child.m_training_vecs[0]].first);
1724
1725
for (uint32_t i = 1; i < r_child.m_training_vecs.size(); i++)
1726
{
1727
if (!(v == m_training_vecs[r_child.m_training_vecs[i]].first))
1728
{
1729
r_child.m_var = 1e-4f;
1730
break;
1731
}
1732
}
1733
}
1734
1735
if ((l_child.m_var > 0.0f) && (l_child.m_training_vecs.size() > 1))
1736
var_heap.add_heap(l_child_index, l_child.m_var);
1737
1738
if ((r_child.m_var > 0.0f) && (r_child.m_training_vecs.size() > 1))
1739
var_heap.add_heap(r_child_index, r_child.m_var);
1740
1741
return true;
1742
}
1743
1744
TrainingVectorType compute_split_axis(const tsvq_node &node) const
1745
{
1746
const uint32_t N = TrainingVectorType::num_elements;
1747
1748
matrix<N, N, float> cmatrix;
1749
1750
if ((N != 16) || (!g_cpu_supports_sse41))
1751
{
1752
cmatrix.set_zero();
1753
1754
// Compute covariance matrix from weighted input vectors
1755
for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1756
{
1757
const TrainingVectorType v(m_training_vecs[node.m_training_vecs[i]].first - node.m_origin);
1758
const TrainingVectorType w(static_cast<float>(m_training_vecs[node.m_training_vecs[i]].second) * v);
1759
1760
for (uint32_t x = 0; x < N; x++)
1761
for (uint32_t y = x; y < N; y++)
1762
cmatrix[x][y] = cmatrix[x][y] + v[x] * w[y];
1763
}
1764
}
1765
else
1766
{
1767
#if BASISU_SUPPORT_SSE
1768
// Specialize the case with 16x16 matrices, which are quite expensive without SIMD.
1769
// This SSE function takes pointers to void types, so do some sanity checks.
1770
assert(sizeof(TrainingVectorType) == sizeof(float) * 16);
1771
assert(sizeof(training_vec_with_weight) == sizeof(std::pair<vec16F, uint64_t>));
1772
update_covar_matrix_16x16_sse41(node.m_training_vecs.size_u32(), m_training_vecs.data(), &node.m_origin, node.m_training_vecs.data(), &cmatrix);
1773
#endif
1774
}
1775
1776
const float renorm_scale = 1.0f / node.m_weight;
1777
1778
for (uint32_t x = 0; x < N; x++)
1779
for (uint32_t y = x; y < N; y++)
1780
cmatrix[x][y] *= renorm_scale;
1781
1782
// Diagonal flip
1783
for (uint32_t x = 0; x < (N - 1); x++)
1784
for (uint32_t y = x + 1; y < N; y++)
1785
cmatrix[y][x] = cmatrix[x][y];
1786
1787
return compute_pca_from_covar<N, TrainingVectorType>(cmatrix);
1788
}
1789
1790
bool prep_split(const tsvq_node &node, TrainingVectorType &l_child_result, TrainingVectorType &r_child_result) const
1791
{
1792
//const uint32_t N = TrainingVectorType::num_elements;
1793
1794
if (2 == node.m_training_vecs.size())
1795
{
1796
l_child_result = m_training_vecs[node.m_training_vecs[0]].first;
1797
r_child_result = m_training_vecs[node.m_training_vecs[1]].first;
1798
return true;
1799
}
1800
1801
TrainingVectorType axis(compute_split_axis(node)), l_child(0.0f), r_child(0.0f);
1802
double l_weight = 0.0f, r_weight = 0.0f;
1803
1804
// Compute initial left/right children
1805
for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1806
{
1807
const float weight = (float)m_training_vecs[node.m_training_vecs[i]].second;
1808
1809
const TrainingVectorType &v = m_training_vecs[node.m_training_vecs[i]].first;
1810
1811
double t = (v - node.m_origin).dot(axis);
1812
if (t >= 0.0f)
1813
{
1814
r_child += v * weight;
1815
r_weight += weight;
1816
}
1817
else
1818
{
1819
l_child += v * weight;
1820
l_weight += weight;
1821
}
1822
}
1823
1824
if ((l_weight > 0.0f) && (r_weight > 0.0f))
1825
{
1826
l_child_result = l_child * static_cast<float>(1.0f / l_weight);
1827
r_child_result = r_child * static_cast<float>(1.0f / r_weight);
1828
}
1829
else
1830
{
1831
TrainingVectorType l(1e+20f);
1832
TrainingVectorType h(-1e+20f);
1833
for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1834
{
1835
const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
1836
1837
l = TrainingVectorType::component_min(l, v);
1838
h = TrainingVectorType::component_max(h, v);
1839
}
1840
1841
TrainingVectorType r(h - l);
1842
1843
float largest_axis_v = 0.0f;
1844
int largest_axis_index = -1;
1845
for (uint32_t i = 0; i < TrainingVectorType::num_elements; i++)
1846
{
1847
if (r[i] > largest_axis_v)
1848
{
1849
largest_axis_v = r[i];
1850
largest_axis_index = i;
1851
}
1852
}
1853
1854
if (largest_axis_index < 0)
1855
return false;
1856
1857
basisu::vector<float> keys(node.m_training_vecs.size());
1858
for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1859
keys[i] = m_training_vecs[node.m_training_vecs[i]].first[largest_axis_index];
1860
1861
uint_vec indices(node.m_training_vecs.size());
1862
indirect_sort((uint32_t)node.m_training_vecs.size(), &indices[0], &keys[0]);
1863
1864
l_child.set_zero();
1865
l_weight = 0;
1866
1867
r_child.set_zero();
1868
r_weight = 0;
1869
1870
const uint32_t half_index = (uint32_t)node.m_training_vecs.size() / 2;
1871
for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1872
{
1873
const float weight = (float)m_training_vecs[node.m_training_vecs[i]].second;
1874
1875
const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
1876
1877
if (i < half_index)
1878
{
1879
l_child += v * weight;
1880
l_weight += weight;
1881
}
1882
else
1883
{
1884
r_child += v * weight;
1885
r_weight += weight;
1886
}
1887
}
1888
1889
if ((l_weight > 0.0f) && (r_weight > 0.0f))
1890
{
1891
l_child_result = l_child * static_cast<float>(1.0f / l_weight);
1892
r_child_result = r_child * static_cast<float>(1.0f / r_weight);
1893
}
1894
else
1895
{
1896
l_child_result = l;
1897
r_child_result = h;
1898
}
1899
}
1900
1901
return true;
1902
}
1903
1904
bool refine_split(const tsvq_node &node,
1905
TrainingVectorType &l_child, uint64_t &l_weight, float &l_var, basisu::vector<uint32_t> &l_children,
1906
TrainingVectorType &r_child, uint64_t &r_weight, float &r_var, basisu::vector<uint32_t> &r_children) const
1907
{
1908
l_children.reserve(node.m_training_vecs.size());
1909
r_children.reserve(node.m_training_vecs.size());
1910
1911
float prev_total_variance = 1e+10f;
1912
1913
// Refine left/right children locations using k-means iterations
1914
const uint32_t cMaxIters = 6;
1915
for (uint32_t iter = 0; iter < cMaxIters; iter++)
1916
{
1917
l_children.resize(0);
1918
r_children.resize(0);
1919
1920
TrainingVectorType new_l_child(cZero), new_r_child(cZero);
1921
1922
double l_ttsum = 0.0f, r_ttsum = 0.0f;
1923
1924
l_weight = 0;
1925
r_weight = 0;
1926
1927
for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1928
{
1929
const TrainingVectorType &v = m_training_vecs[node.m_training_vecs[i]].first;
1930
const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second;
1931
1932
double left_dist2 = l_child.squared_distance_d(v), right_dist2 = r_child.squared_distance_d(v);
1933
1934
if (left_dist2 >= right_dist2)
1935
{
1936
new_r_child += (v * static_cast<float>(weight));
1937
r_weight += weight;
1938
1939
r_ttsum += weight * v.dot(v);
1940
r_children.push_back(node.m_training_vecs[i]);
1941
}
1942
else
1943
{
1944
new_l_child += (v * static_cast<float>(weight));
1945
l_weight += weight;
1946
1947
l_ttsum += weight * v.dot(v);
1948
l_children.push_back(node.m_training_vecs[i]);
1949
}
1950
}
1951
1952
// Node is unsplittable using the above algorithm - try something else to split it up.
1953
if ((!l_weight) || (!r_weight))
1954
{
1955
l_children.resize(0);
1956
new_l_child.set(0.0f);
1957
l_ttsum = 0.0f;
1958
l_weight = 0;
1959
1960
r_children.resize(0);
1961
new_r_child.set(0.0f);
1962
r_ttsum = 0.0f;
1963
r_weight = 0;
1964
1965
TrainingVectorType firstVec;
1966
for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1967
{
1968
const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
1969
const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second;
1970
1971
if ((!i) || (v == firstVec))
1972
{
1973
firstVec = v;
1974
1975
new_r_child += (v * static_cast<float>(weight));
1976
r_weight += weight;
1977
1978
r_ttsum += weight * v.dot(v);
1979
r_children.push_back(node.m_training_vecs[i]);
1980
}
1981
else
1982
{
1983
new_l_child += (v * static_cast<float>(weight));
1984
l_weight += weight;
1985
1986
l_ttsum += weight * v.dot(v);
1987
l_children.push_back(node.m_training_vecs[i]);
1988
}
1989
}
1990
1991
if ((!l_weight) || (!r_weight))
1992
return false;
1993
}
1994
1995
l_var = static_cast<float>(l_ttsum - (new_l_child.dot(new_l_child) / l_weight));
1996
r_var = static_cast<float>(r_ttsum - (new_r_child.dot(new_r_child) / r_weight));
1997
1998
new_l_child *= (1.0f / l_weight);
1999
new_r_child *= (1.0f / r_weight);
2000
2001
l_child = new_l_child;
2002
r_child = new_r_child;
2003
2004
float total_var = l_var + r_var;
2005
const float cGiveupVariance = .00001f;
2006
if (total_var < cGiveupVariance)
2007
break;
2008
2009
// Check to see if the variance has settled
2010
const float cVarianceDeltaThresh = .00125f;
2011
if (((prev_total_variance - total_var) / total_var) < cVarianceDeltaThresh)
2012
break;
2013
2014
prev_total_variance = total_var;
2015
}
2016
2017
return true;
2018
}
2019
};
2020
2021
struct weighted_block_group
2022
{
2023
uint64_t m_total_weight;
2024
uint_vec m_indices;
2025
};
2026
2027
template<typename Quantizer>
2028
bool generate_hierarchical_codebook_threaded_internal(Quantizer& q,
2029
uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
2030
basisu::vector<uint_vec>& codebook,
2031
basisu::vector<uint_vec>& parent_codebook,
2032
uint32_t max_threads, bool limit_clusterizers, job_pool *pJob_pool)
2033
{
2034
codebook.resize(0);
2035
parent_codebook.resize(0);
2036
2037
if ((max_threads <= 1) || (q.get_training_vecs().size() < 256) || (max_codebook_size < max_threads * 16))
2038
{
2039
if (!q.generate(max_codebook_size))
2040
return false;
2041
2042
q.retrieve(codebook);
2043
2044
if (max_parent_codebook_size)
2045
q.retrieve(max_parent_codebook_size, parent_codebook);
2046
2047
return true;
2048
}
2049
2050
const uint32_t cMaxThreads = 16;
2051
if (max_threads > cMaxThreads)
2052
max_threads = cMaxThreads;
2053
2054
if (!q.generate(max_threads))
2055
return false;
2056
2057
basisu::vector<uint_vec> initial_codebook;
2058
2059
q.retrieve(initial_codebook);
2060
2061
if (initial_codebook.size() < max_threads)
2062
{
2063
codebook = initial_codebook;
2064
2065
if (max_parent_codebook_size)
2066
q.retrieve(max_parent_codebook_size, parent_codebook);
2067
2068
return true;
2069
}
2070
2071
Quantizer quantizers[cMaxThreads];
2072
2073
bool success_flags[cMaxThreads];
2074
clear_obj(success_flags);
2075
2076
basisu::vector<uint_vec> local_clusters[cMaxThreads];
2077
basisu::vector<uint_vec> local_parent_clusters[cMaxThreads];
2078
2079
for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)
2080
{
2081
pJob_pool->add_job( [thread_iter, &local_clusters, &local_parent_clusters, &success_flags, &quantizers, &initial_codebook, &q, &limit_clusterizers, &max_codebook_size, &max_threads, &max_parent_codebook_size] {
2082
2083
Quantizer& lq = quantizers[thread_iter];
2084
uint_vec& cluster_indices = initial_codebook[thread_iter];
2085
2086
uint_vec local_to_global(cluster_indices.size());
2087
2088
for (uint32_t i = 0; i < cluster_indices.size(); i++)
2089
{
2090
const uint32_t global_training_vec_index = cluster_indices[i];
2091
local_to_global[i] = global_training_vec_index;
2092
2093
lq.add_training_vec(q.get_training_vecs()[global_training_vec_index].first, q.get_training_vecs()[global_training_vec_index].second);
2094
}
2095
2096
const uint32_t max_clusters = limit_clusterizers ? ((max_codebook_size + max_threads - 1) / max_threads) : (uint32_t)lq.get_total_training_vecs();
2097
2098
success_flags[thread_iter] = lq.generate(max_clusters);
2099
2100
if (success_flags[thread_iter])
2101
{
2102
lq.retrieve(local_clusters[thread_iter]);
2103
2104
for (uint32_t i = 0; i < local_clusters[thread_iter].size(); i++)
2105
{
2106
for (uint32_t j = 0; j < local_clusters[thread_iter][i].size(); j++)
2107
local_clusters[thread_iter][i][j] = local_to_global[local_clusters[thread_iter][i][j]];
2108
}
2109
2110
if (max_parent_codebook_size)
2111
{
2112
lq.retrieve((max_parent_codebook_size + max_threads - 1) / max_threads, local_parent_clusters[thread_iter]);
2113
2114
for (uint32_t i = 0; i < local_parent_clusters[thread_iter].size(); i++)
2115
{
2116
for (uint32_t j = 0; j < local_parent_clusters[thread_iter][i].size(); j++)
2117
local_parent_clusters[thread_iter][i][j] = local_to_global[local_parent_clusters[thread_iter][i][j]];
2118
}
2119
}
2120
}
2121
2122
} );
2123
2124
} // thread_iter
2125
2126
pJob_pool->wait_for_all();
2127
2128
uint32_t total_clusters = 0, total_parent_clusters = 0;
2129
2130
for (int thread_iter = 0; thread_iter < (int)max_threads; thread_iter++)
2131
{
2132
if (!success_flags[thread_iter])
2133
return false;
2134
total_clusters += (uint32_t)local_clusters[thread_iter].size();
2135
total_parent_clusters += (uint32_t)local_parent_clusters[thread_iter].size();
2136
}
2137
2138
codebook.reserve(total_clusters);
2139
parent_codebook.reserve(total_parent_clusters);
2140
2141
for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)
2142
{
2143
for (uint32_t j = 0; j < local_clusters[thread_iter].size(); j++)
2144
{
2145
codebook.resize(codebook.size() + 1);
2146
codebook.back().swap(local_clusters[thread_iter][j]);
2147
}
2148
2149
for (uint32_t j = 0; j < local_parent_clusters[thread_iter].size(); j++)
2150
{
2151
parent_codebook.resize(parent_codebook.size() + 1);
2152
parent_codebook.back().swap(local_parent_clusters[thread_iter][j]);
2153
}
2154
}
2155
2156
return true;
2157
}
2158
2159
template<typename Quantizer>
2160
bool generate_hierarchical_codebook_threaded(Quantizer& q,
2161
uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
2162
basisu::vector<uint_vec>& codebook,
2163
basisu::vector<uint_vec>& parent_codebook,
2164
uint32_t max_threads, job_pool *pJob_pool,
2165
bool even_odd_input_pairs_equal)
2166
{
2167
// rg 6/24/2025 - Cross platform determinism
2168
#if 0
2169
typedef bit_hasher<typename Quantizer::training_vec_type> training_vec_bit_hasher;
2170
typedef std::unordered_map < typename Quantizer::training_vec_type, weighted_block_group,
2171
training_vec_bit_hasher> group_hash;
2172
#else
2173
typedef std::map< typename Quantizer::training_vec_type, weighted_block_group > group_hash;
2174
#endif
2175
2176
//interval_timer tm;
2177
//tm.start();
2178
2179
group_hash unique_vecs;
2180
2181
// rg 6/24/2025 - Cross platform determinism
2182
#if 0
2183
unique_vecs.reserve(20000);
2184
#endif
2185
2186
weighted_block_group g;
2187
2188
if (even_odd_input_pairs_equal)
2189
{
2190
g.m_indices.resize(2);
2191
2192
assert(q.get_training_vecs().size() >= 2 && (q.get_training_vecs().size() & 1) == 0);
2193
2194
for (uint32_t i = 0; i < q.get_training_vecs().size(); i += 2)
2195
{
2196
assert(q.get_training_vecs()[i].first == q.get_training_vecs()[i + 1].first);
2197
2198
g.m_total_weight = q.get_training_vecs()[i].second + q.get_training_vecs()[i + 1].second;
2199
g.m_indices[0] = i;
2200
g.m_indices[1] = i + 1;
2201
2202
auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g));
2203
2204
if (!ins_res.second)
2205
{
2206
(ins_res.first)->second.m_total_weight += g.m_total_weight;
2207
(ins_res.first)->second.m_indices.push_back(i);
2208
(ins_res.first)->second.m_indices.push_back(i + 1);
2209
}
2210
}
2211
}
2212
else
2213
{
2214
g.m_indices.resize(1);
2215
2216
for (uint32_t i = 0; i < q.get_training_vecs().size(); i++)
2217
{
2218
g.m_total_weight = q.get_training_vecs()[i].second;
2219
g.m_indices[0] = i;
2220
2221
auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g));
2222
2223
if (!ins_res.second)
2224
{
2225
(ins_res.first)->second.m_total_weight += g.m_total_weight;
2226
(ins_res.first)->second.m_indices.push_back(i);
2227
}
2228
}
2229
}
2230
2231
//debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors, %3.3f secs\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size(), tm.get_elapsed_secs());
2232
debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size());
2233
2234
Quantizer group_quant;
2235
typedef typename group_hash::const_iterator group_hash_const_iter;
2236
basisu::vector<group_hash_const_iter> unique_vec_iters;
2237
unique_vec_iters.reserve(unique_vecs.size());
2238
2239
for (auto iter = unique_vecs.begin(); iter != unique_vecs.end(); ++iter)
2240
{
2241
group_quant.add_training_vec(iter->first, iter->second.m_total_weight);
2242
unique_vec_iters.push_back(iter);
2243
}
2244
2245
bool limit_clusterizers = true;
2246
if (unique_vecs.size() <= max_codebook_size)
2247
limit_clusterizers = false;
2248
2249
debug_printf("Limit clusterizers: %u\n", limit_clusterizers);
2250
2251
basisu::vector<uint_vec> group_codebook, group_parent_codebook;
2252
bool status = generate_hierarchical_codebook_threaded_internal(group_quant,
2253
max_codebook_size, max_parent_codebook_size,
2254
group_codebook,
2255
group_parent_codebook,
2256
(unique_vecs.size() < 65536*4) ? 1 : max_threads, limit_clusterizers, pJob_pool);
2257
2258
if (!status)
2259
return false;
2260
2261
codebook.resize(0);
2262
for (uint32_t i = 0; i < group_codebook.size(); i++)
2263
{
2264
codebook.resize(codebook.size() + 1);
2265
2266
for (uint32_t j = 0; j < group_codebook[i].size(); j++)
2267
{
2268
const uint32_t group_index = group_codebook[i][j];
2269
2270
typename group_hash::const_iterator group_iter = unique_vec_iters[group_index];
2271
const uint_vec& training_vec_indices = group_iter->second.m_indices;
2272
2273
append_vector(codebook.back(), training_vec_indices);
2274
}
2275
}
2276
2277
parent_codebook.resize(0);
2278
for (uint32_t i = 0; i < group_parent_codebook.size(); i++)
2279
{
2280
parent_codebook.resize(parent_codebook.size() + 1);
2281
2282
for (uint32_t j = 0; j < group_parent_codebook[i].size(); j++)
2283
{
2284
const uint32_t group_index = group_parent_codebook[i][j];
2285
2286
typename group_hash::const_iterator group_iter = unique_vec_iters[group_index];
2287
const uint_vec& training_vec_indices = group_iter->second.m_indices;
2288
2289
append_vector(parent_codebook.back(), training_vec_indices);
2290
}
2291
}
2292
2293
return true;
2294
}
2295
2296
// Canonical Huffman coding
2297
2298
class histogram
2299
{
2300
basisu::vector<uint32_t> m_hist;
2301
2302
public:
2303
histogram(uint32_t size = 0) { init(size); }
2304
2305
void clear()
2306
{
2307
clear_vector(m_hist);
2308
}
2309
2310
void init(uint32_t size)
2311
{
2312
m_hist.resize(0);
2313
m_hist.resize(size);
2314
}
2315
2316
inline uint32_t size() const { return static_cast<uint32_t>(m_hist.size()); }
2317
2318
inline const uint32_t &operator[] (uint32_t index) const
2319
{
2320
return m_hist[index];
2321
}
2322
2323
inline uint32_t &operator[] (uint32_t index)
2324
{
2325
return m_hist[index];
2326
}
2327
2328
inline void inc(uint32_t index)
2329
{
2330
m_hist[index]++;
2331
}
2332
2333
uint64_t get_total() const
2334
{
2335
uint64_t total = 0;
2336
for (uint32_t i = 0; i < m_hist.size(); ++i)
2337
total += m_hist[i];
2338
return total;
2339
}
2340
2341
double get_entropy() const
2342
{
2343
double total = static_cast<double>(get_total());
2344
if (total == 0.0f)
2345
return 0.0f;
2346
2347
const double inv_total = 1.0f / total;
2348
const double neg_inv_log2 = -1.0f / log(2.0f);
2349
2350
double e = 0.0f;
2351
for (uint32_t i = 0; i < m_hist.size(); i++)
2352
if (m_hist[i])
2353
e += log(m_hist[i] * inv_total) * neg_inv_log2 * static_cast<double>(m_hist[i]);
2354
2355
return e;
2356
}
2357
};
2358
2359
struct sym_freq
2360
{
2361
uint32_t m_key;
2362
uint16_t m_sym_index;
2363
};
2364
2365
sym_freq *canonical_huffman_radix_sort_syms(uint32_t num_syms, sym_freq *pSyms0, sym_freq *pSyms1);
2366
void canonical_huffman_calculate_minimum_redundancy(sym_freq *A, int num_syms);
2367
void canonical_huffman_enforce_max_code_size(int *pNum_codes, int code_list_len, int max_code_size);
2368
2369
class huffman_encoding_table
2370
{
2371
public:
2372
huffman_encoding_table()
2373
{
2374
}
2375
2376
void clear()
2377
{
2378
clear_vector(m_codes);
2379
clear_vector(m_code_sizes);
2380
}
2381
2382
bool init(const histogram &h, uint32_t max_code_size = cHuffmanMaxSupportedCodeSize)
2383
{
2384
return init(h.size(), &h[0], max_code_size);
2385
}
2386
2387
bool init(uint32_t num_syms, const uint16_t *pFreq, uint32_t max_code_size);
2388
bool init(uint32_t num_syms, const uint32_t *pSym_freq, uint32_t max_code_size);
2389
2390
inline const uint16_vec &get_codes() const { return m_codes; }
2391
inline const uint8_vec &get_code_sizes() const { return m_code_sizes; }
2392
2393
uint32_t get_total_used_codes() const
2394
{
2395
for (int i = static_cast<int>(m_code_sizes.size()) - 1; i >= 0; i--)
2396
if (m_code_sizes[i])
2397
return i + 1;
2398
return 0;
2399
}
2400
2401
private:
2402
uint16_vec m_codes;
2403
uint8_vec m_code_sizes;
2404
};
2405
2406
class bitwise_coder
2407
{
2408
public:
2409
bitwise_coder() :
2410
m_bit_buffer(0),
2411
m_bit_buffer_size(0),
2412
m_total_bits(0)
2413
{
2414
}
2415
2416
bitwise_coder(const bitwise_coder& other) :
2417
m_bytes(other.m_bytes),
2418
m_bit_buffer(other.m_bit_buffer),
2419
m_bit_buffer_size(other.m_bit_buffer_size),
2420
m_total_bits(other.m_total_bits)
2421
{
2422
}
2423
2424
bitwise_coder(bitwise_coder&& other) :
2425
m_bytes(std::move(other.m_bytes)),
2426
m_bit_buffer(other.m_bit_buffer),
2427
m_bit_buffer_size(other.m_bit_buffer_size),
2428
m_total_bits(other.m_total_bits)
2429
{
2430
}
2431
2432
bitwise_coder& operator= (const bitwise_coder& rhs)
2433
{
2434
if (this == &rhs)
2435
return *this;
2436
2437
m_bytes = rhs.m_bytes;
2438
m_bit_buffer = rhs.m_bit_buffer;
2439
m_bit_buffer_size = rhs.m_bit_buffer_size;
2440
m_total_bits = rhs.m_total_bits;
2441
2442
return *this;
2443
}
2444
2445
bitwise_coder& operator= (bitwise_coder&& rhs)
2446
{
2447
if (this == &rhs)
2448
return *this;
2449
2450
m_bytes = std::move(rhs.m_bytes);
2451
m_bit_buffer = rhs.m_bit_buffer;
2452
m_bit_buffer_size = rhs.m_bit_buffer_size;
2453
m_total_bits = rhs.m_total_bits;
2454
2455
return *this;
2456
}
2457
2458
inline void clear()
2459
{
2460
clear_vector(m_bytes);
2461
m_bit_buffer = 0;
2462
m_bit_buffer_size = 0;
2463
m_total_bits = 0;
2464
}
2465
2466
inline void restart()
2467
{
2468
m_bytes.resize(0);
2469
m_bit_buffer = 0;
2470
m_bit_buffer_size = 0;
2471
m_total_bits = 0;
2472
}
2473
2474
inline const uint8_vec &get_bytes() const { return m_bytes; }
2475
inline uint8_vec& get_bytes() { return m_bytes; }
2476
2477
inline void reserve(uint32_t size) { m_bytes.reserve(size); }
2478
2479
inline uint64_t get_total_bits() const { return m_total_bits; }
2480
inline uint32_t get_total_bits_u32() const { assert(m_total_bits <= UINT32_MAX); return static_cast<uint32_t>(m_total_bits); }
2481
inline void clear_total_bits() { m_total_bits = 0; }
2482
2483
inline void init(uint32_t reserve_size = 1024)
2484
{
2485
m_bytes.reserve(reserve_size);
2486
m_bytes.resize(0);
2487
2488
m_bit_buffer = 0;
2489
m_bit_buffer_size = 0;
2490
m_total_bits = 0;
2491
}
2492
2493
inline uint32_t flush()
2494
{
2495
if (m_bit_buffer_size)
2496
{
2497
m_total_bits += 8 - (m_bit_buffer_size & 7);
2498
append_byte(static_cast<uint8_t>(m_bit_buffer));
2499
2500
m_bit_buffer = 0;
2501
m_bit_buffer_size = 0;
2502
2503
return 8;
2504
}
2505
2506
return 0;
2507
}
2508
2509
inline uint32_t put_bits(uint32_t bits, uint32_t num_bits)
2510
{
2511
assert(num_bits <= 32);
2512
assert(bits < (1ULL << num_bits));
2513
2514
if (!num_bits)
2515
return 0;
2516
2517
m_total_bits += num_bits;
2518
2519
uint64_t v = (static_cast<uint64_t>(bits) << m_bit_buffer_size) | m_bit_buffer;
2520
m_bit_buffer_size += num_bits;
2521
2522
while (m_bit_buffer_size >= 8)
2523
{
2524
append_byte(static_cast<uint8_t>(v));
2525
v >>= 8;
2526
m_bit_buffer_size -= 8;
2527
}
2528
2529
m_bit_buffer = static_cast<uint8_t>(v);
2530
return num_bits;
2531
}
2532
2533
inline uint32_t put_code(uint32_t sym, const huffman_encoding_table &tab)
2534
{
2535
uint32_t code = tab.get_codes()[sym];
2536
uint32_t code_size = tab.get_code_sizes()[sym];
2537
assert(code_size >= 1);
2538
put_bits(code, code_size);
2539
return code_size;
2540
}
2541
2542
inline uint32_t put_truncated_binary(uint32_t v, uint32_t n)
2543
{
2544
assert((n >= 2) && (v < n));
2545
2546
uint32_t k = floor_log2i(n);
2547
uint32_t u = (1 << (k + 1)) - n;
2548
2549
if (v < u)
2550
return put_bits(v, k);
2551
2552
uint32_t x = v + u;
2553
assert((x >> 1) >= u);
2554
2555
put_bits(x >> 1, k);
2556
put_bits(x & 1, 1);
2557
return k + 1;
2558
}
2559
2560
inline uint32_t put_rice(uint32_t v, uint32_t m)
2561
{
2562
assert(m);
2563
2564
const uint64_t start_bits = m_total_bits;
2565
2566
uint32_t q = v >> m, r = v & ((1 << m) - 1);
2567
2568
// rice coding sanity check
2569
assert(q <= 64);
2570
2571
for (; q > 16; q -= 16)
2572
put_bits(0xFFFF, 16);
2573
2574
put_bits((1 << q) - 1, q);
2575
put_bits(r << 1, m + 1);
2576
2577
return (uint32_t)(m_total_bits - start_bits);
2578
}
2579
2580
inline uint32_t put_vlc(uint32_t v, uint32_t chunk_bits)
2581
{
2582
assert(chunk_bits);
2583
2584
const uint32_t chunk_size = 1 << chunk_bits;
2585
const uint32_t chunk_mask = chunk_size - 1;
2586
2587
uint32_t total_bits = 0;
2588
2589
for ( ; ; )
2590
{
2591
uint32_t next_v = v >> chunk_bits;
2592
2593
total_bits += put_bits((v & chunk_mask) | (next_v ? chunk_size : 0), chunk_bits + 1);
2594
if (!next_v)
2595
break;
2596
2597
v = next_v;
2598
}
2599
2600
return total_bits;
2601
}
2602
2603
uint32_t emit_huffman_table(const huffman_encoding_table &tab);
2604
2605
void append(const bitwise_coder& other)
2606
{
2607
for (uint32_t i = 0; i < other.m_bytes.size(); i++)
2608
put_bits(other.m_bytes[i], 8);
2609
2610
if (other.m_bit_buffer_size)
2611
put_bits(other.m_bit_buffer, other.m_bit_buffer_size);
2612
}
2613
2614
private:
2615
uint8_vec m_bytes;
2616
uint32_t m_bit_buffer, m_bit_buffer_size;
2617
uint64_t m_total_bits;
2618
2619
inline void append_byte(uint8_t c)
2620
{
2621
//m_bytes.resize(m_bytes.size() + 1);
2622
//m_bytes.back() = c;
2623
2624
m_bytes.push_back(c);
2625
}
2626
2627
static void end_nonzero_run(uint16_vec &syms, uint32_t &run_size, uint32_t len);
2628
static void end_zero_run(uint16_vec &syms, uint32_t &run_size);
2629
};
2630
2631
class huff2D
2632
{
2633
public:
2634
huff2D() { }
2635
huff2D(uint32_t bits_per_sym, uint32_t total_syms_per_group) { init(bits_per_sym, total_syms_per_group); }
2636
2637
inline const histogram &get_histogram() const { return m_histogram; }
2638
inline const huffman_encoding_table &get_encoding_table() const { return m_encoding_table; }
2639
2640
inline void init(uint32_t bits_per_sym, uint32_t total_syms_per_group)
2641
{
2642
assert((bits_per_sym * total_syms_per_group) <= 16 && total_syms_per_group >= 1 && bits_per_sym >= 1);
2643
2644
m_bits_per_sym = bits_per_sym;
2645
m_total_syms_per_group = total_syms_per_group;
2646
m_cur_sym_bits = 0;
2647
m_cur_num_syms = 0;
2648
m_decode_syms_remaining = 0;
2649
m_next_decoder_group_index = 0;
2650
2651
m_histogram.init(1 << (bits_per_sym * total_syms_per_group));
2652
}
2653
2654
inline void clear()
2655
{
2656
m_group_bits.clear();
2657
2658
m_cur_sym_bits = 0;
2659
m_cur_num_syms = 0;
2660
m_decode_syms_remaining = 0;
2661
m_next_decoder_group_index = 0;
2662
}
2663
2664
inline void emit(uint32_t sym)
2665
{
2666
m_cur_sym_bits |= (sym << (m_cur_num_syms * m_bits_per_sym));
2667
m_cur_num_syms++;
2668
2669
if (m_cur_num_syms == m_total_syms_per_group)
2670
flush();
2671
}
2672
2673
inline void flush()
2674
{
2675
if (m_cur_num_syms)
2676
{
2677
m_group_bits.push_back(m_cur_sym_bits);
2678
m_histogram.inc(m_cur_sym_bits);
2679
2680
m_cur_sym_bits = 0;
2681
m_cur_num_syms = 0;
2682
}
2683
}
2684
2685
inline bool start_encoding(uint32_t code_size_limit = 16)
2686
{
2687
flush();
2688
2689
if (!m_encoding_table.init(m_histogram, code_size_limit))
2690
return false;
2691
2692
m_decode_syms_remaining = 0;
2693
m_next_decoder_group_index = 0;
2694
2695
return true;
2696
}
2697
2698
inline uint32_t emit_next_sym(bitwise_coder &c)
2699
{
2700
uint32_t bits = 0;
2701
2702
if (!m_decode_syms_remaining)
2703
{
2704
bits = c.put_code(m_group_bits[m_next_decoder_group_index++], m_encoding_table);
2705
m_decode_syms_remaining = m_total_syms_per_group;
2706
}
2707
2708
m_decode_syms_remaining--;
2709
return bits;
2710
}
2711
2712
inline void emit_flush()
2713
{
2714
m_decode_syms_remaining = 0;
2715
}
2716
2717
private:
2718
uint_vec m_group_bits;
2719
huffman_encoding_table m_encoding_table;
2720
histogram m_histogram;
2721
uint32_t m_bits_per_sym, m_total_syms_per_group, m_cur_sym_bits, m_cur_num_syms, m_next_decoder_group_index, m_decode_syms_remaining;
2722
};
2723
2724
bool huffman_test(int rand_seed);
2725
2726
// VQ index reordering
2727
2728
class palette_index_reorderer
2729
{
2730
public:
2731
palette_index_reorderer()
2732
{
2733
}
2734
2735
void clear()
2736
{
2737
clear_vector(m_hist);
2738
clear_vector(m_total_count_to_picked);
2739
clear_vector(m_entries_picked);
2740
clear_vector(m_entries_to_do);
2741
clear_vector(m_remap_table);
2742
}
2743
2744
// returns [0,1] distance of entry i to entry j
2745
typedef float(*pEntry_dist_func)(uint32_t i, uint32_t j, void *pCtx);
2746
2747
void init(uint32_t num_indices, const uint32_t *pIndices, uint32_t num_syms, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight);
2748
2749
// Table remaps old to new symbol indices
2750
inline const uint_vec &get_remap_table() const { return m_remap_table; }
2751
2752
private:
2753
uint_vec m_hist, m_total_count_to_picked, m_entries_picked, m_entries_to_do, m_remap_table;
2754
2755
inline uint32_t get_hist(int i, int j, int n) const { return (i > j) ? m_hist[j * n + i] : m_hist[i * n + j]; }
2756
inline void inc_hist(int i, int j, int n) { if ((i != j) && (i < j) && (i != -1) && (j != -1)) { assert(((uint32_t)i < (uint32_t)n) && ((uint32_t)j < (uint32_t)n)); m_hist[i * n + j]++; } }
2757
2758
void prepare_hist(uint32_t num_syms, uint32_t num_indices, const uint32_t *pIndices);
2759
void find_initial(uint32_t num_syms);
2760
void find_next_entry(uint32_t &best_entry, double &best_count, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight);
2761
float pick_side(uint32_t num_syms, uint32_t entry_to_move, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight);
2762
};
2763
2764
// Simple 32-bit 2D image class
2765
2766
class image
2767
{
2768
public:
2769
image() :
2770
m_width(0), m_height(0), m_pitch(0)
2771
{
2772
}
2773
2774
image(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) :
2775
m_width(0), m_height(0), m_pitch(0)
2776
{
2777
resize(w, h, p);
2778
}
2779
2780
image(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps) :
2781
m_width(0), m_height(0), m_pitch(0)
2782
{
2783
init(pImage, width, height, comps);
2784
}
2785
2786
image(const image &other) :
2787
m_width(0), m_height(0), m_pitch(0)
2788
{
2789
*this = other;
2790
}
2791
2792
image(image&& other) :
2793
m_width(other.m_width), m_height(other.m_height), m_pitch(other.m_pitch),
2794
m_pixels(std::move(other.m_pixels))
2795
{
2796
other.m_width = 0;
2797
other.m_height = 0;
2798
other.m_pitch = 0;
2799
}
2800
2801
image& operator= (image&& rhs)
2802
{
2803
if (this != &rhs)
2804
{
2805
m_width = rhs.m_width;
2806
m_height = rhs.m_height;
2807
m_pitch = rhs.m_pitch;
2808
m_pixels = std::move(rhs.m_pixels);
2809
2810
rhs.m_width = 0;
2811
rhs.m_height = 0;
2812
rhs.m_pitch = 0;
2813
}
2814
return *this;
2815
}
2816
2817
image &swap(image &other)
2818
{
2819
std::swap(m_width, other.m_width);
2820
std::swap(m_height, other.m_height);
2821
std::swap(m_pitch, other.m_pitch);
2822
m_pixels.swap(other.m_pixels);
2823
return *this;
2824
}
2825
2826
image &operator= (const image &rhs)
2827
{
2828
if (this != &rhs)
2829
{
2830
m_width = rhs.m_width;
2831
m_height = rhs.m_height;
2832
m_pitch = rhs.m_pitch;
2833
m_pixels = rhs.m_pixels;
2834
}
2835
return *this;
2836
}
2837
2838
image &clear()
2839
{
2840
m_width = 0;
2841
m_height = 0;
2842
m_pitch = 0;
2843
clear_vector(m_pixels);
2844
return *this;
2845
}
2846
2847
image& match_dimensions(const image& other)
2848
{
2849
resize(other.get_width(), other.get_height());
2850
return *this;
2851
}
2852
2853
image &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba& background = g_black_color)
2854
{
2855
return crop(w, h, p, background);
2856
}
2857
2858
image &set_all(const color_rgba &c)
2859
{
2860
for (uint32_t i = 0; i < m_pixels.size(); i++)
2861
m_pixels[i] = c;
2862
return *this;
2863
}
2864
2865
void init(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps)
2866
{
2867
assert(comps >= 1 && comps <= 4);
2868
2869
resize(width, height);
2870
2871
for (uint32_t y = 0; y < height; y++)
2872
{
2873
for (uint32_t x = 0; x < width; x++)
2874
{
2875
const uint8_t *pSrc = &pImage[(x + y * width) * comps];
2876
color_rgba &dst = (*this)(x, y);
2877
2878
if (comps == 1)
2879
{
2880
dst.r = pSrc[0];
2881
dst.g = pSrc[0];
2882
dst.b = pSrc[0];
2883
dst.a = 255;
2884
}
2885
else if (comps == 2)
2886
{
2887
dst.r = pSrc[0];
2888
dst.g = pSrc[0];
2889
dst.b = pSrc[0];
2890
dst.a = pSrc[1];
2891
}
2892
else
2893
{
2894
dst.r = pSrc[0];
2895
dst.g = pSrc[1];
2896
dst.b = pSrc[2];
2897
if (comps == 4)
2898
dst.a = pSrc[3];
2899
else
2900
dst.a = 255;
2901
}
2902
}
2903
}
2904
}
2905
2906
image &fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba &c)
2907
{
2908
for (uint32_t iy = 0; iy < h; iy++)
2909
for (uint32_t ix = 0; ix < w; ix++)
2910
set_clipped(x + ix, y + iy, c);
2911
return *this;
2912
}
2913
2914
image& fill_box_alpha(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba& c)
2915
{
2916
for (uint32_t iy = 0; iy < h; iy++)
2917
for (uint32_t ix = 0; ix < w; ix++)
2918
set_clipped_alpha(x + ix, y + iy, c);
2919
return *this;
2920
}
2921
2922
image &crop_dup_borders(uint32_t w, uint32_t h)
2923
{
2924
const uint32_t orig_w = m_width, orig_h = m_height;
2925
2926
crop(w, h);
2927
2928
if (orig_w && orig_h)
2929
{
2930
if (m_width > orig_w)
2931
{
2932
for (uint32_t x = orig_w; x < m_width; x++)
2933
for (uint32_t y = 0; y < m_height; y++)
2934
set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));
2935
}
2936
2937
if (m_height > orig_h)
2938
{
2939
for (uint32_t y = orig_h; y < m_height; y++)
2940
for (uint32_t x = 0; x < m_width; x++)
2941
set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));
2942
}
2943
}
2944
return *this;
2945
}
2946
2947
// pPixels MUST have been allocated using malloc() (basisu::vector will eventually use free() on the pointer).
2948
image& grant_ownership(color_rgba* pPixels, uint32_t w, uint32_t h, uint32_t p = UINT32_MAX)
2949
{
2950
if (p == UINT32_MAX)
2951
p = w;
2952
2953
clear();
2954
2955
if ((!p) || (!w) || (!h))
2956
return *this;
2957
2958
m_pixels.grant_ownership(pPixels, p * h, p * h);
2959
2960
m_width = w;
2961
m_height = h;
2962
m_pitch = p;
2963
2964
return *this;
2965
}
2966
2967
image &crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba &background = g_black_color, bool init_image = true)
2968
{
2969
if (p == UINT32_MAX)
2970
p = w;
2971
2972
if ((w == m_width) && (m_height == h) && (m_pitch == p))
2973
return *this;
2974
2975
if ((!w) || (!h) || (!p))
2976
{
2977
clear();
2978
return *this;
2979
}
2980
2981
color_rgba_vec cur_state;
2982
cur_state.swap(m_pixels);
2983
2984
m_pixels.resize(p * h);
2985
2986
if (init_image)
2987
{
2988
if (m_width || m_height)
2989
{
2990
for (uint32_t y = 0; y < h; y++)
2991
{
2992
for (uint32_t x = 0; x < w; x++)
2993
{
2994
if ((x < m_width) && (y < m_height))
2995
m_pixels[x + y * p] = cur_state[x + y * m_pitch];
2996
else
2997
m_pixels[x + y * p] = background;
2998
}
2999
}
3000
}
3001
else
3002
{
3003
m_pixels.set_all(background);
3004
}
3005
}
3006
3007
m_width = w;
3008
m_height = h;
3009
m_pitch = p;
3010
3011
return *this;
3012
}
3013
3014
inline const color_rgba &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
3015
inline color_rgba &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
3016
3017
inline const color_rgba &get_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
3018
inline color_rgba &get_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
3019
3020
inline const color_rgba &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) const
3021
{
3022
x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
3023
y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
3024
return m_pixels[x + y * m_pitch];
3025
}
3026
3027
inline color_rgba &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v)
3028
{
3029
x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
3030
y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
3031
return m_pixels[x + y * m_pitch];
3032
}
3033
3034
inline image &set_clipped(int x, int y, const color_rgba &c)
3035
{
3036
if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
3037
(*this)(x, y) = c;
3038
return *this;
3039
}
3040
3041
inline image& set_clipped_alpha(int x, int y, const color_rgba& c)
3042
{
3043
if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
3044
(*this)(x, y).m_comps[3] = c.m_comps[3];
3045
return *this;
3046
}
3047
3048
// Very straightforward blit with full clipping. Not fast, but it works.
3049
image &blit(const image &src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y)
3050
{
3051
for (int y = 0; y < src_h; y++)
3052
{
3053
const int sy = src_y + y;
3054
if (sy < 0)
3055
continue;
3056
else if (sy >= (int)src.get_height())
3057
break;
3058
3059
for (int x = 0; x < src_w; x++)
3060
{
3061
const int sx = src_x + x;
3062
if (sx < 0)
3063
continue;
3064
else if (sx >= (int)src.get_width())
3065
break;
3066
3067
set_clipped(dst_x + x, dst_y + y, src(sx, sy));
3068
}
3069
}
3070
3071
return *this;
3072
}
3073
3074
const image &extract_block_clamped(color_rgba *pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const
3075
{
3076
if (((src_x + w) > m_width) || ((src_y + h) > m_height))
3077
{
3078
// Slower clamping case
3079
for (uint32_t y = 0; y < h; y++)
3080
for (uint32_t x = 0; x < w; x++)
3081
*pDst++ = get_clamped(src_x + x, src_y + y);
3082
}
3083
else
3084
{
3085
const color_rgba* pSrc = &m_pixels[src_x + src_y * m_pitch];
3086
3087
for (uint32_t y = 0; y < h; y++)
3088
{
3089
memcpy(pDst, pSrc, w * sizeof(color_rgba));
3090
pSrc += m_pitch;
3091
pDst += w;
3092
}
3093
}
3094
3095
return *this;
3096
}
3097
3098
image &set_block_clipped(const color_rgba *pSrc, uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h)
3099
{
3100
for (uint32_t y = 0; y < h; y++)
3101
for (uint32_t x = 0; x < w; x++)
3102
set_clipped(dst_x + x, dst_y + y, *pSrc++);
3103
return *this;
3104
}
3105
3106
inline bool is_valid() const { return m_width > 0; }
3107
3108
inline uint32_t get_width() const { return m_width; }
3109
inline uint32_t get_height() const { return m_height; }
3110
inline uint32_t get_pitch() const { return m_pitch; }
3111
inline uint32_t get_total_pixels() const { return m_width * m_height; }
3112
3113
inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - 1)) / w; }
3114
inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - 1)) / h; }
3115
inline uint32_t get_total_blocks(uint32_t w, uint32_t h) const { return get_block_width(w) * get_block_height(h); }
3116
3117
inline const color_rgba_vec &get_pixels() const { return m_pixels; }
3118
inline color_rgba_vec &get_pixels() { return m_pixels; }
3119
3120
inline const color_rgba *get_ptr() const { return &m_pixels[0]; }
3121
inline color_rgba *get_ptr() { return &m_pixels[0]; }
3122
3123
bool has_alpha(uint32_t channel = 3) const
3124
{
3125
for (uint32_t y = 0; y < m_height; ++y)
3126
for (uint32_t x = 0; x < m_width; ++x)
3127
if ((*this)(x, y)[channel] < 255)
3128
return true;
3129
3130
return false;
3131
}
3132
3133
image &set_alpha(uint8_t a)
3134
{
3135
for (uint32_t y = 0; y < m_height; ++y)
3136
for (uint32_t x = 0; x < m_width; ++x)
3137
(*this)(x, y).a = a;
3138
return *this;
3139
}
3140
3141
image &flip_y()
3142
{
3143
for (uint32_t y = 0; y < m_height / 2; ++y)
3144
for (uint32_t x = 0; x < m_width; ++x)
3145
std::swap((*this)(x, y), (*this)(x, m_height - 1 - y));
3146
return *this;
3147
}
3148
3149
// TODO: There are many ways to do this, not sure this is the best way.
3150
image &renormalize_normal_map()
3151
{
3152
for (uint32_t y = 0; y < m_height; y++)
3153
{
3154
for (uint32_t x = 0; x < m_width; x++)
3155
{
3156
color_rgba &c = (*this)(x, y);
3157
if ((c.r == 128) && (c.g == 128) && (c.b == 128))
3158
continue;
3159
3160
vec3F v(c.r, c.g, c.b);
3161
v = (v * (2.0f / 255.0f)) - vec3F(1.0f);
3162
v.clamp(-1.0f, 1.0f);
3163
3164
float length = v.length();
3165
const float cValidThresh = .077f;
3166
if (length < cValidThresh)
3167
{
3168
c.set(128, 128, 128, c.a);
3169
}
3170
else if (fabs(length - 1.0f) > cValidThresh)
3171
{
3172
if (length)
3173
v /= length;
3174
3175
for (uint32_t i = 0; i < 3; i++)
3176
c[i] = static_cast<uint8_t>(clamp<float>(floor((v[i] + 1.0f) * 255.0f * .5f + .5f), 0.0f, 255.0f));
3177
3178
if ((c.g == 128) && (c.r == 128))
3179
{
3180
if (c.b < 128)
3181
c.b = 0;
3182
else
3183
c.b = 255;
3184
}
3185
}
3186
}
3187
}
3188
return *this;
3189
}
3190
3191
void swap_rb()
3192
{
3193
for (auto& v : m_pixels)
3194
std::swap(v.r, v.b);
3195
}
3196
3197
void debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t x_scale, uint32_t y_scale, const color_rgba &fg, const color_rgba *pBG, bool alpha_only, const char* p, ...);
3198
3199
vec4F get_filtered_vec4F(float x, float y) const
3200
{
3201
x -= .5f;
3202
y -= .5f;
3203
3204
int ix = (int)floorf(x);
3205
int iy = (int)floorf(y);
3206
float wx = x - ix;
3207
float wy = y - iy;
3208
3209
color_rgba a(get_clamped(ix, iy));
3210
color_rgba b(get_clamped(ix + 1, iy));
3211
color_rgba c(get_clamped(ix, iy + 1));
3212
color_rgba d(get_clamped(ix + 1, iy + 1));
3213
3214
vec4F result;
3215
3216
for (uint32_t i = 0; i < 4; i++)
3217
{
3218
const float top = lerp<float>((float)a[i], (float)b[i], wx);
3219
const float bot = lerp<float>((float)c[i], (float)d[i], wx);
3220
const float m = lerp<float>((float)top, (float)bot, wy);
3221
3222
result[i] = m;
3223
}
3224
3225
return result;
3226
}
3227
3228
// (x,y) - Continuous coordinates, where pixel centers are at (.5,.5), valid image coords are [0,width] and [0,height]. Clamp addressing.
3229
color_rgba get_filtered(float x, float y) const
3230
{
3231
const vec4F fresult(get_filtered_vec4F(x, y));
3232
3233
color_rgba result;
3234
3235
for (uint32_t i = 0; i < 4; i++)
3236
result[i] = (uint8_t)clamp<int>((int)(fresult[i] + .5f), 0, 255);
3237
3238
return result;
3239
}
3240
3241
private:
3242
uint32_t m_width, m_height, m_pitch; // all in pixels
3243
color_rgba_vec m_pixels;
3244
};
3245
3246
// Float images
3247
3248
typedef basisu::vector<vec4F> vec4F_vec;
3249
3250
class imagef
3251
{
3252
public:
3253
imagef() :
3254
m_width(0), m_height(0), m_pitch(0)
3255
{
3256
}
3257
3258
imagef(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) :
3259
m_width(0), m_height(0), m_pitch(0)
3260
{
3261
resize(w, h, p);
3262
}
3263
3264
imagef(const imagef &other) :
3265
m_width(0), m_height(0), m_pitch(0)
3266
{
3267
*this = other;
3268
}
3269
3270
imagef(imagef&& other) :
3271
m_width(other.m_width), m_height(other.m_height), m_pitch(other.m_pitch),
3272
m_pixels(std::move(other.m_pixels))
3273
{
3274
other.m_width = 0;
3275
other.m_height = 0;
3276
other.m_pitch = 0;
3277
}
3278
3279
imagef& operator= (imagef&& rhs)
3280
{
3281
if (this != &rhs)
3282
{
3283
m_width = rhs.m_width;
3284
m_height = rhs.m_height;
3285
m_pitch = rhs.m_pitch;
3286
m_pixels = std::move(rhs.m_pixels);
3287
3288
rhs.m_width = 0;
3289
rhs.m_height = 0;
3290
rhs.m_pitch = 0;
3291
}
3292
return *this;
3293
}
3294
3295
imagef &swap(imagef &other)
3296
{
3297
std::swap(m_width, other.m_width);
3298
std::swap(m_height, other.m_height);
3299
std::swap(m_pitch, other.m_pitch);
3300
m_pixels.swap(other.m_pixels);
3301
return *this;
3302
}
3303
3304
imagef &operator= (const imagef &rhs)
3305
{
3306
if (this != &rhs)
3307
{
3308
m_width = rhs.m_width;
3309
m_height = rhs.m_height;
3310
m_pitch = rhs.m_pitch;
3311
m_pixels = rhs.m_pixels;
3312
}
3313
return *this;
3314
}
3315
3316
imagef &clear()
3317
{
3318
m_width = 0;
3319
m_height = 0;
3320
m_pitch = 0;
3321
clear_vector(m_pixels);
3322
return *this;
3323
}
3324
3325
imagef &set(const image &src, const vec4F &scale = vec4F(1), const vec4F &bias = vec4F(0))
3326
{
3327
const uint32_t width = src.get_width();
3328
const uint32_t height = src.get_height();
3329
3330
resize(width, height);
3331
3332
for (int y = 0; y < (int)height; y++)
3333
{
3334
for (uint32_t x = 0; x < width; x++)
3335
{
3336
const color_rgba &src_pixel = src(x, y);
3337
(*this)(x, y).set((float)src_pixel.r * scale[0] + bias[0], (float)src_pixel.g * scale[1] + bias[1], (float)src_pixel.b * scale[2] + bias[2], (float)src_pixel.a * scale[3] + bias[3]);
3338
}
3339
}
3340
3341
return *this;
3342
}
3343
3344
imagef& match_dimensions(const imagef& other)
3345
{
3346
resize(other.get_width(), other.get_height());
3347
return *this;
3348
}
3349
3350
imagef &resize(const imagef &other, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0,0,0,1))
3351
{
3352
return resize(other.get_width(), other.get_height(), p, background);
3353
}
3354
3355
imagef &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0,0,0,1))
3356
{
3357
return crop(w, h, p, background);
3358
}
3359
3360
imagef &set_all(const vec4F &c)
3361
{
3362
for (uint32_t i = 0; i < m_pixels.size(); i++)
3363
m_pixels[i] = c;
3364
return *this;
3365
}
3366
3367
imagef &fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const vec4F &c)
3368
{
3369
for (uint32_t iy = 0; iy < h; iy++)
3370
for (uint32_t ix = 0; ix < w; ix++)
3371
set_clipped(x + ix, y + iy, c);
3372
return *this;
3373
}
3374
3375
imagef &crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F &background = vec4F(0,0,0,1))
3376
{
3377
if (p == UINT32_MAX)
3378
p = w;
3379
3380
if ((w == m_width) && (m_height == h) && (m_pitch == p))
3381
return *this;
3382
3383
if ((!w) || (!h) || (!p))
3384
{
3385
clear();
3386
return *this;
3387
}
3388
3389
vec4F_vec cur_state;
3390
cur_state.swap(m_pixels);
3391
3392
m_pixels.resize(p * h);
3393
3394
for (uint32_t y = 0; y < h; y++)
3395
{
3396
for (uint32_t x = 0; x < w; x++)
3397
{
3398
if ((x < m_width) && (y < m_height))
3399
m_pixels[x + y * p] = cur_state[x + y * m_pitch];
3400
else
3401
m_pixels[x + y * p] = background;
3402
}
3403
}
3404
3405
m_width = w;
3406
m_height = h;
3407
m_pitch = p;
3408
3409
return *this;
3410
}
3411
3412
imagef& crop_dup_borders(uint32_t w, uint32_t h)
3413
{
3414
const uint32_t orig_w = m_width, orig_h = m_height;
3415
3416
crop(w, h);
3417
3418
if (orig_w && orig_h)
3419
{
3420
if (m_width > orig_w)
3421
{
3422
for (uint32_t x = orig_w; x < m_width; x++)
3423
for (uint32_t y = 0; y < m_height; y++)
3424
set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));
3425
}
3426
3427
if (m_height > orig_h)
3428
{
3429
for (uint32_t y = orig_h; y < m_height; y++)
3430
for (uint32_t x = 0; x < m_width; x++)
3431
set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));
3432
}
3433
}
3434
return *this;
3435
}
3436
3437
inline const vec4F &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
3438
inline vec4F &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
3439
3440
inline const vec4F &get_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
3441
inline vec4F &get_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
3442
3443
inline const vec4F &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) const
3444
{
3445
x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
3446
y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
3447
return m_pixels[x + y * m_pitch];
3448
}
3449
3450
inline vec4F &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v)
3451
{
3452
x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
3453
y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
3454
return m_pixels[x + y * m_pitch];
3455
}
3456
3457
inline imagef &set_clipped(int x, int y, const vec4F &c)
3458
{
3459
if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
3460
(*this)(x, y) = c;
3461
return *this;
3462
}
3463
3464
// Very straightforward blit with full clipping. Not fast, but it works.
3465
imagef &blit(const imagef &src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y)
3466
{
3467
for (int y = 0; y < src_h; y++)
3468
{
3469
const int sy = src_y + y;
3470
if (sy < 0)
3471
continue;
3472
else if (sy >= (int)src.get_height())
3473
break;
3474
3475
for (int x = 0; x < src_w; x++)
3476
{
3477
const int sx = src_x + x;
3478
if (sx < 0)
3479
continue;
3480
else if (sx >= (int)src.get_width())
3481
break;
3482
3483
set_clipped(dst_x + x, dst_y + y, src(sx, sy));
3484
}
3485
}
3486
3487
return *this;
3488
}
3489
3490
const imagef &extract_block_clamped(vec4F *pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const
3491
{
3492
for (uint32_t y = 0; y < h; y++)
3493
for (uint32_t x = 0; x < w; x++)
3494
*pDst++ = get_clamped(src_x + x, src_y + y);
3495
return *this;
3496
}
3497
3498
imagef &set_block_clipped(const vec4F *pSrc, uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h)
3499
{
3500
for (uint32_t y = 0; y < h; y++)
3501
for (uint32_t x = 0; x < w; x++)
3502
set_clipped(dst_x + x, dst_y + y, *pSrc++);
3503
return *this;
3504
}
3505
3506
inline bool is_valid() const { return m_width > 0; }
3507
3508
inline uint32_t get_width() const { return m_width; }
3509
inline uint32_t get_height() const { return m_height; }
3510
inline uint32_t get_pitch() const { return m_pitch; }
3511
inline uint64_t get_total_pixels() const { return (uint64_t)m_width * m_height; }
3512
3513
inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - 1)) / w; }
3514
inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - 1)) / h; }
3515
inline uint32_t get_total_blocks(uint32_t w, uint32_t h) const { return get_block_width(w) * get_block_height(h); }
3516
3517
inline const vec4F_vec &get_pixels() const { return m_pixels; }
3518
inline vec4F_vec &get_pixels() { return m_pixels; }
3519
3520
inline const vec4F *get_ptr() const { return &m_pixels[0]; }
3521
inline vec4F *get_ptr() { return &m_pixels[0]; }
3522
3523
bool clean_astc_hdr_pixels(float highest_mag)
3524
{
3525
bool status = true;
3526
bool nan_msg = false;
3527
bool inf_msg = false;
3528
bool neg_zero_msg = false;
3529
bool neg_msg = false;
3530
bool clamp_msg = false;
3531
3532
for (uint32_t iy = 0; iy < m_height; iy++)
3533
{
3534
for (uint32_t ix = 0; ix < m_width; ix++)
3535
{
3536
vec4F& c = (*this)(ix, iy);
3537
3538
for (uint32_t s = 0; s < 4; s++)
3539
{
3540
float &p = c[s];
3541
union { float f; uint32_t u; } x; x.f = p;
3542
3543
if ((std::isnan(p)) || (std::isinf(p)) || (x.u == 0x80000000))
3544
{
3545
if (std::isnan(p))
3546
{
3547
if (!nan_msg)
3548
{
3549
fprintf(stderr, "One or more input pixels was NaN, setting to 0.\n");
3550
nan_msg = true;
3551
}
3552
}
3553
3554
if (std::isinf(p))
3555
{
3556
if (!inf_msg)
3557
{
3558
fprintf(stderr, "One or more input pixels was INF, setting to 0.\n");
3559
inf_msg = true;
3560
}
3561
}
3562
3563
if (x.u == 0x80000000)
3564
{
3565
if (!neg_zero_msg)
3566
{
3567
fprintf(stderr, "One or more input pixels was -0, setting them to 0.\n");
3568
neg_zero_msg = true;
3569
}
3570
}
3571
3572
p = 0.0f;
3573
status = false;
3574
}
3575
else
3576
{
3577
//const float o = p;
3578
if (p < 0.0f)
3579
{
3580
p = 0.0f;
3581
3582
if (!neg_msg)
3583
{
3584
fprintf(stderr, "One or more input pixels was negative -- setting these pixel components to 0 because ASTC HDR doesn't support signed values.\n");
3585
neg_msg = true;
3586
}
3587
3588
status = false;
3589
}
3590
3591
if (p > highest_mag)
3592
{
3593
p = highest_mag;
3594
3595
if (!clamp_msg)
3596
{
3597
fprintf(stderr, "One or more input pixels had to be clamped to %f.\n", highest_mag);
3598
clamp_msg = true;
3599
}
3600
3601
status = false;
3602
}
3603
}
3604
}
3605
}
3606
}
3607
3608
return status;
3609
}
3610
3611
imagef& flip_y()
3612
{
3613
for (uint32_t y = 0; y < m_height / 2; ++y)
3614
for (uint32_t x = 0; x < m_width; ++x)
3615
std::swap((*this)(x, y), (*this)(x, m_height - 1 - y));
3616
3617
return *this;
3618
}
3619
3620
bool has_alpha(uint32_t channel = 3) const
3621
{
3622
for (uint32_t y = 0; y < m_height; ++y)
3623
for (uint32_t x = 0; x < m_width; ++x)
3624
if ((*this)(x, y)[channel] != 1.0f)
3625
return true;
3626
3627
return false;
3628
}
3629
3630
vec4F get_filtered_vec4F(float x, float y) const
3631
{
3632
x -= .5f;
3633
y -= .5f;
3634
3635
int ix = (int)floorf(x);
3636
int iy = (int)floorf(y);
3637
float wx = x - ix;
3638
float wy = y - iy;
3639
3640
vec4F a(get_clamped(ix, iy));
3641
vec4F b(get_clamped(ix + 1, iy));
3642
vec4F c(get_clamped(ix, iy + 1));
3643
vec4F d(get_clamped(ix + 1, iy + 1));
3644
3645
vec4F result;
3646
3647
for (uint32_t i = 0; i < 4; i++)
3648
{
3649
const float top = lerp<float>((float)a[i], (float)b[i], wx);
3650
const float bot = lerp<float>((float)c[i], (float)d[i], wx);
3651
const float m = lerp<float>((float)top, (float)bot, wy);
3652
3653
result[i] = m;
3654
}
3655
3656
return result;
3657
}
3658
3659
private:
3660
uint32_t m_width, m_height, m_pitch; // all in pixels
3661
vec4F_vec m_pixels;
3662
};
3663
3664
// REC 709 coefficients
3665
const float REC_709_R = 0.212656f, REC_709_G = 0.715158f, REC_709_B = 0.072186f;
3666
3667
inline float get_luminance(const vec4F &c)
3668
{
3669
return c[0] * REC_709_R + c[1] * REC_709_G + c[2] * REC_709_B;
3670
}
3671
3672
float linear_to_srgb(float l);
3673
float srgb_to_linear(float s);
3674
3675
class fast_linear_to_srgb
3676
{
3677
public:
3678
fast_linear_to_srgb()
3679
{
3680
init();
3681
}
3682
3683
void init()
3684
{
3685
for (int i = 0; i < LINEAR_TO_SRGB_TABLE_SIZE; ++i)
3686
{
3687
float l = (float)i * (1.0f / (LINEAR_TO_SRGB_TABLE_SIZE - 1));
3688
m_linear_to_srgb_table[i] = (uint8_t)basisu::fast_floorf_int(255.0f * basisu::linear_to_srgb(l));
3689
}
3690
3691
float srgb_to_linear[256];
3692
for (int i = 0; i < 256; i++)
3693
srgb_to_linear[i] = basisu::srgb_to_linear((float)i / 255.0f);
3694
3695
for (int i = 0; i < 256; i++)
3696
m_srgb_to_linear_thresh[i] = (srgb_to_linear[i] + srgb_to_linear[basisu::minimum<int>(i + 1, 255)]) * .5f;
3697
}
3698
3699
inline uint8_t convert(float l) const
3700
{
3701
assert((l >= 0.0f) && (l <= 1.0f));
3702
int j = basisu::fast_roundf_int((LINEAR_TO_SRGB_TABLE_SIZE - 1) * l);
3703
3704
assert((j >= 0) && (j < LINEAR_TO_SRGB_TABLE_SIZE));
3705
int b = m_linear_to_srgb_table[j];
3706
3707
b += (l > m_srgb_to_linear_thresh[b]);
3708
3709
return (uint8_t)b;
3710
}
3711
3712
private:
3713
static constexpr int LINEAR_TO_SRGB_TABLE_SIZE = 2048;
3714
uint8_t m_linear_to_srgb_table[LINEAR_TO_SRGB_TABLE_SIZE];
3715
3716
float m_srgb_to_linear_thresh[256];
3717
};
3718
3719
extern fast_linear_to_srgb g_fast_linear_to_srgb;
3720
3721
// Image metrics
3722
3723
class image_metrics
3724
{
3725
public:
3726
// TODO: Add ssim
3727
double m_max, m_mean, m_mean_squared, m_rms, m_psnr, m_ssim;
3728
bool m_has_neg, m_hf_mag_overflow, m_any_abnormal;
3729
3730
image_metrics()
3731
{
3732
clear();
3733
}
3734
3735
void clear()
3736
{
3737
m_max = 0;
3738
m_mean = 0;
3739
m_mean_squared = 0;
3740
m_rms = 0;
3741
m_psnr = 0;
3742
m_ssim = 0;
3743
m_has_neg = false;
3744
m_hf_mag_overflow = false;
3745
m_any_abnormal = false;
3746
}
3747
3748
void print(const char *pPrefix = nullptr) { printf("%sMax: %3.3f Mean: %3.3f RMS: %3.3f PSNR: %2.3f dB\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr); }
3749
void print_hp(const char* pPrefix = nullptr) { printf("%sMax: %3.6f Mean: %3.6f RMS: %3.6f PSNR: %2.6f dB, Any Neg: %u, Half float overflow: %u, Any NaN/Inf: %u\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr, m_has_neg, m_hf_mag_overflow, m_any_abnormal); }
3750
3751
void calc(const imagef& a, const imagef& b, uint32_t first_chan = 0, uint32_t total_chans = 0, bool avg_comp_error = true, bool log = false);
3752
void calc_half(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error);
3753
void calc_half2(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error);
3754
void calc(const image &a, const image &b, uint32_t first_chan = 0, uint32_t total_chans = 0, bool avg_comp_error = true, bool use_601_luma = false);
3755
};
3756
3757
void print_image_metrics(const image& a, const image& b);
3758
3759
// Image saving/loading/resampling
3760
3761
bool load_png(const uint8_t* pBuf, size_t buf_size, image& img, const char* pFilename = nullptr);
3762
bool load_png(const char* pFilename, image& img);
3763
inline bool load_png(const std::string &filename, image &img) { return load_png(filename.c_str(), img); }
3764
3765
bool load_tga(const char* pFilename, image& img);
3766
inline bool load_tga(const std::string &filename, image &img) { return load_tga(filename.c_str(), img); }
3767
3768
bool load_qoi(const char* pFilename, image& img);
3769
3770
bool load_jpg(const char *pFilename, image& img);
3771
bool load_jpg(const uint8_t* pBuf, size_t buf_size, image& img);
3772
inline bool load_jpg(const std::string &filename, image &img) { return load_jpg(filename.c_str(), img); }
3773
3774
// Currently loads .PNG, .TGA, or .JPG
3775
bool load_image(const char* pFilename, image& img);
3776
inline bool load_image(const std::string &filename, image &img) { return load_image(filename.c_str(), img); }
3777
3778
bool is_image_filename_hdr(const char* pFilename);
3779
3780
// Supports .HDR and most (but not all) .EXR's (see TinyEXR).
3781
bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear = true, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f);
3782
3783
inline bool load_image_hdr(const std::string& filename, imagef& img, bool ldr_srgb_to_linear = true, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f)
3784
{
3785
return load_image_hdr(filename.c_str(), img, ldr_srgb_to_linear, linear_nit_multiplier, ldr_black_bias);
3786
}
3787
3788
enum class hdr_image_type
3789
{
3790
cHITRGBAHalfFloat = 0,
3791
cHITRGBAFloat = 1,
3792
cHITPNGImage = 2,
3793
cHITEXRImage = 3,
3794
cHITHDRImage = 4,
3795
cHITJPGImage = 5
3796
};
3797
3798
bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f);
3799
3800
uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans);
3801
uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans);
3802
3803
struct rgbe_header_info
3804
{
3805
std::string m_program;
3806
3807
// Note no validation is done, either gamma or exposure may be 0.
3808
double m_gamma;
3809
bool m_has_gamma;
3810
3811
double m_exposure; // watts/steradian/m^2.
3812
bool m_has_exposure;
3813
3814
void clear()
3815
{
3816
m_program.clear();
3817
m_gamma = 1.0f;
3818
m_has_gamma = false;
3819
m_exposure = 1.0f;
3820
m_has_exposure = false;
3821
}
3822
};
3823
3824
bool read_rgbe(const uint8_vec& filedata, imagef& img, rgbe_header_info& hdr_info);
3825
bool read_rgbe(const char* pFilename, imagef& img, rgbe_header_info &hdr_info);
3826
3827
bool write_rgbe(uint8_vec& file_data, imagef& img, rgbe_header_info& hdr_info);
3828
bool write_rgbe(const char* pFilename, imagef& img, rgbe_header_info& hdr_info);
3829
3830
bool read_exr(const char* pFilename, imagef& img, int& n_chans);
3831
bool read_exr(const void* pMem, size_t mem_size, imagef& img);
3832
3833
enum
3834
{
3835
WRITE_EXR_LINEAR_HINT = 1, // hint for lossy comp. methods: exr_perceptual_treatment_t, logarithmic or linear, defaults to logarithmic
3836
WRITE_EXR_STORE_FLOATS = 2, // use 32-bit floats, otherwise it uses half floats
3837
WRITE_EXR_NO_COMPRESSION = 4 // no compression, otherwise it uses ZIP compression (16 scanlines per block)
3838
};
3839
3840
// Supports 1 (Y), 3 (RGB), or 4 (RGBA) channel images.
3841
bool write_exr(const char* pFilename, const imagef& img, uint32_t n_chans, uint32_t flags);
3842
3843
enum
3844
{
3845
cImageSaveGrayscale = 1,
3846
cImageSaveIgnoreAlpha = 2
3847
};
3848
3849
bool save_png(const char* pFilename, const image& img, uint32_t image_save_flags = 0, uint32_t grayscale_comp = 0);
3850
inline bool save_png(const std::string &filename, const image &img, uint32_t image_save_flags = 0, uint32_t grayscale_comp = 0) { return save_png(filename.c_str(), img, image_save_flags, grayscale_comp); }
3851
3852
bool read_file_to_vec(const char* pFilename, uint8_vec& data);
3853
bool read_file_to_data(const char* pFilename, void *pData, size_t len);
3854
3855
bool write_data_to_file(const char* pFilename, const void* pData, size_t len);
3856
3857
inline bool write_vec_to_file(const char* pFilename, const uint8_vec& v) { return v.size() ? write_data_to_file(pFilename, &v[0], v.size()) : write_data_to_file(pFilename, "", 0); }
3858
3859
bool image_resample(const image &src, image &dst, bool srgb = false,
3860
const char *pFilter = "lanczos4", float filter_scale = 1.0f,
3861
bool wrapping = false,
3862
uint32_t first_comp = 0, uint32_t num_comps = 4);
3863
3864
bool image_resample(const imagef& src, imagef& dst,
3865
const char* pFilter = "lanczos4", float filter_scale = 1.0f,
3866
bool wrapping = false,
3867
uint32_t first_comp = 0, uint32_t num_comps = 4);
3868
3869
// Timing
3870
3871
typedef uint64_t timer_ticks;
3872
3873
class interval_timer
3874
{
3875
public:
3876
interval_timer();
3877
3878
void start();
3879
void stop();
3880
3881
double get_elapsed_secs() const;
3882
inline double get_elapsed_ms() const { return 1000.0f* get_elapsed_secs(); }
3883
3884
static void init();
3885
static inline timer_ticks get_ticks_per_sec() { return g_freq; }
3886
static timer_ticks get_ticks();
3887
static double ticks_to_secs(timer_ticks ticks);
3888
static inline double ticks_to_ms(timer_ticks ticks) { return ticks_to_secs(ticks) * 1000.0f; }
3889
3890
private:
3891
static timer_ticks g_init_ticks, g_freq;
3892
static double g_timer_freq;
3893
3894
timer_ticks m_start_time, m_stop_time;
3895
3896
bool m_started, m_stopped;
3897
};
3898
3899
inline double get_interval_timer() { return interval_timer::ticks_to_secs(interval_timer::get_ticks()); }
3900
3901
inline FILE *fopen_safe(const char *pFilename, const char *pMode)
3902
{
3903
#ifdef _WIN32
3904
FILE *pFile = nullptr;
3905
fopen_s(&pFile, pFilename, pMode);
3906
return pFile;
3907
#else
3908
return fopen(pFilename, pMode);
3909
#endif
3910
}
3911
3912
void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed = 1);
3913
3914
const uint32_t cPixelBlockWidth = 4;
3915
const uint32_t cPixelBlockHeight = 4;
3916
const uint32_t cPixelBlockTotalPixels = cPixelBlockWidth * cPixelBlockHeight;
3917
3918
struct pixel_block
3919
{
3920
color_rgba m_pixels[cPixelBlockHeight][cPixelBlockWidth]; // [y][x]
3921
3922
inline const color_rgba& operator() (uint32_t x, uint32_t y) const { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
3923
inline color_rgba& operator() (uint32_t x, uint32_t y) { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
3924
3925
inline const color_rgba* get_ptr() const { return &m_pixels[0][0]; }
3926
inline color_rgba* get_ptr() { return &m_pixels[0][0]; }
3927
3928
inline void clear() { clear_obj(*this); }
3929
3930
inline bool operator== (const pixel_block& rhs) const
3931
{
3932
return memcmp(m_pixels, rhs.m_pixels, sizeof(m_pixels)) == 0;
3933
}
3934
};
3935
typedef basisu::vector<pixel_block> pixel_block_vec;
3936
3937
struct pixel_block_hdr
3938
{
3939
vec4F m_pixels[cPixelBlockHeight][cPixelBlockWidth]; // [y][x]
3940
3941
inline const vec4F& operator() (uint32_t x, uint32_t y) const { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
3942
inline vec4F& operator() (uint32_t x, uint32_t y) { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
3943
3944
inline const vec4F* get_ptr() const { return &m_pixels[0][0]; }
3945
inline vec4F* get_ptr() { return &m_pixels[0][0]; }
3946
3947
inline void clear() { clear_obj(*this); }
3948
3949
inline bool operator== (const pixel_block& rhs) const
3950
{
3951
return memcmp(m_pixels, rhs.m_pixels, sizeof(m_pixels)) == 0;
3952
}
3953
};
3954
typedef basisu::vector<pixel_block_hdr> pixel_block_hdr_vec;
3955
3956
void tonemap_image_reinhard(image& ldr_img, const imagef& hdr_img, float exposure, bool add_noise = false, bool per_component = true, bool luma_scaling = false);
3957
bool tonemap_image_compressive(image& dst_img, const imagef& hdr_test_img);
3958
bool tonemap_image_compressive2(image& dst_img, const imagef& hdr_test_img);
3959
3960
// Intersection
3961
enum eClear { cClear = 0 };
3962
enum eInitExpand { cInitExpand = 0 };
3963
enum eIdentity { cIdentity = 0 };
3964
3965
template<typename vector_type>
3966
class ray
3967
{
3968
public:
3969
typedef vector_type vector_t;
3970
typedef typename vector_type::scalar_type scalar_type;
3971
3972
inline ray() { }
3973
inline ray(eClear) { clear(); }
3974
inline ray(const vector_type& origin, const vector_type& direction) : m_origin(origin), m_direction(direction) { }
3975
3976
inline void clear()
3977
{
3978
m_origin.clear();
3979
m_direction.clear();
3980
}
3981
3982
inline const vector_type& get_origin(void) const { return m_origin; }
3983
inline void set_origin(const vector_type& origin) { m_origin = origin; }
3984
3985
inline const vector_type& get_direction(void) const { return m_direction; }
3986
inline void set_direction(const vector_type& direction) { m_direction = direction; }
3987
3988
inline void set_endpoints(const vector_type& start, const vector_type& end)
3989
{
3990
m_origin = start;
3991
3992
m_direction = end - start;
3993
m_direction.normalize_in_place();
3994
}
3995
3996
inline vector_type eval(scalar_type t) const
3997
{
3998
return m_origin + m_direction * t;
3999
}
4000
4001
private:
4002
vector_type m_origin;
4003
vector_type m_direction;
4004
};
4005
4006
typedef ray<vec2F> ray2F;
4007
typedef ray<vec3F> ray3F;
4008
4009
template<typename T>
4010
class vec_interval
4011
{
4012
public:
4013
enum { N = T::num_elements };
4014
typedef typename T::scalar_type scalar_type;
4015
4016
inline vec_interval(const T& v) { m_bounds[0] = v; m_bounds[1] = v; }
4017
inline vec_interval(const T& low, const T& high) { m_bounds[0] = low; m_bounds[1] = high; }
4018
4019
inline vec_interval() { }
4020
inline vec_interval(eClear) { clear(); }
4021
inline vec_interval(eInitExpand) { init_expand(); }
4022
4023
inline void clear() { m_bounds[0].clear(); m_bounds[1].clear(); }
4024
4025
inline void init_expand()
4026
{
4027
m_bounds[0].set(1e+30f, 1e+30f, 1e+30f);
4028
m_bounds[1].set(-1e+30f, -1e+30f, -1e+30f);
4029
}
4030
4031
inline vec_interval expand(const T& p)
4032
{
4033
for (uint32_t c = 0; c < N; c++)
4034
{
4035
if (p[c] < m_bounds[0][c])
4036
m_bounds[0][c] = p[c];
4037
4038
if (p[c] > m_bounds[1][c])
4039
m_bounds[1][c] = p[c];
4040
}
4041
4042
return *this;
4043
}
4044
4045
inline const T& operator[] (uint32_t i) const { assert(i < 2); return m_bounds[i]; }
4046
inline T& operator[] (uint32_t i) { assert(i < 2); return m_bounds[i]; }
4047
4048
const T& get_low() const { return m_bounds[0]; }
4049
T& get_low() { return m_bounds[0]; }
4050
4051
const T& get_high() const { return m_bounds[1]; }
4052
T& get_high() { return m_bounds[1]; }
4053
4054
scalar_type get_dim(uint32_t axis) const { return m_bounds[1][axis] - m_bounds[0][axis]; }
4055
4056
bool contains(const T& p) const
4057
{
4058
const T& low = get_low(), high = get_high();
4059
4060
for (uint32_t i = 0; i < N; i++)
4061
{
4062
if (p[i] < low[i])
4063
return false;
4064
4065
if (p[i] > high[i])
4066
return false;
4067
}
4068
return true;
4069
}
4070
4071
private:
4072
T m_bounds[2];
4073
};
4074
4075
typedef vec_interval<vec1F> vec_interval1F;
4076
typedef vec_interval<vec2F> vec_interval2F;
4077
typedef vec_interval<vec3F> vec_interval3F;
4078
typedef vec_interval<vec4F> vec_interval4F;
4079
4080
typedef vec_interval1F aabb1F;
4081
typedef vec_interval2F aabb2F;
4082
typedef vec_interval3F aabb3F;
4083
4084
namespace intersection
4085
{
4086
enum result
4087
{
4088
cBackfacing = -1,
4089
cFailure = 0,
4090
cSuccess,
4091
cParallel,
4092
cInside,
4093
};
4094
4095
// Returns cInside, cSuccess, or cFailure.
4096
// Algorithm: Graphics Gems 1
4097
template<typename vector_type, typename scalar_type, typename ray_type, typename aabb_type>
4098
result ray_aabb(vector_type& coord, scalar_type& t, const ray_type& ray, const aabb_type& box)
4099
{
4100
enum
4101
{
4102
cNumDim = vector_type::num_elements,
4103
cRight = 0,
4104
cLeft = 1,
4105
cMiddle = 2
4106
};
4107
4108
bool inside = true;
4109
int quadrant[cNumDim];
4110
scalar_type candidate_plane[cNumDim];
4111
4112
for (int i = 0; i < cNumDim; i++)
4113
{
4114
if (ray.get_origin()[i] < box[0][i])
4115
{
4116
quadrant[i] = cLeft;
4117
candidate_plane[i] = box[0][i];
4118
inside = false;
4119
}
4120
else if (ray.get_origin()[i] > box[1][i])
4121
{
4122
quadrant[i] = cRight;
4123
candidate_plane[i] = box[1][i];
4124
inside = false;
4125
}
4126
else
4127
{
4128
quadrant[i] = cMiddle;
4129
}
4130
}
4131
4132
if (inside)
4133
{
4134
coord = ray.get_origin();
4135
t = 0.0f;
4136
return cInside;
4137
}
4138
4139
scalar_type max_t[cNumDim];
4140
for (int i = 0; i < cNumDim; i++)
4141
{
4142
if ((quadrant[i] != cMiddle) && (ray.get_direction()[i] != 0.0f))
4143
max_t[i] = (candidate_plane[i] - ray.get_origin()[i]) / ray.get_direction()[i];
4144
else
4145
max_t[i] = -1.0f;
4146
}
4147
4148
int which_plane = 0;
4149
for (int i = 1; i < cNumDim; i++)
4150
if (max_t[which_plane] < max_t[i])
4151
which_plane = i;
4152
4153
if (max_t[which_plane] < 0.0f)
4154
return cFailure;
4155
4156
for (int i = 0; i < cNumDim; i++)
4157
{
4158
if (i != which_plane)
4159
{
4160
coord[i] = ray.get_origin()[i] + max_t[which_plane] * ray.get_direction()[i];
4161
4162
if ((coord[i] < box[0][i]) || (coord[i] > box[1][i]))
4163
return cFailure;
4164
}
4165
else
4166
{
4167
coord[i] = candidate_plane[i];
4168
}
4169
4170
assert(coord[i] >= box[0][i] && coord[i] <= box[1][i]);
4171
}
4172
4173
t = max_t[which_plane];
4174
return cSuccess;
4175
}
4176
4177
template<typename vector_type, typename scalar_type, typename ray_type, typename aabb_type>
4178
result ray_aabb(bool& started_within, vector_type& coord, scalar_type& t, const ray_type& ray, const aabb_type& box)
4179
{
4180
if (!box.contains(ray.get_origin()))
4181
{
4182
started_within = false;
4183
return ray_aabb(coord, t, ray, box);
4184
}
4185
4186
started_within = true;
4187
4188
typename vector_type::T diag_dist = box.diagonal_length() * 1.5f;
4189
ray_type outside_ray(ray.eval(diag_dist), -ray.get_direction());
4190
4191
result res(ray_aabb(coord, t, outside_ray, box));
4192
if (res != cSuccess)
4193
return res;
4194
4195
t = basisu::maximum(0.0f, diag_dist - t);
4196
return cSuccess;
4197
}
4198
4199
} // intersect
4200
4201
// This float->half conversion matches how "F32TO16" works on Intel GPU's.
4202
// Input cannot be negative, Inf or Nan.
4203
inline basist::half_float float_to_half_non_neg_no_nan_inf(float val)
4204
{
4205
union { float f; int32_t i; uint32_t u; } fi = { val };
4206
const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF;
4207
int e = 0, m = 0;
4208
4209
assert(((fi.i >> 31) == 0) && (flt_e != 0xFF));
4210
4211
// not zero or denormal
4212
if (flt_e != 0)
4213
{
4214
int new_exp = flt_e - 127;
4215
if (new_exp > 15)
4216
e = 31;
4217
else if (new_exp < -14)
4218
m = lrintf((1 << 24) * fabsf(fi.f));
4219
else
4220
{
4221
e = new_exp + 15;
4222
m = lrintf(flt_m * (1.0f / ((float)(1 << 13))));
4223
}
4224
}
4225
4226
assert((0 <= m) && (m <= 1024));
4227
if (m == 1024)
4228
{
4229
e++;
4230
m = 0;
4231
}
4232
4233
assert((e >= 0) && (e <= 31));
4234
assert((m >= 0) && (m <= 1023));
4235
4236
basist::half_float result = (basist::half_float)((e << 10) | m);
4237
return result;
4238
}
4239
4240
union fu32
4241
{
4242
uint32_t u;
4243
float f;
4244
};
4245
4246
// Supports positive and denormals only. No NaN or Inf.
4247
BASISU_FORCE_INLINE float fast_half_to_float_pos_not_inf_or_nan(basist::half_float h)
4248
{
4249
assert(!basist::half_is_signed(h) && !basist::is_half_inf_or_nan(h));
4250
4251
// add 112 to the exponent (112+half float's exp bias of 15=float32's bias of 127)
4252
static const fu32 K = { 0x77800000 };
4253
4254
fu32 o;
4255
o.u = h << 13;
4256
o.f *= K.f;
4257
4258
return o.f;
4259
}
4260
4261
// Positive, negative, or denormals. No NaN or Inf. Clamped to MAX_HALF_FLOAT.
4262
inline basist::half_float fast_float_to_half_trunc_no_nan_or_inf(float f)
4263
{
4264
assert(!isnan(f) && !isinf(f));
4265
4266
// Sutract 112 from the exponent, to change the bias from 127 to 15.
4267
static const fu32 g_f_to_h{ 0x7800000 };
4268
4269
fu32 fu;
4270
4271
fu.f = minimum<float>((float)basist::MAX_HALF_FLOAT, fabsf(f)) * g_f_to_h.f;
4272
4273
return (basist::half_float)(((fu.u >> (23 - 10)) & 0x7FFF) | ((f < 0.0f) ? 0x8000 : 0));
4274
}
4275
4276
inline basist::half_float fast_float_to_half_trunc_no_clamp_neg_nan_or_inf(float f)
4277
{
4278
assert(!isnan(f) && !isinf(f));
4279
assert((f >= 0.0f) && (f <= basist::MAX_HALF_FLOAT));
4280
4281
// Sutract 112 from the exponent, to change the bias from 127 to 15.
4282
static const fu32 g_f_to_h{ 0x7800000 };
4283
4284
fu32 fu;
4285
4286
fu.f = f * g_f_to_h.f;
4287
4288
return (basist::half_float)((fu.u >> (23 - 10)) & 0x7FFF);
4289
}
4290
4291
inline basist::half_float fast_float_to_half_no_clamp_neg_nan_or_inf(float f)
4292
{
4293
assert(!isnan(f) && !isinf(f));
4294
assert((f >= 0.0f) && (f <= basist::MAX_HALF_FLOAT));
4295
4296
// Sutract 112 from the exponent, to change the bias from 127 to 15.
4297
static const fu32 g_f_to_h{ 0x7800000 };
4298
4299
fu32 fu;
4300
4301
fu.f = f * g_f_to_h.f;
4302
4303
uint32_t h = (basist::half_float)((fu.u >> (23 - 10)) & 0x7FFF);
4304
4305
// round to even or nearest
4306
uint32_t mant = fu.u & 8191; // examine lowest 13 bits
4307
uint32_t inc = (mant > 4096) | ((mant == 4096) & (h & 1));
4308
h += inc;
4309
4310
if (h > basist::MAX_HALF_FLOAT_AS_INT_BITS)
4311
h = basist::MAX_HALF_FLOAT_AS_INT_BITS;
4312
4313
return (basist::half_float)h;
4314
}
4315
4316
} // namespace basisu
4317
4318
#include "basisu_math.h"
4319
4320