CoCalc -- basisu

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/basis_universal/encoder/basisu_enc.h
⁹⁹⁰² views
1
// basisu_enc.h
2
// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
3
//
4
// Licensed under the Apache License, Version 2.0 (the "License");
5
// you may not use this file except in compliance with the License.
6
// You may obtain a copy of the License at
7
//
8
//    http://www.apache.org/licenses/LICENSE-2.0
9
//
10
// Unless required by applicable law or agreed to in writing, software
11
// distributed under the License is distributed on an "AS IS" BASIS,
12
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
// See the License for the specific language governing permissions and
14
// limitations under the License.
15
#pragma once
16
#include "../transcoder/basisu.h"
17
#include "../transcoder/basisu_transcoder_internal.h"
18

19
#include <mutex>
20
#include <atomic>
21
#include <condition_variable>
22
#include <functional>
23
#include <thread>
24
#include <unordered_map>
25
#include <ostream>
26

27
#if !defined(_WIN32) || defined(__MINGW32__)
28
#include <libgen.h>
29
#endif
30

31
// This module is really just a huge grab bag of classes and helper functions needed by the encoder.
32

33
// If BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE is 1, quality in perceptual mode will be slightly greater, but at a large increase in encoding CPU time.
34
#define BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE (0)
35

36
#if BASISU_SUPPORT_SSE
37
// Declared in basisu_kernels_imp.h, but we can't include that here otherwise it would lead to circular type errors.
38
extern void update_covar_matrix_16x16_sse41(uint32_t num_vecs, const void* pWeighted_vecs, const void* pOrigin, const uint32_t *pVec_indices, void* pMatrix16x16);
39
#endif
40

41
namespace basisu
42
{
43
	extern uint8_t g_hamming_dist[256];
44
	extern const uint8_t g_debug_font8x8_basic[127 - 32 + 1][8];
45

46
	// true if basisu_encoder_init() has been called and returned.
47
	extern bool g_library_initialized;
48

49
	// Encoder library initialization.
50
	// This function MUST be called before encoding anything!
51
	// Returns false if library initialization fails.
52
	bool basisu_encoder_init(bool use_opencl = false, bool opencl_force_serialization = false);
53
	void basisu_encoder_deinit();
54

55
	// basisu_kernels_sse.cpp - will be a no-op and g_cpu_supports_sse41 will always be false unless compiled with BASISU_SUPPORT_SSE=1
56
	extern void detect_sse41();
57

58
#if BASISU_SUPPORT_SSE
59
	extern bool g_cpu_supports_sse41;
60
#else
61
	const bool g_cpu_supports_sse41 = false;
62
#endif
63

64
	void error_vprintf(const char* pFmt, va_list args);
65
	void error_printf(const char *pFmt, ...);
66
	
67
	template <typename... Args>
68
	inline void fmt_error_printf(const char* pFmt, Args&&... args)
69
	{
70
		std::string res;
71
		if (!fmt_variants(res, pFmt, fmt_variant_vec{ fmt_variant(std::forward<Args>(args))... }))
72
			return;
73
		error_printf("%s", res.c_str());
74
	}
75

76
	void platform_sleep(uint32_t ms);
77
	
78
	// Helpers
79
		
80
	inline uint8_t clamp255(int32_t i)
81
	{
82
		return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i);
83
	}
84

85
	inline int left_shift32(int val, int shift)
86
	{
87
		assert((shift >= 0) && (shift < 32));
88
		return static_cast<int>(static_cast<uint32_t>(val) << shift);
89
	}
90

91
	inline uint32_t left_shift32(uint32_t val, int shift)
92
	{
93
		assert((shift >= 0) && (shift < 32));
94
		return val << shift;
95
	}
96

97
	inline int32_t clampi(int32_t value, int32_t low, int32_t high) 
98
	{ 
99
		if (value < low) 
100
			value = low; 
101
		else if (value > high) 
102
			value = high; 
103
		return value; 
104
	}
105

106
	inline uint8_t mul_8(uint32_t v, uint32_t a)
107
	{
108
		v = v * a + 128; 
109
		return (uint8_t)((v + (v >> 8)) >> 8);
110
	}
111

112
	inline int fast_roundf_int(float x)
113
	{
114
		return (x >= 0.0f) ? (int)(x + 0.5f) : (int)(x - 0.5f);
115
	}
116

117
	inline int fast_floorf_int(float x)
118
	{
119
		int xi = (int)x;  // Truncate towards zero
120
		return ((x < 0.0f) && (x != (float)xi)) ? (xi - 1) : xi;
121
	}
122

123
	inline uint64_t read_bits(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
124
	{
125
		assert(codesize <= 64);
126
		uint64_t bits = 0;
127
		uint32_t total_bits = 0;
128

129
		while (total_bits < codesize)
130
		{
131
			uint32_t byte_bit_offset = bit_offset & 7;
132
			uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset);
133

134
			uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset;
135
			byte_bits &= ((1 << bits_to_read) - 1);
136

137
			bits |= ((uint64_t)(byte_bits) << total_bits);
138

139
			total_bits += bits_to_read;
140
			bit_offset += bits_to_read;
141
		}
142

143
		return bits;
144
	}
145

146
	inline uint32_t read_bits32(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
147
	{
148
		assert(codesize <= 32);
149
		uint32_t bits = 0;
150
		uint32_t total_bits = 0;
151

152
		while (total_bits < codesize)
153
		{
154
			uint32_t byte_bit_offset = bit_offset & 7;
155
			uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset);
156

157
			uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset;
158
			byte_bits &= ((1 << bits_to_read) - 1);
159

160
			bits |= (byte_bits << total_bits);
161

162
			total_bits += bits_to_read;
163
			bit_offset += bits_to_read;
164
		}
165

166
		return bits;
167
	}
168
		
169
	// Open interval
170
	inline int bounds_check(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }
171
	inline uint32_t bounds_check(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }
172

173
	// Closed interval
174
	inline int bounds_check_incl(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v <= h); return v; }
175
	inline uint32_t bounds_check_incl(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v <= h); return v; }
176

177
	inline uint32_t clz(uint32_t x)
178
	{
179
		if (!x)
180
			return 32;
181

182
		uint32_t n = 0;
183
		while ((x & 0x80000000) == 0)
184
		{
185
			x <<= 1u;
186
			n++;
187
		}
188

189
		return n;
190
	}
191

192
	bool string_begins_with(const std::string& str, const char* pPhrase);
193

194
	// Case sensitive, returns -1 if can't find
195
	inline int string_find_first(const std::string& str, const char* pPhrase)
196
	{
197
		size_t res = str.find(pPhrase, 0);
198
		if (res == std::string::npos)
199
			return -1;
200
		return (int)res;
201
	}
202
				
203
	// Hashing
204
	
205
	inline uint32_t bitmix32c(uint32_t v) 
206
	{
207
		v = (v + 0x7ed55d16) + (v << 12);
208
		v = (v ^ 0xc761c23c) ^ (v >> 19);
209
		v = (v + 0x165667b1) + (v << 5);
210
		v = (v + 0xd3a2646c) ^ (v << 9);
211
		v = (v + 0xfd7046c5) + (v << 3);
212
		v = (v ^ 0xb55a4f09) ^ (v >> 16);
213
		return v;
214
	}
215

216
	inline uint32_t bitmix32(uint32_t v) 
217
	{
218
		v -= (v << 6);
219
		v ^= (v >> 17);
220
		v -= (v << 9);
221
		v ^= (v << 4);
222
		v -= (v << 3);
223
		v ^= (v << 10);
224
		v ^= (v >> 15);
225
		return v;
226
	}
227

228
	inline uint32_t wang_hash(uint32_t seed)
229
	{
230
		 seed = (seed ^ 61) ^ (seed >> 16);
231
		 seed *= 9;
232
		 seed = seed ^ (seed >> 4);
233
		 seed *= 0x27d4eb2d;
234
		 seed = seed ^ (seed >> 15);
235
		 return seed;
236
	}
237

238
	uint32_t hash_hsieh(const uint8_t* pBuf, size_t len);
239

240
	template <typename Key>
241
	struct bit_hasher
242
	{
243
		inline std::size_t operator()(const Key& k) const
244
		{
245
			return hash_hsieh(reinterpret_cast<const uint8_t *>(&k), sizeof(k));
246
		}
247
	};
248

249
	struct string_hasher
250
	{
251
		inline std::size_t operator()(const std::string& k) const
252
		{
253
			size_t l = k.size();
254
			if (!l)
255
				return 0;
256
			return hash_hsieh(reinterpret_cast<const uint8_t*>(k.c_str()), l);
257
		}
258
	};
259

260
	class running_stat
261
	{
262
	public:
263
		running_stat() { clear(); }
264

265
		void clear()
266
		{
267
			m_n = 0;
268
			m_total = 0;
269
			m_old_m = 0;
270
			m_new_m = 0;
271
			m_old_s = 0;
272
			m_new_s = 0;
273
			m_min = 0;
274
			m_max = 0;
275
		}
276

277
		void push(double x)
278
		{
279
			m_n++;
280
			m_total += x;
281
			if (m_n == 1)
282
			{
283
				m_old_m = m_new_m = x;
284
				m_old_s = 0.0;
285
				m_min = x;
286
				m_max = x;
287
			}
288
			else
289
			{
290
				// See Knuth TAOCP vol 2, 3rd edition, page 232
291
				m_new_m = m_old_m + (x - m_old_m) / m_n;
292
				m_new_s = m_old_s + (x - m_old_m) * (x - m_new_m);
293
				m_old_m = m_new_m;
294
				m_old_s = m_new_s;
295
				m_min = basisu::minimum(x, m_min);
296
				m_max = basisu::maximum(x, m_max);
297
			}
298
		}
299

300
		uint32_t get_num() const
301
		{
302
			return m_n;
303
		}
304

305
		double get_total() const
306
		{
307
			return m_total;
308
		}
309

310
		double get_mean() const
311
		{
312
			return (m_n > 0) ? m_new_m : 0.0;
313
		}
314

315
		// Returns sample variance
316
		double get_variance() const
317
		{
318
			return ((m_n > 1) ? m_new_s / (m_n - 1) : 0.0);
319
		}
320

321
		double get_std_dev() const
322
		{
323
			return sqrt(get_variance());
324
		}
325

326
		double get_min() const
327
		{
328
			return m_min;
329
		}
330

331
		double get_max() const
332
		{
333
			return m_max;
334
		}
335

336
	private:
337
		uint32_t m_n;
338
		double m_total, m_old_m, m_new_m, m_old_s, m_new_s, m_min, m_max;
339
	};
340

341
	// Linear algebra
342
			
343
	template <uint32_t N, typename T>
344
	class vec
345
	{
346
	protected:
347
		T m_v[N];
348

349
	public:
350
		enum { num_elements = N };
351
		typedef T scalar_type;
352

353
		inline vec() { }
354
		inline vec(eZero) { set_zero();  }
355

356
		explicit inline vec(T val) { set(val); }
357
		inline vec(T v0, T v1) { set(v0, v1); }
358
		inline vec(T v0, T v1, T v2) { set(v0, v1, v2); }
359
		inline vec(T v0, T v1, T v2, T v3) { set(v0, v1, v2, v3); }
360
		inline vec(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] = other.m_v[i]; }
361
		template <uint32_t OtherN, typename OtherT> inline vec(const vec<OtherN, OtherT> &other) { set(other); }
362

363
		inline const T& operator[](uint32_t i) const { assert(i < N); return m_v[i]; }
364
		inline T &operator[](uint32_t i) { assert(i < N); return m_v[i]; }
365

366
		inline T getX() const { return m_v[0]; }
367
		inline T getY() const { static_assert(N >= 2, "N too small"); return m_v[1]; }
368
		inline T getZ() const { static_assert(N >= 3, "N too small"); return m_v[2]; }
369
		inline T getW() const { static_assert(N >= 4, "N too small"); return m_v[3]; }
370

371
		inline bool operator==(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) if (m_v[i] != rhs.m_v[i]) return false;	return true; }
372
		inline bool operator!=(const vec& rhs) const { return !(*this == rhs); }
373
		inline bool operator<(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) { if (m_v[i] < rhs.m_v[i]) return true; else if (m_v[i] != rhs.m_v[i]) return false; } return false; }
374

375
		inline void set_zero() { for (uint32_t i = 0; i < N; i++) m_v[i] = 0; }
376
		inline void clear() { set_zero(); }
377

378
		template <uint32_t OtherN, typename OtherT>
379
		inline vec &set(const vec<OtherN, OtherT> &other)
380
		{
381
			uint32_t i;
382
			if ((const void *)(&other) == (const void *)(this))
383
				return *this;
384
			const uint32_t m = minimum(OtherN, N);
385
			for (i = 0; i < m; i++)
386
				m_v[i] = static_cast<T>(other[i]);
387
			for (; i < N; i++)
388
				m_v[i] = 0;
389
			return *this;
390
		}
391

392
		inline vec &set_component(uint32_t index, T val) { assert(index < N); m_v[index] = val; return *this; }
393
		inline vec &set(T val) { for (uint32_t i = 0; i < N; i++) m_v[i] = val; return *this; }
394
		inline void clear_elements(uint32_t s, uint32_t e) { assert(e <= N); for (uint32_t i = s; i < e; i++) m_v[i] = 0; }
395

396
		inline vec &set(T v0, T v1)
397
		{
398
			m_v[0] = v0;
399
			if (N >= 2)
400
			{
401
				m_v[1] = v1;
402
				clear_elements(2, N);
403
			}
404
			return *this;
405
		}
406

407
		inline vec &set(T v0, T v1, T v2)
408
		{
409
			m_v[0] = v0;
410
			if (N >= 2)
411
			{
412
				m_v[1] = v1;
413
				if (N >= 3)
414
				{
415
					m_v[2] = v2;
416
					clear_elements(3, N);
417
				}
418
			}
419
			return *this;
420
		}
421

422
		inline vec &set(T v0, T v1, T v2, T v3)
423
		{
424
			m_v[0] = v0;
425
			if (N >= 2)
426
			{
427
				m_v[1] = v1;
428
				if (N >= 3)
429
				{
430
					m_v[2] = v2;
431

432
					if (N >= 4)
433
					{
434
						m_v[3] = v3;
435
						clear_elements(5, N);
436
					}
437
				}
438
			}
439
			return *this;
440
		}
441

442
		inline vec &operator=(const vec &rhs) { if (this != &rhs) for (uint32_t i = 0; i < N; i++) m_v[i] = rhs.m_v[i]; return *this; }
443
		template <uint32_t OtherN, typename OtherT> inline vec &operator=(const vec<OtherN, OtherT> &rhs) { set(rhs); return *this; }
444

445
		inline const T *get_ptr() const { return reinterpret_cast<const T *>(&m_v[0]); }
446
		inline T *get_ptr() { return reinterpret_cast<T *>(&m_v[0]); }
447
		
448
		inline vec operator- () const { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = -m_v[i]; return res; }
449
		inline vec operator+ () const { return *this; }
450
		inline vec &operator+= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] += other.m_v[i]; return *this; }
451
		inline vec &operator-= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] -= other.m_v[i]; return *this; }
452
		inline vec &operator/= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] /= other.m_v[i]; return *this; }
453
		inline vec &operator*=(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] *= other.m_v[i]; return *this; }
454
		inline vec &operator/= (T s) { for (uint32_t i = 0; i < N; i++) m_v[i] /= s; return *this; }
455
		inline vec &operator*= (T s) { for (uint32_t i = 0; i < N; i++) m_v[i] *= s; return *this; }
456
		
457
		friend inline vec operator+(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] + rhs.m_v[i]; return res; }
458
		friend inline vec operator-(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] - rhs.m_v[i]; return res; }
459
		friend inline vec operator*(const vec &lhs, T val) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] * val; return res; }
460
		friend inline vec operator*(T val, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = val * rhs.m_v[i]; return res; }
461
		friend inline vec operator/(const vec &lhs, T val) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] / val; return res; }
462
		friend inline vec operator/(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] / rhs.m_v[i]; return res; }
463
		
464
		static inline T dot_product(const vec &lhs, const vec &rhs) { T res = lhs.m_v[0] * rhs.m_v[0]; for (uint32_t i = 1; i < N; i++) res += lhs.m_v[i] * rhs.m_v[i]; return res; }
465

466
		inline T dot(const vec &rhs) const { return dot_product(*this, rhs); }
467

468
		inline T norm() const { return dot_product(*this, *this); }
469
		inline T length() const { return sqrt(norm()); }
470

471
		inline T squared_distance(const vec &other) const { T d2 = 0; for (uint32_t i = 0; i < N; i++) { T d = m_v[i] - other.m_v[i]; d2 += d * d; } return d2; }
472
		inline double squared_distance_d(const vec& other) const { double d2 = 0; for (uint32_t i = 0; i < N; i++) { double d = (double)m_v[i] - (double)other.m_v[i]; d2 += d * d; } return d2; }
473

474
		inline T distance(const vec &other) const { return static_cast<T>(sqrt(squared_distance(other))); }
475
		inline double distance_d(const vec& other) const { return sqrt(squared_distance_d(other)); }
476

477
		inline vec &normalize_in_place() { T len = length(); if (len != 0.0f) *this *= (1.0f / len); return *this; }
478

479
		inline vec get_normalized() const { vec res(*this); res.normalize_in_place(); return res; }
480

481
		inline vec &clamp(T l, T h)
482
		{
483
			for (uint32_t i = 0; i < N; i++)
484
				m_v[i] = basisu::clamp(m_v[i], l, h);
485
			return *this;
486
		}
487

488
		static vec component_mul(const vec& a, const vec& b)
489
		{
490
			vec res;
491
			for (uint32_t i = 0; i < N; i++)
492
				res[i] = a[i] * b[i];
493
			return res;
494
		}
495

496
		static vec component_min(const vec& a, const vec& b)
497
		{
498
			vec res;
499
			for (uint32_t i = 0; i < N; i++)
500
				res[i] = minimum(a[i], b[i]);
501
			return res;
502
		}
503

504
		static vec component_max(const vec& a, const vec& b)
505
		{
506
			vec res;
507
			for (uint32_t i = 0; i < N; i++)
508
				res[i] = maximum(a[i], b[i]);
509
			return res;
510
		}
511

512
		static vec lerp(const vec& a, const vec& b, float s)
513
		{
514
			vec res;
515
			for (uint32_t i = 0; i < N; i++)
516
				res[i] = basisu::lerp(a[i], b[i], s);
517
			return res;
518
		}
519
	};
520

521
	typedef vec<4, double> vec4D;
522
	typedef vec<3, double> vec3D;
523
	typedef vec<2, double> vec2D;
524
	typedef vec<1, double> vec1D;
525

526
	typedef vec<6, float> vec6F;
527
	typedef vec<5, float> vec5F;
528
	typedef vec<4, float> vec4F;
529
	typedef vec<3, float> vec3F;
530
	typedef vec<2, float> vec2F;
531
	typedef vec<1, float> vec1F;
532

533
	typedef vec<16, float> vec16F;
534

535
	template<uint32_t N, typename T> struct bitwise_copyable< vec<N, T> > { enum { cFlag = true }; };
536
	template<uint32_t N, typename T> struct bitwise_movable< vec<N, T> > { enum { cFlag = true }; };
537
		
538
	template <uint32_t Rows, uint32_t Cols, typename T>
539
	class matrix
540
	{
541
	public:
542
		typedef vec<Rows, T> col_vec;
543
		typedef vec<Cols, T> row_vec;
544

545
		typedef T scalar_type;
546

547
		enum { rows = Rows, cols = Cols };
548

549
	protected:
550
		row_vec m_r[Rows];
551

552
	public:
553
		inline matrix() {}
554
		inline matrix(eZero) { set_zero();  }
555
		inline matrix(const matrix &other) { for (uint32_t i = 0; i < Rows; i++) m_r[i] = other.m_r[i];	}
556
		inline matrix &operator=(const matrix &rhs) { if (this != &rhs) for (uint32_t i = 0; i < Rows; i++) m_r[i] = rhs.m_r[i]; return *this; }
557

558
		inline T operator()(uint32_t r, uint32_t c) const { assert((r < Rows) && (c < Cols)); return m_r[r][c]; }
559
		inline T &operator()(uint32_t r, uint32_t c) { assert((r < Rows) && (c < Cols)); return m_r[r][c]; }
560

561
		inline const row_vec &operator[](uint32_t r) const { assert(r < Rows); return m_r[r]; }
562
		inline row_vec &operator[](uint32_t r) { assert(r < Rows); return m_r[r]; }
563

564
		inline matrix &set_zero()
565
		{
566
			for (uint32_t i = 0; i < Rows; i++)
567
				m_r[i].set_zero();
568
			return *this;
569
		}
570

571
		inline matrix &set_identity()
572
		{
573
			for (uint32_t i = 0; i < Rows; i++)
574
			{
575
				m_r[i].set_zero();
576
				if (i < Cols)
577
					m_r[i][i] = 1.0f;
578
			}
579
			return *this;
580
		}
581
	};
582

583
	template<uint32_t R, uint32_t C, typename T> struct bitwise_copyable< matrix<R, C, T> > { enum { cFlag = true }; };
584
	template<uint32_t R, uint32_t C, typename T> struct bitwise_movable< matrix<R, C, T> > { enum { cFlag = true }; };
585

586
	template<uint32_t N, typename VectorType>
587
	inline VectorType compute_pca_from_covar(matrix<N, N, float> &cmatrix)
588
	{
589
		VectorType axis;
590
		if (N == 1)
591
			axis.set(1.0f);
592
		else
593
		{
594
			for (uint32_t i = 0; i < N; i++)
595
				axis[i] = lerp(.75f, 1.25f, i * (1.0f / maximum<int>(N - 1, 1)));
596
		}
597

598
		VectorType prev_axis(axis);
599

600
		// Power iterations
601
		for (uint32_t power_iter = 0; power_iter < 8; power_iter++)
602
		{
603
			VectorType trial_axis;
604
			double max_sum = 0;
605

606
			for (uint32_t i = 0; i < N; i++)
607
			{
608
				double sum = 0;
609
				for (uint32_t j = 0; j < N; j++)
610
					sum += cmatrix[i][j] * axis[j];
611

612
				trial_axis[i] = static_cast<float>(sum);
613

614
				max_sum = maximum(fabs(sum), max_sum);
615
			}
616

617
			if (max_sum != 0.0f)
618
				trial_axis *= static_cast<float>(1.0f / max_sum);
619

620
			VectorType delta_axis(prev_axis - trial_axis);
621

622
			prev_axis = axis;
623
			axis = trial_axis;
624

625
			if (delta_axis.norm() < .0024f)
626
				break;
627
		}
628

629
		return axis.normalize_in_place();
630
	}
631

632
	template<typename T> inline void indirect_sort(uint32_t num_indices, uint32_t* pIndices, const T* pKeys)
633
	{
634
		for (uint32_t i = 0; i < num_indices; i++)
635
			pIndices[i] = i;
636

637
		std::sort(
638
			pIndices,
639
			pIndices + num_indices,
640
			[pKeys](uint32_t a, uint32_t b) { return pKeys[a] < pKeys[b]; }
641
		);
642
	}
643

644
	// 1-4 byte direct Radix sort.
645
	template <typename T>
646
	T* radix_sort(uint32_t num_vals, T* pBuf0, T* pBuf1, uint32_t key_ofs, uint32_t key_size)
647
	{
648
		assert(key_ofs < sizeof(T));
649
		assert((key_size >= 1) && (key_size <= 4));
650

651
		uint32_t hist[256 * 4];
652

653
		memset(hist, 0, sizeof(hist[0]) * 256 * key_size);
654

655
#define BASISU_GET_KEY(p) (*(uint32_t *)((uint8_t *)(p) + key_ofs))
656

657
		if (key_size == 4)
658
		{
659
			T* p = pBuf0;
660
			T* q = pBuf0 + num_vals;
661
			for (; p != q; p++)
662
			{
663
				const uint32_t key = BASISU_GET_KEY(p);
664

665
				hist[key & 0xFF]++;
666
				hist[256 + ((key >> 8) & 0xFF)]++;
667
				hist[512 + ((key >> 16) & 0xFF)]++;
668
				hist[768 + ((key >> 24) & 0xFF)]++;
669
			}
670
		}
671
		else if (key_size == 3)
672
		{
673
			T* p = pBuf0;
674
			T* q = pBuf0 + num_vals;
675
			for (; p != q; p++)
676
			{
677
				const uint32_t key = BASISU_GET_KEY(p);
678

679
				hist[key & 0xFF]++;
680
				hist[256 + ((key >> 8) & 0xFF)]++;
681
				hist[512 + ((key >> 16) & 0xFF)]++;
682
			}
683
		}
684
		else if (key_size == 2)
685
		{
686
			T* p = pBuf0;
687
			T* q = pBuf0 + (num_vals >> 1) * 2;
688

689
			for (; p != q; p += 2)
690
			{
691
				const uint32_t key0 = BASISU_GET_KEY(p);
692
				const uint32_t key1 = BASISU_GET_KEY(p + 1);
693

694
				hist[key0 & 0xFF]++;
695
				hist[256 + ((key0 >> 8) & 0xFF)]++;
696

697
				hist[key1 & 0xFF]++;
698
				hist[256 + ((key1 >> 8) & 0xFF)]++;
699
			}
700

701
			if (num_vals & 1)
702
			{
703
				const uint32_t key = BASISU_GET_KEY(p);
704

705
				hist[key & 0xFF]++;
706
				hist[256 + ((key >> 8) & 0xFF)]++;
707
			}
708
		}
709
		else
710
		{
711
			assert(key_size == 1);
712
			if (key_size != 1)
713
				return NULL;
714

715
			T* p = pBuf0;
716
			T* q = pBuf0 + (num_vals >> 1) * 2;
717

718
			for (; p != q; p += 2)
719
			{
720
				const uint32_t key0 = BASISU_GET_KEY(p);
721
				const uint32_t key1 = BASISU_GET_KEY(p + 1);
722

723
				hist[key0 & 0xFF]++;
724
				hist[key1 & 0xFF]++;
725
			}
726

727
			if (num_vals & 1)
728
			{
729
				const uint32_t key = BASISU_GET_KEY(p);
730
				hist[key & 0xFF]++;
731
			}
732
		}
733

734
		T* pCur = pBuf0;
735
		T* pNew = pBuf1;
736

737
		for (uint32_t pass = 0; pass < key_size; pass++)
738
		{
739
			const uint32_t* pHist = &hist[pass << 8];
740

741
			uint32_t offsets[256];
742

743
			uint32_t cur_ofs = 0;
744
			for (uint32_t i = 0; i < 256; i += 2)
745
			{
746
				offsets[i] = cur_ofs;
747
				cur_ofs += pHist[i];
748

749
				offsets[i + 1] = cur_ofs;
750
				cur_ofs += pHist[i + 1];
751
			}
752

753
			const uint32_t pass_shift = pass << 3;
754

755
			T* p = pCur;
756
			T* q = pCur + (num_vals >> 1) * 2;
757

758
			for (; p != q; p += 2)
759
			{
760
				uint32_t c0 = (BASISU_GET_KEY(p) >> pass_shift) & 0xFF;
761
				uint32_t c1 = (BASISU_GET_KEY(p + 1) >> pass_shift) & 0xFF;
762

763
				if (c0 == c1)
764
				{
765
					uint32_t dst_offset0 = offsets[c0];
766

767
					offsets[c0] = dst_offset0 + 2;
768

769
					pNew[dst_offset0] = p[0];
770
					pNew[dst_offset0 + 1] = p[1];
771
				}
772
				else
773
				{
774
					uint32_t dst_offset0 = offsets[c0]++;
775
					uint32_t dst_offset1 = offsets[c1]++;
776

777
					pNew[dst_offset0] = p[0];
778
					pNew[dst_offset1] = p[1];
779
				}
780
			}
781

782
			if (num_vals & 1)
783
			{
784
				uint32_t c = (BASISU_GET_KEY(p) >> pass_shift) & 0xFF;
785

786
				uint32_t dst_offset = offsets[c];
787
				offsets[c] = dst_offset + 1;
788

789
				pNew[dst_offset] = *p;
790
			}
791

792
			T* t = pCur;
793
			pCur = pNew;
794
			pNew = t;
795
		}
796

797
		return pCur;
798
	}
799

800
#undef BASISU_GET_KEY
801
	
802
	// Very simple job pool with no dependencies.
803
	class job_pool
804
	{
805
		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(job_pool);
806

807
	public:
808
		// num_threads is the TOTAL number of job pool threads, including the calling thread! So 2=1 new thread, 3=2 new threads, etc.
809
		job_pool(uint32_t num_threads);
810
		~job_pool();
811
				
812
		void add_job(const std::function<void()>& job);
813
		void add_job(std::function<void()>&& job);
814

815
		void wait_for_all();
816

817
		size_t get_total_threads() const { return 1 + m_threads.size(); }
818
		
819
	private:
820
		std::vector<std::thread> m_threads;
821
		std::vector<std::function<void()> > m_queue;
822
		
823
		std::mutex m_mutex;
824
		std::condition_variable m_has_work;
825
		std::condition_variable m_no_more_jobs;
826
		
827
		uint32_t m_num_active_jobs;
828
		
829
		std::atomic<bool> m_kill_flag;
830

831
		std::atomic<int> m_num_active_workers;
832

833
		void job_thread(uint32_t index);
834
	};
835

836
	// Simple 64-bit color class
837

838
	class color_rgba_i16
839
	{
840
	public:
841
		union
842
		{
843
			int16_t m_comps[4];
844

845
			struct
846
			{
847
				int16_t r;
848
				int16_t g;
849
				int16_t b;
850
				int16_t a;
851
			};
852
		};
853

854
		inline color_rgba_i16()
855
		{
856
			static_assert(sizeof(*this) == sizeof(int16_t)*4, "sizeof(*this) == sizeof(int16_t)*4");
857
		}
858

859
		inline color_rgba_i16(int sr, int sg, int sb, int sa)
860
		{
861
			set(sr, sg, sb, sa);
862
		}
863

864
		inline color_rgba_i16 &set(int sr, int sg, int sb, int sa)
865
		{
866
			m_comps[0] = (int16_t)clamp<int>(sr, INT16_MIN, INT16_MAX);
867
			m_comps[1] = (int16_t)clamp<int>(sg, INT16_MIN, INT16_MAX);
868
			m_comps[2] = (int16_t)clamp<int>(sb, INT16_MIN, INT16_MAX);
869
			m_comps[3] = (int16_t)clamp<int>(sa, INT16_MIN, INT16_MAX);
870
			return *this;
871
		}
872
	};
873
				
874
	class color_rgba
875
	{
876
	public:
877
		union
878
		{
879
			uint8_t m_comps[4];
880

881
			struct
882
			{
883
				uint8_t r;
884
				uint8_t g;
885
				uint8_t b;
886
				uint8_t a;
887
			};
888
		};
889

890
		inline color_rgba()
891
		{
892
			static_assert(sizeof(*this) == 4, "sizeof(*this) != 4");
893
			static_assert(sizeof(*this) == sizeof(basist::color32), "sizeof(*this) != sizeof(basist::color32)");
894
		}
895

896
		// Not too hot about this idea.
897
		inline color_rgba(const basist::color32& other) :
898
			r(other.r),
899
			g(other.g),
900
			b(other.b),
901
			a(other.a)
902
		{
903
		}
904

905
		color_rgba& operator= (const basist::color32& rhs)
906
		{
907
			r = rhs.r;
908
			g = rhs.g;
909
			b = rhs.b;
910
			a = rhs.a;
911
			return *this;
912
		}
913

914
		inline color_rgba(int y)
915
		{
916
			set(y);
917
		}
918

919
		inline color_rgba(int y, int na)
920
		{
921
			set(y, na);
922
		}
923

924
		inline color_rgba(int sr, int sg, int sb, int sa)
925
		{
926
			set(sr, sg, sb, sa);
927
		}
928

929
		inline color_rgba(eNoClamp, int sr, int sg, int sb, int sa)
930
		{
931
			set_noclamp_rgba((uint8_t)sr, (uint8_t)sg, (uint8_t)sb, (uint8_t)sa);
932
		}
933

934
		inline color_rgba& set_noclamp_y(int y)
935
		{
936
			m_comps[0] = (uint8_t)y;
937
			m_comps[1] = (uint8_t)y;
938
			m_comps[2] = (uint8_t)y;
939
			m_comps[3] = (uint8_t)255;
940
			return *this;
941
		}
942

943
		inline color_rgba &set_noclamp_rgba(int sr, int sg, int sb, int sa)
944
		{
945
			m_comps[0] = (uint8_t)sr;
946
			m_comps[1] = (uint8_t)sg;
947
			m_comps[2] = (uint8_t)sb;
948
			m_comps[3] = (uint8_t)sa;
949
			return *this;
950
		}
951

952
		inline color_rgba &set(int y)
953
		{
954
			m_comps[0] = static_cast<uint8_t>(clamp<int>(y, 0, 255));
955
			m_comps[1] = m_comps[0];
956
			m_comps[2] = m_comps[0];
957
			m_comps[3] = 255;
958
			return *this;
959
		}
960

961
		inline color_rgba &set(int y, int na)
962
		{
963
			m_comps[0] = static_cast<uint8_t>(clamp<int>(y, 0, 255));
964
			m_comps[1] = m_comps[0];
965
			m_comps[2] = m_comps[0];
966
			m_comps[3] = static_cast<uint8_t>(clamp<int>(na, 0, 255));
967
			return *this;
968
		}
969

970
		inline color_rgba &set(int sr, int sg, int sb, int sa)
971
		{
972
			m_comps[0] = static_cast<uint8_t>(clamp<int>(sr, 0, 255));
973
			m_comps[1] = static_cast<uint8_t>(clamp<int>(sg, 0, 255));
974
			m_comps[2] = static_cast<uint8_t>(clamp<int>(sb, 0, 255));
975
			m_comps[3] = static_cast<uint8_t>(clamp<int>(sa, 0, 255));
976
			return *this;
977
		}
978

979
		inline color_rgba &set_rgb(int sr, int sg, int sb)
980
		{
981
			m_comps[0] = static_cast<uint8_t>(clamp<int>(sr, 0, 255));
982
			m_comps[1] = static_cast<uint8_t>(clamp<int>(sg, 0, 255));
983
			m_comps[2] = static_cast<uint8_t>(clamp<int>(sb, 0, 255));
984
			return *this;
985
		}
986

987
		inline color_rgba &set_rgb(const color_rgba &other)
988
		{
989
			r = other.r;
990
			g = other.g;
991
			b = other.b;
992
			return *this;
993
		}
994

995
		inline const uint8_t &operator[] (uint32_t index) const { assert(index < 4); return m_comps[index]; }
996
		inline uint8_t &operator[] (uint32_t index) { assert(index < 4); return m_comps[index]; }
997
		
998
		inline void clear()
999
		{
1000
			m_comps[0] = 0;
1001
			m_comps[1] = 0;
1002
			m_comps[2] = 0;
1003
			m_comps[3] = 0;
1004
		}
1005

1006
		inline bool operator== (const color_rgba &rhs) const
1007
		{
1008
			if (m_comps[0] != rhs.m_comps[0]) return false;
1009
			if (m_comps[1] != rhs.m_comps[1]) return false;
1010
			if (m_comps[2] != rhs.m_comps[2]) return false;
1011
			if (m_comps[3] != rhs.m_comps[3]) return false;
1012
			return true;
1013
		}
1014

1015
		inline bool operator!= (const color_rgba &rhs) const
1016
		{
1017
			return !(*this == rhs);
1018
		}
1019

1020
		inline bool operator<(const color_rgba &rhs) const
1021
		{
1022
			for (int i = 0; i < 4; i++)
1023
			{
1024
				if (m_comps[i] < rhs.m_comps[i])
1025
					return true;
1026
				else if (m_comps[i] != rhs.m_comps[i])
1027
					return false;
1028
			}
1029
			return false;
1030
		}
1031

1032
		inline int get_601_luma() const { return (19595U * m_comps[0] + 38470U * m_comps[1] + 7471U * m_comps[2] + 32768U) >> 16U; }
1033
		inline int get_709_luma() const { return (13938U * m_comps[0] + 46869U * m_comps[1] + 4729U * m_comps[2] + 32768U) >> 16U; } 
1034
		inline int get_luma(bool luma_601) const { return luma_601 ? get_601_luma() : get_709_luma(); }
1035

1036
		inline uint32_t get_bgra_uint32() const { return b | (g << 8) | (r << 16) | (a << 24); }
1037
		inline uint32_t get_rgba_uint32() const { return r | (g << 8) | (b << 16) | (a << 24); }
1038

1039
		inline basist::color32 get_color32() const
1040
		{
1041
			return basist::color32(r, g, b, a);
1042
		}
1043

1044
		static color_rgba comp_min(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::minimum(a[0], b[0]), basisu::minimum(a[1], b[1]), basisu::minimum(a[2], b[2]), basisu::minimum(a[3], b[3])); }
1045
		static color_rgba comp_max(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::maximum(a[0], b[0]), basisu::maximum(a[1], b[1]), basisu::maximum(a[2], b[2]), basisu::maximum(a[3], b[3])); }
1046
	};
1047

1048
	typedef basisu::vector<color_rgba> color_rgba_vec;
1049

1050
	const color_rgba g_black_color(0, 0, 0, 255);
1051
	const color_rgba g_black_trans_color(0, 0, 0, 0);
1052
	const color_rgba g_white_color(255, 255, 255, 255);
1053

1054
	inline int color_distance(int r0, int g0, int b0, int r1, int g1, int b1)
1055
	{
1056
		int dr = r0 - r1, dg = g0 - g1, db = b0 - b1;
1057
		return dr * dr + dg * dg + db * db;
1058
	}
1059

1060
	inline int color_distance(int r0, int g0, int b0, int a0, int r1, int g1, int b1, int a1)
1061
	{
1062
		int dr = r0 - r1, dg = g0 - g1, db = b0 - b1, da = a0 - a1;
1063
		return dr * dr + dg * dg + db * db + da * da;
1064
	}
1065

1066
	inline int color_distance(const color_rgba &c0, const color_rgba &c1, bool alpha)
1067
	{
1068
		if (alpha)
1069
			return color_distance(c0.r, c0.g, c0.b, c0.a, c1.r, c1.g, c1.b, c1.a);
1070
		else
1071
			return color_distance(c0.r, c0.g, c0.b, c1.r, c1.g, c1.b);
1072
	}
1073
		
1074
	// TODO: Allow user to control channel weightings.
1075
	inline uint32_t color_distance(bool perceptual, const color_rgba &e1, const color_rgba &e2, bool alpha)
1076
	{
1077
		if (perceptual)
1078
		{
1079
#if BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE
1080
			const float l1 = e1.r * .2126f + e1.g * .715f + e1.b * .0722f;
1081
			const float l2 = e2.r * .2126f + e2.g * .715f + e2.b * .0722f;
1082

1083
			const float cr1 = e1.r - l1;
1084
			const float cr2 = e2.r - l2;
1085

1086
			const float cb1 = e1.b - l1;
1087
			const float cb2 = e2.b - l2;
1088

1089
			const float dl = l1 - l2;
1090
			const float dcr = cr1 - cr2;
1091
			const float dcb = cb1 - cb2;
1092

1093
			uint32_t d = static_cast<uint32_t>(32.0f*4.0f*dl*dl + 32.0f*2.0f*(.5f / (1.0f - .2126f))*(.5f / (1.0f - .2126f))*dcr*dcr + 32.0f*.25f*(.5f / (1.0f - .0722f))*(.5f / (1.0f - .0722f))*dcb*dcb);
1094
			
1095
			if (alpha)
1096
			{
1097
				int da = static_cast<int>(e1.a) - static_cast<int>(e2.a);
1098
				d += static_cast<uint32_t>(128.0f*da*da);
1099
			}
1100

1101
			return d;
1102
#elif 1
1103
			int dr = e1.r - e2.r;
1104
			int dg = e1.g - e2.g;
1105
			int db = e1.b - e2.b;
1106

1107
#if 0
1108
			int delta_l = dr * 27 + dg * 92 + db * 9;
1109
			int delta_cr = dr * 128 - delta_l;
1110
			int delta_cb = db * 128 - delta_l;
1111
															
1112
			uint32_t id = ((uint32_t)(delta_l * delta_l) >> 7U) +
1113
				((((uint32_t)(delta_cr * delta_cr) >> 7U) * 26U) >> 7U) +
1114
				((((uint32_t)(delta_cb * delta_cb) >> 7U) * 3U) >> 7U);
1115
#else
1116
			int64_t delta_l = dr * 27 + dg * 92 + db * 9;
1117
			int64_t delta_cr = dr * 128 - delta_l;
1118
			int64_t delta_cb = db * 128 - delta_l;
1119

1120
			uint32_t id = ((uint32_t)((delta_l * delta_l) >> 7U)) +
1121
				((((uint32_t)((delta_cr * delta_cr) >> 7U)) * 26U) >> 7U) +
1122
				((((uint32_t)((delta_cb * delta_cb) >> 7U)) * 3U) >> 7U);
1123
#endif
1124

1125
			if (alpha)
1126
			{
1127
				int da = (e1.a - e2.a) << 7;
1128
				// This shouldn't overflow if da is 255 or -255: 29.99 bits after squaring.
1129
				id += ((uint32_t)(da * da) >> 7U);
1130
			}
1131

1132
			return id;
1133
#else
1134
			int dr = e1.r - e2.r;
1135
			int dg = e1.g - e2.g;
1136
			int db = e1.b - e2.b;
1137

1138
			int64_t delta_l = dr * 27 + dg * 92 + db * 9;
1139
			int64_t delta_cr = dr * 128 - delta_l;
1140
			int64_t delta_cb = db * 128 - delta_l;
1141

1142
			int64_t id = ((delta_l * delta_l) * 128) +
1143
				((delta_cr * delta_cr) * 26) +
1144
				((delta_cb * delta_cb) * 3);
1145

1146
			if (alpha)
1147
			{
1148
				int64_t da = (e1.a - e2.a);
1149
				id += (da * da) * 128;
1150
			}
1151

1152
			int d = (id + 8192) >> 14;
1153

1154
			return d;
1155
#endif
1156
		}
1157
		else
1158
			return color_distance(e1, e2, alpha);
1159
	}
1160

1161
	static inline uint32_t color_distance_la(const color_rgba& a, const color_rgba& b)
1162
	{
1163
		const int dl = a.r - b.r;
1164
		const int da = a.a - b.a;
1165
		return dl * dl + da * da;
1166
	}
1167

1168
	// String helpers
1169

1170
	inline int string_find_right(const std::string& filename, char c)
1171
	{
1172
		size_t result = filename.find_last_of(c);
1173
		return (result == std::string::npos) ? -1 : (int)result;
1174
	}
1175

1176
	inline std::string string_get_extension(const std::string &filename)
1177
	{
1178
		int sep = -1;
1179
#ifdef _WIN32
1180
		sep = string_find_right(filename, '\\');
1181
#endif
1182
		if (sep < 0)
1183
			sep = string_find_right(filename, '/');
1184

1185
		int dot = string_find_right(filename, '.');
1186
		if (dot <= sep)
1187
			return "";
1188

1189
		std::string result(filename);
1190
		result.erase(0, dot + 1);
1191

1192
		return result;
1193
	}
1194

1195
	inline bool string_remove_extension(std::string &filename)
1196
	{
1197
		int sep = -1;
1198
#ifdef _WIN32
1199
		sep = string_find_right(filename, '\\');
1200
#endif
1201
		if (sep < 0)
1202
			sep = string_find_right(filename, '/');
1203

1204
		int dot = string_find_right(filename, '.');
1205
		if ((dot < sep) || (dot < 0))
1206
			return false;
1207

1208
		filename.resize(dot);
1209

1210
		return true;
1211
	}
1212
		
1213
	inline std::string string_tolower(const std::string& s)
1214
	{
1215
		std::string result(s);
1216
		for (size_t i = 0; i < result.size(); i++)
1217
		{
1218
			result[i] = (char)tolower((uint8_t)(result[i]));
1219
		}
1220
		return result;
1221
	}
1222

1223
	inline char *strcpy_safe(char *pDst, size_t dst_len, const char *pSrc)
1224
	{
1225
		assert(pDst && pSrc && dst_len);
1226
		if (!dst_len)
1227
			return pDst;
1228

1229
		const size_t src_len = strlen(pSrc);
1230
		const size_t src_len_plus_terminator = src_len + 1;
1231

1232
		if (src_len_plus_terminator <= dst_len)
1233
			memcpy(pDst, pSrc, src_len_plus_terminator);
1234
		else
1235
		{
1236
			if (dst_len > 1)
1237
				memcpy(pDst, pSrc, dst_len - 1);
1238
			pDst[dst_len - 1] = '\0';
1239
		}
1240

1241
		return pDst;
1242
	}
1243

1244
	inline bool string_ends_with(const std::string& s, char c)
1245
	{
1246
		return (s.size() != 0) && (s.back() == c);
1247
	}
1248

1249
	inline bool string_split_path(const char *p, std::string *pDrive, std::string *pDir, std::string *pFilename, std::string *pExt)
1250
	{
1251
#ifdef _MSC_VER
1252
		char drive_buf[_MAX_DRIVE] = { 0 };
1253
		char dir_buf[_MAX_DIR] = { 0 };
1254
		char fname_buf[_MAX_FNAME] = { 0 };
1255
		char ext_buf[_MAX_EXT] = { 0 };
1256

1257
		errno_t error = _splitpath_s(p, 
1258
			pDrive ? drive_buf : NULL, pDrive ? _MAX_DRIVE : 0,
1259
			pDir ? dir_buf : NULL, pDir ? _MAX_DIR : 0,
1260
			pFilename ? fname_buf : NULL, pFilename ? _MAX_FNAME : 0,
1261
			pExt ? ext_buf : NULL, pExt ? _MAX_EXT : 0);
1262
		if (error != 0)
1263
			return false;
1264

1265
		if (pDrive) *pDrive = drive_buf;
1266
		if (pDir) *pDir = dir_buf;
1267
		if (pFilename) *pFilename = fname_buf;
1268
		if (pExt) *pExt = ext_buf;
1269
		return true;
1270
#else
1271
		char dirtmp[1024], nametmp[1024];
1272
		strcpy_safe(dirtmp, sizeof(dirtmp), p);
1273
		strcpy_safe(nametmp, sizeof(nametmp), p);
1274

1275
		if (pDrive)
1276
			pDrive->resize(0);
1277

1278
		const char *pDirName = dirname(dirtmp);
1279
		const char* pBaseName = basename(nametmp);
1280
		if ((!pDirName) || (!pBaseName))
1281
			return false;
1282

1283
		if (pDir)
1284
		{
1285
			*pDir = pDirName;
1286
			if ((pDir->size()) && (pDir->back() != '/'))
1287
				*pDir += "/";
1288
		}
1289
				
1290
		if (pFilename)
1291
		{
1292
			*pFilename = pBaseName;
1293
			string_remove_extension(*pFilename);
1294
		}
1295

1296
		if (pExt)
1297
		{
1298
			*pExt = pBaseName;
1299
			*pExt = string_get_extension(*pExt);
1300
			if (pExt->size())
1301
				*pExt = "." + *pExt;
1302
		}
1303

1304
		return true;
1305
#endif
1306
	}
1307

1308
	inline bool is_path_separator(char c)
1309
	{
1310
#ifdef _WIN32
1311
		return (c == '/') || (c == '\\');
1312
#else
1313
		return (c == '/');
1314
#endif
1315
	}
1316
		
1317
	inline bool is_drive_separator(char c)
1318
	{
1319
#ifdef _WIN32
1320
		return (c == ':');
1321
#else
1322
		(void)c;
1323
		return false;
1324
#endif
1325
	}
1326

1327
	inline void string_combine_path(std::string &dst, const char *p, const char *q)
1328
	{
1329
		std::string temp(p);
1330
		if (temp.size() && !is_path_separator(q[0]))
1331
		{
1332
			if (!is_path_separator(temp.back()))
1333
				temp.append(1, BASISU_PATH_SEPERATOR_CHAR);
1334
		}
1335
		temp += q;
1336
		dst.swap(temp);
1337
	}
1338

1339
	inline void string_combine_path(std::string &dst, const char *p, const char *q, const char *r)
1340
	{
1341
		string_combine_path(dst, p, q);
1342
		string_combine_path(dst, dst.c_str(), r);
1343
	}
1344
		
1345
	inline void string_combine_path_and_extension(std::string &dst, const char *p, const char *q, const char *r, const char *pExt)
1346
	{
1347
		string_combine_path(dst, p, q, r);
1348
		if ((!string_ends_with(dst, '.')) && (pExt[0]) && (pExt[0] != '.'))
1349
			dst.append(1, '.');
1350
		dst.append(pExt);
1351
	}
1352

1353
	inline bool string_get_pathname(const char *p, std::string &path)
1354
	{
1355
		std::string temp_drive, temp_path;
1356
		if (!string_split_path(p, &temp_drive, &temp_path, NULL, NULL))
1357
			return false;
1358
		string_combine_path(path, temp_drive.c_str(), temp_path.c_str());
1359
		return true;
1360
	}
1361

1362
	inline bool string_get_filename(const char *p, std::string &filename)
1363
	{
1364
		std::string temp_ext;
1365
		if (!string_split_path(p, nullptr, nullptr, &filename, &temp_ext))
1366
			return false;
1367
		filename += temp_ext;
1368
		return true;
1369
	}
1370

1371
	class rand
1372
	{
1373
		std::mt19937 m_mt;
1374

1375
	public:
1376
		rand() {	}
1377

1378
		rand(uint32_t s) { seed(s); }
1379
		void seed(uint32_t s) { m_mt.seed(s); }
1380

1381
		// between [l,h]
1382
		int irand(int l, int h) { std::uniform_int_distribution<int> d(l, h); return d(m_mt); }
1383

1384
		uint32_t urand32() { return static_cast<uint32_t>(irand(INT32_MIN, INT32_MAX)); }
1385

1386
		bool bit() { return irand(0, 1) == 1; }
1387

1388
		uint8_t byte() { return static_cast<uint8_t>(urand32()); }
1389

1390
		// between [l,h)
1391
		float frand(float l, float h) { std::uniform_real_distribution<float> d(l, h); return d(m_mt); }
1392

1393
		float gaussian(float mean, float stddev) { std::normal_distribution<float> d(mean, stddev); return d(m_mt); }
1394
	};
1395

1396
	class priority_queue
1397
	{
1398
	public:
1399
		priority_queue() :
1400
			m_size(0)
1401
		{
1402
		}
1403

1404
		void clear()
1405
		{
1406
			m_heap.clear();
1407
			m_size = 0;
1408
		}
1409

1410
		void init(uint32_t max_entries, uint32_t first_index, float first_priority)
1411
		{
1412
			m_heap.resize(max_entries + 1);
1413
			m_heap[1].m_index = first_index;
1414
			m_heap[1].m_priority = first_priority;
1415
			m_size = 1;
1416
		}
1417

1418
		inline uint32_t size() const { return m_size; }
1419

1420
		inline uint32_t get_top_index() const { return m_heap[1].m_index; }
1421
		inline float get_top_priority() const { return m_heap[1].m_priority; }
1422

1423
		inline void delete_top()
1424
		{
1425
			assert(m_size > 0);
1426
			m_heap[1] = m_heap[m_size];
1427
			m_size--;
1428
			if (m_size)
1429
				down_heap(1);
1430
		}
1431

1432
		inline void add_heap(uint32_t index, float priority)
1433
		{
1434
			m_size++;
1435

1436
			uint32_t k = m_size;
1437

1438
			if (m_size >= m_heap.size())
1439
				m_heap.resize(m_size + 1);
1440

1441
			for (;;)
1442
			{
1443
				uint32_t parent_index = k >> 1;
1444
				if ((!parent_index) || (m_heap[parent_index].m_priority > priority))
1445
					break;
1446
				m_heap[k] = m_heap[parent_index];
1447
				k = parent_index;
1448
			}
1449

1450
			m_heap[k].m_index = index;
1451
			m_heap[k].m_priority = priority;
1452
		}
1453

1454
	private:
1455
		struct entry
1456
		{
1457
			uint32_t m_index;
1458
			float m_priority;
1459
		};
1460

1461
		basisu::vector<entry> m_heap;
1462
		uint32_t m_size;
1463

1464
		// Push down entry at index
1465
		inline void down_heap(uint32_t heap_index)
1466
		{
1467
			uint32_t orig_index = m_heap[heap_index].m_index;
1468
			const float orig_priority = m_heap[heap_index].m_priority;
1469

1470
			uint32_t child_index;
1471
			while ((child_index = (heap_index << 1)) <= m_size)
1472
			{
1473
				if ((child_index < m_size) && (m_heap[child_index].m_priority < m_heap[child_index + 1].m_priority)) ++child_index;
1474
				if (orig_priority > m_heap[child_index].m_priority)
1475
					break;
1476
				m_heap[heap_index] = m_heap[child_index];
1477
				heap_index = child_index;
1478
			}
1479

1480
			m_heap[heap_index].m_index = orig_index;
1481
			m_heap[heap_index].m_priority = orig_priority;
1482
		}
1483
	};
1484

1485
	// Tree structured vector quantization (TSVQ)
1486

1487
	template <typename TrainingVectorType>
1488
	class tree_vector_quant
1489
	{
1490
	public:
1491
		typedef TrainingVectorType training_vec_type;
1492
		typedef std::pair<TrainingVectorType, uint64_t> training_vec_with_weight;
1493
		typedef basisu::vector< training_vec_with_weight > array_of_weighted_training_vecs;
1494

1495
		tree_vector_quant() :
1496
			m_next_codebook_index(0)
1497
		{
1498
		}
1499

1500
		void clear()
1501
		{
1502
			clear_vector(m_training_vecs);
1503
			clear_vector(m_nodes);
1504
			m_next_codebook_index = 0;
1505
		}
1506

1507
		void add_training_vec(const TrainingVectorType &v, uint64_t weight) { m_training_vecs.push_back(std::make_pair(v, weight)); }
1508

1509
		size_t get_total_training_vecs() const { return m_training_vecs.size(); }
1510
		const array_of_weighted_training_vecs &get_training_vecs() const	{ return m_training_vecs; }
1511
			  array_of_weighted_training_vecs &get_training_vecs()			{ return m_training_vecs; }
1512

1513
		void retrieve(basisu::vector< basisu::vector<uint32_t> > &codebook) const
1514
		{
1515
			for (uint32_t i = 0; i < m_nodes.size(); i++)
1516
			{
1517
				const tsvq_node &n = m_nodes[i];
1518
				if (!n.is_leaf())
1519
					continue;
1520

1521
				codebook.resize(codebook.size() + 1);
1522
				codebook.back() = n.m_training_vecs;
1523
			}
1524
		}
1525

1526
		void retrieve(basisu::vector<TrainingVectorType> &codebook) const
1527
		{
1528
			for (uint32_t i = 0; i < m_nodes.size(); i++)
1529
			{
1530
				const tsvq_node &n = m_nodes[i];
1531
				if (!n.is_leaf())
1532
					continue;
1533

1534
				codebook.resize(codebook.size() + 1);
1535
				codebook.back() = n.m_origin;
1536
			}
1537
		}
1538

1539
		void retrieve(uint32_t max_clusters, basisu::vector<uint_vec> &codebook) const
1540
		{
1541
			uint_vec node_stack;
1542
			node_stack.reserve(512);
1543

1544
			codebook.resize(0);
1545
			codebook.reserve(max_clusters);
1546
			         
1547
			uint32_t node_index = 0;
1548

1549
			while (true)
1550
			{
1551
				const tsvq_node& cur = m_nodes[node_index];
1552

1553
				if (cur.is_leaf() || ((2 + cur.m_codebook_index) > (int)max_clusters))
1554
				{
1555
					codebook.resize(codebook.size() + 1);
1556
					codebook.back() = cur.m_training_vecs;
1557
										
1558
					if (node_stack.empty())
1559
						break;
1560

1561
					node_index = node_stack.back();
1562
					node_stack.pop_back();
1563
					continue;
1564
				}
1565
				            
1566
				node_stack.push_back(cur.m_right_index);
1567
					node_index = cur.m_left_index;
1568
			}
1569
		}
1570

1571
		bool generate(uint32_t max_size)
1572
		{
1573
			if (!m_training_vecs.size())
1574
				return false;
1575

1576
			m_next_codebook_index = 0;
1577

1578
			clear_vector(m_nodes);
1579
			m_nodes.reserve(max_size * 2 + 1);
1580

1581
			m_nodes.push_back(prepare_root());
1582

1583
			priority_queue var_heap;
1584
			var_heap.init(max_size, 0, m_nodes[0].m_var);
1585

1586
			basisu::vector<uint32_t> l_children, r_children;
1587

1588
			// Now split the worst nodes
1589
			l_children.reserve(m_training_vecs.size() + 1);
1590
			r_children.reserve(m_training_vecs.size() + 1);
1591

1592
			uint32_t total_leaf_nodes = 1;
1593

1594
			//interval_timer tm;
1595
			//tm.start();
1596

1597
			while ((var_heap.size()) && (total_leaf_nodes < max_size))
1598
			{
1599
				const uint32_t node_index = var_heap.get_top_index();
1600
				const tsvq_node &node = m_nodes[node_index];
1601

1602
				assert(node.m_var == var_heap.get_top_priority());
1603
				assert(node.is_leaf());
1604

1605
				var_heap.delete_top();
1606
								
1607
				if (node.m_training_vecs.size() > 1)
1608
				{
1609
					if (split_node(node_index, var_heap, l_children, r_children))
1610
					{
1611
						// This removes one leaf node (making an internal node) and replaces it with two new leaves, so +1 total.
1612
						total_leaf_nodes += 1;
1613
					}
1614
				}
1615
			}
1616

1617
			//debug_printf("tree_vector_quant::generate %u: %3.3f secs\n", TrainingVectorType::num_elements, tm.get_elapsed_secs());
1618

1619
			return true;
1620
		}
1621

1622
	private:
1623
		class tsvq_node
1624
		{
1625
		public:
1626
			inline tsvq_node() : m_weight(0), m_origin(cZero), m_left_index(-1), m_right_index(-1), m_codebook_index(-1) { }
1627

1628
			// vecs is erased
1629
			inline void set(const TrainingVectorType &org, uint64_t weight, float var, basisu::vector<uint32_t> &vecs) { m_origin = org; m_weight = weight; m_var = var; m_training_vecs.swap(vecs); }
1630

1631
			inline bool is_leaf() const { return m_left_index < 0; }
1632

1633
			float m_var;
1634
			uint64_t m_weight;
1635
			TrainingVectorType m_origin;
1636
			int32_t m_left_index, m_right_index;
1637
			basisu::vector<uint32_t> m_training_vecs;
1638
			int m_codebook_index;
1639
		};
1640

1641
		typedef basisu::vector<tsvq_node> tsvq_node_vec;
1642
		tsvq_node_vec m_nodes;
1643

1644
		array_of_weighted_training_vecs m_training_vecs;
1645

1646
		uint32_t m_next_codebook_index;
1647

1648
		tsvq_node prepare_root() const
1649
		{
1650
			double ttsum = 0.0f;
1651

1652
			// Prepare root node containing all training vectors
1653
			tsvq_node root;
1654
			root.m_training_vecs.reserve(m_training_vecs.size());
1655

1656
			for (uint32_t i = 0; i < m_training_vecs.size(); i++)
1657
			{
1658
				const TrainingVectorType &v = m_training_vecs[i].first;
1659
				const uint64_t weight = m_training_vecs[i].second;
1660

1661
				root.m_training_vecs.push_back(i);
1662

1663
				root.m_origin += (v * static_cast<float>(weight));
1664
				root.m_weight += weight;
1665

1666
				ttsum += v.dot(v) * weight;
1667
			}
1668

1669
			root.m_var = static_cast<float>(ttsum - (root.m_origin.dot(root.m_origin) / root.m_weight));
1670

1671
			root.m_origin *= (1.0f / root.m_weight);
1672

1673
			return root;
1674
		}
1675

1676
		bool split_node(uint32_t node_index, priority_queue &var_heap, basisu::vector<uint32_t> &l_children, basisu::vector<uint32_t> &r_children)
1677
		{
1678
			TrainingVectorType l_child_org, r_child_org;
1679
			uint64_t l_weight = 0, r_weight = 0;
1680
			float l_var = 0.0f, r_var = 0.0f;
1681

1682
			// Compute initial left/right child origins
1683
			if (!prep_split(m_nodes[node_index], l_child_org, r_child_org))
1684
				return false;
1685

1686
			// Use k-means iterations to refine these children vectors
1687
			if (!refine_split(m_nodes[node_index], l_child_org, l_weight, l_var, l_children, r_child_org, r_weight, r_var, r_children))
1688
				return false;
1689

1690
			// Create children
1691
			const uint32_t l_child_index = (uint32_t)m_nodes.size(), r_child_index = (uint32_t)m_nodes.size() + 1;
1692

1693
			m_nodes[node_index].m_left_index = l_child_index;
1694
			m_nodes[node_index].m_right_index = r_child_index;
1695
			
1696
			m_nodes[node_index].m_codebook_index = m_next_codebook_index;
1697
			m_next_codebook_index++;
1698

1699
			m_nodes.resize(m_nodes.size() + 2);
1700

1701
			tsvq_node &l_child = m_nodes[l_child_index], &r_child = m_nodes[r_child_index];
1702

1703
			l_child.set(l_child_org, l_weight, l_var, l_children);
1704
			r_child.set(r_child_org, r_weight, r_var, r_children);
1705

1706
			if ((l_child.m_var <= 0.0f) && (l_child.m_training_vecs.size() > 1))
1707
			{
1708
				TrainingVectorType v(m_training_vecs[l_child.m_training_vecs[0]].first);
1709
				
1710
				for (uint32_t i = 1; i < l_child.m_training_vecs.size(); i++)
1711
				{
1712
					if (!(v == m_training_vecs[l_child.m_training_vecs[i]].first))
1713
					{
1714
						l_child.m_var = 1e-4f;
1715
						break;
1716
					}
1717
				}
1718
			}
1719

1720
			if ((r_child.m_var <= 0.0f) && (r_child.m_training_vecs.size() > 1))
1721
			{
1722
				TrainingVectorType v(m_training_vecs[r_child.m_training_vecs[0]].first);
1723

1724
				for (uint32_t i = 1; i < r_child.m_training_vecs.size(); i++)
1725
				{
1726
					if (!(v == m_training_vecs[r_child.m_training_vecs[i]].first))
1727
					{
1728
						r_child.m_var = 1e-4f;
1729
						break;
1730
					}
1731
				}
1732
			}
1733

1734
			if ((l_child.m_var > 0.0f) && (l_child.m_training_vecs.size() > 1))
1735
				var_heap.add_heap(l_child_index, l_child.m_var);
1736
						
1737
			if ((r_child.m_var > 0.0f) && (r_child.m_training_vecs.size() > 1))
1738
				var_heap.add_heap(r_child_index, r_child.m_var);
1739
						
1740
			return true;
1741
		}
1742

1743
		TrainingVectorType compute_split_axis(const tsvq_node &node) const
1744
		{
1745
			const uint32_t N = TrainingVectorType::num_elements;
1746

1747
			matrix<N, N, float> cmatrix;
1748

1749
			if ((N != 16) || (!g_cpu_supports_sse41))
1750
			{
1751
				cmatrix.set_zero();
1752

1753
				// Compute covariance matrix from weighted input vectors
1754
				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1755
				{
1756
					const TrainingVectorType v(m_training_vecs[node.m_training_vecs[i]].first - node.m_origin);
1757
					const TrainingVectorType w(static_cast<float>(m_training_vecs[node.m_training_vecs[i]].second) * v);
1758

1759
					for (uint32_t x = 0; x < N; x++)
1760
						for (uint32_t y = x; y < N; y++)
1761
							cmatrix[x][y] = cmatrix[x][y] + v[x] * w[y];
1762
				}
1763
			}
1764
			else
1765
			{
1766
#if BASISU_SUPPORT_SSE
1767
				// Specialize the case with 16x16 matrices, which are quite expensive without SIMD.
1768
				// This SSE function takes pointers to void types, so do some sanity checks.
1769
				assert(sizeof(TrainingVectorType) == sizeof(float) * 16);
1770
				assert(sizeof(training_vec_with_weight) == sizeof(std::pair<vec16F, uint64_t>));
1771
				update_covar_matrix_16x16_sse41(node.m_training_vecs.size_u32(), m_training_vecs.data(), &node.m_origin, node.m_training_vecs.data(), &cmatrix);
1772
#endif
1773
			}
1774

1775
			const float renorm_scale = 1.0f / node.m_weight;
1776

1777
			for (uint32_t x = 0; x < N; x++)
1778
				for (uint32_t y = x; y < N; y++)
1779
					cmatrix[x][y] *= renorm_scale;
1780

1781
			// Diagonal flip
1782
			for (uint32_t x = 0; x < (N - 1); x++)
1783
				for (uint32_t y = x + 1; y < N; y++)
1784
					cmatrix[y][x] = cmatrix[x][y];
1785

1786
			return compute_pca_from_covar<N, TrainingVectorType>(cmatrix);
1787
		}
1788

1789
		bool prep_split(const tsvq_node &node, TrainingVectorType &l_child_result, TrainingVectorType &r_child_result) const
1790
		{
1791
			//const uint32_t N = TrainingVectorType::num_elements;
1792

1793
			if (2 == node.m_training_vecs.size())
1794
			{
1795
				l_child_result = m_training_vecs[node.m_training_vecs[0]].first;
1796
				r_child_result = m_training_vecs[node.m_training_vecs[1]].first;
1797
				return true;
1798
			}
1799

1800
			TrainingVectorType axis(compute_split_axis(node)), l_child(0.0f), r_child(0.0f);
1801
			double l_weight = 0.0f, r_weight = 0.0f;
1802

1803
			// Compute initial left/right children
1804
			for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1805
			{
1806
				const float weight = (float)m_training_vecs[node.m_training_vecs[i]].second;
1807

1808
				const TrainingVectorType &v = m_training_vecs[node.m_training_vecs[i]].first;
1809

1810
				double t = (v - node.m_origin).dot(axis);
1811
				if (t >= 0.0f)
1812
				{
1813
					r_child += v * weight;
1814
					r_weight += weight;
1815
				}
1816
				else
1817
				{
1818
					l_child += v * weight;
1819
					l_weight += weight;
1820
				}
1821
			}
1822

1823
			if ((l_weight > 0.0f) && (r_weight > 0.0f))
1824
			{
1825
				l_child_result = l_child * static_cast<float>(1.0f / l_weight);
1826
				r_child_result = r_child * static_cast<float>(1.0f / r_weight);
1827
			}
1828
			else
1829
			{
1830
				TrainingVectorType l(1e+20f);
1831
				TrainingVectorType h(-1e+20f);
1832
				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1833
				{
1834
					const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
1835
					
1836
					l = TrainingVectorType::component_min(l, v);
1837
					h = TrainingVectorType::component_max(h, v);
1838
				}
1839

1840
				TrainingVectorType r(h - l);
1841

1842
				float largest_axis_v = 0.0f;
1843
				int largest_axis_index = -1;
1844
				for (uint32_t i = 0; i < TrainingVectorType::num_elements; i++)
1845
				{
1846
					if (r[i] > largest_axis_v)
1847
					{
1848
						largest_axis_v = r[i];
1849
						largest_axis_index = i;
1850
					}
1851
				}
1852

1853
				if (largest_axis_index < 0)
1854
					return false;
1855

1856
				basisu::vector<float> keys(node.m_training_vecs.size());
1857
				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1858
					keys[i] = m_training_vecs[node.m_training_vecs[i]].first[largest_axis_index];
1859

1860
				uint_vec indices(node.m_training_vecs.size());
1861
				indirect_sort((uint32_t)node.m_training_vecs.size(), &indices[0], &keys[0]);
1862

1863
				l_child.set_zero();
1864
				l_weight = 0;
1865

1866
				r_child.set_zero();
1867
				r_weight = 0;
1868

1869
				const uint32_t half_index = (uint32_t)node.m_training_vecs.size() / 2;
1870
				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1871
				{
1872
					const float weight = (float)m_training_vecs[node.m_training_vecs[i]].second;
1873

1874
					const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
1875

1876
					if (i < half_index)
1877
					{
1878
						l_child += v * weight;
1879
						l_weight += weight;
1880
					}
1881
					else
1882
					{
1883
						r_child += v * weight;
1884
						r_weight += weight;
1885
					}
1886
				}
1887

1888
				if ((l_weight > 0.0f) && (r_weight > 0.0f))
1889
				{
1890
					l_child_result = l_child * static_cast<float>(1.0f / l_weight);
1891
					r_child_result = r_child * static_cast<float>(1.0f / r_weight);
1892
				}
1893
				else
1894
				{
1895
					l_child_result = l;
1896
					r_child_result = h;
1897
				}
1898
			}
1899

1900
			return true;
1901
		}
1902

1903
		bool refine_split(const tsvq_node &node,
1904
			TrainingVectorType &l_child, uint64_t &l_weight, float &l_var, basisu::vector<uint32_t> &l_children,
1905
			TrainingVectorType &r_child, uint64_t &r_weight, float &r_var, basisu::vector<uint32_t> &r_children) const
1906
		{
1907
			l_children.reserve(node.m_training_vecs.size());
1908
			r_children.reserve(node.m_training_vecs.size());
1909

1910
			float prev_total_variance = 1e+10f;
1911

1912
			// Refine left/right children locations using k-means iterations
1913
			const uint32_t cMaxIters = 6;
1914
			for (uint32_t iter = 0; iter < cMaxIters; iter++)
1915
			{
1916
				l_children.resize(0); 
1917
				r_children.resize(0); 
1918

1919
				TrainingVectorType new_l_child(cZero), new_r_child(cZero);
1920

1921
				double l_ttsum = 0.0f, r_ttsum = 0.0f;
1922

1923
				l_weight = 0;
1924
				r_weight = 0;
1925

1926
				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1927
				{
1928
					const TrainingVectorType &v = m_training_vecs[node.m_training_vecs[i]].first;
1929
					const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second;
1930

1931
					double left_dist2 = l_child.squared_distance_d(v), right_dist2 = r_child.squared_distance_d(v);
1932

1933
					if (left_dist2 >= right_dist2)
1934
					{
1935
						new_r_child += (v * static_cast<float>(weight));
1936
						r_weight += weight;
1937

1938
						r_ttsum += weight * v.dot(v);
1939
						r_children.push_back(node.m_training_vecs[i]);
1940
					}
1941
					else
1942
					{
1943
						new_l_child += (v * static_cast<float>(weight));
1944
						l_weight += weight;
1945

1946
						l_ttsum += weight * v.dot(v);
1947
						l_children.push_back(node.m_training_vecs[i]);
1948
					}
1949
				}
1950

1951
				// Node is unsplittable using the above algorithm - try something else to split it up.
1952
				if ((!l_weight) || (!r_weight))
1953
				{
1954
					l_children.resize(0);
1955
					new_l_child.set(0.0f);
1956
					l_ttsum = 0.0f;
1957
					l_weight = 0;
1958

1959
					r_children.resize(0);
1960
					new_r_child.set(0.0f);
1961
					r_ttsum = 0.0f;
1962
					r_weight = 0;
1963

1964
					TrainingVectorType firstVec;
1965
					for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
1966
					{
1967
						const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
1968
						const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second;
1969
					
1970
						if ((!i) || (v == firstVec))
1971
						{
1972
							firstVec = v;
1973

1974
							new_r_child += (v * static_cast<float>(weight));
1975
							r_weight += weight;
1976

1977
							r_ttsum += weight * v.dot(v);
1978
							r_children.push_back(node.m_training_vecs[i]);
1979
						}
1980
						else
1981
						{
1982
							new_l_child += (v * static_cast<float>(weight));
1983
							l_weight += weight;
1984

1985
							l_ttsum += weight * v.dot(v);
1986
							l_children.push_back(node.m_training_vecs[i]);
1987
						}
1988
					}
1989

1990
					if ((!l_weight) || (!r_weight))
1991
						return false;
1992
				}
1993

1994
				l_var = static_cast<float>(l_ttsum - (new_l_child.dot(new_l_child) / l_weight));
1995
				r_var = static_cast<float>(r_ttsum - (new_r_child.dot(new_r_child) / r_weight));
1996

1997
				new_l_child *= (1.0f / l_weight);
1998
				new_r_child *= (1.0f / r_weight);
1999

2000
				l_child = new_l_child;
2001
				r_child = new_r_child;
2002

2003
				float total_var = l_var + r_var;
2004
				const float cGiveupVariance = .00001f;
2005
				if (total_var < cGiveupVariance)
2006
					break;
2007

2008
				// Check to see if the variance has settled
2009
				const float cVarianceDeltaThresh = .00125f;
2010
				if (((prev_total_variance - total_var) / total_var) < cVarianceDeltaThresh)
2011
					break;
2012

2013
				prev_total_variance = total_var;
2014
			}
2015

2016
			return true;
2017
		}
2018
	};
2019

2020
	struct weighted_block_group
2021
	{
2022
		uint64_t m_total_weight;
2023
		uint_vec m_indices;
2024
	};
2025

2026
	template<typename Quantizer>
2027
	bool generate_hierarchical_codebook_threaded_internal(Quantizer& q,
2028
		uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
2029
		basisu::vector<uint_vec>& codebook,
2030
		basisu::vector<uint_vec>& parent_codebook,
2031
		uint32_t max_threads, bool limit_clusterizers, job_pool *pJob_pool)
2032
	{
2033
		codebook.resize(0);
2034
		parent_codebook.resize(0);
2035

2036
		if ((max_threads <= 1) || (q.get_training_vecs().size() < 256) || (max_codebook_size < max_threads * 16))
2037
		{
2038
			if (!q.generate(max_codebook_size))
2039
				return false;
2040

2041
			q.retrieve(codebook);
2042

2043
			if (max_parent_codebook_size)
2044
				q.retrieve(max_parent_codebook_size, parent_codebook);
2045

2046
			return true;
2047
		}
2048

2049
		const uint32_t cMaxThreads = 16;
2050
		if (max_threads > cMaxThreads)
2051
			max_threads = cMaxThreads;
2052

2053
		if (!q.generate(max_threads))
2054
			return false;
2055

2056
		basisu::vector<uint_vec> initial_codebook;
2057

2058
		q.retrieve(initial_codebook);
2059

2060
		if (initial_codebook.size() < max_threads)
2061
		{
2062
			codebook = initial_codebook;
2063

2064
			if (max_parent_codebook_size)
2065
				q.retrieve(max_parent_codebook_size, parent_codebook);
2066

2067
			return true;
2068
		}
2069

2070
		Quantizer quantizers[cMaxThreads];
2071
		
2072
		bool success_flags[cMaxThreads];
2073
		clear_obj(success_flags);
2074

2075
		basisu::vector<uint_vec> local_clusters[cMaxThreads];
2076
		basisu::vector<uint_vec> local_parent_clusters[cMaxThreads];
2077

2078
		for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)
2079
		{
2080
			pJob_pool->add_job( [thread_iter, &local_clusters, &local_parent_clusters, &success_flags, &quantizers, &initial_codebook, &q, &limit_clusterizers, &max_codebook_size, &max_threads, &max_parent_codebook_size] {
2081

2082
				Quantizer& lq = quantizers[thread_iter];
2083
				uint_vec& cluster_indices = initial_codebook[thread_iter];
2084

2085
				uint_vec local_to_global(cluster_indices.size());
2086

2087
				for (uint32_t i = 0; i < cluster_indices.size(); i++)
2088
				{
2089
					const uint32_t global_training_vec_index = cluster_indices[i];
2090
					local_to_global[i] = global_training_vec_index;
2091

2092
					lq.add_training_vec(q.get_training_vecs()[global_training_vec_index].first, q.get_training_vecs()[global_training_vec_index].second);
2093
				}
2094

2095
				const uint32_t max_clusters = limit_clusterizers ? ((max_codebook_size + max_threads - 1) / max_threads) : (uint32_t)lq.get_total_training_vecs();
2096

2097
				success_flags[thread_iter] = lq.generate(max_clusters);
2098

2099
				if (success_flags[thread_iter])
2100
				{
2101
					lq.retrieve(local_clusters[thread_iter]);
2102

2103
					for (uint32_t i = 0; i < local_clusters[thread_iter].size(); i++)
2104
					{
2105
						for (uint32_t j = 0; j < local_clusters[thread_iter][i].size(); j++)
2106
							local_clusters[thread_iter][i][j] = local_to_global[local_clusters[thread_iter][i][j]];
2107
					}
2108

2109
					if (max_parent_codebook_size)
2110
					{
2111
						lq.retrieve((max_parent_codebook_size + max_threads - 1) / max_threads, local_parent_clusters[thread_iter]);
2112

2113
						for (uint32_t i = 0; i < local_parent_clusters[thread_iter].size(); i++)
2114
						{
2115
							for (uint32_t j = 0; j < local_parent_clusters[thread_iter][i].size(); j++)
2116
								local_parent_clusters[thread_iter][i][j] = local_to_global[local_parent_clusters[thread_iter][i][j]];
2117
						}
2118
					}
2119
				}
2120

2121
			} );
2122

2123
		} // thread_iter
2124

2125
		pJob_pool->wait_for_all();
2126

2127
		uint32_t total_clusters = 0, total_parent_clusters = 0;
2128

2129
		for (int thread_iter = 0; thread_iter < (int)max_threads; thread_iter++)
2130
		{
2131
			if (!success_flags[thread_iter])
2132
				return false;
2133
			total_clusters += (uint32_t)local_clusters[thread_iter].size();
2134
			total_parent_clusters += (uint32_t)local_parent_clusters[thread_iter].size();
2135
		}
2136

2137
		codebook.reserve(total_clusters);
2138
		parent_codebook.reserve(total_parent_clusters);
2139

2140
		for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)
2141
		{
2142
			for (uint32_t j = 0; j < local_clusters[thread_iter].size(); j++)
2143
			{
2144
				codebook.resize(codebook.size() + 1);
2145
				codebook.back().swap(local_clusters[thread_iter][j]);
2146
			}
2147

2148
			for (uint32_t j = 0; j < local_parent_clusters[thread_iter].size(); j++)
2149
			{
2150
				parent_codebook.resize(parent_codebook.size() + 1);
2151
				parent_codebook.back().swap(local_parent_clusters[thread_iter][j]);
2152
			}
2153
		}
2154

2155
		return true;
2156
	}
2157

2158
	template<typename Quantizer>
2159
	bool generate_hierarchical_codebook_threaded(Quantizer& q,
2160
		uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
2161
		basisu::vector<uint_vec>& codebook,
2162
		basisu::vector<uint_vec>& parent_codebook,
2163
		uint32_t max_threads, job_pool *pJob_pool,
2164
		bool even_odd_input_pairs_equal)
2165
	{
2166
		typedef bit_hasher<typename Quantizer::training_vec_type> training_vec_bit_hasher;
2167
		
2168
		typedef std::unordered_map < typename Quantizer::training_vec_type, weighted_block_group, 
2169
			training_vec_bit_hasher> group_hash;
2170
		
2171
		//interval_timer tm;
2172
		//tm.start();
2173

2174
		group_hash unique_vecs;
2175

2176
		unique_vecs.reserve(20000);
2177

2178
		weighted_block_group g;
2179
		
2180
		if (even_odd_input_pairs_equal)
2181
		{
2182
			g.m_indices.resize(2);
2183

2184
			assert(q.get_training_vecs().size() >= 2 && (q.get_training_vecs().size() & 1) == 0);
2185

2186
			for (uint32_t i = 0; i < q.get_training_vecs().size(); i += 2)
2187
			{
2188
				assert(q.get_training_vecs()[i].first == q.get_training_vecs()[i + 1].first);
2189

2190
				g.m_total_weight = q.get_training_vecs()[i].second + q.get_training_vecs()[i + 1].second;
2191
				g.m_indices[0] = i;
2192
				g.m_indices[1] = i + 1;
2193

2194
				auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g));
2195

2196
				if (!ins_res.second)
2197
				{
2198
					(ins_res.first)->second.m_total_weight += g.m_total_weight;
2199
					(ins_res.first)->second.m_indices.push_back(i);
2200
					(ins_res.first)->second.m_indices.push_back(i + 1);
2201
				}
2202
			}
2203
		}
2204
		else
2205
		{
2206
			g.m_indices.resize(1);
2207

2208
			for (uint32_t i = 0; i < q.get_training_vecs().size(); i++)
2209
			{
2210
				g.m_total_weight = q.get_training_vecs()[i].second;
2211
				g.m_indices[0] = i;
2212

2213
				auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g));
2214

2215
				if (!ins_res.second)
2216
				{
2217
					(ins_res.first)->second.m_total_weight += g.m_total_weight;
2218
					(ins_res.first)->second.m_indices.push_back(i);
2219
				}
2220
			}
2221
		}
2222

2223
		//debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors, %3.3f secs\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size(), tm.get_elapsed_secs());
2224
		debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size());
2225

2226
		Quantizer group_quant;
2227
		typedef typename group_hash::const_iterator group_hash_const_iter;
2228
		basisu::vector<group_hash_const_iter> unique_vec_iters;
2229
		unique_vec_iters.reserve(unique_vecs.size());
2230

2231
		for (auto iter = unique_vecs.begin(); iter != unique_vecs.end(); ++iter)
2232
		{
2233
			group_quant.add_training_vec(iter->first, iter->second.m_total_weight);
2234
			unique_vec_iters.push_back(iter);
2235
		}
2236

2237
		bool limit_clusterizers = true;
2238
		if (unique_vecs.size() <= max_codebook_size)
2239
			limit_clusterizers = false;
2240

2241
		debug_printf("Limit clusterizers: %u\n", limit_clusterizers);
2242

2243
		basisu::vector<uint_vec> group_codebook, group_parent_codebook;
2244
		bool status = generate_hierarchical_codebook_threaded_internal(group_quant,
2245
			max_codebook_size, max_parent_codebook_size,
2246
			group_codebook,
2247
			group_parent_codebook,
2248
			(unique_vecs.size() < 65536*4) ? 1 : max_threads, limit_clusterizers, pJob_pool);
2249

2250
		if (!status)
2251
			return false;
2252

2253
		codebook.resize(0);
2254
		for (uint32_t i = 0; i < group_codebook.size(); i++)
2255
		{
2256
			codebook.resize(codebook.size() + 1);
2257

2258
			for (uint32_t j = 0; j < group_codebook[i].size(); j++)
2259
			{
2260
				const uint32_t group_index = group_codebook[i][j];
2261

2262
				typename group_hash::const_iterator group_iter = unique_vec_iters[group_index];
2263
				const uint_vec& training_vec_indices = group_iter->second.m_indices;
2264
				
2265
				append_vector(codebook.back(), training_vec_indices);
2266
			}
2267
		}
2268

2269
		parent_codebook.resize(0);
2270
		for (uint32_t i = 0; i < group_parent_codebook.size(); i++)
2271
		{
2272
			parent_codebook.resize(parent_codebook.size() + 1);
2273

2274
			for (uint32_t j = 0; j < group_parent_codebook[i].size(); j++)
2275
			{
2276
				const uint32_t group_index = group_parent_codebook[i][j];
2277

2278
				typename group_hash::const_iterator group_iter = unique_vec_iters[group_index];
2279
				const uint_vec& training_vec_indices = group_iter->second.m_indices;
2280

2281
				append_vector(parent_codebook.back(), training_vec_indices);
2282
			}
2283
		}
2284

2285
		return true;
2286
	}
2287

2288
	// Canonical Huffman coding
2289

2290
	class histogram
2291
	{
2292
		basisu::vector<uint32_t> m_hist;
2293

2294
	public:
2295
		histogram(uint32_t size = 0) { init(size); }
2296

2297
		void clear()
2298
		{
2299
			clear_vector(m_hist);
2300
		}
2301

2302
		void init(uint32_t size)
2303
		{
2304
			m_hist.resize(0);
2305
			m_hist.resize(size);
2306
		}
2307

2308
		inline uint32_t size() const { return static_cast<uint32_t>(m_hist.size()); }
2309

2310
		inline const uint32_t &operator[] (uint32_t index) const
2311
		{
2312
			return m_hist[index];
2313
		}
2314

2315
		inline uint32_t &operator[] (uint32_t index)
2316
		{
2317
			return m_hist[index];
2318
		}
2319

2320
		inline void inc(uint32_t index)
2321
		{
2322
			m_hist[index]++;
2323
		}
2324

2325
		uint64_t get_total() const
2326
		{
2327
			uint64_t total = 0;
2328
			for (uint32_t i = 0; i < m_hist.size(); ++i)
2329
				total += m_hist[i];
2330
			return total;
2331
		}
2332

2333
		double get_entropy() const
2334
		{
2335
			double total = static_cast<double>(get_total());
2336
			if (total == 0.0f)
2337
				return 0.0f;
2338

2339
			const double inv_total = 1.0f / total;
2340
			const double neg_inv_log2 = -1.0f / log(2.0f);
2341
			
2342
			double e = 0.0f;
2343
			for (uint32_t i = 0; i < m_hist.size(); i++)
2344
				if (m_hist[i])
2345
					e += log(m_hist[i] * inv_total) * neg_inv_log2 * static_cast<double>(m_hist[i]);
2346

2347
			return e;
2348
		}
2349
	};
2350
		
2351
	struct sym_freq
2352
	{
2353
		uint32_t m_key;
2354
		uint16_t m_sym_index;
2355
	};
2356

2357
	sym_freq *canonical_huffman_radix_sort_syms(uint32_t num_syms, sym_freq *pSyms0, sym_freq *pSyms1);
2358
	void canonical_huffman_calculate_minimum_redundancy(sym_freq *A, int num_syms);
2359
	void canonical_huffman_enforce_max_code_size(int *pNum_codes, int code_list_len, int max_code_size);
2360
	
2361
	class huffman_encoding_table
2362
	{
2363
	public:
2364
		huffman_encoding_table()
2365
		{
2366
		}
2367

2368
		void clear()
2369
		{
2370
			clear_vector(m_codes);
2371
			clear_vector(m_code_sizes);
2372
		}
2373

2374
		bool init(const histogram &h, uint32_t max_code_size = cHuffmanMaxSupportedCodeSize)
2375
		{
2376
			return init(h.size(), &h[0], max_code_size);
2377
		}
2378

2379
		bool init(uint32_t num_syms, const uint16_t *pFreq, uint32_t max_code_size);
2380
		bool init(uint32_t num_syms, const uint32_t *pSym_freq, uint32_t max_code_size);
2381
		
2382
		inline const uint16_vec &get_codes() const { return m_codes; }
2383
		inline const uint8_vec &get_code_sizes() const { return m_code_sizes; }
2384

2385
		uint32_t get_total_used_codes() const
2386
		{
2387
			for (int i = static_cast<int>(m_code_sizes.size()) - 1; i >= 0; i--)
2388
				if (m_code_sizes[i])
2389
					return i + 1;
2390
			return 0;
2391
		}
2392

2393
	private:
2394
		uint16_vec m_codes;
2395
		uint8_vec m_code_sizes;
2396
	};
2397

2398
	class bitwise_coder
2399
	{
2400
	public:
2401
		bitwise_coder() :
2402
			m_bit_buffer(0),
2403
			m_bit_buffer_size(0),
2404
			m_total_bits(0)
2405
		{
2406
		}
2407

2408
		bitwise_coder(const bitwise_coder& other) :
2409
			m_bytes(other.m_bytes),
2410
			m_bit_buffer(other.m_bit_buffer),
2411
			m_bit_buffer_size(other.m_bit_buffer_size),
2412
			m_total_bits(other.m_total_bits)			
2413
		{
2414
		}
2415

2416
		bitwise_coder(bitwise_coder&& other) :
2417
			m_bytes(std::move(other.m_bytes)),
2418
			m_bit_buffer(other.m_bit_buffer),
2419
			m_bit_buffer_size(other.m_bit_buffer_size),
2420
			m_total_bits(other.m_total_bits)
2421
		{
2422
		}
2423

2424
		bitwise_coder& operator= (const bitwise_coder& rhs)
2425
		{
2426
			if (this == &rhs)
2427
				return *this;
2428

2429
			m_bytes = rhs.m_bytes;
2430
			m_bit_buffer = rhs.m_bit_buffer;
2431
			m_bit_buffer_size = rhs.m_bit_buffer_size;
2432
			m_total_bits = rhs.m_total_bits;
2433

2434
			return *this;
2435
		}
2436

2437
		bitwise_coder& operator= (bitwise_coder&& rhs)
2438
		{
2439
			if (this == &rhs)
2440
				return *this;
2441

2442
			m_bytes = std::move(rhs.m_bytes);
2443
			m_bit_buffer = rhs.m_bit_buffer;
2444
			m_bit_buffer_size = rhs.m_bit_buffer_size;
2445
			m_total_bits = rhs.m_total_bits;
2446

2447
			return *this;
2448
		}
2449

2450
		inline void clear()
2451
		{
2452
			clear_vector(m_bytes);
2453
			m_bit_buffer = 0;
2454
			m_bit_buffer_size = 0;
2455
			m_total_bits = 0;
2456
		}
2457

2458
		inline void restart()
2459
		{
2460
			m_bytes.resize(0);
2461
			m_bit_buffer = 0;
2462
			m_bit_buffer_size = 0;
2463
			m_total_bits = 0;
2464
		}
2465

2466
		inline const uint8_vec &get_bytes() const { return m_bytes; }
2467
		inline uint8_vec& get_bytes() { return m_bytes; }
2468

2469
		inline void reserve(uint32_t size) { m_bytes.reserve(size); }
2470

2471
		inline uint64_t get_total_bits() const { return m_total_bits; }
2472
		inline uint32_t get_total_bits_u32() const { assert(m_total_bits <= UINT32_MAX); return static_cast<uint32_t>(m_total_bits); }
2473
		inline void clear_total_bits() { m_total_bits = 0; }
2474

2475
		inline void init(uint32_t reserve_size = 1024)
2476
		{
2477
			m_bytes.reserve(reserve_size);
2478
			m_bytes.resize(0);
2479

2480
			m_bit_buffer = 0;
2481
			m_bit_buffer_size = 0;
2482
			m_total_bits = 0;
2483
		}
2484

2485
		inline uint32_t flush()
2486
		{
2487
			if (m_bit_buffer_size)
2488
			{
2489
				m_total_bits += 8 - (m_bit_buffer_size & 7);
2490
				append_byte(static_cast<uint8_t>(m_bit_buffer));
2491

2492
				m_bit_buffer = 0;
2493
				m_bit_buffer_size = 0;
2494
				
2495
				return 8;
2496
			}
2497

2498
			return 0;
2499
		}
2500

2501
		inline uint32_t put_bits(uint32_t bits, uint32_t num_bits)
2502
		{
2503
			assert(num_bits <= 32);
2504
			assert(bits < (1ULL << num_bits));
2505

2506
			if (!num_bits)
2507
				return 0;
2508

2509
			m_total_bits += num_bits;
2510

2511
			uint64_t v = (static_cast<uint64_t>(bits) << m_bit_buffer_size) | m_bit_buffer;
2512
			m_bit_buffer_size += num_bits;
2513

2514
			while (m_bit_buffer_size >= 8)
2515
			{
2516
				append_byte(static_cast<uint8_t>(v));
2517
				v >>= 8;
2518
				m_bit_buffer_size -= 8;
2519
			}
2520

2521
			m_bit_buffer = static_cast<uint8_t>(v);
2522
			return num_bits;
2523
		}
2524

2525
		inline uint32_t put_code(uint32_t sym, const huffman_encoding_table &tab)
2526
		{
2527
			uint32_t code = tab.get_codes()[sym];
2528
			uint32_t code_size = tab.get_code_sizes()[sym];
2529
			assert(code_size >= 1);
2530
			put_bits(code, code_size);
2531
			return code_size;
2532
		}
2533

2534
		inline uint32_t put_truncated_binary(uint32_t v, uint32_t n)
2535
		{
2536
			assert((n >= 2) && (v < n));
2537

2538
			uint32_t k = floor_log2i(n);
2539
			uint32_t u = (1 << (k + 1)) - n;
2540

2541
			if (v < u)
2542
				return put_bits(v, k);
2543
			
2544
			uint32_t x = v + u;
2545
			assert((x >> 1) >= u);
2546

2547
			put_bits(x >> 1, k);
2548
			put_bits(x & 1, 1);
2549
			return k + 1;
2550
		}
2551

2552
		inline uint32_t put_rice(uint32_t v, uint32_t m)
2553
		{
2554
			assert(m);
2555
			
2556
			const uint64_t start_bits = m_total_bits;
2557

2558
			uint32_t q = v >> m, r = v & ((1 << m) - 1);
2559

2560
			// rice coding sanity check
2561
			assert(q <= 64);
2562
			
2563
			for (; q > 16; q -= 16)
2564
				put_bits(0xFFFF, 16);
2565

2566
			put_bits((1 << q) - 1, q);
2567
			put_bits(r << 1, m + 1);
2568
			
2569
			return (uint32_t)(m_total_bits - start_bits);
2570
		}
2571

2572
		inline uint32_t put_vlc(uint32_t v, uint32_t chunk_bits)
2573
		{
2574
			assert(chunk_bits);
2575

2576
			const uint32_t chunk_size = 1 << chunk_bits;
2577
			const uint32_t chunk_mask = chunk_size - 1;
2578
					
2579
			uint32_t total_bits = 0;
2580

2581
			for ( ; ; )
2582
			{
2583
				uint32_t next_v = v >> chunk_bits;
2584
								
2585
				total_bits += put_bits((v & chunk_mask) | (next_v ? chunk_size : 0), chunk_bits + 1);
2586
				if (!next_v)
2587
					break;
2588

2589
				v = next_v;
2590
			}
2591

2592
			return total_bits;
2593
		}
2594

2595
		uint32_t emit_huffman_table(const huffman_encoding_table &tab);
2596

2597
		void append(const bitwise_coder& other)
2598
		{
2599
			for (uint32_t i = 0; i < other.m_bytes.size(); i++)
2600
				put_bits(other.m_bytes[i], 8);
2601
		
2602
			if (other.m_bit_buffer_size)
2603
				put_bits(other.m_bit_buffer, other.m_bit_buffer_size);
2604
		}
2605
		
2606
	private:
2607
		uint8_vec m_bytes;
2608
		uint32_t m_bit_buffer, m_bit_buffer_size;
2609
		uint64_t m_total_bits;
2610

2611
		inline void append_byte(uint8_t c)
2612
		{
2613
			//m_bytes.resize(m_bytes.size() + 1);
2614
			//m_bytes.back() = c;
2615

2616
			m_bytes.push_back(c);
2617
		}
2618

2619
		static void end_nonzero_run(uint16_vec &syms, uint32_t &run_size, uint32_t len);
2620
		static void end_zero_run(uint16_vec &syms, uint32_t &run_size);
2621
	};
2622

2623
	class huff2D
2624
	{
2625
	public:
2626
		huff2D() { }
2627
		huff2D(uint32_t bits_per_sym, uint32_t total_syms_per_group) { init(bits_per_sym, total_syms_per_group); }
2628

2629
		inline const histogram &get_histogram() const { return m_histogram; }
2630
		inline const huffman_encoding_table &get_encoding_table() const { return m_encoding_table; }
2631

2632
		inline void init(uint32_t bits_per_sym, uint32_t total_syms_per_group)
2633
		{
2634
			assert((bits_per_sym * total_syms_per_group) <= 16 && total_syms_per_group >= 1 && bits_per_sym >= 1);
2635
						
2636
			m_bits_per_sym = bits_per_sym;
2637
			m_total_syms_per_group = total_syms_per_group;
2638
			m_cur_sym_bits = 0;
2639
			m_cur_num_syms = 0;
2640
			m_decode_syms_remaining = 0;
2641
			m_next_decoder_group_index = 0;
2642

2643
			m_histogram.init(1 << (bits_per_sym * total_syms_per_group));
2644
		}
2645

2646
		inline void clear()
2647
		{
2648
			m_group_bits.clear();
2649

2650
			m_cur_sym_bits = 0;
2651
			m_cur_num_syms = 0;
2652
			m_decode_syms_remaining = 0;
2653
			m_next_decoder_group_index = 0;
2654
		}
2655

2656
		inline void emit(uint32_t sym)
2657
		{
2658
			m_cur_sym_bits |= (sym << (m_cur_num_syms * m_bits_per_sym));
2659
			m_cur_num_syms++;
2660

2661
			if (m_cur_num_syms == m_total_syms_per_group)
2662
				flush();
2663
		}
2664

2665
		inline void flush()
2666
		{
2667
			if (m_cur_num_syms)
2668
			{
2669
				m_group_bits.push_back(m_cur_sym_bits);
2670
				m_histogram.inc(m_cur_sym_bits);
2671

2672
				m_cur_sym_bits = 0;
2673
				m_cur_num_syms = 0;
2674
			}
2675
		}
2676

2677
		inline bool start_encoding(uint32_t code_size_limit = 16)
2678
		{
2679
			flush();
2680

2681
			if (!m_encoding_table.init(m_histogram, code_size_limit))
2682
				return false;
2683

2684
			m_decode_syms_remaining = 0;
2685
			m_next_decoder_group_index = 0;
2686

2687
			return true;
2688
		}
2689
				
2690
		inline uint32_t emit_next_sym(bitwise_coder &c)
2691
		{
2692
			uint32_t bits = 0;
2693

2694
			if (!m_decode_syms_remaining)
2695
			{
2696
				bits = c.put_code(m_group_bits[m_next_decoder_group_index++], m_encoding_table);
2697
				m_decode_syms_remaining = m_total_syms_per_group;
2698
			}
2699

2700
			m_decode_syms_remaining--;
2701
			return bits;
2702
		}
2703

2704
		inline void emit_flush()
2705
		{
2706
			m_decode_syms_remaining = 0;
2707
		}
2708

2709
	private:
2710
		uint_vec m_group_bits;
2711
		huffman_encoding_table m_encoding_table;
2712
		histogram m_histogram;
2713
		uint32_t m_bits_per_sym, m_total_syms_per_group, m_cur_sym_bits, m_cur_num_syms, m_next_decoder_group_index, m_decode_syms_remaining;
2714
	};
2715

2716
	bool huffman_test(int rand_seed);
2717

2718
	// VQ index reordering
2719
	
2720
	class palette_index_reorderer
2721
	{
2722
	public:
2723
		palette_index_reorderer()
2724
		{
2725
		}
2726

2727
		void clear()
2728
		{
2729
			clear_vector(m_hist);
2730
			clear_vector(m_total_count_to_picked);
2731
			clear_vector(m_entries_picked);
2732
			clear_vector(m_entries_to_do);
2733
			clear_vector(m_remap_table);
2734
		}
2735

2736
		// returns [0,1] distance of entry i to entry j
2737
		typedef float(*pEntry_dist_func)(uint32_t i, uint32_t j, void *pCtx);
2738

2739
		void init(uint32_t num_indices, const uint32_t *pIndices, uint32_t num_syms, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight);
2740
		
2741
		// Table remaps old to new symbol indices
2742
		inline const uint_vec &get_remap_table() const { return m_remap_table; }
2743

2744
	private:
2745
		uint_vec m_hist, m_total_count_to_picked, m_entries_picked, m_entries_to_do, m_remap_table;
2746

2747
		inline uint32_t get_hist(int i, int j, int n) const { return (i > j) ? m_hist[j * n + i] : m_hist[i * n + j]; }
2748
		inline void inc_hist(int i, int j, int n) { if ((i != j) && (i < j) && (i != -1) && (j != -1)) { assert(((uint32_t)i < (uint32_t)n) && ((uint32_t)j < (uint32_t)n)); m_hist[i * n + j]++; } }
2749

2750
		void prepare_hist(uint32_t num_syms, uint32_t num_indices, const uint32_t *pIndices);
2751
		void find_initial(uint32_t num_syms);
2752
		void find_next_entry(uint32_t &best_entry, double &best_count, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight);
2753
		float pick_side(uint32_t num_syms, uint32_t entry_to_move, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight);
2754
	};
2755

2756
	// Simple 32-bit 2D image class
2757

2758
	class image
2759
	{
2760
	public:
2761
		image() : 
2762
			m_width(0), m_height(0), m_pitch(0)
2763
		{
2764
		}
2765

2766
		image(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) : 
2767
			m_width(0), m_height(0), m_pitch(0)
2768
		{
2769
			resize(w, h, p);
2770
		}
2771

2772
		image(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps) :
2773
			m_width(0), m_height(0), m_pitch(0)
2774
		{
2775
			init(pImage, width, height, comps);
2776
		}
2777

2778
		image(const image &other) :
2779
			m_width(0), m_height(0), m_pitch(0)
2780
		{
2781
			*this = other;
2782
		}
2783

2784
		image(image&& other) :
2785
			m_width(other.m_width), m_height(other.m_height), m_pitch(other.m_pitch),
2786
			m_pixels(std::move(other.m_pixels))
2787
		{
2788
			other.m_width = 0;
2789
			other.m_height = 0;
2790
			other.m_pitch = 0;
2791
		}
2792

2793
		image& operator= (image&& rhs)
2794
		{
2795
			if (this != &rhs)
2796
			{
2797
				m_width = rhs.m_width;
2798
				m_height = rhs.m_height;
2799
				m_pitch = rhs.m_pitch;
2800
				m_pixels = std::move(rhs.m_pixels);
2801

2802
				rhs.m_width = 0;
2803
				rhs.m_height = 0;
2804
				rhs.m_pitch = 0;
2805
			}
2806
			return *this;
2807
		}
2808

2809
		image &swap(image &other)
2810
		{
2811
			std::swap(m_width, other.m_width);
2812
			std::swap(m_height, other.m_height);
2813
			std::swap(m_pitch, other.m_pitch);
2814
			m_pixels.swap(other.m_pixels);
2815
			return *this;
2816
		}
2817

2818
		image &operator= (const image &rhs)
2819
		{
2820
			if (this != &rhs)
2821
			{
2822
				m_width = rhs.m_width;
2823
				m_height = rhs.m_height;
2824
				m_pitch = rhs.m_pitch;
2825
				m_pixels = rhs.m_pixels;
2826
			}
2827
			return *this;
2828
		}
2829

2830
		image &clear()
2831
		{
2832
			m_width = 0; 
2833
			m_height = 0;
2834
			m_pitch = 0;
2835
			clear_vector(m_pixels);
2836
			return *this;
2837
		}
2838

2839
		image& match_dimensions(const image& other)
2840
		{
2841
			resize(other.get_width(), other.get_height());
2842
			return *this;
2843
		}
2844

2845
		image &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba& background = g_black_color)
2846
		{
2847
			return crop(w, h, p, background);
2848
		}
2849

2850
		image &set_all(const color_rgba &c)
2851
		{
2852
			for (uint32_t i = 0; i < m_pixels.size(); i++)
2853
				m_pixels[i] = c;
2854
			return *this;
2855
		}
2856

2857
		void init(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps)
2858
		{
2859
			assert(comps >= 1 && comps <= 4);
2860
			
2861
			resize(width, height);
2862

2863
			for (uint32_t y = 0; y < height; y++)
2864
			{
2865
				for (uint32_t x = 0; x < width; x++)
2866
				{
2867
					const uint8_t *pSrc = &pImage[(x + y * width) * comps];
2868
					color_rgba &dst = (*this)(x, y);
2869

2870
					if (comps == 1)
2871
					{
2872
						dst.r = pSrc[0];
2873
						dst.g = pSrc[0];
2874
						dst.b = pSrc[0];
2875
						dst.a = 255;
2876
					}
2877
					else if (comps == 2)
2878
					{
2879
						dst.r = pSrc[0];
2880
						dst.g = pSrc[0];
2881
						dst.b = pSrc[0];
2882
						dst.a = pSrc[1];
2883
					}
2884
					else
2885
					{
2886
						dst.r = pSrc[0];
2887
						dst.g = pSrc[1];
2888
						dst.b = pSrc[2];
2889
						if (comps == 4)
2890
							dst.a = pSrc[3];
2891
						else
2892
							dst.a = 255;
2893
					}
2894
				}
2895
			}
2896
		}
2897

2898
		image &fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba &c)
2899
		{
2900
			for (uint32_t iy = 0; iy < h; iy++)
2901
				for (uint32_t ix = 0; ix < w; ix++)
2902
					set_clipped(x + ix, y + iy, c);
2903
			return *this;
2904
		}
2905

2906
		image& fill_box_alpha(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba& c)
2907
		{
2908
			for (uint32_t iy = 0; iy < h; iy++)
2909
				for (uint32_t ix = 0; ix < w; ix++)
2910
					set_clipped_alpha(x + ix, y + iy, c);
2911
			return *this;
2912
		}
2913

2914
		image &crop_dup_borders(uint32_t w, uint32_t h)
2915
		{
2916
			const uint32_t orig_w = m_width, orig_h = m_height;
2917

2918
			crop(w, h);
2919

2920
			if (orig_w && orig_h)
2921
			{
2922
				if (m_width > orig_w)
2923
				{
2924
					for (uint32_t x = orig_w; x < m_width; x++)
2925
						for (uint32_t y = 0; y < m_height; y++)
2926
							set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));
2927
				}
2928

2929
				if (m_height > orig_h)
2930
				{
2931
					for (uint32_t y = orig_h; y < m_height; y++)
2932
						for (uint32_t x = 0; x < m_width; x++)
2933
							set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));
2934
				}
2935
			}
2936
			return *this;
2937
		}
2938

2939
		// pPixels MUST have been allocated using malloc() (basisu::vector will eventually use free() on the pointer).
2940
		image& grant_ownership(color_rgba* pPixels, uint32_t w, uint32_t h, uint32_t p = UINT32_MAX)
2941
		{
2942
			if (p == UINT32_MAX)
2943
				p = w;
2944

2945
			clear();
2946
			
2947
			if ((!p) || (!w) || (!h))
2948
				return *this;
2949

2950
			m_pixels.grant_ownership(pPixels, p * h, p * h);
2951

2952
			m_width = w;
2953
			m_height = h;
2954
			m_pitch = p;
2955

2956
			return *this;
2957
		}
2958

2959
		image &crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba &background = g_black_color, bool init_image = true)
2960
		{
2961
			if (p == UINT32_MAX)
2962
				p = w;
2963

2964
			if ((w == m_width) && (m_height == h) && (m_pitch == p))
2965
				return *this;
2966

2967
			if ((!w) || (!h) || (!p))
2968
			{
2969
				clear();
2970
				return *this;
2971
			}
2972

2973
			color_rgba_vec cur_state;
2974
			cur_state.swap(m_pixels);
2975

2976
			m_pixels.resize(p * h);
2977

2978
			if (init_image)
2979
			{
2980
				if (m_width || m_height)
2981
				{
2982
					for (uint32_t y = 0; y < h; y++)
2983
					{
2984
						for (uint32_t x = 0; x < w; x++)
2985
						{
2986
							if ((x < m_width) && (y < m_height))
2987
								m_pixels[x + y * p] = cur_state[x + y * m_pitch];
2988
							else
2989
								m_pixels[x + y * p] = background;
2990
						}
2991
					}
2992
				}
2993
				else
2994
				{
2995
					m_pixels.set_all(background);
2996
				}
2997
			}
2998

2999
			m_width = w;
3000
			m_height = h;
3001
			m_pitch = p;
3002

3003
			return *this;
3004
		}
3005

3006
		inline const color_rgba &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
3007
		inline color_rgba &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
3008

3009
		inline const color_rgba &get_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
3010
		inline color_rgba &get_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
3011

3012
		inline const color_rgba &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) const
3013
		{
3014
			x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
3015
			y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
3016
			return m_pixels[x + y * m_pitch];
3017
		}
3018

3019
		inline color_rgba &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v)
3020
		{
3021
			x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
3022
			y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
3023
			return m_pixels[x + y * m_pitch];
3024
		}
3025
		
3026
		inline image &set_clipped(int x, int y, const color_rgba &c) 
3027
		{
3028
			if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
3029
				(*this)(x, y) = c;
3030
			return *this;
3031
		}
3032

3033
		inline image& set_clipped_alpha(int x, int y, const color_rgba& c)
3034
		{
3035
			if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
3036
				(*this)(x, y).m_comps[3] = c.m_comps[3];
3037
			return *this;
3038
		}
3039

3040
		// Very straightforward blit with full clipping. Not fast, but it works.
3041
		image &blit(const image &src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y)
3042
		{
3043
			for (int y = 0; y < src_h; y++)
3044
			{
3045
				const int sy = src_y + y;
3046
				if (sy < 0)
3047
					continue;
3048
				else if (sy >= (int)src.get_height())
3049
					break;
3050

3051
				for (int x = 0; x < src_w; x++)
3052
				{
3053
					const int sx = src_x + x;
3054
					if (sx < 0)
3055
						continue;
3056
					else if (sx >= (int)src.get_width())
3057
						break;
3058

3059
					set_clipped(dst_x + x, dst_y + y, src(sx, sy));
3060
				}
3061
			}
3062

3063
			return *this;
3064
		}
3065

3066
		const image &extract_block_clamped(color_rgba *pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const
3067
		{
3068
			if (((src_x + w) > m_width) || ((src_y + h) > m_height))
3069
			{
3070
				// Slower clamping case
3071
				for (uint32_t y = 0; y < h; y++)
3072
					for (uint32_t x = 0; x < w; x++)
3073
						*pDst++ = get_clamped(src_x + x, src_y + y);
3074
			}
3075
			else
3076
			{
3077
				const color_rgba* pSrc = &m_pixels[src_x + src_y * m_pitch];
3078

3079
				for (uint32_t y = 0; y < h; y++)
3080
				{
3081
					memcpy(pDst, pSrc, w * sizeof(color_rgba));
3082
					pSrc += m_pitch;
3083
					pDst += w;
3084
				}
3085
			}
3086

3087
			return *this;
3088
		}
3089

3090
		image &set_block_clipped(const color_rgba *pSrc, uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h)
3091
		{
3092
			for (uint32_t y = 0; y < h; y++)
3093
				for (uint32_t x = 0; x < w; x++)
3094
					set_clipped(dst_x + x, dst_y + y, *pSrc++);
3095
			return *this;
3096
		}
3097

3098
		inline bool is_valid() const { return m_width > 0; }
3099

3100
		inline uint32_t get_width() const { return m_width; }
3101
		inline uint32_t get_height() const { return m_height; }
3102
		inline uint32_t get_pitch() const { return m_pitch; }
3103
		inline uint32_t get_total_pixels() const { return m_width * m_height; }
3104

3105
		inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - 1)) / w; }
3106
		inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - 1)) / h; }
3107
		inline uint32_t get_total_blocks(uint32_t w, uint32_t h) const { return get_block_width(w) * get_block_height(h); }
3108

3109
		inline const color_rgba_vec &get_pixels() const { return m_pixels; }
3110
		inline color_rgba_vec &get_pixels() { return m_pixels; }
3111

3112
		inline const color_rgba *get_ptr() const { return &m_pixels[0]; }
3113
		inline color_rgba *get_ptr() { return &m_pixels[0]; }
3114

3115
		bool has_alpha(uint32_t channel = 3) const
3116
		{
3117
			for (uint32_t y = 0; y < m_height; ++y)
3118
				for (uint32_t x = 0; x < m_width; ++x)
3119
					if ((*this)(x, y)[channel] < 255)
3120
						return true;
3121

3122
			return false;
3123
		}
3124

3125
		image &set_alpha(uint8_t a)
3126
		{
3127
			for (uint32_t y = 0; y < m_height; ++y)
3128
				for (uint32_t x = 0; x < m_width; ++x)
3129
					(*this)(x, y).a = a;
3130
			return *this;
3131
		}
3132

3133
		image &flip_y()
3134
		{
3135
			for (uint32_t y = 0; y < m_height / 2; ++y)
3136
				for (uint32_t x = 0; x < m_width; ++x)
3137
					std::swap((*this)(x, y), (*this)(x, m_height - 1 - y));
3138
			return *this;
3139
		}
3140

3141
		// TODO: There are many ways to do this, not sure this is the best way.
3142
		image &renormalize_normal_map()
3143
		{
3144
			for (uint32_t y = 0; y < m_height; y++)
3145
			{
3146
				for (uint32_t x = 0; x < m_width; x++)
3147
				{
3148
					color_rgba &c = (*this)(x, y);
3149
					if ((c.r == 128) && (c.g == 128) && (c.b == 128))
3150
						continue;
3151

3152
					vec3F v(c.r, c.g, c.b);
3153
					v = (v * (2.0f / 255.0f)) - vec3F(1.0f);
3154
					v.clamp(-1.0f, 1.0f);
3155

3156
					float length = v.length();
3157
					const float cValidThresh = .077f;
3158
					if (length < cValidThresh)
3159
					{
3160
						c.set(128, 128, 128, c.a);
3161
					}
3162
					else if (fabs(length - 1.0f) > cValidThresh)
3163
					{
3164
						if (length)
3165
							v /= length;
3166

3167
						for (uint32_t i = 0; i < 3; i++)
3168
							c[i] = static_cast<uint8_t>(clamp<float>(floor((v[i] + 1.0f) * 255.0f * .5f + .5f), 0.0f, 255.0f));
3169

3170
						if ((c.g == 128) && (c.r == 128))
3171
						{
3172
							if (c.b < 128)
3173
								c.b = 0;
3174
							else
3175
								c.b = 255;
3176
						}
3177
					}
3178
				}
3179
			}
3180
			return *this;
3181
		}
3182

3183
		void swap_rb()
3184
		{
3185
			for (auto& v : m_pixels)
3186
				std::swap(v.r, v.b);
3187
		}
3188

3189
		void debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t x_scale, uint32_t y_scale, const color_rgba &fg, const color_rgba *pBG, bool alpha_only, const char* p, ...);
3190
				
3191
		vec4F get_filtered_vec4F(float x, float y) const
3192
		{
3193
			x -= .5f;
3194
			y -= .5f;
3195

3196
			int ix = (int)floorf(x);
3197
			int iy = (int)floorf(y);
3198
			float wx = x - ix;
3199
			float wy = y - iy;
3200

3201
			color_rgba a(get_clamped(ix, iy));
3202
			color_rgba b(get_clamped(ix + 1, iy));
3203
			color_rgba c(get_clamped(ix, iy + 1));
3204
			color_rgba d(get_clamped(ix + 1, iy + 1));
3205

3206
			vec4F result;
3207

3208
			for (uint32_t i = 0; i < 4; i++)
3209
			{
3210
				const float top = lerp<float>((float)a[i], (float)b[i], wx);
3211
				const float bot = lerp<float>((float)c[i], (float)d[i], wx);
3212
				const float m = lerp<float>((float)top, (float)bot, wy);
3213

3214
				result[i] = m;
3215
			}
3216

3217
			return result;
3218
		}
3219

3220
		// (x,y) - Continuous coordinates, where pixel centers are at (.5,.5), valid image coords are [0,width] and [0,height]. Clamp addressing.
3221
		color_rgba get_filtered(float x, float y) const
3222
		{
3223
			const vec4F fresult(get_filtered_vec4F(x, y));
3224

3225
			color_rgba result;
3226

3227
			for (uint32_t i = 0; i < 4; i++)
3228
				result[i] = (uint8_t)clamp<int>((int)(fresult[i] + .5f), 0, 255);
3229

3230
			return result;
3231
		}
3232
				
3233
	private:
3234
		uint32_t m_width, m_height, m_pitch;  // all in pixels
3235
		color_rgba_vec m_pixels;
3236
	};
3237

3238
	// Float images
3239

3240
	typedef basisu::vector<vec4F> vec4F_vec;
3241

3242
	class imagef
3243
	{
3244
	public:
3245
		imagef() : 
3246
			m_width(0), m_height(0), m_pitch(0)
3247
		{
3248
		}
3249

3250
		imagef(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) : 
3251
			m_width(0), m_height(0), m_pitch(0)
3252
		{
3253
			resize(w, h, p);
3254
		}
3255

3256
		imagef(const imagef &other) :
3257
			m_width(0), m_height(0), m_pitch(0)
3258
		{
3259
			*this = other;
3260
		}
3261

3262
		imagef(imagef&& other) :
3263
			m_width(other.m_width), m_height(other.m_height), m_pitch(other.m_pitch),
3264
			m_pixels(std::move(other.m_pixels))
3265
		{
3266
			other.m_width = 0;
3267
			other.m_height = 0;
3268
			other.m_pitch = 0;
3269
		}
3270

3271
		imagef& operator= (imagef&& rhs)
3272
		{
3273
			if (this != &rhs)
3274
			{
3275
				m_width = rhs.m_width;
3276
				m_height = rhs.m_height;
3277
				m_pitch = rhs.m_pitch;
3278
				m_pixels = std::move(rhs.m_pixels);
3279

3280
				rhs.m_width = 0;
3281
				rhs.m_height = 0;
3282
				rhs.m_pitch = 0;
3283
			}
3284
			return *this;
3285
		}
3286

3287
		imagef &swap(imagef &other)
3288
		{
3289
			std::swap(m_width, other.m_width);
3290
			std::swap(m_height, other.m_height);
3291
			std::swap(m_pitch, other.m_pitch);
3292
			m_pixels.swap(other.m_pixels);
3293
			return *this;
3294
		}
3295

3296
		imagef &operator= (const imagef &rhs)
3297
		{
3298
			if (this != &rhs)
3299
			{
3300
				m_width = rhs.m_width;
3301
				m_height = rhs.m_height;
3302
				m_pitch = rhs.m_pitch;
3303
				m_pixels = rhs.m_pixels;
3304
			}
3305
			return *this;
3306
		}
3307

3308
		imagef &clear()
3309
		{
3310
			m_width = 0; 
3311
			m_height = 0;
3312
			m_pitch = 0;
3313
			clear_vector(m_pixels);
3314
			return *this;
3315
		}
3316

3317
		imagef &set(const image &src, const vec4F &scale = vec4F(1), const vec4F &bias = vec4F(0))
3318
		{
3319
			const uint32_t width = src.get_width();
3320
			const uint32_t height = src.get_height();
3321

3322
			resize(width, height);
3323

3324
			for (int y = 0; y < (int)height; y++)
3325
			{
3326
				for (uint32_t x = 0; x < width; x++)
3327
				{
3328
					const color_rgba &src_pixel = src(x, y);
3329
					(*this)(x, y).set((float)src_pixel.r * scale[0] + bias[0], (float)src_pixel.g * scale[1] + bias[1], (float)src_pixel.b * scale[2] + bias[2], (float)src_pixel.a * scale[3] + bias[3]);
3330
				}
3331
			}
3332

3333
			return *this;
3334
		}
3335

3336
		imagef& match_dimensions(const imagef& other)
3337
		{
3338
			resize(other.get_width(), other.get_height());
3339
			return *this;
3340
		}
3341

3342
		imagef &resize(const imagef &other, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0,0,0,1))
3343
		{
3344
			return resize(other.get_width(), other.get_height(), p, background);
3345
		}
3346

3347
		imagef &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0,0,0,1))
3348
		{
3349
			return crop(w, h, p, background);
3350
		}
3351

3352
		imagef &set_all(const vec4F &c)
3353
		{
3354
			for (uint32_t i = 0; i < m_pixels.size(); i++)
3355
				m_pixels[i] = c;
3356
			return *this;
3357
		}
3358

3359
		imagef &fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const vec4F &c)
3360
		{
3361
			for (uint32_t iy = 0; iy < h; iy++)
3362
				for (uint32_t ix = 0; ix < w; ix++)
3363
					set_clipped(x + ix, y + iy, c);
3364
			return *this;
3365
		}
3366
				
3367
		imagef &crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F &background = vec4F(0,0,0,1))
3368
		{
3369
			if (p == UINT32_MAX)
3370
				p = w;
3371

3372
			if ((w == m_width) && (m_height == h) && (m_pitch == p))
3373
				return *this;
3374

3375
			if ((!w) || (!h) || (!p))
3376
			{
3377
				clear();
3378
				return *this;
3379
			}
3380

3381
			vec4F_vec cur_state;
3382
			cur_state.swap(m_pixels);
3383

3384
			m_pixels.resize(p * h);
3385
			
3386
			for (uint32_t y = 0; y < h; y++)
3387
			{
3388
				for (uint32_t x = 0; x < w; x++)
3389
				{
3390
					if ((x < m_width) && (y < m_height))
3391
						m_pixels[x + y * p] = cur_state[x + y * m_pitch];
3392
					else
3393
						m_pixels[x + y * p] = background;
3394
				}
3395
			}
3396

3397
			m_width = w;
3398
			m_height = h;
3399
			m_pitch = p;
3400

3401
			return *this;
3402
		}
3403

3404
		imagef& crop_dup_borders(uint32_t w, uint32_t h)
3405
		{
3406
			const uint32_t orig_w = m_width, orig_h = m_height;
3407

3408
			crop(w, h);
3409

3410
			if (orig_w && orig_h)
3411
			{
3412
				if (m_width > orig_w)
3413
				{
3414
					for (uint32_t x = orig_w; x < m_width; x++)
3415
						for (uint32_t y = 0; y < m_height; y++)
3416
							set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));
3417
				}
3418

3419
				if (m_height > orig_h)
3420
				{
3421
					for (uint32_t y = orig_h; y < m_height; y++)
3422
						for (uint32_t x = 0; x < m_width; x++)
3423
							set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));
3424
				}
3425
			}
3426
			return *this;
3427
		}
3428

3429
		inline const vec4F &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
3430
		inline vec4F &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
3431

3432
		inline const vec4F &get_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
3433
		inline vec4F &get_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
3434

3435
		inline const vec4F &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) const
3436
		{
3437
			x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
3438
			y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
3439
			return m_pixels[x + y * m_pitch];
3440
		}
3441

3442
		inline vec4F &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v)
3443
		{
3444
			x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
3445
			y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
3446
			return m_pixels[x + y * m_pitch];
3447
		}
3448
		
3449
		inline imagef &set_clipped(int x, int y, const vec4F &c) 
3450
		{
3451
			if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
3452
				(*this)(x, y) = c;
3453
			return *this;
3454
		}
3455

3456
		// Very straightforward blit with full clipping. Not fast, but it works.
3457
		imagef &blit(const imagef &src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y)
3458
		{
3459
			for (int y = 0; y < src_h; y++)
3460
			{
3461
				const int sy = src_y + y;
3462
				if (sy < 0)
3463
					continue;
3464
				else if (sy >= (int)src.get_height())
3465
					break;
3466

3467
				for (int x = 0; x < src_w; x++)
3468
				{
3469
					const int sx = src_x + x;
3470
					if (sx < 0)
3471
						continue;
3472
					else if (sx >= (int)src.get_width())
3473
						break;
3474

3475
					set_clipped(dst_x + x, dst_y + y, src(sx, sy));
3476
				}
3477
			}
3478

3479
			return *this;
3480
		}
3481

3482
		const imagef &extract_block_clamped(vec4F *pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const
3483
		{
3484
			for (uint32_t y = 0; y < h; y++)
3485
				for (uint32_t x = 0; x < w; x++)
3486
					*pDst++ = get_clamped(src_x + x, src_y + y);
3487
			return *this;
3488
		}
3489

3490
		imagef &set_block_clipped(const vec4F *pSrc, uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h)
3491
		{
3492
			for (uint32_t y = 0; y < h; y++)
3493
				for (uint32_t x = 0; x < w; x++)
3494
					set_clipped(dst_x + x, dst_y + y, *pSrc++);
3495
			return *this;
3496
		}
3497

3498
		inline bool is_valid() const { return m_width > 0; }
3499

3500
		inline uint32_t get_width() const { return m_width; }
3501
		inline uint32_t get_height() const { return m_height; }
3502
		inline uint32_t get_pitch() const { return m_pitch; }
3503
		inline uint64_t get_total_pixels() const { return (uint64_t)m_width * m_height; }
3504

3505
		inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - 1)) / w; }
3506
		inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - 1)) / h; }
3507
		inline uint32_t get_total_blocks(uint32_t w, uint32_t h) const { return get_block_width(w) * get_block_height(h); }
3508

3509
		inline const vec4F_vec &get_pixels() const { return m_pixels; }
3510
		inline vec4F_vec &get_pixels() { return m_pixels; }
3511

3512
		inline const vec4F *get_ptr() const { return &m_pixels[0]; }
3513
		inline vec4F *get_ptr() { return &m_pixels[0]; }
3514

3515
		bool clean_astc_hdr_pixels(float highest_mag)
3516
		{
3517
			bool status = true;
3518
			bool nan_msg = false;
3519
			bool inf_msg = false;
3520
			bool neg_zero_msg = false;
3521
			bool neg_msg = false;
3522
			bool clamp_msg = false;
3523

3524
			for (uint32_t iy = 0; iy < m_height; iy++)
3525
			{
3526
				for (uint32_t ix = 0; ix < m_width; ix++)
3527
				{
3528
					vec4F& c = (*this)(ix, iy);
3529

3530
					for (uint32_t s = 0; s < 4; s++)
3531
					{
3532
						float &p = c[s];
3533
						union { float f; uint32_t u; } x; x.f = p;
3534
						
3535
						if ((std::isnan(p)) || (std::isinf(p)) || (x.u == 0x80000000))
3536
						{
3537
							if (std::isnan(p))
3538
							{
3539
								if (!nan_msg)
3540
								{
3541
									fprintf(stderr, "One or more input pixels was NaN, setting to 0.\n");
3542
									nan_msg = true;
3543
								}
3544
							}
3545

3546
							if (std::isinf(p))
3547
							{
3548
								if (!inf_msg)
3549
								{
3550
									fprintf(stderr, "One or more input pixels was INF, setting to 0.\n");
3551
									inf_msg = true;
3552
								}
3553
							}
3554

3555
							if (x.u == 0x80000000)
3556
							{
3557
								if (!neg_zero_msg)
3558
								{
3559
									fprintf(stderr, "One or more input pixels was -0, setting them to 0.\n");
3560
									neg_zero_msg = true;
3561
								}
3562
							}
3563

3564
							p = 0.0f;
3565
							status = false;
3566
						}
3567
						else
3568
						{
3569
							//const float o = p;
3570
							if (p < 0.0f)
3571
							{
3572
								p = 0.0f;
3573

3574
								if (!neg_msg)
3575
								{
3576
									fprintf(stderr, "One or more input pixels was negative -- setting these pixel components to 0 because ASTC HDR doesn't support signed values.\n");
3577
									neg_msg = true;
3578
								}
3579
								
3580
								status = false;
3581
							}
3582

3583
							if (p > highest_mag)
3584
							{
3585
								p = highest_mag;
3586
								
3587
								if (!clamp_msg)
3588
								{
3589
									fprintf(stderr, "One or more input pixels had to be clamped to %f.\n", highest_mag);
3590
									clamp_msg = true;
3591
								}
3592

3593
								status = false;
3594
							}
3595
						}
3596
					}
3597
				}
3598
			}
3599

3600
			return status;
3601
		}
3602

3603
		imagef& flip_y()
3604
		{
3605
			for (uint32_t y = 0; y < m_height / 2; ++y)
3606
				for (uint32_t x = 0; x < m_width; ++x)
3607
					std::swap((*this)(x, y), (*this)(x, m_height - 1 - y));
3608

3609
			return *this;
3610
		}
3611

3612
		bool has_alpha(uint32_t channel = 3) const
3613
		{
3614
			for (uint32_t y = 0; y < m_height; ++y)
3615
				for (uint32_t x = 0; x < m_width; ++x)
3616
					if ((*this)(x, y)[channel] != 1.0f)
3617
						return true;
3618

3619
			return false;
3620
		}
3621

3622
		vec4F get_filtered_vec4F(float x, float y) const
3623
		{
3624
			x -= .5f;
3625
			y -= .5f;
3626

3627
			int ix = (int)floorf(x);
3628
			int iy = (int)floorf(y);
3629
			float wx = x - ix;
3630
			float wy = y - iy;
3631

3632
			vec4F a(get_clamped(ix, iy));
3633
			vec4F b(get_clamped(ix + 1, iy));
3634
			vec4F c(get_clamped(ix, iy + 1));
3635
			vec4F d(get_clamped(ix + 1, iy + 1));
3636

3637
			vec4F result;
3638

3639
			for (uint32_t i = 0; i < 4; i++)
3640
			{
3641
				const float top = lerp<float>((float)a[i], (float)b[i], wx);
3642
				const float bot = lerp<float>((float)c[i], (float)d[i], wx);
3643
				const float m = lerp<float>((float)top, (float)bot, wy);
3644

3645
				result[i] = m;
3646
			}
3647

3648
			return result;
3649
		}
3650
						
3651
	private:
3652
		uint32_t m_width, m_height, m_pitch;  // all in pixels
3653
		vec4F_vec m_pixels;
3654
	};
3655

3656
	// REC 709 coefficients
3657
	const float REC_709_R = 0.212656f, REC_709_G = 0.715158f, REC_709_B = 0.072186f;
3658

3659
	inline float get_luminance(const vec4F &c)
3660
	{
3661
		return c[0] * REC_709_R + c[1] * REC_709_G + c[2] * REC_709_B;
3662
	}
3663

3664
	float linear_to_srgb(float l);
3665
	float srgb_to_linear(float s);
3666

3667
	class fast_linear_to_srgb
3668
	{
3669
	public:
3670
		fast_linear_to_srgb()
3671
		{
3672
			init();
3673
		}
3674

3675
		void init()
3676
		{
3677
			for (int i = 0; i < LINEAR_TO_SRGB_TABLE_SIZE; ++i)
3678
			{
3679
				float l = (float)i * (1.0f / (LINEAR_TO_SRGB_TABLE_SIZE - 1));
3680
				m_linear_to_srgb_table[i] = (uint8_t)basisu::fast_floorf_int(255.0f * basisu::linear_to_srgb(l));
3681
			}
3682

3683
			float srgb_to_linear[256];
3684
			for (int i = 0; i < 256; i++)
3685
				srgb_to_linear[i] = basisu::srgb_to_linear((float)i / 255.0f);
3686

3687
			for (int i = 0; i < 256; i++)
3688
				m_srgb_to_linear_thresh[i] = (srgb_to_linear[i] + srgb_to_linear[basisu::minimum<int>(i + 1, 255)]) * .5f;
3689
		}
3690

3691
		inline uint8_t convert(float l) const
3692
		{
3693
			assert((l >= 0.0f) && (l <= 1.0f));
3694
			int j = basisu::fast_roundf_int((LINEAR_TO_SRGB_TABLE_SIZE - 1) * l);
3695

3696
			assert((j >= 0) && (j < LINEAR_TO_SRGB_TABLE_SIZE));
3697
			int b = m_linear_to_srgb_table[j];
3698

3699
			b += (l > m_srgb_to_linear_thresh[b]);
3700

3701
			return (uint8_t)b;
3702
		}
3703

3704
	private:
3705
		static constexpr int LINEAR_TO_SRGB_TABLE_SIZE = 2048;
3706
		uint8_t m_linear_to_srgb_table[LINEAR_TO_SRGB_TABLE_SIZE];
3707

3708
		float m_srgb_to_linear_thresh[256];
3709
	};
3710

3711
	extern fast_linear_to_srgb g_fast_linear_to_srgb;
3712
		
3713
	// Image metrics
3714
		
3715
	class image_metrics
3716
	{
3717
	public:
3718
		// TODO: Add ssim
3719
		double m_max, m_mean, m_mean_squared, m_rms, m_psnr, m_ssim;
3720
		bool m_has_neg, m_hf_mag_overflow, m_any_abnormal;
3721

3722
		image_metrics()
3723
		{
3724
			clear();
3725
		}
3726

3727
		void clear()
3728
		{
3729
			m_max = 0;
3730
			m_mean = 0;
3731
			m_mean_squared = 0;
3732
			m_rms = 0;
3733
			m_psnr = 0;
3734
			m_ssim = 0;
3735
			m_has_neg = false;
3736
			m_hf_mag_overflow = false;
3737
			m_any_abnormal = false;
3738
		}
3739

3740
		void print(const char *pPrefix = nullptr)	{ printf("%sMax: %3.3f Mean: %3.3f RMS: %3.3f PSNR: %2.3f dB\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr);	}
3741
		void print_hp(const char* pPrefix = nullptr) { printf("%sMax: %3.6f Mean: %3.6f RMS: %3.6f PSNR: %2.6f dB, Any Neg: %u, Half float overflow: %u, Any NaN/Inf: %u\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr, m_has_neg, m_hf_mag_overflow, m_any_abnormal); }
3742

3743
		void calc(const imagef& a, const imagef& b, uint32_t first_chan = 0, uint32_t total_chans = 0, bool avg_comp_error = true, bool log = false);
3744
		void calc_half(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error);
3745
		void calc_half2(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error);
3746
		void calc(const image &a, const image &b, uint32_t first_chan = 0, uint32_t total_chans = 0, bool avg_comp_error = true, bool use_601_luma = false);
3747
	};
3748

3749
	void print_image_metrics(const image& a, const image& b);
3750

3751
	// Image saving/loading/resampling
3752

3753
	bool load_png(const uint8_t* pBuf, size_t buf_size, image& img, const char* pFilename = nullptr);
3754
	bool load_png(const char* pFilename, image& img);
3755
	inline bool load_png(const std::string &filename, image &img) { return load_png(filename.c_str(), img); }
3756

3757
	bool load_tga(const char* pFilename, image& img);
3758
	inline bool load_tga(const std::string &filename, image &img) { return load_tga(filename.c_str(), img); }
3759

3760
	bool load_qoi(const char* pFilename, image& img);
3761

3762
	bool load_jpg(const char *pFilename, image& img);
3763
	bool load_jpg(const uint8_t* pBuf, size_t buf_size, image& img);
3764
	inline bool load_jpg(const std::string &filename, image &img) { return load_jpg(filename.c_str(), img); }
3765
	
3766
	// Currently loads .PNG, .TGA, or .JPG
3767
	bool load_image(const char* pFilename, image& img);
3768
	inline bool load_image(const std::string &filename, image &img) { return load_image(filename.c_str(), img); }
3769

3770
	bool is_image_filename_hdr(const char* pFilename);
3771

3772
	// Supports .HDR and most (but not all) .EXR's (see TinyEXR).
3773
	bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear = true, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f);
3774
	
3775
	inline bool load_image_hdr(const std::string& filename, imagef& img, bool ldr_srgb_to_linear = true, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f)
3776
	{ 
3777
		return load_image_hdr(filename.c_str(), img, ldr_srgb_to_linear, linear_nit_multiplier, ldr_black_bias);
3778
	}
3779

3780
	enum class hdr_image_type
3781
	{
3782
		cHITRGBAHalfFloat = 0,
3783
		cHITRGBAFloat = 1,
3784
		cHITPNGImage = 2,
3785
		cHITEXRImage = 3,
3786
		cHITHDRImage = 4,
3787
		cHITJPGImage = 5
3788
	};
3789

3790
	bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f);
3791

3792
	uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans);
3793
	uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans);
3794
		
3795
	struct rgbe_header_info
3796
	{
3797
		std::string m_program;
3798

3799
		// Note no validation is done, either gamma or exposure may be 0.
3800
		double m_gamma;
3801
		bool m_has_gamma;
3802

3803
		double m_exposure; // watts/steradian/m^2.
3804
		bool m_has_exposure;
3805

3806
		void clear() 
3807
		{ 
3808
			m_program.clear(); 
3809
			m_gamma = 1.0f; 
3810
			m_has_gamma = false; 
3811
			m_exposure = 1.0f; 
3812
			m_has_exposure = false; 
3813
		}
3814
	};
3815

3816
	bool read_rgbe(const uint8_vec& filedata, imagef& img, rgbe_header_info& hdr_info);
3817
	bool read_rgbe(const char* pFilename, imagef& img, rgbe_header_info &hdr_info);
3818

3819
	bool write_rgbe(uint8_vec& file_data, imagef& img, rgbe_header_info& hdr_info);
3820
	bool write_rgbe(const char* pFilename, imagef& img, rgbe_header_info& hdr_info);
3821

3822
	bool read_exr(const char* pFilename, imagef& img, int& n_chans);
3823
	bool read_exr(const void* pMem, size_t mem_size, imagef& img);
3824
	
3825
	enum
3826
	{
3827
		WRITE_EXR_LINEAR_HINT = 1, // hint for lossy comp. methods: exr_perceptual_treatment_t, logarithmic or linear, defaults to logarithmic
3828
		WRITE_EXR_STORE_FLOATS = 2, // use 32-bit floats, otherwise it uses half floats
3829
		WRITE_EXR_NO_COMPRESSION = 4 // no compression, otherwise it uses ZIP compression (16 scanlines per block)
3830
	};
3831

3832
	// Supports 1 (Y), 3 (RGB), or 4 (RGBA) channel images.
3833
	bool write_exr(const char* pFilename, const imagef& img, uint32_t n_chans, uint32_t flags);
3834
			
3835
	enum
3836
	{
3837
		cImageSaveGrayscale = 1,
3838
		cImageSaveIgnoreAlpha = 2
3839
	};
3840

3841
	bool save_png(const char* pFilename, const image& img, uint32_t image_save_flags = 0, uint32_t grayscale_comp = 0);
3842
	inline bool save_png(const std::string &filename, const image &img, uint32_t image_save_flags = 0, uint32_t grayscale_comp = 0) { return save_png(filename.c_str(), img, image_save_flags, grayscale_comp); }
3843
	
3844
	bool read_file_to_vec(const char* pFilename, uint8_vec& data);
3845
	bool read_file_to_data(const char* pFilename, void *pData, size_t len);	
3846

3847
	bool write_data_to_file(const char* pFilename, const void* pData, size_t len);
3848
	
3849
	inline bool write_vec_to_file(const char* pFilename, const uint8_vec& v) {	return v.size() ? write_data_to_file(pFilename, &v[0], v.size()) : write_data_to_file(pFilename, "", 0); }
3850
		
3851
	bool image_resample(const image &src, image &dst, bool srgb = false,
3852
		const char *pFilter = "lanczos4", float filter_scale = 1.0f, 
3853
		bool wrapping = false,
3854
		uint32_t first_comp = 0, uint32_t num_comps = 4);
3855

3856
	bool image_resample(const imagef& src, imagef& dst, 
3857
		const char* pFilter = "lanczos4", float filter_scale = 1.0f,
3858
		bool wrapping = false,
3859
		uint32_t first_comp = 0, uint32_t num_comps = 4);
3860
		
3861
	// Timing
3862
			
3863
	typedef uint64_t timer_ticks;
3864

3865
	class interval_timer
3866
	{
3867
	public:
3868
		interval_timer();
3869

3870
		void start();
3871
		void stop();
3872

3873
		double get_elapsed_secs() const;
3874
		inline double get_elapsed_ms() const { return 1000.0f* get_elapsed_secs(); }
3875
		
3876
		static void init();
3877
		static inline timer_ticks get_ticks_per_sec() { return g_freq; }
3878
		static timer_ticks get_ticks();
3879
		static double ticks_to_secs(timer_ticks ticks);
3880
		static inline double ticks_to_ms(timer_ticks ticks) {	return ticks_to_secs(ticks) * 1000.0f; }
3881

3882
	private:
3883
		static timer_ticks g_init_ticks, g_freq;
3884
		static double g_timer_freq;
3885

3886
		timer_ticks m_start_time, m_stop_time;
3887

3888
		bool m_started, m_stopped;
3889
	};
3890

3891
	inline double get_interval_timer() { return interval_timer::ticks_to_secs(interval_timer::get_ticks()); }
3892

3893
	inline FILE *fopen_safe(const char *pFilename, const char *pMode)
3894
	{
3895
#ifdef _WIN32
3896
		FILE *pFile = nullptr;
3897
		fopen_s(&pFile, pFilename, pMode);
3898
		return pFile;
3899
#else
3900
		return fopen(pFilename, pMode);
3901
#endif
3902
	}
3903

3904
	void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed = 1);
3905

3906
	const uint32_t cPixelBlockWidth = 4;
3907
	const uint32_t cPixelBlockHeight = 4;
3908
	const uint32_t cPixelBlockTotalPixels = cPixelBlockWidth * cPixelBlockHeight;
3909

3910
	struct pixel_block
3911
	{
3912
		color_rgba m_pixels[cPixelBlockHeight][cPixelBlockWidth]; // [y][x]
3913

3914
		inline const color_rgba& operator() (uint32_t x, uint32_t y) const { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
3915
		inline color_rgba& operator() (uint32_t x, uint32_t y) { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
3916

3917
		inline const color_rgba* get_ptr() const { return &m_pixels[0][0]; }
3918
		inline color_rgba* get_ptr() { return &m_pixels[0][0]; }
3919

3920
		inline void clear() { clear_obj(*this); }
3921

3922
		inline bool operator== (const pixel_block& rhs) const
3923
		{
3924
			return memcmp(m_pixels, rhs.m_pixels, sizeof(m_pixels)) == 0;
3925
		}
3926
	};
3927
	typedef basisu::vector<pixel_block> pixel_block_vec;
3928

3929
	struct pixel_block_hdr
3930
	{
3931
		vec4F m_pixels[cPixelBlockHeight][cPixelBlockWidth]; // [y][x]
3932

3933
		inline const vec4F& operator() (uint32_t x, uint32_t y) const { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
3934
		inline vec4F& operator() (uint32_t x, uint32_t y) { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
3935

3936
		inline const vec4F* get_ptr() const { return &m_pixels[0][0]; }
3937
		inline vec4F* get_ptr() { return &m_pixels[0][0]; }
3938

3939
		inline void clear() { clear_obj(*this); }
3940

3941
		inline bool operator== (const pixel_block& rhs) const
3942
		{
3943
			return memcmp(m_pixels, rhs.m_pixels, sizeof(m_pixels)) == 0;
3944
		}
3945
	};
3946
	typedef basisu::vector<pixel_block_hdr> pixel_block_hdr_vec;
3947

3948
	void tonemap_image_reinhard(image& ldr_img, const imagef& hdr_img, float exposure, bool add_noise = false, bool per_component = true, bool luma_scaling = false);
3949
	bool tonemap_image_compressive(image& dst_img, const imagef& hdr_test_img);
3950
	bool tonemap_image_compressive2(image& dst_img, const imagef& hdr_test_img);
3951
	
3952
	// Intersection
3953
	enum eClear { cClear = 0 };
3954
	enum eInitExpand { cInitExpand = 0 };
3955
	enum eIdentity { cIdentity = 0 };
3956

3957
	template<typename vector_type>
3958
	class ray
3959
	{
3960
	public:
3961
		typedef vector_type vector_t;
3962
		typedef typename vector_type::scalar_type scalar_type;
3963

3964
		inline ray() { }
3965
		inline ray(eClear) { clear(); }
3966
		inline ray(const vector_type& origin, const vector_type& direction) : m_origin(origin), m_direction(direction) { }
3967

3968
		inline void clear()
3969
		{
3970
			m_origin.clear();
3971
			m_direction.clear();
3972
		}
3973

3974
		inline const vector_type& get_origin(void) const { return m_origin; }
3975
		inline void set_origin(const vector_type& origin) { m_origin = origin; }
3976

3977
		inline const vector_type& get_direction(void) const { return m_direction; }
3978
		inline void set_direction(const vector_type& direction) { m_direction = direction; }
3979

3980
		inline void set_endpoints(const vector_type& start, const vector_type& end)
3981
		{
3982
			m_origin = start;
3983

3984
			m_direction = end - start;
3985
			m_direction.normalize_in_place();
3986
		}
3987

3988
		inline vector_type eval(scalar_type t) const
3989
		{
3990
			return m_origin + m_direction * t;
3991
		}
3992

3993
	private:
3994
		vector_type m_origin;
3995
		vector_type m_direction;
3996
	};
3997

3998
	typedef ray<vec2F> ray2F;
3999
	typedef ray<vec3F> ray3F;
4000

4001
	template<typename T>
4002
	class vec_interval
4003
	{
4004
	public:
4005
		enum { N = T::num_elements };
4006
		typedef typename T::scalar_type scalar_type;
4007

4008
		inline vec_interval(const T& v) { m_bounds[0] = v; m_bounds[1] = v; }
4009
		inline vec_interval(const T& low, const T& high) { m_bounds[0] = low; m_bounds[1] = high; }
4010

4011
		inline vec_interval() { }
4012
		inline vec_interval(eClear) { clear(); }
4013
		inline vec_interval(eInitExpand) { init_expand(); }
4014

4015
		inline void clear() { m_bounds[0].clear(); m_bounds[1].clear(); }
4016

4017
		inline void init_expand()
4018
		{
4019
			m_bounds[0].set(1e+30f, 1e+30f, 1e+30f);
4020
			m_bounds[1].set(-1e+30f, -1e+30f, -1e+30f);
4021
		}
4022

4023
		inline vec_interval expand(const T& p)
4024
		{
4025
			for (uint32_t c = 0; c < N; c++)
4026
			{
4027
				if (p[c] < m_bounds[0][c])
4028
					m_bounds[0][c] = p[c];
4029

4030
				if (p[c] > m_bounds[1][c])
4031
					m_bounds[1][c] = p[c];
4032
			}
4033

4034
			return *this;
4035
		}
4036

4037
		inline const T& operator[] (uint32_t i) const { assert(i < 2); return m_bounds[i]; }
4038
		inline       T& operator[] (uint32_t i) { assert(i < 2); return m_bounds[i]; }
4039

4040
		const T& get_low() const { return m_bounds[0]; }
4041
		T& get_low() { return m_bounds[0]; }
4042

4043
		const T& get_high() const { return m_bounds[1]; }
4044
		T& get_high() { return m_bounds[1]; }
4045

4046
		scalar_type get_dim(uint32_t axis) const { return m_bounds[1][axis] - m_bounds[0][axis]; }
4047

4048
		bool contains(const T& p) const
4049
		{
4050
			const T& low = get_low(), high = get_high();
4051

4052
			for (uint32_t i = 0; i < N; i++)
4053
			{
4054
				if (p[i] < low[i])
4055
					return false;
4056

4057
				if (p[i] > high[i])
4058
					return false;
4059
			}
4060
			return true;
4061
		}
4062

4063
	private:
4064
		T m_bounds[2];
4065
	};
4066

4067
	typedef vec_interval<vec1F> vec_interval1F;
4068
	typedef vec_interval<vec2F> vec_interval2F;
4069
	typedef vec_interval<vec3F> vec_interval3F;
4070
	typedef vec_interval<vec4F> vec_interval4F;
4071

4072
	typedef vec_interval1F aabb1F;
4073
	typedef vec_interval2F aabb2F;
4074
	typedef vec_interval3F aabb3F;
4075

4076
	namespace intersection
4077
	{
4078
		enum result
4079
		{
4080
			cBackfacing = -1,
4081
			cFailure = 0,
4082
			cSuccess,
4083
			cParallel,
4084
			cInside,
4085
		};
4086

4087
		// Returns cInside, cSuccess, or cFailure.
4088
		// Algorithm: Graphics Gems 1
4089
		template<typename vector_type, typename scalar_type, typename ray_type, typename aabb_type>
4090
		result ray_aabb(vector_type& coord, scalar_type& t, const ray_type& ray, const aabb_type& box)
4091
		{
4092
			enum
4093
			{
4094
				cNumDim = vector_type::num_elements,
4095
				cRight = 0,
4096
				cLeft = 1,
4097
				cMiddle = 2
4098
			};
4099

4100
			bool inside = true;
4101
			int quadrant[cNumDim];
4102
			scalar_type candidate_plane[cNumDim];
4103

4104
			for (int i = 0; i < cNumDim; i++)
4105
			{
4106
				if (ray.get_origin()[i] < box[0][i])
4107
				{
4108
					quadrant[i] = cLeft;
4109
					candidate_plane[i] = box[0][i];
4110
					inside = false;
4111
				}
4112
				else if (ray.get_origin()[i] > box[1][i])
4113
				{
4114
					quadrant[i] = cRight;
4115
					candidate_plane[i] = box[1][i];
4116
					inside = false;
4117
				}
4118
				else
4119
				{
4120
					quadrant[i] = cMiddle;
4121
				}
4122
			}
4123

4124
			if (inside)
4125
			{
4126
				coord = ray.get_origin();
4127
				t = 0.0f;
4128
				return cInside;
4129
			}
4130

4131
			scalar_type max_t[cNumDim];
4132
			for (int i = 0; i < cNumDim; i++)
4133
			{
4134
				if ((quadrant[i] != cMiddle) && (ray.get_direction()[i] != 0.0f))
4135
					max_t[i] = (candidate_plane[i] - ray.get_origin()[i]) / ray.get_direction()[i];
4136
				else
4137
					max_t[i] = -1.0f;
4138
			}
4139

4140
			int which_plane = 0;
4141
			for (int i = 1; i < cNumDim; i++)
4142
				if (max_t[which_plane] < max_t[i])
4143
					which_plane = i;
4144

4145
			if (max_t[which_plane] < 0.0f)
4146
				return cFailure;
4147

4148
			for (int i = 0; i < cNumDim; i++)
4149
			{
4150
				if (i != which_plane)
4151
				{
4152
					coord[i] = ray.get_origin()[i] + max_t[which_plane] * ray.get_direction()[i];
4153

4154
					if ((coord[i] < box[0][i]) || (coord[i] > box[1][i]))
4155
						return cFailure;
4156
				}
4157
				else
4158
				{
4159
					coord[i] = candidate_plane[i];
4160
				}
4161

4162
				assert(coord[i] >= box[0][i] && coord[i] <= box[1][i]);
4163
			}
4164

4165
			t = max_t[which_plane];
4166
			return cSuccess;
4167
		}
4168

4169
		template<typename vector_type, typename scalar_type, typename ray_type, typename aabb_type>
4170
		result ray_aabb(bool& started_within, vector_type& coord, scalar_type& t, const ray_type& ray, const aabb_type& box)
4171
		{
4172
			if (!box.contains(ray.get_origin()))
4173
			{
4174
				started_within = false;
4175
				return ray_aabb(coord, t, ray, box);
4176
			}
4177

4178
			started_within = true;
4179

4180
			typename vector_type::T diag_dist = box.diagonal_length() * 1.5f;
4181
			ray_type outside_ray(ray.eval(diag_dist), -ray.get_direction());
4182

4183
			result res(ray_aabb(coord, t, outside_ray, box));
4184
			if (res != cSuccess)
4185
				return res;
4186

4187
			t = basisu::maximum(0.0f, diag_dist - t);
4188
			return cSuccess;
4189
		}
4190

4191
	} // intersect
4192

4193
	// This float->half conversion matches how "F32TO16" works on Intel GPU's.
4194
	// Input cannot be negative, Inf or Nan.
4195
	inline basist::half_float float_to_half_non_neg_no_nan_inf(float val)
4196
	{
4197
		union { float f; int32_t i; uint32_t u; } fi = { val };
4198
		const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF;
4199
		int e = 0, m = 0;
4200

4201
		assert(((fi.i >> 31) == 0) && (flt_e != 0xFF));
4202

4203
		// not zero or denormal
4204
		if (flt_e != 0)
4205
		{
4206
			int new_exp = flt_e - 127;
4207
			if (new_exp > 15)
4208
				e = 31;
4209
			else if (new_exp < -14)
4210
				m = lrintf((1 << 24) * fabsf(fi.f));
4211
			else
4212
			{
4213
				e = new_exp + 15;
4214
				m = lrintf(flt_m * (1.0f / ((float)(1 << 13))));
4215
			}
4216
		}
4217

4218
		assert((0 <= m) && (m <= 1024));
4219
		if (m == 1024)
4220
		{
4221
			e++;
4222
			m = 0;
4223
		}
4224

4225
		assert((e >= 0) && (e <= 31));
4226
		assert((m >= 0) && (m <= 1023));
4227

4228
		basist::half_float result = (basist::half_float)((e << 10) | m);
4229
		return result;
4230
	}
4231

4232
	union fu32
4233
	{
4234
		uint32_t u;
4235
		float f;
4236
	};
4237

4238
	// Supports positive and denormals only. No NaN or Inf.
4239
	BASISU_FORCE_INLINE float fast_half_to_float_pos_not_inf_or_nan(basist::half_float h)
4240
	{
4241
		assert(!basist::half_is_signed(h) && !basist::is_half_inf_or_nan(h));
4242
				
4243
		// add 112 to the exponent (112+half float's exp bias of 15=float32's bias of 127)
4244
		static const fu32 K = { 0x77800000 }; 
4245

4246
		fu32 o;
4247
		o.u = h << 13;
4248
		o.f *= K.f;
4249

4250
		return o.f;
4251
	}
4252

4253
	// Positive, negative, or denormals. No NaN or Inf. Clamped to MAX_HALF_FLOAT.
4254
	inline basist::half_float fast_float_to_half_trunc_no_nan_or_inf(float f)
4255
	{
4256
		assert(!isnan(f) && !isinf(f));
4257

4258
		// Sutract 112 from the exponent, to change the bias from 127 to 15.
4259
		static const fu32 g_f_to_h{ 0x7800000 };
4260
				
4261
		fu32 fu;
4262

4263
		fu.f = minimum<float>((float)basist::MAX_HALF_FLOAT, fabsf(f)) * g_f_to_h.f;
4264

4265
		return (basist::half_float)(((fu.u >> (23 - 10)) & 0x7FFF) | ((f < 0.0f) ? 0x8000 : 0));
4266
	}
4267

4268
	inline basist::half_float fast_float_to_half_trunc_no_clamp_neg_nan_or_inf(float f)
4269
	{
4270
		assert(!isnan(f) && !isinf(f));
4271
		assert((f >= 0.0f) && (f <= basist::MAX_HALF_FLOAT));
4272
		
4273
		// Sutract 112 from the exponent, to change the bias from 127 to 15.
4274
		static const fu32 g_f_to_h{ 0x7800000 };
4275

4276
		fu32 fu;
4277

4278
		fu.f = f * g_f_to_h.f;
4279
		
4280
		return (basist::half_float)((fu.u >> (23 - 10)) & 0x7FFF);
4281
	}
4282
		
4283
	inline basist::half_float fast_float_to_half_no_clamp_neg_nan_or_inf(float f)
4284
	{
4285
		assert(!isnan(f) && !isinf(f));
4286
		assert((f >= 0.0f) && (f <= basist::MAX_HALF_FLOAT));
4287

4288
		// Sutract 112 from the exponent, to change the bias from 127 to 15.
4289
		static const fu32 g_f_to_h{ 0x7800000 };
4290

4291
		fu32 fu;
4292

4293
		fu.f = f * g_f_to_h.f;
4294

4295
		uint32_t h = (basist::half_float)((fu.u >> (23 - 10)) & 0x7FFF);
4296

4297
		// round to even or nearest
4298
		uint32_t mant = fu.u & 8191; // examine lowest 13 bits
4299
		uint32_t inc = (mant > 4096) | ((mant == 4096) & (h & 1));
4300
		h += inc;
4301

4302
		if (h > basist::MAX_HALF_FLOAT_AS_INT_BITS)
4303
			h = basist::MAX_HALF_FLOAT_AS_INT_BITS;
4304

4305
		return (basist::half_float)h;
4306
	}
4307
								
4308
} // namespace basisu
4309

4310
#include "basisu_math.h"
4311

4312
Product

Resources

Company