CoCalc -- basisu_astc_hdr_6x6

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.cpp
⁹⁹⁰⁴ views
1
// File: basisu_astc_hdr_6x6_enc.cpp
2
#include "basisu_astc_hdr_6x6_enc.h"
3
#include "basisu_enc.h"
4
#include "basisu_astc_hdr_common.h"
5
#include "basisu_math.h"
6
#include "basisu_resampler.h"
7
#include "basisu_resampler_filters.h"
8

9
#define MINIZ_HEADER_FILE_ONLY
10
#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
11
#include "basisu_miniz.h"
12

13
#include "3rdparty/android_astc_decomp.h"
14

15
#include <array>
16

17
using namespace basisu;
18
using namespace buminiz;
19
using namespace basist::astc_6x6_hdr;
20

21
namespace astc_6x6_hdr
22
{
23

24
static void atomic_max(std::atomic<uint32_t>& atomic_var, uint32_t new_value) 
25
{
26
	uint32_t current = atomic_var.load(std::memory_order_relaxed);
27
	for ( ; ; )
28
	{
29
		uint32_t new_max = std::max(current, new_value);
30
		if (atomic_var.compare_exchange_weak(current, new_max, std::memory_order_relaxed, std::memory_order_relaxed)) 
31
			break;
32
	}
33
}
34

35
void astc_hdr_6x6_global_config::set_user_level(int level)
36
{
37
	level = basisu::clamp<int>(level, 0, ASTC_HDR_6X6_MAX_USER_COMP_LEVEL);
38

39
	m_master_comp_level = 0;
40
	m_highest_comp_level = 0;
41
	m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS;
42
	m_extra_patterns_flag = false;
43
	m_brute_force_partition_matching = false;
44

45
	switch (level)
46
	{
47
	case 0:
48
	{
49
		// Both reduce compression a lot when lambda>0
50
		m_favor_higher_compression = false;
51
		m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS / 2;
52
		break;
53
	}
54
	case 1:
55
	{
56
		m_master_comp_level = 0;
57
		m_highest_comp_level = 0;
58
		break;
59
	}
60
	case 2:
61
	{
62
		m_master_comp_level = 0;
63
		m_highest_comp_level = 1;
64
		break;
65
	}
66
	case 3:
67
	{
68
		m_master_comp_level = 1;
69
		m_highest_comp_level = 1;
70
		break;
71
	}
72
	case 4:
73
	{
74
		m_master_comp_level = 1;
75
		m_highest_comp_level = 2;
76
		break;
77
	}
78
	case 5:
79
	{
80
		m_master_comp_level = 1;
81
		m_highest_comp_level = 3;
82
		break;
83
	}
84
	case 6:
85
	{
86
		m_master_comp_level = 1;
87
		m_highest_comp_level = 4;
88
		break;
89
	}
90
	case 7:
91
	{
92
		m_master_comp_level = 2;
93
		m_highest_comp_level = 2;
94
		break;
95
	}
96
	case 8:
97
	{
98
		m_master_comp_level = 2;
99
		m_highest_comp_level = 3;
100
		break;
101
	}
102
	case 9:
103
	{
104
		m_master_comp_level = 2;
105
		m_highest_comp_level = 4;
106
		break;
107
	}
108
	case 10:
109
	{
110
		m_master_comp_level = 3;
111
		m_highest_comp_level = 3;
112
		break;
113
	}
114
	case 11:
115
	{
116
		m_master_comp_level = 3;
117
		m_highest_comp_level = 4;
118
		break;
119
	}
120
	case 12:
121
	default:
122
	{
123
		m_master_comp_level = 4;
124
		m_highest_comp_level = 4;
125
		m_extra_patterns_flag = true;
126
		m_brute_force_partition_matching = true;
127
		break;
128
	}
129
	}
130
}
131

132
const float m1 = 0.1593017578125f;    // (2610 / 2^14) * (1/100)
133
const float m2 = 78.84375f;           // (2523 / 32) * (1/100)
134
const float c1 = 0.8359375f;          // 3424 / (2^12)
135
const float c2 = 18.8515625f;         // (2413 / 128)
136
const float c3 = 18.6875f;            // (2392 / 128)
137

138
static float forwardPQ(float Y)
139
{
140
	// 10,000 here is an absolute scale - it's in nits (cd per square meter)
141
	float L = Y * (1.0f / 10000.0f);
142

143
	float num = powf(L, m1);
144
	float N = powf((c1 + c2 * num) / (1 + c3 * num), m2);
145

146
	return N;
147
}
148

149
#if 0
150
static float inversePQ(float E)
151
{
152
	float N = powf(E, 1.0f / m2);
153

154
	float num = basisu::maximum<float>((N - c1), 0.0f) / (c2 - c3 * N);
155
	float L = powf(num, 1.0f / m1);
156

157
	return L * 10000.0f;
158
}
159
#endif
160

161
// PQ function approximation: convert input to bfloat16, look up in tables, bilinear interpolation between table entries.
162
// max_er: 0.000023007392883, max_rel_er: 0.000023472490284, avg_er: 0.000004330495689, 6-7x faster on x86
163
// Highest error is for values less than SMALLEST_PQ_VAL_IN.
164
//
165
// Approximation is round trip lossless for 10-12 bits at [0,10000] nits:
166
// for x [0,1024] (SCALE=1023) or for x [0,4095] (SCALE=4096): 
167
// round(forwardPQTab(inversePQ(x / SCALE)) * SCALE) == x
168
//
169
// bfloat16 has enough precision to handle 8-bit sRGB to linear conversions:
170
// round(linear_to_srgb(bfloat16_to_float(float_to_bfloat16(srgb_to_linear(isRGB/255.0f))))*255.0) is lossless
171

172
const int PQ_APPROX_MIN_EXP = -16, PQ_APPROX_MAX_EXP = 16;
173
const int PQ_APPROX_EXP_RANGE = (PQ_APPROX_MAX_EXP - PQ_APPROX_MIN_EXP + 1);
174

175
const float SMALLEST_PQ_VAL_IN = 0.000015258829080f;
176
const float SMALLEST_PQ_VAL = 0.000551903737f;		// forwardPQ(SMALLEST_PQ_VAL_IN)
177

178
const float LARGEST_PQ_VAL = 1.251312f; 
179

180
float g_pq_approx_tabs[PQ_APPROX_EXP_RANGE][128];
181

182
static void init_pq_tables()
183
{
184
	for (int exp = PQ_APPROX_MIN_EXP; exp <= PQ_APPROX_MAX_EXP; exp++)
185
	{
186
		for (int mant = 0; mant < 128; mant++)
187
		{
188
			bfloat16 b = bfloat16_init(1, exp, mant);
189
			float bf = bfloat16_to_float(b);
190

191
			float pq = forwardPQ(bf);
192

193
			g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant] = pq;
194
		}
195
	}
196

197
	//fmt_printf("{.15} {.15}\n", g_pq_approx_tabs[0][0], inversePQ(g_pq_approx_tabs[0][0]));
198
	//fmt_printf("{.15}\n", forwardPQ(SMALLEST_PQ_VAL_IN));
199
}
200

201
static inline float forwardPQTab(float v)
202
{
203
	assert(g_pq_approx_tabs[0][0]);
204

205
	assert(v >= 0.0f);
206
	if (v == 0.0f)
207
		return 0.0f;
208

209
	bfloat16 bf = float_to_bfloat16(v, false);
210
	assert(v >= bfloat16_to_float(bf));
211

212
	int exp = bfloat16_get_exp(bf);
213

214
	if (exp < PQ_APPROX_MIN_EXP)
215
	{
216
		// not accurate but should be good enough for our uses
217
		return lerp(0.0f, SMALLEST_PQ_VAL, minimum(1.0f, v / SMALLEST_PQ_VAL_IN));
218
	}
219
	else if (exp > PQ_APPROX_MAX_EXP)
220
		return LARGEST_PQ_VAL;
221

222
	int mant = bfloat16_get_mantissa(bf);
223

224
	float a = g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant];
225
	float bf_f32 = bfloat16_to_float(bf);
226

227
	int next_mant = mant + 1;
228
	int next_exp = exp;
229
	if (next_mant == 128)
230
	{
231
		next_mant = 0;
232
		next_exp++;
233
		if (next_exp > PQ_APPROX_MAX_EXP)
234
			return a;
235
	}
236

237
	float b = g_pq_approx_tabs[next_exp - PQ_APPROX_MIN_EXP][next_mant];
238

239
	bfloat16 next_bf = bfloat16_init(1, next_exp, next_mant);
240
	float next_bf_f32 = bfloat16_to_float(next_bf);
241
	assert(v <= next_bf_f32);
242

243
	float lerp_factor = (v - bf_f32) / (next_bf_f32 - bf_f32);
244
	assert((lerp_factor >= 0) && (lerp_factor <= 1.0f));
245

246
	return lerp(a, b, lerp_factor);
247
}
248

249
// 100 nits = ~.5 i
250
// This converts absolute linear RGB light in either REC 709 or REC2020/BT2100 color gamut to ICtCp, a coding space where Ct is scaled by 2. 
251
// To convert to perceptual ITP for error/distance calculations, multiply the result Ct by .5 (or set itp_flag to true).
252
// Assumes REC 709 input, or REC 2020/BT.2100 RGB input if rec2020_bt2100_color_gamut is true.
253
//
254
// ITP info:
255
// https://www.portrait.com/resource-center/ictcp-color-difference-metric/
256
// https://professional.dolby.com/siteassets/pdfs/measuringperceptualcolorvolume_v07.253.pdf (see scale to JND's)
257
// This also converts from a ICtCp coding space to threshold or perceptually uniform space ITP.
258
//
259
// Linear REC709 to REC2020/BT.2100 gamut conversion:
260
// rgb_2100[0] = rgb_in[0] * 0.6274f + rgb_in[1] * 0.3293f + rgb_in[2] * 0.0433f;
261
// rgb_2100[1] = rgb_in[0] * 0.0691f + rgb_in[1] * 0.9195f + rgb_in[2] * 0.0114f;
262
// rgb_2100[2] = rgb_in[0] * 0.0164f + rgb_in[1] * 0.0880f + rgb_in[2] * 0.8956f;
263
// const float S = 1.0f / 4096.0f;
264
// l = (1688.0f * S) * rgb_2100[0] + (2146.0f * S) * rgb_2100[1] + (262.0f * S) * rgb_2100[2];
265
// m = (683.0f * S) * rgb_2100[0] + (2951.0f * S) * rgb_2100[1] + (462.0f * S) * rgb_2100[2];
266
// s = (99.0f * S) * rgb_2100[0] + (309.0f * S) * rgb_2100[1] + (3688.0f * S) * rgb_2100[2];
267
static void linear_rgb_to_ictcp(const vec3F& rgb_in, vec3F& ictcp, bool itp_flag = false, bool rec2020_bt2100_color_gamut = false)
268
{
269
	vec3F rgb_2100(rgb_in);
270
	
271
	float l, m, s;
272
	if (!rec2020_bt2100_color_gamut)
273
	{
274
		// Assume REC 709 input color gamut
275
		// (REC2020_to_LMS * REC709_to_2020) * input_color
276
		l = rgb_2100[0] * 0.2958097f + rgb_2100[1] * 0.6230863f + rgb_2100[2] * 0.0811040f;
277
		m = rgb_2100[0] * 0.1562512f + rgb_2100[1] * 0.7272980f + rgb_2100[2] * 0.1164508f;
278
		s = rgb_2100[0] * 0.0351435f + rgb_2100[1] * 0.1565601f + rgb_2100[2] * 0.8082964f;
279
	}
280
	else
281
	{
282
		// Assumes REC2020/BT.2100 input color gamut (this is from the spec)
283
		l = 0.412109375f    * rgb_2100[0] + 0.52392578125f  * rgb_2100[1] + 0.06396484375f * rgb_2100[2];
284
		m = 0.166748046875f * rgb_2100[0] + 0.720458984375f * rgb_2100[1] + 0.11279296875f * rgb_2100[2];
285
		s = 0.024169921875f * rgb_2100[0] + 0.075439453125f * rgb_2100[1] + 0.900390625f   * rgb_2100[2];
286
	}
287

288
	float ld = forwardPQTab(l);
289
	float md = forwardPQTab(m);
290
	float sd = forwardPQTab(s);
291

292
	ictcp[0] = .5f * ld + .5f * md;
293

294
	// if ITP scale Ct by .5 (the ICtCp spec scaled Ct to better exploit the full scaled output, which is not perceptually linear)
295
	if (itp_flag)
296
		ictcp[1] = 0.806884765625f * ld + -1.6617431640625f * md + 0.8548583984375f * sd;
297
	else
298
		ictcp[1] = 1.61376953125f * ld + -3.323486328125f * md + 1.709716796875f * sd;
299

300
	ictcp[2] = 4.378173828125f * ld + -4.24560546875f * md + -0.132568359375f * sd;
301
}
302

303
static inline void linear_rgb_to_itp(const vec3F& rgb_in, vec3F& itp, const astc_hdr_6x6_global_config &cfg)
304
{
305
	linear_rgb_to_ictcp(rgb_in, itp, true, cfg.m_rec2020_bt2100_color_gamut);
306
}
307

308
#if 0
309
// Outputs rec2020/bt2100 color gamut (i.e. this doesn't convert back to REC709 gamut).
310
static void ictcp_to_linear_rgb(const vec3F& ictcp, vec3F& rgb, bool itp_flag = false)
311
{
312
	float ct = ictcp[1];
313

314
	if (itp_flag)
315
		ct *= 2.0f;
316

317
	float ld = ictcp[0] + ct * 0.008609037037932726f + ictcp[2] * 0.11102962500302596f;
318
	float md = ictcp[0] + ct * -0.008609037037932726f + ictcp[2] * -0.11102962500302596f;
319
	float sd = ictcp[0] + ct * 0.5600313357106792f + ictcp[2] * -0.32062717498731885f;
320

321
	float l = inversePQ(ld);
322
	float m = inversePQ(md);
323
	float s = inversePQ(sd);
324

325
	rgb[0] = l * 3.436606694333079f + m * -2.5064521186562705f + s * 0.06984542432319149f;
326
	rgb[1] = l * -0.7913295555989289f + m * 1.983600451792291f + s * -0.192270896193362f;
327
	rgb[2] = l * -0.025949899690592672f + m * -0.09891371471172646f + s * 1.1248636144023192f;
328
}
329
#endif
330

331
struct half_vec3
332
{
333
	basist::half_float m_vals[3];
334

335
	inline half_vec3() { }
336

337
	inline half_vec3(basist::half_float x, basist::half_float y, basist::half_float z)
338
	{
339
		m_vals[0] = x;
340
		m_vals[1] = y;
341
		m_vals[2] = z;
342
	}
343

344
	inline half_vec3(const half_vec3& other)
345
	{
346
		*this = other;
347
	}
348

349
	inline half_vec3& operator= (const half_vec3& rhs)
350
	{
351
		m_vals[0] = rhs.m_vals[0];
352
		m_vals[1] = rhs.m_vals[1];
353
		m_vals[2] = rhs.m_vals[2];
354
		return *this;
355
	}
356

357
	inline void clear()
358
	{
359
		clear_obj(m_vals);
360
	}
361

362
	inline half_vec3 &set(basist::half_float x, basist::half_float y, basist::half_float z)
363
	{
364
		m_vals[0] = x;
365
		m_vals[1] = y;
366
		m_vals[2] = z;
367
		return *this;
368
	}
369

370
	inline half_vec3& set(float x, float y, float z)
371
	{
372
		m_vals[0] = basist::float_to_half(x);
373
		m_vals[1] = basist::float_to_half(y);
374
		m_vals[2] = basist::float_to_half(z);
375
		return *this;
376
	}
377

378
	template<typename T>
379
	inline half_vec3& set_vec(const T& vec)
380
	{
381
		m_vals[0] = basist::float_to_half(vec[0]);
382
		m_vals[1] = basist::float_to_half(vec[1]);
383
		m_vals[2] = basist::float_to_half(vec[2]);
384
		return *this;
385
	}
386

387
	template<typename T>
388
	inline T get_vec() const
389
	{
390
		return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]));
391
	}
392

393
	inline basist::half_float operator[] (uint32_t c) const { assert(c < 3); return m_vals[c]; }
394
	inline basist::half_float& operator[] (uint32_t c) { assert(c < 3); return m_vals[c]; }
395

396
	float get_float_comp(uint32_t c) const
397
	{
398
		assert(c < 3);
399
		return basist::half_to_float(m_vals[c]);
400
	}
401

402
	half_vec3& set_float_comp(uint32_t c, float v)
403
	{
404
		assert(c < 3);
405
		m_vals[c] = basist::float_to_half(v);
406
		return *this;
407
	}
408
};
409

410
struct half_vec4
411
{
412
	basist::half_float m_vals[4];
413

414
	inline half_vec4() { }
415

416
	inline half_vec4(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)
417
	{
418
		m_vals[0] = x;
419
		m_vals[1] = y;
420
		m_vals[2] = z;
421
		m_vals[3] = w;
422
	}
423

424
	inline half_vec4(const half_vec4& other)
425
	{
426
		*this = other;
427
	}
428

429
	inline half_vec4& operator= (const half_vec4& rhs)
430
	{
431
		m_vals[0] = rhs.m_vals[0];
432
		m_vals[1] = rhs.m_vals[1];
433
		m_vals[2] = rhs.m_vals[2];
434
		m_vals[3] = rhs.m_vals[3];
435
		return *this;
436
	}
437

438
	inline void clear()
439
	{
440
		clear_obj(m_vals);
441
	}
442

443
	inline half_vec4& set(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)
444
	{
445
		m_vals[0] = x;
446
		m_vals[1] = y;
447
		m_vals[2] = z;
448
		m_vals[3] = w;
449
		return *this;
450
	}
451

452
	inline half_vec4& set(float x, float y, float z, float w)
453
	{
454
		m_vals[0] = basist::float_to_half(x);
455
		m_vals[1] = basist::float_to_half(y);
456
		m_vals[2] = basist::float_to_half(z);
457
		m_vals[3] = basist::float_to_half(w);
458
		return *this;
459
	}
460

461
	template<typename T>
462
	inline half_vec4& set_vec(const T& vec)
463
	{
464
		m_vals[0] = basist::float_to_half(vec[0]);
465
		m_vals[1] = basist::float_to_half(vec[1]);
466
		m_vals[2] = basist::float_to_half(vec[2]);
467
		m_vals[3] = basist::float_to_half(vec[3]);
468
		return *this;
469
	}
470

471
	template<typename T>
472
	inline T get_vec() const
473
	{
474
		return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]), basist::half_to_float(m_vals[3]));
475
	}
476

477
	inline basist::half_float operator[] (uint32_t c) const { assert(c < 4); return m_vals[c]; }
478
	inline basist::half_float &operator[] (uint32_t c) { assert(c < 4); return m_vals[c]; }
479

480
	float get_float_comp(uint32_t c) const
481
	{
482
		assert(c < 4);
483
		return basist::half_to_float(m_vals[c]);
484
	}
485

486
	half_vec4& set_float_comp(uint32_t c, float v)
487
	{
488
		assert(c < 4);
489
		m_vals[c] = basist::float_to_half(v);
490
		return *this;
491
	}
492
};
493

494
const uint32_t MAX_BLOCK_W = 6, MAX_BLOCK_H = 6;
495

496
struct trial_result
497
{
498
	astc_helpers::log_astc_block m_log_blk;
499
	double m_err;
500
	bool m_valid;
501
};
502

503
//----------------------------------------------------------
504

505
const uint32_t NUM_PART3_MAPPINGS = 6;
506
static uint8_t g_part3_mapping[NUM_PART3_MAPPINGS][3] =
507
{
508
	{ 0, 1, 2 },
509
	{ 1, 2, 0 },
510
	{ 2, 0, 1 },
511
	{ 0, 2, 1 },
512
	{ 1, 0, 2 },
513
	{ 2, 1, 0 }
514
};
515

516
struct partition_pattern_vec
517
{
518
	uint8_t m_parts[6 * 6];
519

520
	partition_pattern_vec()
521
	{
522
		clear();
523
	}
524

525
	partition_pattern_vec(const partition_pattern_vec& other)
526
	{
527
		*this = other;
528
	}
529

530
	void clear()
531
	{
532
		memset(m_parts, 0, sizeof(m_parts));
533
	}
534

535
	partition_pattern_vec& operator= (const partition_pattern_vec& rhs)
536
	{
537
		if (this == &rhs)
538
			return *this;
539
		memcpy(m_parts, rhs.m_parts, 36);
540
		return *this;
541
	}
542

543
	uint8_t operator[] (uint32_t i) const { assert(i < 36); return m_parts[i]; }
544
	uint8_t& operator[] (uint32_t i) { assert(i < 36); return m_parts[i]; }
545

546
	uint8_t operator() (uint32_t x, uint32_t y) const { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }
547
	uint8_t& operator() (uint32_t x, uint32_t y) { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }
548

549
	int get_squared_distance(const partition_pattern_vec& other) const
550
	{
551
		int total_dist = 0;
552
		for (uint32_t i = 0; i < 36; i++)
553
			total_dist += iabs((int)m_parts[i] - (int)other.m_parts[i]);
554
		return total_dist;
555
	}
556

557
	float get_distance(const partition_pattern_vec& other) const
558
	{
559
		return sqrtf((float)get_squared_distance(other));
560
	}
561

562
	partition_pattern_vec get_permuted2(uint32_t permute_index) const
563
	{
564
		assert(permute_index <= 1);
565

566
		partition_pattern_vec res;
567
		for (uint32_t i = 0; i < 36; i++)
568
		{
569
			assert(m_parts[i] <= 1);
570
			res.m_parts[i] = (uint8_t)(m_parts[i] ^ permute_index);
571
		}
572

573
		return res;
574
	}
575

576
	partition_pattern_vec get_permuted3(uint32_t permute_index) const
577
	{
578
		assert(permute_index <= 5);
579

580
		partition_pattern_vec res;
581
		for (uint32_t i = 0; i < 36; i++)
582
		{
583
			assert(m_parts[i] <= 2);
584
			res.m_parts[i] = g_part3_mapping[permute_index][m_parts[i]];
585
		}
586

587
		return res;
588
	}
589

590
	partition_pattern_vec get_canonicalized() const
591
	{
592
		partition_pattern_vec res;
593

594
		int new_labels[3] = { -1, -1, -1 };
595
		uint32_t next_index = 0;
596
		for (uint32_t i = 0; i < 36; i++)
597
		{
598
			uint32_t p = m_parts[i];
599
			if (new_labels[p] == -1)
600
				new_labels[p] = next_index++;
601

602
			res.m_parts[i] = (uint8_t)new_labels[p];
603
		}
604

605
		return res;
606
	}
607

608
	bool operator== (const partition_pattern_vec& rhs) const
609
	{
610
		return memcmp(m_parts, rhs.m_parts, sizeof(m_parts)) == 0;
611
	}
612

613
	operator size_t() const
614
	{
615
		return basisu::hash_hsieh(m_parts, sizeof(m_parts));
616
	}
617
};
618

619
struct vp_tree_node
620
{
621
	partition_pattern_vec m_vantage_point;
622
	uint32_t m_point_index;
623
	float m_dist;
624

625
	int m_inner_node, m_outer_node;
626
};
627

628
#define BRUTE_FORCE_PART_SEARCH (0)
629

630
class vp_tree
631
{
632
public:
633
	vp_tree()
634
	{
635
	}
636

637
	void clear()
638
	{
639
		m_nodes.clear();
640
	}
641

642
	// This requires no redundant patterns, i.e. all must be unique.
643
	bool init(uint32_t n, const partition_pattern_vec* pUnique_pats)
644
	{
645
		clear();
646

647
		uint_vec pat_indices(n);
648
		for (uint32_t i = 0; i < n; i++)
649
			pat_indices[i] = i;
650

651
		std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);
652

653
		if (root_idx.first == -1)
654
			return false;
655

656
		m_nodes.resize(1);
657
		m_nodes[0].m_vantage_point = pUnique_pats[root_idx.first];
658
		m_nodes[0].m_point_index = root_idx.first;
659
		m_nodes[0].m_dist = root_idx.second;
660
		m_nodes[0].m_inner_node = -1;
661
		m_nodes[0].m_outer_node = -1;
662

663
		uint_vec inner_list, outer_list;
664
		
665
		inner_list.reserve(n / 2);
666
		outer_list.reserve(n / 2);
667

668
		for (uint32_t pat_index = 0; pat_index < n; pat_index++)
669
		{
670
			if ((int)pat_index == root_idx.first)
671
				continue;
672

673
			const float dist = m_nodes[0].m_vantage_point.get_distance(pUnique_pats[pat_index]);
674

675
			if (dist <= root_idx.second)
676
				inner_list.push_back(pat_index);
677
			else
678
				outer_list.push_back(pat_index);
679
		}
680

681
		if (inner_list.size())
682
		{
683
			m_nodes[0].m_inner_node = create_node(n, pUnique_pats, inner_list);
684
			if (m_nodes[0].m_inner_node < 0)
685
				return false;
686
		}
687

688
		if (outer_list.size())
689
		{
690
			m_nodes[0].m_outer_node = create_node(n, pUnique_pats, outer_list);
691
			if (m_nodes[0].m_outer_node < 0)
692
				return false;
693
		}
694

695
		return true;
696
	}
697

698
	struct result
699
	{
700
		uint32_t m_pat_index;
701
		uint32_t m_mapping_index;
702
		float m_dist;
703

704
		bool operator< (const result& rhs) const { return m_dist < rhs.m_dist; }
705
		bool operator> (const result& rhs) const { return m_dist > rhs.m_dist; }
706
	};
707

708
	class result_queue
709
	{
710
		enum { MaxSupportedSize = 256 + 1 };
711

712
	public:
713
		result_queue() : 
714
			m_cur_size(0) 
715
		{
716
		}
717

718
		size_t get_size() const
719
		{
720
			return m_cur_size;
721
		}
722

723
		bool empty() const
724
		{
725
			return !m_cur_size;
726
		}
727

728
		typedef std::array<result, MaxSupportedSize + 1> result_array_type;
729

730
		const result_array_type& get_elements() const { return m_elements; }
731
		result_array_type& get_elements() { return m_elements; }
732

733
		void clear()
734
		{
735
			m_cur_size = 0;
736
		}
737

738
		void reserve(uint32_t n)
739
		{
740
			BASISU_NOTE_UNUSED(n);
741
		}
742

743
		const result& top() const
744
		{
745
			assert(m_cur_size);
746
			return m_elements[1];
747
		}
748

749
		bool insert(const result& val, uint32_t max_size)
750
		{
751
			assert(max_size < MaxSupportedSize);
752

753
			if (m_cur_size >= MaxSupportedSize)
754
				return false;
755

756
			m_elements[++m_cur_size] = val;
757
			up_heap(m_cur_size);
758

759
			if (m_cur_size > max_size)
760
				pop();
761

762
			return true;
763
		}
764

765
		bool pop()
766
		{
767
			if (m_cur_size == 0) 
768
				return false;
769

770
			m_elements[1] = m_elements[m_cur_size--];
771
			down_heap(1);
772
			return true;
773
		}
774
								
775
		float get_highest_dist() const
776
		{
777
			if (!m_cur_size)
778
				return 0.0f;
779

780
			return top().m_dist;
781
		}
782
	
783
	private:
784
		result_array_type m_elements;
785
		size_t m_cur_size;
786

787
		void up_heap(size_t index)
788
		{
789
			while ((index > 1) && (m_elements[index] > m_elements[index >> 1]))
790
			{
791
				std::swap(m_elements[index], m_elements[index >> 1]);
792
				index >>= 1;
793
			}
794
		}
795

796
		void down_heap(size_t index)
797
		{
798
			for ( ; ; )
799
			{
800
				size_t largest = index, left_child = 2 * index, right_child = 2 * index + 1;
801

802
				if ((left_child <= m_cur_size) && (m_elements[left_child] > m_elements[largest]))
803
					largest = left_child;
804

805
				if ((right_child <= m_cur_size) && (m_elements[right_child] > m_elements[largest]))
806
					largest = right_child;
807

808
				if (largest == index)
809
					break;
810

811
				std::swap(m_elements[index], m_elements[largest]);
812
				index = largest;
813
			}
814
		}
815
	};
816
		
817
	void find_nearest(uint32_t num_subsets, const partition_pattern_vec& desired_pat, result_queue& results, uint32_t max_results)
818
	{
819
		assert((num_subsets >= 2) && (num_subsets <= 3));
820

821
		results.clear();
822

823
		if (!m_nodes.size())
824
			return;
825

826
		uint32_t num_desired_pats;
827
		partition_pattern_vec desired_pats[NUM_PART3_MAPPINGS];
828

829
		if (num_subsets == 2)
830
		{
831
			num_desired_pats = 2;
832
			for (uint32_t i = 0; i < 2; i++)
833
				desired_pats[i] = desired_pat.get_permuted2(i);
834
		}
835
		else
836
		{
837
			num_desired_pats = NUM_PART3_MAPPINGS;
838
			for (uint32_t i = 0; i < NUM_PART3_MAPPINGS; i++)
839
				desired_pats[i] = desired_pat.get_permuted3(i);
840
		}
841

842
#if 0
843
		find_nearest_at_node(0, num_desired_pats, desired_pats, results, max_results);
844
#else
845
		find_nearest_at_node_non_recursive(0, num_desired_pats, desired_pats, results, max_results);
846
#endif
847
	}
848

849
private:
850
	basisu::vector<vp_tree_node> m_nodes;
851

852
	void find_nearest_at_node(int node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)
853
	{
854
		float best_dist_to_vantage = BIG_FLOAT_VAL;
855
		uint32_t best_mapping = 0;
856
		for (uint32_t i = 0; i < num_desired_pats; i++)
857
		{
858
			float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);
859
			if (dist < best_dist_to_vantage)
860
			{
861
				best_dist_to_vantage = dist;
862
				best_mapping = i;
863
			}
864
		}
865

866
		result r;
867
		r.m_dist = best_dist_to_vantage;
868
		r.m_mapping_index = best_mapping;
869
		r.m_pat_index = m_nodes[node_index].m_point_index;
870

871
		results.insert(r, max_results);
872

873
		if (best_dist_to_vantage <= m_nodes[node_index].m_dist)
874
		{
875
			// inner first
876
			if (m_nodes[node_index].m_inner_node >= 0)
877
				find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);
878

879
			if (m_nodes[node_index].m_outer_node >= 0)
880
			{
881
				if ( (results.get_size() < max_results) || 
882
					((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())
883
					)
884
				{
885
					find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);
886
				}
887
			}
888
		}
889
		else
890
		{
891
			// outer first
892
			if (m_nodes[node_index].m_outer_node >= 0)
893
				find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);
894

895
			if (m_nodes[node_index].m_inner_node >= 0)
896
			{
897
				if ( (results.get_size() < max_results) || 
898
					((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())
899
					)
900
				{
901
					find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);
902
				}
903
			}
904
		}
905
	}
906
		
907
	void find_nearest_at_node_non_recursive(int init_node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)
908
	{
909
		uint_vec node_stack;
910
		node_stack.reserve(16);
911
		node_stack.push_back(init_node_index);
912
		
913
		do
914
		{
915
			const uint32_t node_index = node_stack.back();
916
			node_stack.pop_back();
917

918
			float best_dist_to_vantage = BIG_FLOAT_VAL;
919
			uint32_t best_mapping = 0;
920
			for (uint32_t i = 0; i < num_desired_pats; i++)
921
			{
922
				float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);
923
				if (dist < best_dist_to_vantage)
924
				{
925
					best_dist_to_vantage = dist;
926
					best_mapping = i;
927
				}
928
			}
929

930
			result r;
931
			r.m_dist = best_dist_to_vantage;
932
			r.m_mapping_index = best_mapping;
933
			r.m_pat_index = m_nodes[node_index].m_point_index;
934

935
			results.insert(r, max_results);
936

937
			if (best_dist_to_vantage <= m_nodes[node_index].m_dist)
938
			{
939
				if (m_nodes[node_index].m_outer_node >= 0)
940
				{
941
					if ((results.get_size() < max_results) ||
942
						((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())
943
						)
944
					{
945
						node_stack.push_back(m_nodes[node_index].m_outer_node);
946
					}
947
				}
948

949
				// inner first
950
				if (m_nodes[node_index].m_inner_node >= 0)
951
				{
952
					node_stack.push_back(m_nodes[node_index].m_inner_node);
953
				}
954
			}
955
			else
956
			{
957
				if (m_nodes[node_index].m_inner_node >= 0)
958
				{
959
					if ((results.get_size() < max_results) ||
960
						((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())
961
						)
962
					{
963
						node_stack.push_back(m_nodes[node_index].m_inner_node);
964
					}
965
				}
966

967
				// outer first
968
				if (m_nodes[node_index].m_outer_node >= 0)
969
				{
970
					node_stack.push_back(m_nodes[node_index].m_outer_node);
971
				}
972
			}
973

974
		} while (!node_stack.empty());
975
	}
976

977
	// returns the index of the new node, or -1 on error
978
	int create_node(uint32_t n, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices)
979
	{
980
		std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);
981

982
		if (root_idx.first < 0)
983
			return -1;
984

985
		m_nodes.resize(m_nodes.size() + 1);
986
		const uint32_t new_node_index = m_nodes.size_u32() - 1;
987
				
988
		m_nodes[new_node_index].m_vantage_point = pUnique_pats[root_idx.first];
989
		m_nodes[new_node_index].m_point_index = root_idx.first;
990
		m_nodes[new_node_index].m_dist = root_idx.second;
991
		m_nodes[new_node_index].m_inner_node = -1;
992
		m_nodes[new_node_index].m_outer_node = -1;
993

994
		uint_vec inner_list, outer_list;
995

996
		inner_list.reserve(pat_indices.size_u32() / 2);
997
		outer_list.reserve(pat_indices.size_u32() / 2);
998

999
		for (uint32_t pat_indices_iter = 0; pat_indices_iter < pat_indices.size(); pat_indices_iter++)
1000
		{
1001
			const uint32_t pat_index = pat_indices[pat_indices_iter];
1002

1003
			if ((int)pat_index == root_idx.first)
1004
				continue;
1005

1006
			const float dist = m_nodes[new_node_index].m_vantage_point.get_distance(pUnique_pats[pat_index]);
1007

1008
			if (dist <= root_idx.second)
1009
				inner_list.push_back(pat_index);
1010
			else
1011
				outer_list.push_back(pat_index);
1012
		}
1013

1014
		if (inner_list.size())
1015
			m_nodes[new_node_index].m_inner_node = create_node(n, pUnique_pats, inner_list);
1016

1017
		if (outer_list.size())
1018
			m_nodes[new_node_index].m_outer_node = create_node(n, pUnique_pats, outer_list);
1019

1020
		return new_node_index;
1021
	}
1022

1023
	// returns the pattern index of the vantage point (-1 on error), and the optimal split distance
1024
	std::pair<int, float> find_best_vantage_point(uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, const uint_vec &pat_indices)
1025
	{
1026
		BASISU_NOTE_UNUSED(num_unique_pats);
1027

1028
		const uint32_t n = pat_indices.size_u32();
1029

1030
		assert(n);
1031
		if (n == 1)
1032
			return std::pair(pat_indices[0], 0.0f);
1033

1034
		float best_split_metric = -1.0f;
1035
		int best_split_pat = -1;
1036
		float best_split_dist = 0.0f;
1037
		float best_split_var = 0.0f;
1038

1039
		basisu::vector< std::pair<float, uint32_t> > dists;
1040
		dists.reserve(n);
1041
		
1042
		float_vec float_dists;
1043
		float_dists.reserve(n);
1044
				
1045
		for (uint32_t pat_indices_iter = 0; pat_indices_iter < n; pat_indices_iter++)
1046
		{
1047
			const uint32_t split_pat_index = pat_indices[pat_indices_iter];
1048
			assert(split_pat_index < num_unique_pats);
1049

1050
			const partition_pattern_vec& trial_vantage = pUnique_pats[split_pat_index];
1051
		
1052
			dists.resize(0);
1053
			float_dists.resize(0);
1054

1055
			for (uint32_t j = 0; j < n; j++)
1056
			{
1057
				const uint32_t pat_index = pat_indices[j];
1058
				assert(pat_index < num_unique_pats);
1059

1060
				if (pat_index == split_pat_index)
1061
					continue;
1062
				
1063
				float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);
1064
				dists.emplace_back(std::pair(dist, pat_index));
1065

1066
				float_dists.push_back(dist);
1067
			}
1068

1069
			stats<double> s;
1070
			s.calc(float_dists.size_u32(), float_dists.data());
1071

1072
			std::sort(dists.begin(), dists.end(), [](const auto &a, const auto &b) {
1073
				return a.first < b.first;
1074
				});
1075

1076
			const uint32_t num_dists = dists.size_u32();
1077
			float split_dist = dists[num_dists / 2].first;
1078
			if ((num_dists & 1) == 0)
1079
				split_dist = (split_dist + dists[(num_dists / 2) - 1].first) * .5f;
1080

1081
			uint32_t total_inner = 0, total_outer = 0;
1082
			
1083
			for (uint32_t j = 0; j < n; j++)
1084
			{
1085
				const uint32_t pat_index = pat_indices[j];
1086
				if (pat_index == split_pat_index)
1087
					continue;
1088
				
1089
				float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);
1090

1091
				if (dist <= split_dist)
1092
					total_inner++;
1093
				else
1094
					total_outer++;
1095
			}
1096

1097
			float split_metric = (float)minimum(total_inner, total_outer) / (float)maximum(total_inner, total_outer);
1098
			
1099
			if ( (split_metric > best_split_metric) ||
1100
				 ((split_metric == best_split_metric) && (s.m_var > best_split_var)) )
1101
			{
1102
				best_split_metric = split_metric;
1103
				best_split_dist = split_dist;
1104
				best_split_pat = split_pat_index;
1105
				best_split_var = (float)s.m_var;
1106
			}
1107
		}
1108

1109
		return std::pair(best_split_pat, best_split_dist);
1110
	}
1111
};
1112

1113
struct partition
1114
{
1115
	uint64_t m_p;
1116

1117
	inline partition() : 
1118
		m_p(0)
1119
	{
1120
	}
1121

1122
	inline partition(uint64_t p) :
1123
		m_p(p)
1124
	{
1125
		assert(p < (1ULL << 36));
1126
	}
1127

1128
	inline partition& operator=(uint64_t p)
1129
	{
1130
		assert(p < (1ULL << 36));
1131
		m_p = p;
1132
		return *this;
1133
	}
1134

1135
	inline bool operator< (const partition& p) const
1136
	{
1137
		return m_p < p.m_p;
1138
	}
1139

1140
	inline bool operator== (const partition& p) const
1141
	{
1142
		return m_p == p.m_p;
1143
	}
1144

1145
	inline operator size_t() const
1146
	{
1147
		return hash_hsieh((const uint8_t *)&m_p, sizeof(m_p));
1148
	}
1149
};
1150

1151
partition_pattern_vec g_partitions2[NUM_UNIQUE_PARTITIONS2];
1152
int g_part2_seed_to_unique_index[1024];
1153
vp_tree g_part2_vp_tree;
1154

1155
static inline vec3F vec3F_norm_approx(vec3F axis)
1156
{
1157
	float l = axis.norm();
1158
	axis = (fabs(l) >= SMALL_FLOAT_VAL) ? (axis * bu_math::inv_sqrt(l)) : vec3F(0.577350269f);
1159
	return axis;
1160
}
1161

1162
static void init_partitions2_6x6()
1163
{
1164
#if 0
1165
	// makes pattern bits to the 10-bit ASTC seed index
1166
	typedef basisu::hash_map<uint64_t, uint32_t> partition2_hash_map;
1167
	partition2_hash_map phash;
1168
	phash.reserve(1024);
1169

1170
	for (uint32_t i = 0; i < 1024; i++)
1171
	{
1172
		uint64_t p_bits = 0;
1173
		uint64_t p_bits_inv = 0;
1174
				
1175
		for (uint32_t y = 0; y < 6; y++)
1176
		{
1177
			for (uint32_t x = 0; x < 6; x++)
1178
			{
1179
				uint64_t p = astc_helpers::compute_texel_partition(i, x, y, 0, 2, false);
1180
				assert(p < 2);
1181
								
1182
				p_bits |= (p << (x + y * 6));
1183
				p_bits_inv |= ((1 - p) << (x + y * 6));
1184
			}
1185
		}
1186
				
1187
		if (!p_bits)
1188
			continue;
1189
		if (p_bits == ((1ULL << 36) - 1))
1190
			continue;
1191

1192
		assert(p_bits < (1ULL << 36));
1193
		assert(p_bits_inv < (1ULL << 36));
1194

1195
		if (phash.contains(p_bits))
1196
		{
1197
		}
1198
		else if (phash.contains(p_bits_inv))
1199
		{
1200
		}
1201
		else
1202
		{
1203
			auto res = phash.insert(p_bits, i);
1204
			assert(res.second);
1205
			BASISU_NOTE_UNUSED(res);
1206
		}
1207
	}
1208
		
1209
	uint32_t num_unique_partitions2 = 0;
1210
		
1211
	for (const auto& r : phash)
1212
	{
1213
		assert(r.second < 1024);
1214
		
1215
		const uint32_t unique_index = num_unique_partitions2;
1216
		assert(unique_index < NUM_UNIQUE_PARTITIONS2);
1217

1218
		partition_pattern_vec pat_vec;
1219
		for (uint32_t i = 0; i < 36; i++)
1220
			pat_vec[i] = (uint8_t)((r.first >> i) & 1);
1221

1222
		g_partitions2[unique_index] = pat_vec;
1223
		
1224
		assert(g_part2_unique_index_to_seed[unique_index] == r.second);
1225
		g_part2_seed_to_unique_index[r.second] = unique_index;
1226

1227
		num_unique_partitions2++;
1228
	}
1229
	assert(num_unique_partitions2 == NUM_UNIQUE_PARTITIONS2);
1230
#else
1231
	for (uint32_t unique_index = 0; unique_index < NUM_UNIQUE_PARTITIONS2; unique_index++)
1232
	{
1233
		const uint32_t seed_index = g_part2_unique_index_to_seed[unique_index];
1234
		assert(seed_index < 1024);
1235

1236
		assert(g_part2_seed_to_unique_index[seed_index] == 0);
1237
		g_part2_seed_to_unique_index[seed_index] = unique_index;
1238

1239
		partition_pattern_vec& pat_vec = g_partitions2[unique_index];
1240

1241
		for (uint32_t y = 0; y < 6; y++)
1242
		{
1243
			for (uint32_t x = 0; x < 6; x++)
1244
			{
1245
				uint8_t p = (uint8_t)astc_helpers::compute_texel_partition(seed_index, x, y, 0, 2, false);
1246
				assert(p < 2);
1247

1248
				pat_vec[x + y * 6] = p;
1249
			}
1250
		}
1251
	}
1252
#endif
1253

1254
	g_part2_vp_tree.init(NUM_UNIQUE_PARTITIONS2, g_partitions2);
1255
}
1256

1257
static bool estimate_partition2_6x6(
1258
	const basist::half_float pBlock_pixels_half[][3],
1259
	int* pBest_parts, uint32_t num_best_parts)
1260
{
1261
	const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H;
1262
		
1263
	vec3F training_vecs[BLOCK_T], mean(0.0f);
1264

1265
	for (uint32_t i = 0; i < BLOCK_T; i++)
1266
	{
1267
		vec3F& v = training_vecs[i];
1268

1269
		v[0] = (float)pBlock_pixels_half[i][0];
1270
		v[1] = (float)pBlock_pixels_half[i][1];
1271
		v[2] = (float)pBlock_pixels_half[i][2];
1272

1273
		mean += v;
1274
	}
1275
	mean *= (1.0f / (float)BLOCK_T);
1276

1277
	vec3F max_vals(-BIG_FLOAT_VAL);
1278

1279
	for (uint32_t i = 0; i < BLOCK_T; i++)
1280
	{
1281
		vec3F& v = training_vecs[i];
1282
		max_vals = vec3F::component_max(max_vals, v);
1283
	}
1284

1285
	// Initialize principle axis approximation
1286
	vec3F axis(max_vals - mean);
1287

1288
	// Incremental approx. PCA - only viable if we have a reasonably fast approximation for 1.0/sqrt(x).
1289
	for (uint32_t i = 0; i < BLOCK_T; i++)
1290
	{
1291
		axis = vec3F_norm_approx(axis);
1292

1293
		vec3F color(training_vecs[i] - mean);
1294

1295
		float d = color.dot(axis);
1296

1297
		axis += color * d;
1298
	}
1299

1300
	if (axis.norm() < SMALL_FLOAT_VAL)
1301
		axis.set(0.57735027f);
1302
	else
1303
		axis.normalize_in_place();
1304

1305
#if BRUTE_FORCE_PART_SEARCH
1306
	int desired_parts[BLOCK_H][BLOCK_W]; // [y][x]
1307
	for (uint32_t i = 0; i < BLOCK_T; i++)
1308
	{
1309
		float proj = (training_vecs[i] - mean).dot(axis);
1310

1311
		desired_parts[i / BLOCK_W][i % BLOCK_W] = proj < 0.0f;
1312
	}
1313
#else
1314
	partition_pattern_vec desired_part;
1315

1316
	for (uint32_t i = 0; i < BLOCK_T; i++)
1317
	{
1318
		float proj = (training_vecs[i] - mean).dot(axis);
1319

1320
		desired_part.m_parts[i] = proj < 0.0f;
1321
	}
1322
#endif
1323
	
1324
	//interval_timer tm;
1325
	//tm.start();
1326
	
1327
#if BRUTE_FORCE_PART_SEARCH
1328
	uint32_t part_similarity[NUM_UNIQUE_PARTITIONS2];
1329

1330
	for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS2; part_index++)
1331
	{
1332
		const partition_pattern_vec &pat_vec = g_partitions2[part_index];
1333

1334
		int total_sim_non_inv = 0;
1335
		int total_sim_inv = 0;
1336

1337
		for (uint32_t y = 0; y < BLOCK_H; y++)
1338
		{
1339
			for (uint32_t x = 0; x < BLOCK_W; x++)
1340
			{
1341
				int part = pat_vec[x + y * 6];
1342

1343
				if (part == desired_parts[y][x])
1344
					total_sim_non_inv++;
1345

1346
				if ((part ^ 1) == desired_parts[y][x])
1347
					total_sim_inv++;
1348
			}
1349
		}
1350

1351
		int total_sim = maximum(total_sim_non_inv, total_sim_inv);
1352

1353
		part_similarity[part_index] = (total_sim << 16) | part_index;
1354

1355
	} // part_index;
1356

1357
	std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS2);
1358

1359
	for (uint32_t i = 0; i < num_best_parts; i++)
1360
		pBest_parts[i] = part_similarity[(NUM_UNIQUE_PARTITIONS2 - 1) - i] & 0xFFFF;
1361
#else
1362
	vp_tree::result_queue results;
1363
	results.reserve(num_best_parts);
1364
	g_part2_vp_tree.find_nearest(2, desired_part, results, num_best_parts);
1365

1366
	assert(results.get_size() == num_best_parts);
1367

1368
	const auto& elements = results.get_elements();
1369

1370
	for (uint32_t i = 0; i < results.get_size(); i++)
1371
		pBest_parts[i] = elements[1 + i].m_pat_index;
1372
#endif
1373

1374
	//fmt_printf("{} ", tm.get_elapsed_ms());
1375

1376
	return true;
1377
}
1378

1379
const uint32_t MIN_REFINE_LEVEL = 0;
1380

1381
static bool encode_block_2_subsets(
1382
	trial_result res[2],
1383
	uint32_t grid_w, uint32_t grid_h,
1384
	uint32_t cem,
1385
	uint32_t weights_ise_range, uint32_t endpoints_ise_range,
1386
	const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,
1387
	astc_hdr_codec_base_options& coptions,
1388
	bool uber_mode_flag,
1389
	int unique_pat_index,
1390
	uint32_t comp_level,
1391
	opt_mode_t mode11_opt_mode,
1392
	bool refine_endpoints_flag)
1393
{
1394
	const uint32_t num_endpoint_vals = (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;
1395

1396
	res[0].m_valid = false;
1397
	res[1].m_valid = false;
1398

1399
	const uint32_t BLOCK_W = 6, BLOCK_H = 6;
1400

1401
	astc_helpers::log_astc_block best_log_blk;
1402
	clear_obj(best_log_blk);
1403

1404
	best_log_blk.m_num_partitions = 2;
1405
	best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;
1406
	best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;
1407
	best_log_blk.m_grid_width = (uint8_t)grid_w;
1408
	best_log_blk.m_grid_height = (uint8_t)grid_h;
1409

1410
	best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;
1411
	best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;
1412

1413
	partition_pattern_vec* pPat = &g_partitions2[unique_pat_index];
1414
	const uint32_t p_seed = g_part2_unique_index_to_seed[unique_pat_index];
1415

1416
	vec4F part_pixels_q16[2][64];
1417
	half_vec3 part_half_pixels[2][64];
1418
	uint8_t part_pixel_index[2][64];
1419
	uint32_t part_total_pixels[2] = { 0 };
1420

1421
	for (uint32_t y = 0; y < BLOCK_H; y++)
1422
	{
1423
		for (uint32_t x = 0; x < BLOCK_W; x++)
1424
		{
1425
			uint32_t part_index = (*pPat)[x + y * BLOCK_W];
1426

1427
			uint32_t l = part_total_pixels[part_index];
1428

1429
			part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];
1430
			part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];
1431
			part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);
1432

1433
			part_total_pixels[part_index] = l + 1;
1434
		} // x 
1435
	} // y
1436

1437
	uint8_t blk_endpoints[2][basist::NUM_MODE11_ENDPOINTS];
1438
	uint8_t blk_weights[2][BLOCK_W * BLOCK_H];
1439
	uint32_t best_submode[2];
1440

1441
	for (uint32_t part_iter = 0; part_iter < 2; part_iter++)
1442
	{
1443
		assert(part_total_pixels[part_iter]);
1444

1445
		double e;
1446
		if (cem == 7)
1447
		{
1448
			e = encode_astc_hdr_block_mode_7(
1449
				part_total_pixels[part_iter],
1450
				(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1451
				best_log_blk.m_weight_ise_range,
1452
				best_submode[part_iter],
1453
				BIG_FLOAT_VAL,
1454
				blk_endpoints[part_iter],
1455
				blk_weights[part_iter],
1456
				coptions,
1457
				best_log_blk.m_endpoint_ise_range);
1458
		}
1459
		else
1460
		{
1461
			assert(cem == 11);
1462

1463
			e = encode_astc_hdr_block_mode_11(
1464
				part_total_pixels[part_iter],
1465
				(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1466
				best_log_blk.m_weight_ise_range,
1467
				best_submode[part_iter],
1468
				BIG_FLOAT_VAL,
1469
				blk_endpoints[part_iter],
1470
				blk_weights[part_iter],
1471
				coptions,
1472
				false,
1473
				best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,
1474
				mode11_opt_mode);
1475
		}
1476

1477
		if (e == BIG_FLOAT_VAL)
1478
			return false;
1479

1480
	} // part_iter
1481

1482
	uint8_t ise_weights[BLOCK_W * BLOCK_H];
1483

1484
	uint32_t src_pixel_index[2] = { 0, 0 };
1485
	for (uint32_t y = 0; y < BLOCK_H; y++)
1486
	{
1487
		for (uint32_t x = 0; x < BLOCK_W; x++)
1488
		{
1489
			uint32_t part_index = (*pPat)[x + y * BLOCK_W];
1490
			ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
1491
			src_pixel_index[part_index]++;
1492
		} // x
1493
	} // y
1494

1495
	if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))
1496
	{
1497
		best_log_blk.m_partition_id = (uint16_t)p_seed;
1498

1499
		memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);
1500
		memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);
1501
		memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);
1502

1503
		res[0].m_valid = true;
1504
		res[0].m_log_blk = best_log_blk;
1505
	}
1506
	else
1507
	{
1508
		uint8_t desired_weights[BLOCK_H * BLOCK_W];
1509

1510
		const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;
1511

1512
		for (uint32_t by = 0; by < BLOCK_H; by++)
1513
			for (uint32_t bx = 0; bx < BLOCK_W; bx++)
1514
				desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];
1515

1516
		uint8_t downsampled_weights[BLOCK_H * BLOCK_W];
1517

1518
		const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);
1519
		if (!pDownsample_matrix)
1520
		{
1521
			assert(0);
1522
			return false;
1523
		}
1524

1525
		downsample_weight_grid(
1526
			pDownsample_matrix,
1527
			BLOCK_W, BLOCK_H,		// source/from dimension (block size)
1528
			grid_w, grid_h,			// dest/to dimension (grid size)
1529
			desired_weights,		// these are dequantized weights, NOT ISE symbols, [by][bx]
1530
			downsampled_weights);	// [wy][wx]
1531
				
1532
		best_log_blk.m_partition_id = (uint16_t)p_seed;
1533
		memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);
1534
		memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);
1535

1536
		const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;
1537

1538
		for (uint32_t gy = 0; gy < grid_h; gy++)
1539
			for (uint32_t gx = 0; gx < grid_w; gx++)
1540
				best_log_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];
1541

1542
		res[0].m_valid = true;
1543
		res[0].m_log_blk = best_log_blk;
1544

1545
		if ((refine_endpoints_flag) && (comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))
1546
		{
1547
			bool any_refined = false;
1548

1549
			for (uint32_t part_iter = 0; part_iter < 2; part_iter++)
1550
			{
1551
				bool refine_status = refine_endpoints(
1552
					cem,
1553
					endpoints_ise_range,
1554
					best_log_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize
1555
					BLOCK_W, BLOCK_H, // block dimensions
1556
					grid_w, grid_h, best_log_blk.m_weights, weights_ise_range, // weight grid
1557
					part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1558
					&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets
1559
					coptions, mode11_opt_mode);
1560

1561
				if (refine_status)
1562
					any_refined = true;
1563
			}
1564

1565
			if (any_refined)
1566
			{
1567
				res[1].m_valid = true;
1568
				res[1].m_log_blk = best_log_blk;
1569
			}
1570
		}
1571
	}
1572

1573
	return true;
1574
}
1575

1576
typedef basisu::hash_map<partition_pattern_vec, std::pair<uint32_t, uint32_t > > partition3_hash_map;
1577

1578
partition_pattern_vec g_partitions3[NUM_UNIQUE_PARTITIONS3];
1579
int g_part3_seed_to_unique_index[1024];
1580
vp_tree g_part3_vp_tree;
1581

1582
static void init_partitions3_6x6()
1583
{
1584
	uint32_t t = 0;
1585

1586
	for (uint32_t i = 0; i < 1024; i++)
1587
		g_part3_seed_to_unique_index[i] = -1;
1588

1589
	partition3_hash_map part3_hash;
1590
	part3_hash.reserve(512);
1591
		
1592
	for (uint32_t seed_index = 0; seed_index < 1024; seed_index++)
1593
	{
1594
		partition_pattern_vec p3;
1595
		uint32_t part_hist[3] = { 0 };
1596

1597
		for (uint32_t y = 0; y < 6; y++)
1598
		{
1599
			for (uint32_t x = 0; x < 6; x++)
1600
			{
1601
				uint64_t p = astc_helpers::compute_texel_partition(seed_index, x, y, 0, 3, false);
1602
				assert(p < 3);
1603

1604
				p3.m_parts[x + y * 6] = (uint8_t)p;
1605
				part_hist[p]++;
1606
			}
1607
		}
1608

1609
		if (!part_hist[0] || !part_hist[1] || !part_hist[2])
1610
			continue;
1611

1612
		uint32_t j;
1613
		for (j = 0; j < NUM_PART3_MAPPINGS; j++)
1614
		{
1615
			partition_pattern_vec temp_part3(p3.get_permuted3(j));
1616

1617
			if (part3_hash.contains(temp_part3))
1618
				break;
1619
		}
1620
		if (j < NUM_PART3_MAPPINGS)
1621
			continue;
1622

1623
		part3_hash.insert(p3, std::make_pair(seed_index, t) );
1624

1625
		assert(g_part3_unique_index_to_seed[t] == seed_index);
1626
		g_part3_seed_to_unique_index[seed_index] = t;
1627
		g_partitions3[t] = p3;
1628

1629
		t++;
1630
	}
1631

1632
	g_part3_vp_tree.init(NUM_UNIQUE_PARTITIONS3, g_partitions3);
1633
}
1634

1635
static bool estimate_partition3_6x6(
1636
	const basist::half_float pBlock_pixels_half[][3],
1637
	int* pBest_parts, uint32_t num_best_parts)
1638
{
1639
	const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H, NUM_SUBSETS = 3;
1640

1641
	assert(num_best_parts && (num_best_parts <= NUM_UNIQUE_PARTITIONS3));
1642

1643
	vec3F training_vecs[BLOCK_T], mean(0.0f);
1644

1645
	float brightest_inten = 0.0f, darkest_inten = BIG_FLOAT_VAL;
1646
	vec3F cluster_centroids[NUM_SUBSETS];
1647

1648
	for (uint32_t i = 0; i < BLOCK_T; i++)
1649
	{
1650
		vec3F& v = training_vecs[i];
1651

1652
		v.set((float)pBlock_pixels_half[i][0], (float)pBlock_pixels_half[i][1], (float)pBlock_pixels_half[i][2]);
1653

1654
		float inten = v.dot(vec3F(1.0f));
1655
		if (inten < darkest_inten)
1656
		{
1657
			darkest_inten = inten;
1658
			cluster_centroids[0] = v;
1659
		}
1660

1661
		if (inten > brightest_inten)
1662
		{
1663
			brightest_inten = inten;
1664
			cluster_centroids[1] = v;
1665
		}
1666
	}
1667

1668
	if (cluster_centroids[0] == cluster_centroids[1])
1669
		return false;
1670

1671
	float furthest_dist2 = 0.0f;
1672
	for (uint32_t i = 0; i < BLOCK_T; i++)
1673
	{
1674
		vec3F& v = training_vecs[i];
1675

1676
		float dist_a = v.squared_distance(cluster_centroids[0]);
1677
		if (dist_a == 0.0f)
1678
			continue;
1679

1680
		float dist_b = v.squared_distance(cluster_centroids[1]);
1681
		if (dist_b == 0.0f)
1682
			continue;
1683

1684
		float dist2 = dist_a + dist_b;
1685
		if (dist2 > furthest_dist2)
1686
		{
1687
			furthest_dist2 = dist2;
1688
			cluster_centroids[2] = v;
1689
		}
1690
	}
1691

1692
	if ((cluster_centroids[0] == cluster_centroids[2]) || (cluster_centroids[1] == cluster_centroids[2]))
1693
		return false;
1694
		
1695
	uint32_t cluster_pixels[NUM_SUBSETS][BLOCK_T];
1696
	uint32_t num_cluster_pixels[NUM_SUBSETS];
1697
	vec3F new_cluster_means[NUM_SUBSETS];
1698

1699
	const uint32_t NUM_ITERS = 4;
1700
	
1701
	for (uint32_t s = 0; s < NUM_ITERS; s++)
1702
	{
1703
		memset(num_cluster_pixels, 0, sizeof(num_cluster_pixels));
1704
		memset(new_cluster_means, 0, sizeof(new_cluster_means));
1705

1706
		for (uint32_t i = 0; i < BLOCK_T; i++)
1707
		{
1708
			float d[NUM_SUBSETS] = { 
1709
				training_vecs[i].squared_distance(cluster_centroids[0]), 
1710
				training_vecs[i].squared_distance(cluster_centroids[1]), 
1711
				training_vecs[i].squared_distance(cluster_centroids[2]) };
1712

1713
			float min_d = d[0];
1714
			uint32_t min_idx = 0;
1715
			for (uint32_t j = 1; j < NUM_SUBSETS; j++)
1716
			{
1717
				if (d[j] < min_d)
1718
				{
1719
					min_d = d[j];
1720
					min_idx = j;
1721
				}
1722
			}
1723

1724
			cluster_pixels[min_idx][num_cluster_pixels[min_idx]] = i;
1725
			new_cluster_means[min_idx] += training_vecs[i];
1726
			num_cluster_pixels[min_idx]++;
1727
		} // i
1728

1729
		for (uint32_t j = 0; j < NUM_SUBSETS; j++)
1730
		{
1731
			if (!num_cluster_pixels[j])
1732
				return false;
1733

1734
			cluster_centroids[j] = new_cluster_means[j] / (float)num_cluster_pixels[j];
1735
		}
1736
	} // s
1737
		
1738
	partition_pattern_vec desired_part;
1739
	for (uint32_t p = 0; p < NUM_SUBSETS; p++)
1740
	{
1741
		for (uint32_t i = 0; i < num_cluster_pixels[p]; i++)
1742
		{
1743
			const uint32_t pix_index = cluster_pixels[p][i];
1744
			desired_part[pix_index] = (uint8_t)p;
1745
		}
1746
	}
1747

1748
#if BRUTE_FORCE_PART_SEARCH
1749
	partition_pattern_vec desired_parts[NUM_PART3_MAPPINGS];
1750
	for (uint32_t j = 0; j < NUM_PART3_MAPPINGS; j++)
1751
		desired_parts[j] = desired_part.get_permuted3(j);
1752

1753
	uint32_t part_similarity[NUM_UNIQUE_PARTITIONS3];
1754

1755
	for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS3; part_index++)
1756
	{
1757
		const partition_pattern_vec& pat = g_partitions3[part_index];
1758

1759
		uint32_t lowest_pat_dist = UINT32_MAX;
1760
		for (uint32_t p = 0; p < NUM_PART3_MAPPINGS; p++)
1761
		{
1762
			uint32_t dist = pat.get_squared_distance(desired_parts[p]);
1763
			if (dist < lowest_pat_dist)
1764
				lowest_pat_dist = dist;
1765
		}
1766

1767
		part_similarity[part_index] = (lowest_pat_dist << 16) | part_index;
1768

1769
	} // part_index;
1770

1771
	std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS3);
1772
		
1773
	for (uint32_t i = 0; i < num_best_parts; i++)
1774
		pBest_parts[i] = part_similarity[i] & 0xFFFF;
1775
#else
1776
	vp_tree::result_queue results;
1777
	results.reserve(num_best_parts);
1778
	g_part3_vp_tree.find_nearest(3, desired_part, results, num_best_parts);
1779

1780
	assert(results.get_size() == num_best_parts);
1781

1782
	const auto& elements = results.get_elements();
1783

1784
	for (uint32_t i = 0; i < results.get_size(); i++)
1785
		pBest_parts[i] = elements[1 + i].m_pat_index;
1786
#endif
1787

1788
	return true;
1789
}
1790

1791
static bool encode_block_3_subsets(
1792
	trial_result& res,
1793
	uint32_t cem,
1794
	uint32_t grid_w, uint32_t grid_h,
1795
	uint32_t weights_ise_range, uint32_t endpoints_ise_range,
1796
	const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,
1797
	astc_hdr_codec_base_options& coptions,
1798
	bool uber_mode_flag,
1799
	const int* pEst_patterns, int num_est_patterns,
1800
	uint32_t comp_level, 
1801
	opt_mode_t mode11_opt_mode)
1802
{
1803
	BASISU_NOTE_UNUSED(uber_mode_flag);
1804
	const uint32_t BLOCK_W = 6, BLOCK_H = 6, NUM_SUBSETS = 3;
1805
	const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem);
1806
		
1807
	res.m_valid = false;
1808
		
1809
	double best_e = BIG_FLOAT_VAL;
1810

1811
	astc_helpers::log_astc_block best_log_blk;
1812
	clear_obj(best_log_blk);
1813

1814
	best_log_blk.m_num_partitions = NUM_SUBSETS;
1815
	best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;
1816
	best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;
1817
	best_log_blk.m_color_endpoint_modes[2] = (uint8_t)cem;
1818
	best_log_blk.m_grid_width = (uint8_t)grid_w;
1819
	best_log_blk.m_grid_height = (uint8_t)grid_h;
1820

1821
	best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;
1822
	best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;
1823

1824
	const uint32_t n = num_est_patterns ? num_est_patterns : NUM_UNIQUE_PARTITIONS3;
1825

1826
	for (uint32_t unique_p_iter = 0; unique_p_iter < n; unique_p_iter++)
1827
	{
1828
		const uint32_t unique_part_index = num_est_patterns ? pEst_patterns[unique_p_iter] : unique_p_iter;
1829
		assert(unique_part_index < NUM_UNIQUE_PARTITIONS3);
1830
		const partition_pattern_vec*pPart = &g_partitions3[unique_part_index];
1831

1832
		vec4F part_pixels_q16[NUM_SUBSETS][64];
1833
		half_vec3 part_half_pixels[NUM_SUBSETS][64];
1834
		uint8_t part_pixel_index[NUM_SUBSETS][64];
1835
		uint32_t part_total_pixels[NUM_SUBSETS] = { 0 };
1836

1837
		for (uint32_t y = 0; y < BLOCK_H; y++)
1838
		{
1839
			for (uint32_t x = 0; x < BLOCK_W; x++)
1840
			{
1841
				const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];
1842

1843
				uint32_t l = part_total_pixels[part_index];
1844

1845
				part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];
1846
				part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];
1847
				part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);
1848

1849
				part_total_pixels[part_index] = l + 1;
1850
			} // x 
1851
		} // y
1852

1853
		uint8_t blk_endpoints[NUM_SUBSETS][basist::NUM_MODE11_ENDPOINTS];
1854
		uint8_t blk_weights[NUM_SUBSETS][BLOCK_W * BLOCK_H];
1855
		uint32_t best_submode[NUM_SUBSETS];
1856

1857
		double e = 0.0f;
1858
		for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)
1859
		{
1860
			assert(part_total_pixels[part_iter]);
1861

1862
			if (cem == 7)
1863
			{
1864
				e += encode_astc_hdr_block_mode_7(
1865
					part_total_pixels[part_iter],
1866
					(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1867
					best_log_blk.m_weight_ise_range,
1868
					best_submode[part_iter],
1869
					BIG_FLOAT_VAL,
1870
					blk_endpoints[part_iter],
1871
					blk_weights[part_iter],
1872
					coptions,
1873
					best_log_blk.m_endpoint_ise_range);
1874
			}
1875
			else
1876
			{
1877
				assert(cem == 11);
1878

1879
				e += encode_astc_hdr_block_mode_11(
1880
					part_total_pixels[part_iter],
1881
					(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1882
					best_log_blk.m_weight_ise_range,
1883
					best_submode[part_iter],
1884
					BIG_FLOAT_VAL,
1885
					blk_endpoints[part_iter],
1886
					blk_weights[part_iter],
1887
					coptions,
1888
					false, best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, 
1889
					FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode);
1890
			}
1891

1892
		} // part_iter
1893

1894
		uint8_t ise_weights[BLOCK_W * BLOCK_H];
1895

1896
		uint32_t src_pixel_index[NUM_SUBSETS] = { 0 };
1897
		for (uint32_t y = 0; y < BLOCK_H; y++)
1898
		{
1899
			for (uint32_t x = 0; x < BLOCK_W; x++)
1900
			{
1901
				const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];
1902

1903
				ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
1904
				src_pixel_index[part_index]++;
1905
			} // x
1906
		} // y
1907

1908
		if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))
1909
		{
1910
			if (e < best_e)
1911
			{
1912
				best_e = e;
1913
				best_log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];
1914

1915
				for (uint32_t p = 0; p < NUM_SUBSETS; p++)
1916
					memcpy(best_log_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);
1917
				
1918
				memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);
1919
			}
1920
		}
1921
		else
1922
		{
1923
			uint8_t desired_weights[BLOCK_H * BLOCK_W];
1924

1925
			const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;
1926

1927
			for (uint32_t by = 0; by < BLOCK_H; by++)
1928
				for (uint32_t bx = 0; bx < BLOCK_W; bx++)
1929
					desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];
1930

1931
			uint8_t downsampled_weights[BLOCK_H * BLOCK_W];
1932

1933
			const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);
1934
			if (!pDownsample_matrix)
1935
			{
1936
				assert(0);
1937
				return false;
1938
			}
1939

1940
			downsample_weight_grid(
1941
				pDownsample_matrix,
1942
				BLOCK_W, BLOCK_H,		// source/from dimension (block size)
1943
				grid_w, grid_h,			// dest/to dimension (grid size)
1944
				desired_weights,		// these are dequantized weights, NOT ISE symbols, [by][bx]
1945
				downsampled_weights);	// [wy][wx]
1946

1947
			astc_helpers::log_astc_block trial_blk(best_log_blk);
1948

1949
			trial_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];
1950
			
1951
			for (uint32_t p = 0; p < NUM_SUBSETS; p++)
1952
				memcpy(trial_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);
1953

1954
			const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;
1955

1956
			for (uint32_t gy = 0; gy < grid_h; gy++)
1957
				for (uint32_t gx = 0; gx < grid_w; gx++)
1958
					trial_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];
1959

1960
			if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))
1961
			{
1962
				for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)
1963
				{
1964
					bool refine_status = refine_endpoints(
1965
						cem,
1966
						endpoints_ise_range,
1967
						trial_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize
1968
						BLOCK_W, BLOCK_H, // block dimensions
1969
						grid_w, grid_h, trial_blk.m_weights, weights_ise_range, // weight grid
1970
						part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1971
						&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets
1972
						coptions, mode11_opt_mode);
1973

1974
					BASISU_NOTE_UNUSED(refine_status);
1975
				}
1976
			}
1977

1978
			half_vec4 decoded_pixels_half4[BLOCK_H][BLOCK_W]; // [y][x]
1979
			bool status = astc_helpers::decode_block(trial_blk, decoded_pixels_half4, BLOCK_W, BLOCK_H, astc_helpers::cDecodeModeHDR16);
1980
			assert(status);
1981
			if (!status)
1982
				return false;
1983

1984
			half_vec3 decoded_pixels_half3[BLOCK_H][BLOCK_W];
1985
			for (uint32_t y = 0; y < BLOCK_H; y++)
1986
				for (uint32_t x = 0; x < BLOCK_W; x++)
1987
					decoded_pixels_half3[y][x].set(decoded_pixels_half4[y][x][0], decoded_pixels_half4[y][x][1], decoded_pixels_half4[y][x][2]);
1988

1989
			double trial_err = compute_block_error(BLOCK_W * BLOCK_H, (const basist::half_float*)pBlock_pixels_half, (const basist::half_float*)decoded_pixels_half3, coptions);
1990
			if (trial_err < best_e)
1991
			{
1992
				best_e = trial_err;
1993
				best_log_blk = trial_blk;
1994
			}
1995
		}
1996

1997
	} // unique_p_iter
1998

1999
	if (best_e < BIG_FLOAT_VAL)
2000
	{
2001
		res.m_log_blk = best_log_blk;
2002
		res.m_valid = true;
2003
		res.m_err = best_e;
2004
	}
2005
	else
2006
	{
2007
		res.m_valid = false;
2008
	}
2009

2010
	return res.m_valid;
2011
}
2012

2013
static uint32_t encode_values(bitwise_coder &coder, uint32_t total_values, const uint8_t *pVals, uint32_t endpoint_range)
2014
{
2015
	const uint32_t MAX_VALS = 64;
2016
	uint32_t bit_values[MAX_VALS], tq_values[(MAX_VALS + 2) / 3];
2017
	uint32_t total_tq_values = 0, tq_accum = 0, tq_mul = 1;
2018

2019
	assert((total_values) && (total_values <= MAX_VALS));
2020
	
2021
	const uint32_t ep_bits = astc_helpers::g_ise_range_table[endpoint_range][0];
2022
	const uint32_t ep_trits = astc_helpers::g_ise_range_table[endpoint_range][1];
2023
	const uint32_t ep_quints = astc_helpers::g_ise_range_table[endpoint_range][2];
2024

2025
	for (uint32_t i = 0; i < total_values; i++)
2026
	{
2027
		uint32_t val = pVals[i];
2028

2029
		uint32_t bits = val & ((1 << ep_bits) - 1);
2030
		uint32_t tq = val >> ep_bits;
2031

2032
		bit_values[i] = bits;
2033

2034
		if (ep_trits)
2035
		{
2036
			assert(tq < 3);
2037
			tq_accum += tq * tq_mul;
2038
			tq_mul *= 3;
2039
			if (tq_mul == 243)
2040
			{
2041
				assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));
2042
				tq_values[total_tq_values++] = tq_accum;
2043
				tq_accum = 0;
2044
				tq_mul = 1;
2045
			}
2046
		}
2047
		else if (ep_quints)
2048
		{
2049
			assert(tq < 5);
2050
			tq_accum += tq * tq_mul;
2051
			tq_mul *= 5;
2052
			if (tq_mul == 125)
2053
			{
2054
				assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));
2055
				tq_values[total_tq_values++] = tq_accum;
2056
				tq_accum = 0;
2057
				tq_mul = 1;
2058
			}
2059
		}
2060
	}
2061

2062
	uint32_t total_bits_output = 0;
2063
	
2064
	for (uint32_t i = 0; i < total_tq_values; i++)
2065
	{
2066
		const uint32_t num_bits = ep_trits ? 8 : 7;
2067
		coder.put_bits(tq_values[i], num_bits);
2068
		total_bits_output += num_bits;
2069
	}
2070

2071
	if (tq_mul > 1)
2072
	{
2073
		uint32_t num_bits;
2074
		if (ep_trits)
2075
		{
2076
			if (tq_mul == 3)
2077
				num_bits = 2;
2078
			else if (tq_mul == 9)
2079
				num_bits = 4;
2080
			else if (tq_mul == 27)
2081
				num_bits = 5;
2082
			else //if (tq_mul == 81)
2083
				num_bits = 7;
2084
		}
2085
		else
2086
		{
2087
			if (tq_mul == 5)
2088
				num_bits = 3;
2089
			else //if (tq_mul == 25)
2090
				num_bits = 5;
2091
		}
2092
		coder.put_bits(tq_accum, num_bits);
2093
		total_bits_output += num_bits;
2094
	}
2095

2096
	for (uint32_t i = 0; i < total_values; i++)
2097
	{
2098
		coder.put_bits(bit_values[i], ep_bits);
2099
		total_bits_output += ep_bits;
2100
	}
2101

2102
	return total_bits_output;
2103
}
2104

2105
static inline uint32_t get_num_endpoint_vals(uint32_t cem)
2106
{
2107
	assert((cem == 7) || (cem == 11));
2108
	return (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;
2109
}
2110

2111
static void code_block(bitwise_coder& coder,
2112
	const astc_helpers::log_astc_block& log_blk,
2113
	block_mode block_mode_index,
2114
	endpoint_mode em, const uint8_t *pEP_deltas)
2115
{
2116
	coder.put_truncated_binary((uint32_t)block_mode_index, (uint32_t)block_mode::cBMTotalModes);
2117
	coder.put_truncated_binary((uint32_t)em, (uint32_t)endpoint_mode::cTotal);
2118

2119
	const uint32_t num_endpoint_vals = get_num_endpoint_vals(log_blk.m_color_endpoint_modes[0]);
2120

2121
	if ((em == endpoint_mode::cUseLeftDelta) || (em == endpoint_mode::cUseUpperDelta))
2122
	{
2123
		assert(log_blk.m_num_partitions == 1);
2124

2125
		for (uint32_t i = 0; i < num_endpoint_vals; i++)
2126
			coder.put_bits(pEP_deltas[i], NUM_ENDPOINT_DELTA_BITS);
2127
	}
2128
	else if (em == endpoint_mode::cRaw)
2129
	{
2130
		if (log_blk.m_num_partitions == 2)
2131
		{
2132
			const int unique_partition_index = g_part2_seed_to_unique_index[log_blk.m_partition_id];
2133
			assert(unique_partition_index != -1);
2134
			
2135
			coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS2);
2136
		}
2137
		else if (log_blk.m_num_partitions == 3)
2138
		{
2139
			const int unique_partition_index = g_part3_seed_to_unique_index[log_blk.m_partition_id];
2140
			assert(unique_partition_index != -1);
2141

2142
			coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS3);
2143
		}
2144
		
2145
		encode_values(coder, num_endpoint_vals * log_blk.m_num_partitions, log_blk.m_endpoints, log_blk.m_endpoint_ise_range);
2146
	}
2147

2148
	encode_values(coder, log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1), log_blk.m_weights, log_blk.m_weight_ise_range);
2149
}
2150

2151
struct smooth_map_params
2152
{
2153
	bool m_no_mse_scaling;
2154

2155
	float m_max_smooth_std_dev;
2156
	float m_smooth_max_mse_scale;
2157

2158
	float m_max_med_smooth_std_dev;
2159
	float m_med_smooth_max_mse_scale;
2160

2161
	float m_max_ultra_smooth_std_dev;
2162
	float m_ultra_smooth_max_mse_scale;
2163

2164
	bool m_debug_images;
2165

2166
	smooth_map_params()
2167
	{
2168
		clear();
2169
	}
2170

2171
	void clear()
2172
	{
2173
		m_no_mse_scaling = false;
2174

2175
		// 3x3 region
2176
		m_max_smooth_std_dev = 100.0f;
2177
		m_smooth_max_mse_scale = 13000.0f;
2178
				
2179
		// 7x7 region
2180
		m_max_med_smooth_std_dev = 9.0f;
2181
		m_med_smooth_max_mse_scale = 15000.0f;
2182

2183
		// 11x11 region
2184
		m_max_ultra_smooth_std_dev = 4.0f;
2185
		//m_ultra_smooth_max_mse_scale = 4500.0f;
2186
		//m_ultra_smooth_max_mse_scale = 10000.0f;
2187
		//m_ultra_smooth_max_mse_scale = 50000.0f;
2188
		//m_ultra_smooth_max_mse_scale = 100000.0f;
2189
		//m_ultra_smooth_max_mse_scale = 400000.0f;
2190
		//m_ultra_smooth_max_mse_scale = 800000.0f;
2191
		m_ultra_smooth_max_mse_scale = 2000000.0f;
2192

2193
		m_debug_images = true;
2194
	}
2195
};
2196

2197
Resampler::Contrib_List* g_contrib_lists[7]; // 1-6
2198

2199
static void init_contrib_lists()
2200
{
2201
	for (uint32_t dst_width = 1; dst_width <= 6; dst_width++)
2202
		//g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_GAUSSIAN_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);
2203
		g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_BELL_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);
2204
}
2205

2206
#if 0
2207
static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, half_vec3 *pDst_block_half3, vec4F *pDst_block_q16)
2208
{
2209
	vec3F temp_block[6][6]; // [y][x]
2210

2211
	// first filter rows to temp_block
2212
	if (grid_x == 6)
2213
	{
2214
		memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);
2215
	}
2216
	else
2217
	{
2218
		Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];
2219

2220
		for (uint32_t y = 0; y < 6; y++)
2221
		{
2222
			for (uint32_t x = 0; x < 6; x++)
2223
			{
2224
				vec3F p(0.0f);
2225

2226
				for (uint32_t i = 0; i < pRow_lists[x].n; i++)
2227
					p += pSrc_block[y * 6 + pRow_lists[x].p[i].pixel] * pRow_lists[x].p[i].weight;
2228

2229
				p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
2230

2231
				temp_block[y][x] = p;
2232
			} // x
2233
		} // y
2234
	}
2235

2236
	// filter columns
2237
	if (grid_y == 6)
2238
	{
2239
		for (uint32_t y = 0; y < 6; y++)
2240
		{
2241
			for (uint32_t x = 0; x < 6; x++)
2242
			{
2243
				for (uint32_t c = 0; c < 3; c++)
2244
				{
2245
					const basist::half_float h = basist::float_to_half(temp_block[y][x][c]);
2246
					
2247
					pDst_block_half3[x + y * 6][c] = h;
2248
					pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);
2249
				}
2250

2251
				pDst_block_q16[x + y * 6][3] = 0.0f;
2252
			} // x
2253
		} // y
2254
	}
2255
	else
2256
	{
2257
		Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];
2258

2259
		for (uint32_t x = 0; x < 6; x++)
2260
		{
2261
			for (uint32_t y = 0; y < 6; y++)
2262
			{
2263
				vec3F p(0.0f);
2264

2265
				for (uint32_t i = 0; i < pCol_lists[y].n; i++)
2266
					p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
2267
				
2268
				p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
2269
				
2270
				for (uint32_t c = 0; c < 3; c++)
2271
				{
2272
					const basist::half_float h = basist::float_to_half(p[c]);
2273

2274
					pDst_block_half3[x + y * 6][c] = h;
2275
					pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);
2276
				}
2277

2278
				pDst_block_q16[x + y * 6][3] = 0.0f;
2279
				
2280
			} // x
2281
		} // y
2282
	}
2283
}
2284
#endif
2285

2286
static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec4F* pSrc_block, vec4F* pDst_block)
2287
{
2288
	vec4F temp_block[6][6]; // [y][x]
2289

2290
	// first filter rows to temp_block
2291
	if (grid_x == 6)
2292
	{
2293
		memcpy(temp_block, pSrc_block, sizeof(vec4F) * 6 * 6);
2294
	}
2295
	else
2296
	{
2297
		Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];
2298

2299
		for (uint32_t y = 0; y < 6; y++)
2300
		{
2301
			for (uint32_t x = 0; x < 6; x++)
2302
			{
2303
				vec3F p(0.0f);
2304

2305
				for (uint32_t i = 0; i < pRow_lists[x].n; i++)
2306
					p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;
2307

2308
				p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
2309

2310
				temp_block[y][x] = p;
2311
			} // x
2312
		} // y
2313
	}
2314

2315
	// filter columns
2316
	if (grid_y == 6)
2317
	{
2318
		for (uint32_t y = 0; y < 6; y++)
2319
		{
2320
			for (uint32_t x = 0; x < 6; x++)
2321
			{
2322
				for (uint32_t c = 0; c < 3; c++)
2323
					pDst_block[x + y * 6][c] = temp_block[y][x][c];
2324
			} // x
2325
		} // y
2326
	}
2327
	else
2328
	{
2329
		Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];
2330

2331
		for (uint32_t x = 0; x < 6; x++)
2332
		{
2333
			for (uint32_t y = 0; y < 6; y++)
2334
			{
2335
				vec3F p(0.0f);
2336

2337
				for (uint32_t i = 0; i < pCol_lists[y].n; i++)
2338
					p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
2339

2340
				p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
2341

2342
				pDst_block[x + y * 6] = p;
2343

2344
			} // x
2345
		} // y
2346
	}
2347
}
2348

2349
static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, vec3F* pDst_block)
2350
{
2351
	vec3F temp_block[6][6]; // [y][x]
2352

2353
	// first filter rows to temp_block
2354
	if (grid_x == 6)
2355
	{
2356
		memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);
2357
	}
2358
	else
2359
	{
2360
		Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];
2361

2362
		for (uint32_t y = 0; y < 6; y++)
2363
		{
2364
			for (uint32_t x = 0; x < 6; x++)
2365
			{
2366
				vec3F p(0.0f);
2367

2368
				for (uint32_t i = 0; i < pRow_lists[x].n; i++)
2369
					p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;
2370
								
2371
				temp_block[y][x] = p;
2372
			} // x
2373
		} // y
2374
	}
2375

2376
	// filter columns
2377
	if (grid_y == 6)
2378
	{
2379
		memcpy(pDst_block, temp_block, sizeof(vec3F) * 6 * 6);
2380
	}
2381
	else
2382
	{
2383
		Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];
2384

2385
		for (uint32_t x = 0; x < 6; x++)
2386
		{
2387
			for (uint32_t y = 0; y < 6; y++)
2388
			{
2389
				vec3F& p = pDst_block[x + y * 6];
2390
				p.set(0.0f);
2391

2392
				for (uint32_t i = 0; i < pCol_lists[y].n; i++)
2393
					p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
2394
			} // x
2395
		} // y
2396
	}
2397
}
2398

2399
static float diff_blocks(const vec4F* pA, const vec4F* pB)
2400
{
2401
	const uint32_t BLOCK_T = 36;
2402

2403
	float diff = 0.0f;
2404
	for (uint32_t i = 0; i < BLOCK_T; i++)
2405
		diff += square(pA[i][0] - pB[i][0]) + square(pA[i][1] - pB[i][1]) + square(pA[i][2] - pB[i][2]);
2406
	
2407
	return diff * (1.0f / (float)BLOCK_T);
2408
}
2409

2410
static float sub_and_compute_std_dev(const vec3F* pA, const vec3F* pB)
2411
{
2412
	const uint32_t BLOCK_T = 36;
2413

2414
	vec3F mean(0.0f);
2415

2416
	for (uint32_t i = 0; i < BLOCK_T; i++)
2417
	{
2418
		vec3F diff(pA[i] - pB[i]);
2419
		mean += diff;
2420
	}
2421

2422
	mean *= (1.0f / (float)BLOCK_T);
2423

2424
	vec3F diff_sum(0.0f);
2425
	for (uint32_t i = 0; i < BLOCK_T; i++)
2426
	{
2427
		vec3F diff(pA[i] - pB[i]);
2428
		diff -= mean;
2429
		diff_sum += vec3F::component_mul(diff, diff);
2430
	}
2431

2432
	vec3F var(diff_sum * (1.0f / (float)BLOCK_T));
2433

2434
	vec3F std_dev(sqrtf(var[0]), sqrtf(var[1]), sqrtf(var[2]));
2435

2436
	return maximum(std_dev[0], std_dev[1], std_dev[2]);
2437
}
2438

2439
static void create_smooth_maps2(
2440
	vector2D<float>& smooth_block_mse_scales,
2441
	const image& orig_img,
2442
	smooth_map_params& params, image* pUltra_smooth_img = nullptr)
2443
{
2444
	const uint32_t width = orig_img.get_width();
2445
	const uint32_t height = orig_img.get_height();
2446
	//const uint32_t total_pixels = orig_img.get_total_pixels();
2447
	const uint32_t num_comps = 3;
2448

2449
	if (params.m_no_mse_scaling)
2450
	{
2451
		smooth_block_mse_scales.set_all(1.0f);
2452
		return;
2453
	}
2454

2455
	smooth_block_mse_scales.resize(width, height);
2456

2457
	image smooth_vis, med_smooth_vis, ultra_smooth_vis;
2458

2459
	if (params.m_debug_images)
2460
	{
2461
		smooth_vis.resize(width, height);
2462
		med_smooth_vis.resize(width, height);
2463
		ultra_smooth_vis.resize(width, height);
2464
	}
2465

2466
	for (uint32_t y = 0; y < height; y++)
2467
	{
2468
		for (uint32_t x = 0; x < width; x++)
2469
		{
2470
			{
2471
				tracked_stat_dbl comp_stats[4];
2472
				for (int yd = -1; yd <= 1; yd++)
2473
				{
2474
					for (int xd = -1; xd <= 1; xd++)
2475
					{
2476
						const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
2477

2478
						comp_stats[0].update((float)p[0]);
2479
						comp_stats[1].update((float)p[1]);
2480
						comp_stats[2].update((float)p[2]);
2481
					}
2482
				}
2483

2484
				float max_std_dev = 0.0f;
2485
				for (uint32_t i = 0; i < num_comps; i++)
2486
					max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());
2487

2488
				float yl = clampf(max_std_dev / params.m_max_smooth_std_dev, 0.0f, 1.0f);
2489
				//yl = powf(yl, 2.0f);
2490
				yl = powf(yl, 1.0f / 2.0f); // substantially less bits
2491

2492
				smooth_block_mse_scales(x, y) = lerp(params.m_smooth_max_mse_scale, 1.0f, yl);
2493

2494
				if (params.m_debug_images)
2495
				{
2496
					//smooth_vis(x, y).set(clamp((int)((smooth_block_mse_scales(x, y) - 1.0f) / (params.m_smooth_max_mse_scale - 1.0f) * 255.0f + .5f), 0, 255));
2497
					// white=high local activity (edges/detail)
2498
					// black=low local activity (smooth - error is amplified)
2499
					smooth_vis(x, y).set(clamp((int)((yl * 255.0f) + .5f), 0, 255));
2500
				}
2501
			}
2502

2503
			{
2504
				tracked_stat_dbl comp_stats[4];
2505

2506
				const int S = 3;
2507
				for (int yd = -S; yd < S; yd++)
2508
				{
2509
					for (int xd = -S; xd < S; xd++)
2510
					{
2511
						const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
2512

2513
						comp_stats[0].update((float)p[0]);
2514
						comp_stats[1].update((float)p[1]);
2515
						comp_stats[2].update((float)p[2]);
2516
					}
2517
				}
2518

2519
				float max_std_dev = 0.0f;
2520
				for (uint32_t i = 0; i < num_comps; i++)
2521
					max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());
2522

2523
				float yl = clampf(max_std_dev / params.m_max_med_smooth_std_dev, 0.0f, 1.0f);
2524
				//yl = powf(yl, 2.0f);
2525

2526
				smooth_block_mse_scales(x, y) = lerp(params.m_med_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);
2527

2528
				if (params.m_debug_images)
2529
					med_smooth_vis(x, y).set((int)std::round(yl * 255.0f));
2530
			}
2531

2532
			{
2533
				tracked_stat_dbl comp_stats[4];
2534

2535
				const int S = 5;
2536
				for (int yd = -S; yd < S; yd++)
2537
				{
2538
					for (int xd = -S; xd < S; xd++)
2539
					{
2540
						const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
2541

2542
						comp_stats[0].update((float)p[0]);
2543
						comp_stats[1].update((float)p[1]);
2544
						comp_stats[2].update((float)p[2]);
2545
					}
2546
				}
2547

2548
				float max_std_dev = 0.0f;
2549
				for (uint32_t i = 0; i < num_comps; i++)
2550
					max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());
2551

2552
				float yl = clampf(max_std_dev / params.m_max_ultra_smooth_std_dev, 0.0f, 1.0f);
2553
				yl = powf(yl, 2.0f);
2554
				
2555
				smooth_block_mse_scales(x, y) = lerp(params.m_ultra_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);
2556

2557
				if (params.m_debug_images)
2558
					ultra_smooth_vis(x, y).set((int)std::round(yl * 255.0f));
2559
			}
2560

2561
		}
2562
	}
2563

2564
	if (params.m_debug_images)
2565
	{
2566
		save_png("dbg_smooth_vis.png", smooth_vis);
2567
		save_png("dbg_med_smooth_vis.png", med_smooth_vis);
2568
		save_png("dbg_ultra_smooth_vis.png", ultra_smooth_vis);
2569

2570
		image vis_img(width, height);
2571

2572
		float max_scale = 0.0f;
2573
		for (uint32_t y = 0; y < height; y++)
2574
			for (uint32_t x = 0; x < width; x++)
2575
				max_scale = basisu::maximumf(max_scale, smooth_block_mse_scales(x, y));
2576

2577
		for (uint32_t y = 0; y < height; y++)
2578
			for (uint32_t x = 0; x < width; x++)
2579
				vis_img(x, y).set((int)std::round(smooth_block_mse_scales(x, y) * 255.0f / max_scale));
2580

2581
		save_png("scale_vis.png", vis_img);
2582
	}
2583

2584
	if (pUltra_smooth_img)
2585
		*pUltra_smooth_img = ultra_smooth_vis;
2586
}
2587

2588
const float REALLY_DARK_I_THRESHOLD = 0.0625f;
2589
const float REALLY_DARK_MSE_ERR_SCALE = 128.0f;
2590
const float REALLY_DARK_DELTA_ITP_JND_SCALE = 5.0f;
2591

2592
static float compute_pixel_mse_itp(const vec3F& orig_pixel_itp, const vec3F& comp_pixel_itp, bool delta_itp_dark_adjustment)
2593
{
2594
	float delta_i = orig_pixel_itp[0] - comp_pixel_itp[0];
2595
	float delta_t = orig_pixel_itp[1] - comp_pixel_itp[1];
2596
	float delta_p = orig_pixel_itp[2] - comp_pixel_itp[2];
2597
		
2598
	float err = (delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p);
2599

2600
	if (delta_itp_dark_adjustment)
2601
	{
2602
		// We have to process a large range of inputs, including extremely dark inputs. 
2603
		// Artifically amplify MSE on very dark pixels - otherwise they'll be overly compressed at higher lambdas.
2604
		// This is to better handle very dark signals which could be explictly overexposed.
2605
		float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig_pixel_itp[0]);
2606
		s = lerp(REALLY_DARK_MSE_ERR_SCALE, 1.0f, s);
2607
		err *= s;
2608
	}
2609

2610
	return err;
2611
}
2612

2613
static float compute_block_mse_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp, bool delta_itp_dark_adjustment)
2614
{
2615
	float total_mse = 0.0f;
2616

2617
	for (uint32_t y = 0; y < block_h; y++)
2618
	{
2619
		for (uint32_t x = 0; x < block_w; x++)
2620
		{
2621
			total_mse += compute_pixel_mse_itp(pOrig_pixels_itp[x + y * block_w], pComp_pixels_itp[x + y * block_w], delta_itp_dark_adjustment);
2622
		} // x
2623
	} // y
2624

2625
	return total_mse * (1.0f / (float)(block_w * block_h));
2626
}
2627

2628
static float compute_block_ssim_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp)
2629
{
2630
	const uint32_t n = block_w * block_h;
2631
	assert(n <= 36);
2632

2633
	stats<float> x_stats[3], y_stats[3];
2634
	comparative_stats<float> xy_cov[3];
2635

2636
	for (uint32_t c = 0; c < 3; c++)
2637
	{
2638
		x_stats[c].calc_simplified(n, &pOrig_pixels_itp[0][c], 3);
2639
		y_stats[c].calc_simplified(n, &pComp_pixels_itp[0][c], 3);
2640
	}
2641

2642
	for (uint32_t c = 0; c < 3; c++)
2643
		xy_cov[c].calc_cov(n, &pOrig_pixels_itp[0][c], &pComp_pixels_itp[0][c], 3, 3, &x_stats[c], &y_stats[c]);
2644

2645
	float ssim[3];
2646
	const double d = 1.0f, k1 = .01f, k2 = .03f;
2647

2648
	// weight mean error more highly to reduce blocking
2649
	float ap = 1.5f, bp = 1.0f, cp = 1.0f;
2650

2651
	const double s_c1 = square(k1 * d), s_c2 = square(k2 * d);
2652
	const double s_c3(s_c2 * .5f);
2653

2654
	for (uint32_t c = 0; c < 3; c++)
2655
	{
2656
		float lum = (float)((2.0f * x_stats[c].m_avg * y_stats[c].m_avg + s_c1) / (square(x_stats[c].m_avg) + square(y_stats[c].m_avg) + s_c1));
2657
		lum = saturate(lum);
2658

2659
		float con = (float)((2.0f * x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c2) / (x_stats[c].m_var + y_stats[c].m_var + s_c2));
2660
		con = saturate(con);
2661

2662
		float str = (float)((xy_cov[c].m_cov + s_c3) / (x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c3));
2663
		str = saturate(str);
2664

2665
		ssim[c] = powf(lum, ap) * powf(con, bp) * powf(str, cp);
2666
	}
2667

2668
#if 0
2669
	float final_ssim = (ssim[0] * .4f + ssim[1] * .3f + ssim[2] * .3f);
2670
#elif 1
2671
	float final_ssim = ssim[0] * ssim[1] * ssim[2];
2672
#else
2673
	const float LP = .75f;
2674
	float final_ssim = ssim[0] * powf((ssim[1] + ssim[2]) * .5f, LP);
2675
#endif
2676

2677
	return final_ssim;
2678
}
2679

2680
// delta ITP, 1.0 is JND (Rec. ITU-R BT.2124), modified for higher error at low light
2681
static float compute_pixel_delta_itp(const vec3F& a, const vec3F& b, const vec3F& orig, bool delta_itp_dark_adjustment)
2682
{
2683
	float delta_i = a[0] - b[0];
2684
	float delta_t = a[1] - b[1];
2685
	float delta_p = a[2] - b[2];
2686

2687
	float err = 720.0f * sqrtf((delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p));
2688

2689
	float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig[0]);
2690
	
2691
	if (delta_itp_dark_adjustment)
2692
	{
2693
		// This is to better handle very dark signals which could be explictly overexposed.
2694
		s = lerp(REALLY_DARK_DELTA_ITP_JND_SCALE, 1.0f, s);
2695
		err *= s;
2696
	}
2697

2698
	return err;
2699
}
2700

2701
struct candidate_encoding
2702
{
2703
	encoding_type m_encoding_type;
2704
		
2705
	basist::half_float m_solid_color[3];
2706

2707
	uint32_t m_run_len;
2708

2709
	vec3F m_comp_pixels[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]
2710
	vec3F m_comp_pixels_itp[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]
2711
		
2712
	endpoint_mode m_endpoint_mode;
2713
	block_mode m_block_mode;
2714

2715
	bitwise_coder m_coder;
2716
		
2717
	// The block to code, which may not be valid ASTC. This may have to be transcoded (by requantizing the weights/endpoints) before it's valid ASTC.
2718
	// Note the endpoints may be coded endpoints OR transcoded endpoints, depending on the encoding type.
2719
	astc_helpers::log_astc_block m_coded_log_blk; 
2720

2721
	// The block the decoder outputs.
2722
	astc_helpers::log_astc_block m_decomp_log_blk;
2723

2724
	int m_reuse_delta_index;
2725

2726
	float m_t, m_d, m_bits;
2727
					
2728
	candidate_encoding()
2729
	{
2730
		clear();
2731
	}
2732

2733
	candidate_encoding(const candidate_encoding &other)
2734
	{
2735
		*this = other;
2736
	}
2737

2738
	candidate_encoding(candidate_encoding&& other)
2739
	{
2740
		*this = std::move(other);
2741
	}
2742

2743
	candidate_encoding& operator=(const candidate_encoding& rhs)
2744
	{
2745
		if (this == &rhs)
2746
			return *this;
2747

2748
		m_encoding_type = rhs.m_encoding_type;
2749
		memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));
2750
		m_run_len = rhs.m_run_len;
2751
		memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));
2752
		m_endpoint_mode = rhs.m_endpoint_mode;
2753
		m_block_mode = rhs.m_block_mode;
2754
		m_coder = rhs.m_coder;
2755
		m_coded_log_blk = rhs.m_coded_log_blk;
2756
		m_decomp_log_blk = rhs.m_decomp_log_blk;
2757
		m_reuse_delta_index = rhs.m_reuse_delta_index;
2758
		
2759
		return *this;
2760
	}
2761

2762
	candidate_encoding& operator=(candidate_encoding&& rhs)
2763
	{
2764
		if (this == &rhs)
2765
			return *this;
2766

2767
		m_encoding_type = rhs.m_encoding_type;
2768
		memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));
2769
		m_run_len = rhs.m_run_len;
2770
		memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));
2771
		m_endpoint_mode = rhs.m_endpoint_mode;
2772
		m_block_mode = rhs.m_block_mode;
2773
		m_coder = std::move(rhs.m_coder);
2774
		m_coded_log_blk = rhs.m_coded_log_blk;
2775
		m_decomp_log_blk = rhs.m_decomp_log_blk;
2776
		m_reuse_delta_index = rhs.m_reuse_delta_index;
2777

2778
		return *this;
2779
	}
2780

2781
	void clear()
2782
	{
2783
		m_encoding_type = encoding_type::cInvalid;
2784

2785
		clear_obj(m_solid_color);
2786

2787
		m_run_len = 0;
2788

2789
		clear_obj(m_comp_pixels);
2790
						
2791
		m_endpoint_mode = endpoint_mode::cInvalid;
2792
		m_block_mode = block_mode::cInvalid;
2793

2794
		m_coder.restart();
2795
		
2796
		m_coded_log_blk.clear();
2797
		m_decomp_log_blk.clear();
2798

2799
		m_t = 0;
2800
		m_d = 0;
2801
		m_bits = 0;
2802
		
2803
		m_reuse_delta_index = 0;
2804
	}
2805
};
2806

2807
bool decode_astc_block(uint32_t block_w, uint32_t block_h, astc_helpers::log_astc_block &log_blk, vec3F *pPixels)
2808
{
2809
	assert((block_w <= 6) && (block_h <= 6));
2810

2811
	half_vec4 decoded_pixels_half4[6 * 6]; // [y][x]
2812
	bool status = astc_helpers::decode_block(log_blk, decoded_pixels_half4, block_w, block_h, astc_helpers::cDecodeModeHDR16);
2813
	assert(status);
2814

2815
	if (!status)
2816
		return false;
2817

2818
	for (uint32_t y = 0; y < block_h; y++)
2819
	{
2820
		for (uint32_t x = 0; x < block_w; x++)
2821
		{
2822
			pPixels[x + y * block_w].set(
2823
				basist::half_to_float(decoded_pixels_half4[x + y * block_w][0]),
2824
				basist::half_to_float(decoded_pixels_half4[x + y * block_w][1]),
2825
				basist::half_to_float(decoded_pixels_half4[x + y * block_w][2]));
2826
		} // x 
2827
	} //y
2828

2829
	return true;
2830
}
2831

2832
static inline bool validate_log_blk(const astc_helpers::log_astc_block &decomp_blk)
2833
{
2834
	astc_helpers::astc_block phys_blk;
2835
	return astc_helpers::pack_astc_block(phys_blk, decomp_blk);
2836
}
2837

2838
#define SYNC_MARKERS (0)
2839

2840
static bool decode_file(const uint8_vec& comp_data, vector2D<astc_helpers::astc_block>& decoded_blocks, uint32_t &width, uint32_t &height)
2841
{
2842
	interval_timer tm;
2843
	tm.start();
2844

2845
	const uint32_t BLOCK_W = 6, BLOCK_H = 6;
2846

2847
	width = 0;
2848
	height = 0;
2849

2850
	if (comp_data.size() <= 2*3)
2851
		return false;
2852

2853
	basist::bitwise_decoder decoder;
2854
	if (!decoder.init(comp_data.data(), comp_data.size_u32()))
2855
		return false;
2856

2857
	if (decoder.get_bits(16) != 0xABCD)
2858
		return false;
2859

2860
	width = decoder.get_bits(16);
2861
	height = decoder.get_bits(16);
2862
		
2863
	if (!width || !height || (width > MAX_ASTC_HDR_6X6_DIM) || (height > MAX_ASTC_HDR_6X6_DIM))
2864
		return false;
2865

2866
	const uint32_t num_blocks_x = (width + BLOCK_W - 1) / BLOCK_W;
2867
	const uint32_t num_blocks_y = (height + BLOCK_H - 1) / BLOCK_H;
2868
	const uint32_t total_blocks = num_blocks_x * num_blocks_y;
2869

2870
	decoded_blocks.resize(num_blocks_x, num_blocks_y);
2871
	//memset(decoded_blocks.get_ptr(), 0, decoded_blocks.size_in_bytes());
2872

2873
	vector2D<astc_helpers::log_astc_block> decoded_log_blocks(num_blocks_x, num_blocks_y);
2874
	//memset(decoded_log_blocks.get_ptr(), 0, decoded_log_blocks.size_in_bytes());
2875

2876
	uint32_t cur_bx = 0, cur_by = 0;
2877
	uint32_t step_counter = 0;
2878
	BASISU_NOTE_UNUSED(step_counter);
2879
		
2880
	while (cur_by < num_blocks_y)
2881
	{
2882
		step_counter++;
2883
		
2884
		//if ((cur_bx == 9) && (cur_by == 13))
2885
		//	printf("!");
2886

2887
#if SYNC_MARKERS
2888
		uint32_t mk = decoder.get_bits(16);
2889
		if (mk != 0xDEAD)
2890
		{
2891
			printf("!");
2892
			assert(0);
2893
			return false;
2894
		}
2895
#endif
2896
		if (decoder.get_bits_remaining() < 1)
2897
			return false;
2898

2899
		encoding_type et = encoding_type::cBlock;
2900

2901
		uint32_t b0 = decoder.get_bits(1);
2902
		if (!b0)
2903
		{
2904
			uint32_t b1 = decoder.get_bits(1);
2905
			if (b1)
2906
				et = encoding_type::cReuse;
2907
			else
2908
			{
2909
				uint32_t b2 = decoder.get_bits(1);
2910
				if (b2)
2911
					et = encoding_type::cSolid;
2912
				else
2913
					et = encoding_type::cRun;
2914
			}
2915
		}
2916

2917
		switch (et)
2918
		{
2919
		case encoding_type::cRun:
2920
		{
2921
			if (!cur_bx && !cur_by)
2922
				return false;
2923

2924
			const uint32_t run_len = decoder.decode_vlc(5) + 1;
2925
			
2926
			uint32_t num_blocks_remaining = total_blocks - (cur_bx + cur_by * num_blocks_x);
2927
			if (run_len > num_blocks_remaining)
2928
				return false;
2929
						
2930
			uint32_t prev_bx = cur_bx, prev_by = cur_by;
2931

2932
			if (cur_bx)
2933
				prev_bx--;
2934
			else
2935
			{
2936
				prev_bx = num_blocks_x - 1;
2937
				prev_by--;
2938
			}
2939

2940
			const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);
2941
			const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);
2942

2943
			for (uint32_t i = 0; i < run_len; i++)
2944
			{
2945
				decoded_log_blocks(cur_bx, cur_by) = prev_log_blk;
2946
				decoded_blocks(cur_bx, cur_by) = prev_phys_blk;
2947

2948
				cur_bx++;
2949
				if (cur_bx == num_blocks_x)
2950
				{
2951
					cur_bx = 0;
2952
					cur_by++;
2953
				}
2954
			}
2955

2956
			break;
2957
		}
2958
		case encoding_type::cSolid:
2959
		{
2960
			const basist::half_float rh = (basist::half_float)decoder.get_bits(15);
2961
			const basist::half_float gh = (basist::half_float)decoder.get_bits(15);
2962
			const basist::half_float bh = (basist::half_float)decoder.get_bits(15);
2963

2964
			astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
2965

2966
			log_blk.clear();
2967
			log_blk.m_solid_color_flag_hdr = true;
2968
			log_blk.m_solid_color[0] = rh;
2969
			log_blk.m_solid_color[1] = gh;
2970
			log_blk.m_solid_color[2] = bh;
2971
			log_blk.m_solid_color[3] = basist::float_to_half(1.0f);
2972

2973
			bool status = astc_helpers::pack_astc_block(decoded_blocks(cur_bx, cur_by), log_blk);
2974
			if (!status)
2975
				return false;
2976

2977
			cur_bx++;
2978
			if (cur_bx == num_blocks_x)
2979
			{
2980
				cur_bx = 0;
2981
				cur_by++;
2982
			}
2983
			
2984
			break;
2985
		}
2986
		case encoding_type::cReuse:
2987
		{
2988
			if (!cur_bx && !cur_by)
2989
				return false;
2990

2991
			const uint32_t reuse_delta_index = decoder.get_bits(REUSE_XY_DELTA_BITS);
2992

2993
			const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;
2994
			const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;
2995

2996
			const int prev_bx = cur_bx + reuse_delta_x, prev_by = cur_by + reuse_delta_y;
2997
			if ((prev_bx < 0) || (prev_bx >= (int)num_blocks_x))
2998
				return false;
2999
			if (prev_by < 0)
3000
				return false;
3001
			
3002
			const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);
3003
			const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);
3004

3005
			if (prev_log_blk.m_solid_color_flag_hdr)
3006
				return false;
3007

3008
			astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
3009
			astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
3010
			
3011
			log_blk = prev_log_blk;
3012

3013
			const uint32_t total_grid_weights = log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1);
3014

3015
			bool status = basist::astc_6x6_hdr::decode_values(decoder, total_grid_weights, log_blk.m_weight_ise_range, log_blk.m_weights);
3016
			if (!status)
3017
				return false;
3018

3019
			astc_helpers::log_astc_block decomp_blk;
3020
			status = astc_helpers::unpack_block(&prev_phys_blk, decomp_blk, BLOCK_W, BLOCK_H);
3021
			if (!status)
3022
				return false;
3023
			
3024
			uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
3025
			basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, log_blk.m_weight_ise_range, transcode_weights, decomp_blk.m_weight_ise_range);
3026

3027
			copy_weight_grid(log_blk.m_dual_plane, log_blk.m_grid_width, log_blk.m_grid_height, transcode_weights, decomp_blk);
3028

3029
			status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
3030
			if (!status)
3031
				return false;
3032

3033
			cur_bx++;
3034
			if (cur_bx == num_blocks_x)
3035
			{
3036
				cur_bx = 0;
3037
				cur_by++;
3038
			}
3039

3040
			break;
3041
		}
3042
		case encoding_type::cBlock:
3043
		{
3044
			const block_mode bm = (block_mode)decoder.decode_truncated_binary((uint32_t)block_mode::cBMTotalModes);
3045
			const endpoint_mode em = (endpoint_mode)decoder.decode_truncated_binary((uint32_t)endpoint_mode::cTotal);
3046

3047
			switch (em)
3048
			{
3049
			case endpoint_mode::cUseLeft:
3050
			case endpoint_mode::cUseUpper:
3051
			{
3052
				int neighbor_bx = cur_bx, neighbor_by = cur_by;
3053
				
3054
				if (em == endpoint_mode::cUseLeft)
3055
					neighbor_bx--;
3056
				else
3057
					neighbor_by--;
3058

3059
				if ((neighbor_bx < 0) || (neighbor_by < 0))
3060
					return false;
3061

3062
				const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);
3063
				if (!neighbor_blk.m_color_endpoint_modes[0])
3064
					return false;
3065

3066
				const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
3067
				const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);
3068

3069
				if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])
3070
					return false;
3071

3072
				astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
3073
				astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
3074

3075
				log_blk.clear();
3076
				log_blk.m_num_partitions = 1;
3077
				log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
3078
				log_blk.m_endpoint_ise_range = neighbor_blk.m_endpoint_ise_range;
3079
				log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
3080
				log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
3081
				log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
3082
				log_blk.m_dual_plane = (uint8_t)bmd.m_dp;
3083
				log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3084

3085
				memcpy(log_blk.m_endpoints, neighbor_blk.m_endpoints, num_endpoint_values);
3086

3087
				const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);
3088

3089
				bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
3090
				if (!status)
3091
					return false;
3092

3093
				astc_helpers::log_astc_block decomp_blk;
3094
				decomp_blk.clear();
3095

3096
				decomp_blk.m_num_partitions = 1;
3097
				decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
3098
				decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
3099
				decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
3100
				decomp_blk.m_dual_plane = bmd.m_dp;
3101
				decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3102

3103
				basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);
3104

3105
				uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
3106
				basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);
3107

3108
				copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);
3109

3110
				status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
3111
				if (!status)
3112
					return false;
3113

3114
				cur_bx++;
3115
				if (cur_bx == num_blocks_x)
3116
				{
3117
					cur_bx = 0;
3118
					cur_by++;
3119
				}
3120

3121
				break;
3122
			}
3123
			case endpoint_mode::cUseLeftDelta:
3124
			case endpoint_mode::cUseUpperDelta:
3125
			{
3126
				int neighbor_bx = cur_bx, neighbor_by = cur_by;
3127

3128
				if (em == endpoint_mode::cUseLeftDelta)
3129
					neighbor_bx--;
3130
				else
3131
					neighbor_by--;
3132

3133
				if ((neighbor_bx < 0) || (neighbor_by < 0))
3134
					return false;
3135

3136
				const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);
3137
				if (!neighbor_blk.m_color_endpoint_modes[0])
3138
					return false;
3139

3140
				const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
3141
				const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);
3142

3143
				if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])
3144
					return false;
3145

3146
				astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
3147
				astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
3148

3149
				log_blk.clear();
3150
				log_blk.m_num_partitions = 1;
3151
				log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
3152
				log_blk.m_dual_plane = bmd.m_dp;
3153
				log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3154
				
3155
				log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;
3156
				basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, neighbor_blk.m_endpoint_ise_range, neighbor_blk.m_endpoints, bmd.m_endpoint_ise_range, log_blk.m_endpoints);
3157

3158
				const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;
3159
				const int low_delta_limit = -(total_endpoint_delta_vals / 2); // high_delta_limit = (total_endpoint_delta_vals / 2) - 1;
3160

3161
				const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_ISE_to_rank;
3162
				const auto& rank_to_ise = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_rank_to_ISE;
3163
				const int total_endpoint_levels = astc_helpers::get_ise_levels(log_blk.m_endpoint_ise_range);
3164

3165
				for (uint32_t i = 0; i < num_endpoint_values; i++)
3166
				{
3167
					int cur_val = ise_to_rank[log_blk.m_endpoints[i]];
3168
					
3169
					int delta = (int)decoder.get_bits(NUM_ENDPOINT_DELTA_BITS) + low_delta_limit;
3170

3171
					cur_val += delta;
3172
					if ((cur_val < 0) || (cur_val >= total_endpoint_levels))
3173
						return false;
3174

3175
					log_blk.m_endpoints[i] = rank_to_ise[cur_val];
3176
				}
3177

3178
				log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
3179
				log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
3180
				log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
3181

3182
				const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);
3183

3184
				bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
3185
				if (!status)
3186
					return false;
3187

3188
				astc_helpers::log_astc_block decomp_blk;
3189
				decomp_blk.clear();
3190

3191
				decomp_blk.m_num_partitions = 1;
3192
				decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
3193
				decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
3194
				decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
3195
				decomp_blk.m_dual_plane = (uint8_t)bmd.m_dp;
3196
				decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3197

3198
				basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);
3199

3200
				uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
3201
				basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);
3202

3203
				copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);
3204

3205
				status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
3206
				if (!status)
3207
					return false;
3208

3209
				cur_bx++;
3210
				if (cur_bx == num_blocks_x)
3211
				{
3212
					cur_bx = 0;
3213
					cur_by++;
3214
				}
3215

3216
				break;
3217
			}
3218
			case endpoint_mode::cRaw:
3219
			{
3220
				const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
3221

3222
				const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);
3223

3224
				astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
3225
				astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
3226

3227
				log_blk.clear();
3228
				log_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;
3229
				
3230
				for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
3231
					log_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;
3232

3233
				log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;
3234
				log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
3235

3236
				log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
3237
				log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
3238
				log_blk.m_dual_plane = (uint8_t)bmd.m_dp;
3239
				log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3240

3241
				if (bmd.m_num_partitions == 2)
3242
				{
3243
					const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS2);
3244
					log_blk.m_partition_id = (uint16_t)g_part2_unique_index_to_seed[unique_partition_index];
3245
				}
3246
				else if (bmd.m_num_partitions == 3)
3247
				{
3248
					const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS3);
3249
					log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_partition_index];
3250
				}
3251
				
3252
				bool status = decode_values(decoder, num_endpoint_values * bmd.m_num_partitions, bmd.m_endpoint_ise_range, log_blk.m_endpoints);
3253
				if (!status)
3254
					return false;
3255

3256
				const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);
3257

3258
				status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
3259
				if (!status)
3260
					return false;
3261

3262
				astc_helpers::log_astc_block decomp_blk;
3263
				decomp_blk.clear();
3264
				
3265
				decomp_blk.m_dual_plane = bmd.m_dp;
3266
				decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3267
				decomp_blk.m_partition_id = log_blk.m_partition_id;
3268
								
3269
				decomp_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;
3270
				
3271
				for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
3272
					decomp_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;
3273

3274
				decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
3275
				decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
3276

3277
				for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
3278
					basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, bmd.m_endpoint_ise_range, log_blk.m_endpoints + num_endpoint_values * p, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints + num_endpoint_values * p);
3279

3280
				uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
3281
				basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);
3282

3283
				copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);
3284

3285
				status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
3286
				if (!status)
3287
					return false;
3288

3289
				cur_bx++;
3290
				if (cur_bx == num_blocks_x)
3291
				{
3292
					cur_bx = 0;
3293
					cur_by++;
3294
				}
3295

3296
				break;
3297
			}
3298
			default:
3299
			{
3300
				assert(0);
3301
				return false;
3302
			}
3303
			}
3304

3305
			break;
3306
		}
3307
		default:
3308
		{
3309
			assert(0);
3310
			return false;
3311
		}
3312
		}
3313
	}
3314

3315
	if (decoder.get_bits(16) != 0xA742)
3316
	{
3317
		fmt_error_printf("End marker not found!\n");
3318
		return false;
3319
	}
3320

3321
	//fmt_printf("Total decode_file() time: {} secs\n", tm.get_elapsed_secs());
3322

3323
	return true;
3324
}
3325

3326
static bool unpack_physical_astc_block(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)
3327
{
3328
	astc_helpers::log_astc_block log_blk;
3329
	if (!astc_helpers::unpack_block(pBlock, log_blk, block_width, block_height))
3330
		return false;
3331
	
3332
	basist::half_float half_block[MAX_BLOCK_W * MAX_BLOCK_H][4];
3333
	if (!astc_helpers::decode_block(log_blk, half_block, block_width, block_height, astc_helpers::cDecodeModeHDR16))
3334
		return false;
3335

3336
	const uint32_t total_block_pixels = block_width * block_height;
3337
	for (uint32_t p = 0; p < total_block_pixels; p++)
3338
	{
3339
		pPixels[p][0] = basist::half_to_float(half_block[p][0]);
3340
		pPixels[p][1] = basist::half_to_float(half_block[p][1]);
3341
		pPixels[p][2] = basist::half_to_float(half_block[p][2]);
3342
		pPixels[p][3] = basist::half_to_float(half_block[p][3]);
3343
	}
3344

3345
	return true;
3346
}
3347

3348
static bool unpack_physical_astc_block_google(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)
3349
{
3350
	return basisu_astc::astc::decompress_hdr((float *)pPixels, (uint8_t*)pBlock, block_width, block_height);
3351
}
3352

3353
static bool pack_bc6h_image(const imagef &src_img, vector2D<basist::bc6h_block> &bc6h_blocks, imagef *pPacked_bc6h_img, const fast_bc6h_params &enc_params)
3354
{
3355
	const uint32_t width = src_img.get_width();
3356
	const uint32_t height = src_img.get_height();
3357
	
3358
	if (pPacked_bc6h_img)
3359
		pPacked_bc6h_img->resize(width, height);
3360

3361
	interval_timer tm;
3362
	double total_enc_time = 0.0f;
3363

3364
	const uint32_t num_blocks_x = src_img.get_block_width(4);
3365
	const uint32_t num_blocks_y = src_img.get_block_height(4);
3366

3367
	bc6h_blocks.resize(num_blocks_x, num_blocks_y);
3368
				
3369
	for (uint32_t by = 0; by < num_blocks_y; by++)
3370
	{
3371
		for (uint32_t bx = 0; bx < num_blocks_x; bx++)
3372
		{
3373
			// Extract source image block
3374
			vec4F block_pixels[4][4]; // [y][x]
3375
			src_img.extract_block_clamped(&block_pixels[0][0], bx * 4, by * 4, 4, 4);
3376

3377
			basist::half_float half_pixels[16 * 3]; // [y][x]
3378

3379
			for (uint32_t y = 0; y < 4; y++)
3380
			{
3381
				for (uint32_t x = 0; x < 4; x++)
3382
				{
3383
					for (uint32_t c = 0; c < 3; c++)
3384
					{
3385
						float v = block_pixels[y][x][c];
3386

3387
						basist::half_float h = basist::float_to_half(v);
3388

3389
						half_pixels[(x + y * 4) * 3 + c] = h;
3390

3391
					} // c
3392

3393
				} // x
3394
			} // y
3395

3396
			basist::bc6h_block& bc6h_blk = bc6h_blocks(bx, by);
3397

3398
			tm.start();
3399

3400
			basist::astc_6x6_hdr::fast_encode_bc6h(half_pixels, &bc6h_blk, enc_params);
3401

3402
			total_enc_time += tm.get_elapsed_secs();
3403

3404
			if (pPacked_bc6h_img)
3405
			{
3406
				basist::half_float unpacked_blk[16 * 3];
3407
				bool status = unpack_bc6h(&bc6h_blk, unpacked_blk, false);
3408
				assert(status);
3409
				if (!status)
3410
				{
3411
					fmt_error_printf("unpack_bc6h() failed\n");
3412
					return false;
3413
				}
3414
							
3415
				for (uint32_t y = 0; y < 4; y++)
3416
				{
3417
					for (uint32_t x = 0; x < 4; x++)
3418
					{
3419
						vec4F p;
3420

3421
						for (uint32_t c = 0; c < 3; c++)
3422
						{
3423
							float v = basist::half_to_float(unpacked_blk[(x + y * 4) * 3 + c]);
3424
							p[c] = v;
3425

3426
						} // c
3427

3428
						p[3] = 1.0f;
3429

3430
						pPacked_bc6h_img->set_clipped(bx * 4 + x, by * 4 + y, p);
3431
					} // x
3432
				} // y
3433
			}
3434

3435
		} // bx
3436
	} // by
3437

3438
	//fmt_printf("Total BC6H encode time: {}\n", total_enc_time);
3439

3440
	return true;
3441
}
3442

3443
static float dist_to_line_squared(const vec3F& p, const vec3F &line_org, const vec3F &line_dir)
3444
{
3445
	vec3F q(p - line_org);
3446
	vec3F v(q - q.dot(line_dir) * line_dir);
3447
	return v.dot(v);
3448
}
3449

3450
static void estimate_partitions_mode7_and_11(
3451
	uint32_t num_parts, // 2 or 3 partitions
3452
	uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns
3453
	uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine
3454
	const vec3F *pHalf_pixels_as_floats, // block's half pixel values casted to floats
3455
	const astc_hdr_codec_base_options& coptions, // options
3456
	uint32_t num_desired_pats, 
3457
	int *pDesired_pat_indices_mode11, int *pDesired_pat_indices_mode7) // output indices
3458
{
3459
	BASISU_NOTE_UNUSED(coptions);
3460
	BASISU_NOTE_UNUSED(num_unique_pats);
3461

3462
	const uint32_t BLOCK_W = 6, BLOCK_H = 6, MAX_PARTS = 3; // BLOCK_T = 6 * 6
3463
	assert(num_parts <= MAX_PARTS);
3464

3465
	struct candidate_res
3466
	{
3467
		float m_total_sq_dist;
3468
		uint32_t m_index;
3469
		bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }
3470
	};
3471

3472
	const uint32_t MAX_CANDIDATES = 1024;
3473
	assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));
3474

3475
	candidate_res mode11_candidates[MAX_CANDIDATES];
3476
	candidate_res mode7_candidates[MAX_CANDIDATES];
3477

3478
	const vec3F grayscale_axis(0.5773502691f);
3479
	
3480
	for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)
3481
	{
3482
		const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];
3483
		assert(unique_part_index < num_unique_pats);
3484

3485
		const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];
3486

3487
		vec3F part_means[MAX_PARTS];
3488
		uint32_t part_total_texels[MAX_PARTS] = { 0 };
3489

3490
		for (uint32_t i = 0; i < num_parts; i++)
3491
			part_means[i].clear();
3492

3493
		for (uint32_t y = 0; y < BLOCK_H; y++)
3494
		{
3495
			for (uint32_t x = 0; x < BLOCK_W; x++)
3496
			{
3497
				const uint32_t part_index = (*pPat)(x, y);
3498
				assert(part_index < num_parts);
3499

3500
				part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];
3501
				part_total_texels[part_index]++;
3502

3503
 			} // x
3504
		} // y
3505
		
3506
		for (uint32_t i = 0; i < num_parts; i++)
3507
		{
3508
			assert(part_total_texels[i]);
3509
			part_means[i] /= (float)part_total_texels[i];
3510
		}
3511

3512
		float part_cov[MAX_PARTS][6];
3513
		memset(part_cov, 0, sizeof(part_cov));
3514

3515
		for (uint32_t y = 0; y < BLOCK_H; y++)
3516
		{
3517
			for (uint32_t x = 0; x < BLOCK_W; x++)
3518
			{
3519
				const uint32_t part_index = (*pPat)(x, y);
3520
				assert(part_index < num_parts);
3521

3522
				const vec3F p(pHalf_pixels_as_floats[x + y * BLOCK_W] - part_means[part_index]);
3523

3524
				const float r = p[0], g = p[1], b = p[2];
3525

3526
				part_cov[part_index][0] += r * r;
3527
				part_cov[part_index][1] += r * g;
3528
				part_cov[part_index][2] += r * b;
3529
				part_cov[part_index][3] += g * g;
3530
				part_cov[part_index][4] += g * b;
3531
				part_cov[part_index][5] += b * b;
3532

3533
			} // x
3534
		} // y
3535

3536
		// For each partition compute the total variance of all channels.
3537
		float total_variance[MAX_PARTS];
3538
		for (uint32_t part_index = 0; part_index < num_parts; part_index++)
3539
			total_variance[part_index] = part_cov[part_index][0] + part_cov[part_index][3] + part_cov[part_index][5];
3540

3541
		vec3F part_axis[MAX_PARTS];
3542
		float mode11_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis
3543
		float mode7_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis
3544

3545
		for (uint32_t part_index = 0; part_index < num_parts; part_index++)
3546
		{
3547
			float* pCov = &part_cov[part_index][0];
3548

3549
			float xr = .9f, xg = 1.0f, xb = .7f;
3550
			
3551
			const uint32_t NUM_POWER_ITERS = 4;
3552
			for (uint32_t iter = 0; iter < NUM_POWER_ITERS; iter++)
3553
			{
3554
				float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];
3555
				float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];
3556
				float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];
3557

3558
				float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));
3559

3560
				if (m >= 1e-10f)
3561
				{
3562
					m = 1.0f / m;
3563
					
3564
					r *= m;
3565
					g *= m;
3566
					b *= m;
3567
				}
3568

3569
				xr = r;
3570
				xg = g;
3571
				xb = b;
3572
			}
3573

3574
			float len_sq = xr * xr + xg * xg + xb * xb;
3575
						
3576
			if (len_sq < 1e-10f)
3577
			{
3578
				xr = grayscale_axis[0];
3579
				xg = grayscale_axis[0];
3580
				xb = grayscale_axis[0];
3581
			}
3582
			else
3583
			{
3584
				len_sq = 1.0f / sqrtf(len_sq);
3585

3586
				xr *= len_sq;
3587
				xg *= len_sq;
3588
				xb *= len_sq;
3589
			}
3590
			
3591
			{
3592
				// Transform the principle axis by the covariance matrix, which will scale the vector by its eigenvalue (the variance of the dataset projected onto the principle axis).
3593
				float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];
3594
				float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];
3595
				float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];
3596

3597
				// Estimate the principle eigenvalue by computing the magnitude of the transformed vector.
3598
				// The result is the variance along the principle axis.
3599
				//float z1 = sqrtf(r * r + g * g + b * b); // this works with the principle axis
3600
				//float z2 = r * xr + g * xg + b * xb; // compute length projected along xr,xg,xb
3601
				
3602
				mode11_eigenvalue_est[part_index] = r * xr + g * xg + b * xb;
3603
			}
3604

3605
			{
3606
				const float yrgb = grayscale_axis[0];
3607
				
3608
				// Transform the grayscale axis by the covariance matrix, which will scale the vector by the eigenvalue (which is the variance of the dataset projected onto this vector).
3609
				float r = yrgb * pCov[0] + yrgb * pCov[1] + yrgb * pCov[2];
3610
				float g = yrgb * pCov[1] + yrgb * pCov[3] + yrgb * pCov[4];
3611
				float b = yrgb * pCov[2] + yrgb * pCov[4] + yrgb * pCov[5];
3612

3613
				mode7_eigenvalue_est[part_index] = r * yrgb + g * yrgb + b * yrgb;
3614
			}
3615

3616
		} // part_index
3617
				
3618
		// Compute the total variance (squared error) of the other 2 axes by subtracting the total variance of all channels by the variance of the principle axis.
3619
		// TODO: Could also compute the ratio of the principle axis's variance vs. the total variance.
3620
		float mode11_total_sq_dist_to_line_alt = 0.0f;
3621
		for (uint32_t part_index = 0; part_index < num_parts; part_index++)
3622
		{
3623
			float d = maximum(0.0f, total_variance[part_index] - mode11_eigenvalue_est[part_index]);
3624
			mode11_total_sq_dist_to_line_alt += d;
3625
		}
3626

3627
		{
3628
#if 0
3629
			// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),
3630
			// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.
3631
			float total_sq_dist_to_line = 0.0f;
3632
			for (uint32_t i = 0; i < BLOCK_T; i++)
3633
			{
3634
				const uint32_t part_index = (*pPat)[i];
3635
				assert(part_index < num_parts);
3636

3637
				total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis[part_index]);
3638
			}
3639

3640
			mode11_candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;
3641
#else
3642
			mode11_candidates[examine_iter].m_total_sq_dist = mode11_total_sq_dist_to_line_alt;
3643
#endif
3644
			mode11_candidates[examine_iter].m_index = unique_part_index;
3645
		}
3646

3647
		{
3648
			float mode7_total_sq_dist_to_line_alt = 0.0f;
3649
			for (uint32_t part_index = 0; part_index < num_parts; part_index++)
3650
			{
3651
				float d = maximum(0.0f, total_variance[part_index] - mode7_eigenvalue_est[part_index]);
3652
				mode7_total_sq_dist_to_line_alt += d;
3653
			}
3654

3655
			mode7_candidates[examine_iter].m_total_sq_dist = mode7_total_sq_dist_to_line_alt;
3656
			mode7_candidates[examine_iter].m_index = unique_part_index;
3657
		}
3658

3659
	} // examine_iter
3660

3661
	std::sort(&mode11_candidates[0], &mode11_candidates[num_pats_to_examine]);
3662
	std::sort(&mode7_candidates[0], &mode7_candidates[num_pats_to_examine]);
3663

3664
	for (uint32_t i = 0; i < num_desired_pats; i++)
3665
		pDesired_pat_indices_mode11[i] = mode11_candidates[i].m_index;
3666

3667
	for (uint32_t i = 0; i < num_desired_pats; i++)
3668
		pDesired_pat_indices_mode7[i] = mode7_candidates[i].m_index;
3669
}
3670

3671
static void estimate_partitions_mode7(
3672
	uint32_t num_parts, // 2 or 3 partitions
3673
	uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns
3674
	uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine
3675
	const vec3F* pHalf_pixels_as_floats, // block's half pixel values casted to floats
3676
	const astc_hdr_codec_base_options& coptions, // options
3677
	uint32_t num_desired_pats, uint32_t* pDesired_pat_indices) // output indices
3678
{
3679
	BASISU_NOTE_UNUSED(coptions);
3680
	BASISU_NOTE_UNUSED(num_unique_pats);
3681

3682
	const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = 6 * 6, MAX_PARTS = 3;
3683
	assert(num_parts <= MAX_PARTS);
3684

3685
	struct candidate_res
3686
	{
3687
		float m_total_sq_dist;
3688
		uint32_t m_index;
3689
		bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }
3690
	};
3691

3692
	const uint32_t MAX_CANDIDATES = 1024;
3693
	assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));
3694

3695
	candidate_res candidates[MAX_CANDIDATES];
3696

3697
	for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)
3698
	{
3699
		const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];
3700
		assert(unique_part_index < num_unique_pats);
3701

3702
		const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];
3703

3704
		vec3F part_means[MAX_PARTS];
3705
		uint32_t part_total_texels[MAX_PARTS] = { 0 };
3706

3707
		for (uint32_t i = 0; i < num_parts; i++)
3708
			part_means[i].clear();
3709

3710
		for (uint32_t y = 0; y < BLOCK_H; y++)
3711
		{
3712
			for (uint32_t x = 0; x < BLOCK_W; x++)
3713
			{
3714
				const uint32_t part_index = (*pPat)(x, y);
3715
				assert(part_index < num_parts);
3716

3717
				part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];
3718
				part_total_texels[part_index]++;
3719

3720
			} // x
3721
		} // y
3722

3723
		for (uint32_t i = 0; i < num_parts; i++)
3724
		{
3725
			assert(part_total_texels[i]);
3726
			part_means[i] /= (float)part_total_texels[i];
3727
		}
3728

3729
		vec3F part_axis(0.5773502691f);
3730
		
3731
		// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),
3732
		// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.
3733
		float total_sq_dist_to_line = 0.0f;
3734
		for (uint32_t i = 0; i < BLOCK_T; i++)
3735
		{
3736
			const uint32_t part_index = (*pPat)[i];
3737
			assert(part_index < num_parts);
3738

3739
			total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis);
3740
		}
3741

3742
		candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;
3743

3744
		candidates[examine_iter].m_index = unique_part_index;
3745

3746
	} // examine_iter
3747

3748
	std::sort(&candidates[0], &candidates[num_pats_to_examine]);
3749

3750
	for (uint32_t i = 0; i < num_desired_pats; i++)
3751
		pDesired_pat_indices[i] = candidates[i].m_index;
3752
}
3753

3754
static float calc_deblocking_penalty_itp(
3755
	uint32_t bx, uint32_t by, uint32_t width, uint32_t height,
3756
	const imagef& pass_src_img_itp, const candidate_encoding& candidate)
3757
{
3758
	float total_deblock_penalty = 0.0f;
3759

3760
	float total_orig_mse = 0.0f, total_comp_mse = 0.0f;
3761
	uint32_t total_c = 0;
3762

3763
	for (uint32_t b = 0; b < 4; b++)
3764
	{
3765
		for (uint32_t i = 0; i < 6; i++)
3766
		{
3767
			int ox = 0, oy = 0, qx = 0, qy = 0;
3768

3769
			switch (b)
3770
			{
3771
			case 0:
3772
				ox = bx * 6 + i; oy = (by - 1) * 6 + 5;
3773
				qx = bx * 6 + i; qy = by * 6;
3774
				break;
3775
			case 1:
3776
				ox = bx * 6 + i; oy = (by + 1) * 6;
3777
				qx = bx * 6 + i; qy = by * 6 + 5;
3778
				break;
3779
			case 2:
3780
				ox = (bx - 1) * 6 + 5; oy = by * 6 + i;
3781
				qx = bx * 6; qy = by * 6 + i;
3782
				break;
3783
			case 3:
3784
				ox = (bx + 1) * 6; oy = by * 6 + i;
3785
				qx = bx * 6 + 5; qy = by * 6 + i;
3786
				break;
3787
			}
3788

3789
			if ((ox < 0) || (oy < 0) || (ox >= (int)width) || (oy >= (int)height))
3790
				continue;
3791

3792
			const vec3F& o_pixel_itp = pass_src_img_itp(ox, oy);
3793
			const vec3F& q_pixel_itp = pass_src_img_itp(qx, qy);
3794

3795
			const vec3F &d_pixel_itp = candidate.m_comp_pixels_itp[qy - by * 6][qx - bx * 6]; // compressed block
3796
			
3797
			vec3F orig_delta_v(o_pixel_itp - q_pixel_itp);
3798
			total_orig_mse += square(orig_delta_v[0]) + square(orig_delta_v[1]) + square(orig_delta_v[2]);
3799

3800
			vec3F d_delta_v(o_pixel_itp - d_pixel_itp);
3801
			total_comp_mse += square(d_delta_v[0]) + square(d_delta_v[1]) + square(d_delta_v[2]);
3802

3803
			total_c++;
3804
		}
3805
	}
3806

3807
	if (total_c)
3808
	{
3809
		total_orig_mse /= (float)total_c;
3810
		total_comp_mse /= (float)total_c;
3811

3812
		if (total_orig_mse)
3813
		{
3814
			total_deblock_penalty = fabsf((total_comp_mse - total_orig_mse) / total_orig_mse);
3815
		}
3816
	}
3817

3818
	return total_deblock_penalty;
3819
}
3820

3821
static bool calc_strip_size(
3822
	float lambda,
3823
	uint32_t num_blocks_y, uint32_t total_threads, bool force_one_strip,
3824
	uint32_t& res_total_strips, uint32_t& res_rows_per_strip, astc_hdr_6x6_global_config &global_cfg)
3825
{
3826
	uint32_t total_strips = 1;
3827

3828
	if (lambda == 0.0f)
3829
	{
3830
		if (!force_one_strip)
3831
		{
3832
			total_strips = total_threads;
3833
		}
3834
	}
3835
	else
3836
	{
3837
		const uint32_t MIN_DESIRED_STRIPS = 8;
3838
		const uint32_t MAX_TARGET_STRIPS = 32;
3839
		const uint32_t TARGET_ASTC_6X6_ROWS_PER_STRIP = 12;
3840

3841
		if (!force_one_strip)
3842
		{
3843
			total_strips = maximum<uint32_t>(1, num_blocks_y / TARGET_ASTC_6X6_ROWS_PER_STRIP);
3844

3845
			if (num_blocks_y >= MIN_DESIRED_STRIPS * 2)
3846
				total_strips = maximum(total_strips, MIN_DESIRED_STRIPS);
3847
		}
3848

3849
		total_strips = minimum(total_strips, MAX_TARGET_STRIPS);
3850
	}
3851

3852
	uint32_t rows_per_strip = 0;
3853
	if (total_strips <= 1)
3854
	{
3855
		rows_per_strip = num_blocks_y;
3856
	}
3857
	else
3858
	{
3859
		rows_per_strip = (num_blocks_y / total_strips) & ~1;
3860
		
3861
		if (rows_per_strip < 2)
3862
			rows_per_strip = 2;// num_blocks_y;
3863
	}
3864
		
3865
	assert((rows_per_strip == num_blocks_y) || ((rows_per_strip & 1) == 0));
3866

3867
	total_strips = (num_blocks_y + rows_per_strip - 1) / rows_per_strip;
3868
	
3869
	if (global_cfg.m_debug_output)
3870
	{
3871
		fmt_printf("num_blocks_y: {}, total_threads : {}, Total strips : {}\n", num_blocks_y, total_threads, total_strips);
3872
		fmt_printf("ASTC 6x6 block rows per strip: {}\n", rows_per_strip);
3873
		fmt_printf("ASTC 6x6 block rows on final strip: {}\n", num_blocks_y - (total_strips - 1) * rows_per_strip);
3874
	}
3875

3876
	uint32_t total_rows = 0;
3877
	for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)
3878
	{
3879
		uint32_t strip_first_by = strip_index * rows_per_strip;
3880
		uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);
3881

3882
		if (strip_index == (total_strips - 1))
3883
			strip_last_by = num_blocks_y - 1;
3884

3885
		uint32_t num_strip_block_rows = (strip_last_by - strip_first_by) + 1;
3886
		total_rows += num_strip_block_rows;
3887

3888
		if (global_cfg.m_debug_output)
3889
			fmt_printf("Strip row: {}, total block rows: {}\n", strip_index, num_strip_block_rows);
3890
	}
3891

3892
	if (total_rows != num_blocks_y)
3893
	{
3894
		fmt_error_printf("Strip calc failed\n");
3895
		return false;
3896
	}
3897

3898
	res_total_strips = total_strips;
3899
	res_rows_per_strip = rows_per_strip;
3900

3901
	return true;
3902
}
3903

3904
static void convet_rgb_image_to_itp(const imagef &src_img, imagef &dst_img, const astc_hdr_6x6_global_config& cfg)
3905
{
3906
	const uint32_t width = src_img.get_width(), height = src_img.get_height();
3907

3908
	dst_img.resize(width, height);
3909

3910
	for (uint32_t y = 0; y < height; y++)
3911
	{
3912
		for (uint32_t x = 0; x < width; x++)
3913
		{
3914
			vec3F src_rgb(src_img(x, y));
3915

3916
			vec3F src_itp;
3917
			linear_rgb_to_itp(src_rgb, src_itp, cfg);
3918

3919
			dst_img(x, y) = src_itp;
3920
		}
3921
	}
3922
}
3923

3924
const uint32_t BLOCK_W = 6, BLOCK_H = 6;
3925
const uint32_t NUM_BLOCK_PIXELS = BLOCK_W * BLOCK_H;
3926

3927
const float SOLID_PENALTY = 4.0f;
3928
const float REUSE_PENALTY = 1.0f;
3929
const float RUN_PENALTY = 10.0f;
3930

3931
const float MSE_WEIGHT = 300000.0f;
3932
const float SSIM_WEIGHT = 200.0f;
3933
const float TWO_LEVEL_PENALTY = 1.425f;
3934
const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM = .04f;
3935
const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM = .04f;
3936
const float COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY = 1.5f;
3937
const float COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY = 1.25f;
3938
const float COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY = 1.15f;
3939

3940
struct uastc_hdr_6x6_debug_state
3941
{
3942
	uint32_t m_encoding_type_hist[(uint32_t)encoding_type::cTotal] = { 0 };
3943
	uint32_t m_endpoint_mode_hist[(uint32_t)endpoint_mode::cTotal] = { 0 };
3944
	uint32_t m_block_mode_hist[(uint32_t)block_mode::cBMTotalModes] = { 0 };
3945
	uint64_t m_block_mode_total_bits[(uint32_t)block_mode::cBMTotalModes] = { 0 };
3946

3947
	basisu::vector< basisu::stats<float> > m_block_mode_comp_stats[(uint32_t)block_mode::cBMTotalModes][3];
3948
	basisu::vector< basisu::comparative_stats<float> > m_block_mode_comparative_stats[(uint32_t)block_mode::cBMTotalModes][3];
3949

3950
	std::atomic<uint32_t> m_total_gaussian1_blocks;
3951
	std::atomic<uint32_t> m_total_gaussian2_blocks;
3952
	std::atomic<uint32_t> m_total_filter_horizontal;
3953
	std::atomic<uint32_t> m_detail_stats[5];
3954
	std::atomic<uint32_t> m_total_mode7_skips;
3955

3956
	std::atomic<uint32_t> m_total_blocks_compressed;
3957

3958
	std::atomic<uint32_t> m_total_candidates_considered;
3959
	std::atomic<uint32_t> m_max_candidates_considered;
3960

3961
	std::atomic<uint32_t> m_total_part2_stats[4];
3962
	std::atomic<uint32_t> m_dp_stats[5];
3963

3964
	std::atomic<uint32_t> m_reuse_num_parts[4];
3965
	std::atomic<uint32_t> m_reuse_total_dp;
3966

3967
	imagef m_stat_vis;
3968
	std::mutex m_stat_vis_mutex;
3969

3970
	image m_part_vis;
3971
	image m_mode_vis;
3972
	image m_mode_vis2;
3973
	image m_grid_vis;
3974
	image m_enc_vis;
3975
	std::mutex m_vis_image_mutex;
3976

3977
	std::atomic<uint32_t> m_comp_level_hist[ASTC_HDR_6X6_MAX_COMP_LEVEL + 1];
3978
		
3979
	std::atomic<uint32_t> m_total_jnd_replacements;
3980

3981
	std::mutex m_stats_mutex;
3982

3983
	uastc_hdr_6x6_debug_state()
3984
	{
3985
		for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
3986
		{
3987
			for (uint32_t j = 0; j < 3; j++)
3988
			{
3989
				m_block_mode_comp_stats[i][j].reserve(512);
3990
				m_block_mode_comparative_stats[i][j].reserve(512);
3991
			}
3992
		}
3993
	}
3994
	
3995
	void init(uint32_t width, uint32_t height)
3996
	{
3997
		m_stat_vis.resize(width, height);
3998
		m_part_vis.resize(width, height);
3999
		m_mode_vis.resize(width, height);
4000
		m_mode_vis2.resize(width, height);
4001
		m_grid_vis.resize(width, height);
4002
		m_enc_vis.resize(width, height);
4003

4004
		basisu::clear_obj(m_encoding_type_hist);
4005
		basisu::clear_obj(m_endpoint_mode_hist);
4006
		basisu::clear_obj(m_block_mode_hist);
4007
		basisu::clear_obj(m_block_mode_total_bits);
4008
		
4009
		for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
4010
		{
4011
			for (uint32_t j = 0; j < 3; j++)
4012
			{
4013
				m_block_mode_comp_stats[i][j].clear();
4014
				m_block_mode_comparative_stats[i][j].clear();
4015
			}
4016
		}
4017

4018
		m_total_gaussian1_blocks.store(0);
4019
		m_total_gaussian2_blocks.store(0);
4020
		m_total_filter_horizontal.store(0);
4021
		for (uint32_t i = 0; i < std::size(m_detail_stats); i++)
4022
			m_detail_stats[i].store(0);
4023
		m_total_mode7_skips.store(0);
4024

4025
		for (uint32_t i = 0; i < std::size(m_comp_level_hist); i++)
4026
			m_comp_level_hist[i].store(0);
4027

4028
		m_total_blocks_compressed.store(0);
4029

4030
		m_total_candidates_considered.store(0);
4031
		m_max_candidates_considered.store(0);
4032

4033
		for (uint32_t i = 0; i < std::size(m_total_part2_stats); i++)
4034
			m_total_part2_stats[i].store(0);
4035
		
4036
		for (uint32_t i = 0; i < std::size(m_dp_stats); i++)
4037
			m_dp_stats[i].store(0);
4038

4039
		for (uint32_t i = 0; i < std::size(m_reuse_num_parts); i++)
4040
			m_reuse_num_parts[i] .store(0);
4041

4042
		m_reuse_total_dp.store(0);
4043

4044
		m_total_jnd_replacements.store(0);
4045
	}
4046

4047
	void print(uint32_t total_blocks) const
4048
	{
4049
		fmt_printf("Total blocks: {}\n", total_blocks);
4050
		fmt_printf("Total JND replacements: {} {3.2}%\n", m_total_jnd_replacements, (float)m_total_jnd_replacements * 100.0f / (float)total_blocks);
4051
		fmt_printf("Comp level histogram: {} {} {} {} {}\n", m_comp_level_hist[0], m_comp_level_hist[1], m_comp_level_hist[2], m_comp_level_hist[3], m_comp_level_hist[4]);
4052
		fmt_printf("Total gaussian 1 blocks: {} {3.2}%\n", m_total_gaussian1_blocks, (float)m_total_gaussian1_blocks * 100.0f / (float)total_blocks);
4053
		fmt_printf("Total gaussian 2 blocks: {} {3.2}%\n", m_total_gaussian2_blocks, (float)m_total_gaussian2_blocks * 100.0f / (float)total_blocks);
4054
		fmt_printf("Total filter horizontal: {} {3.2}%\n", m_total_filter_horizontal, (float)m_total_filter_horizontal * 100.0f / (float)total_blocks);
4055
		fmt_printf("Detail stats: Detailed block low grid skip: {}, Blurry block skip: {}, Very blurry block skip: {}, NH:{} H:{}\n", m_detail_stats[0], m_detail_stats[1], m_detail_stats[2], m_detail_stats[3], m_detail_stats[4]);
4056
		fmt_printf("Total mode7 skips: {}\n", m_total_mode7_skips);
4057

4058
		fmt_printf("Total candidates: {}, {} avg per block\n", m_total_candidates_considered, (float)m_total_candidates_considered / (float)total_blocks);
4059
		fmt_printf("Max ever candidates: {}\n", m_max_candidates_considered);
4060

4061
		fmt_printf("Part2/3 stats: {} {} {} {}\n", m_total_part2_stats[0], m_total_part2_stats[1], m_total_part2_stats[2], m_total_part2_stats[3]);
4062
		fmt_printf("Dual plane stats: {} {} {} {} {}\n", m_dp_stats[0], m_dp_stats[1], m_dp_stats[2], m_dp_stats[3], m_dp_stats[4]);
4063
		fmt_printf("Reuse total dual plane: {}\n", m_reuse_total_dp);
4064
		fmt_printf("Reuse part stats: {} {} {}\n", m_reuse_num_parts[1], m_reuse_num_parts[2], m_reuse_num_parts[3]);
4065

4066
		fmt_printf("\nEncoding type histogram:\n");
4067
		for (uint32_t i = 0; i < std::size(m_encoding_type_hist); i++)
4068
			fmt_printf("{}: {}\n", i, m_encoding_type_hist[i]);
4069

4070
		fmt_printf("\nEndpoint mode histogram:\n");
4071
		for (uint32_t i = 0; i < std::size(m_endpoint_mode_hist); i++)
4072
			fmt_printf("{}: {}\n", i, m_endpoint_mode_hist[i]);
4073

4074
		fmt_printf("\nBlock mode histogram:\n");
4075

4076
		uint32_t total_dp = 0, total_sp = 0;
4077
		uint32_t total_mode11 = 0, total_mode7 = 0;
4078
		uint32_t part_hist[3] = { 0 };
4079
		uint32_t part2_mode7_total = 0, part2_mode11_total = 0;
4080
		uint32_t total_used_modes = 0;
4081
		for (uint32_t i = 0; i < std::size(m_block_mode_hist); i++)
4082
		{
4083
			const auto& bm_desc = g_block_mode_descs[i];
4084

4085
			const uint32_t total_uses = m_block_mode_hist[i];
4086

4087
			if (bm_desc.m_dp)
4088
				total_dp += total_uses;
4089
			else
4090
				total_sp += total_uses;
4091

4092
			if (bm_desc.m_cem == 7)
4093
				total_mode7 += total_uses;
4094
			else
4095
				total_mode11 += total_uses;
4096

4097
			part_hist[bm_desc.m_num_partitions - 1] += total_uses;
4098

4099
			if (bm_desc.m_num_partitions == 2)
4100
			{
4101
				if (bm_desc.m_cem == 7)
4102
					part2_mode7_total += total_uses;
4103
				else
4104
				{
4105
					assert(bm_desc.m_cem == 11);
4106
					part2_mode11_total += total_uses;
4107
				}
4108
			}
4109

4110
			float avg_std_dev = 0.0f;
4111
			float avg_cross_correlations[3] = { 0 };
4112

4113
			if (m_block_mode_comp_stats[i][0].size())
4114
			{
4115
				const uint32_t num_uses = m_block_mode_comp_stats[i][0].size_u32();
4116

4117
				for (uint32_t j = 0; j < num_uses; j++)
4118
					avg_std_dev += (float)maximum(m_block_mode_comp_stats[i][0][j].m_std_dev, m_block_mode_comp_stats[i][1][j].m_std_dev, m_block_mode_comp_stats[i][2][j].m_std_dev);
4119
				avg_std_dev /= (float)num_uses;
4120

4121
				for (uint32_t j = 0; j < num_uses; j++)
4122
				{
4123
					avg_cross_correlations[0] += fabsf((float)m_block_mode_comparative_stats[i][0][j].m_pearson);
4124
					avg_cross_correlations[1] += fabsf((float)m_block_mode_comparative_stats[i][1][j].m_pearson);
4125
					avg_cross_correlations[2] += fabsf((float)m_block_mode_comparative_stats[i][2][j].m_pearson);
4126
				}
4127

4128
				avg_cross_correlations[0] /= (float)num_uses;
4129
				avg_cross_correlations[1] /= (float)num_uses;
4130
				avg_cross_correlations[2] /= (float)num_uses;
4131
			}
4132

4133
			fmt_printf("{ 2}: uses: { 6}, cem: {}, dp: {} chan: {}, parts: {}, grid: {}x{}, endpoint levels: {}, weight levels: {}, Avg bits: {}, Avg Max Std Dev: {}, RG: {} RB: {} GB: {}\n", i, total_uses,
4134
				bm_desc.m_cem,
4135
				bm_desc.m_dp, bm_desc.m_dp_channel,
4136
				bm_desc.m_num_partitions,
4137
				bm_desc.m_grid_x, bm_desc.m_grid_y,
4138
				astc_helpers::get_ise_levels(bm_desc.m_endpoint_ise_range),
4139
				astc_helpers::get_ise_levels(bm_desc.m_weight_ise_range),
4140
				total_uses ? ((double)m_block_mode_total_bits[i] / total_uses) : 0.0f,
4141
				avg_std_dev, avg_cross_correlations[0], avg_cross_correlations[1], avg_cross_correlations[2]);
4142

4143
			if (total_uses)
4144
				total_used_modes++;
4145
		}
4146

4147
		fmt_printf("Total used modes: {}\n", total_used_modes);
4148

4149
		fmt_printf("Total single plane: {}, total dual plane: {}\n", total_sp, total_dp);
4150
		fmt_printf("Total mode 11: {}, mode 7: {}\n", total_mode11, total_mode7);
4151
		fmt_printf("Partition histogram: {} {} {}\n", part_hist[0], part_hist[1], part_hist[2]);
4152
		fmt_printf("2 subset mode 7 uses: {}, mode 11 uses: {}\n", part2_mode7_total, part2_mode11_total);
4153
	}
4154
};
4155

4156
struct uastc_hdr_6x6_encode_state
4157
{
4158
	astc_hdr_codec_base_options master_coptions;
4159
		
4160
	imagef src_img;
4161
		
4162
	imagef src_img_filtered1;
4163
	imagef src_img_filtered2;
4164

4165
	imagef src_img_itp;
4166
	imagef src_img_filtered1_itp;
4167
	imagef src_img_filtered2_itp;
4168

4169
	vector2D<float> smooth_block_mse_scales;
4170

4171
	imagef packed_img;
4172

4173
	basisu::vector<bitwise_coder> strip_bits;
4174

4175
	basisu::vector2D<astc_helpers::astc_block> final_astc_blocks;
4176

4177
	vector2D<candidate_encoding> coded_blocks;
4178
};
4179

4180
static bool compress_strip_task(
4181
	uint32_t strip_index, uint32_t total_strips, uint32_t strip_first_by, uint32_t strip_last_by,
4182
	uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t total_blocks, uint32_t width, uint32_t height,
4183
	astc_hdr_6x6_global_config &global_cfg, uastc_hdr_6x6_debug_state &debug_state, uastc_hdr_6x6_encode_state &enc_state)
4184
{
4185
	BASISU_NOTE_UNUSED(num_blocks_y);
4186
	BASISU_NOTE_UNUSED(total_strips);
4187
	
4188
	vec3F prev_comp_pixels[BLOCK_H][BLOCK_W]; // [y][x]
4189
	basisu::clear_obj(prev_comp_pixels);
4190

4191
	uint32_t prev_run_len = 0;
4192

4193
	bitwise_coder prev_encoding;
4194
	candidate_encoding prev_candidate_encoding; // the previous candidate written, which may have been a run extension
4195
	candidate_encoding prev_non_run_candidate_encoding; // the previous *non-run* candidate written
4196

4197
	bitwise_coder& strip_coded_bits = enc_state.strip_bits[strip_index];
4198

4199
	const uint32_t CANDIDATES_TO_RESERVE = 1536;
4200

4201
	basisu::vector<candidate_encoding> candidates;
4202
	candidates.reserve(CANDIDATES_TO_RESERVE);
4203

4204
	for (uint32_t by = strip_first_by; by <= strip_last_by; by++)
4205
	{
4206
		const bool has_upper_neighbor = by > strip_first_by;
4207

4208
		for (uint32_t bx = 0; bx < num_blocks_x; bx++)
4209
		{
4210
			//if ((bx == 1) && (by == 2))
4211
			//	basisu::fmt_printf("!");
4212

4213
			for (uint32_t outer_pass = 0; outer_pass < 3; outer_pass++)
4214
			{
4215
				const bool has_left_neighbor = bx > 0;
4216
				//const bool has_prev = has_left_neighbor || has_upper_neighbor;
4217

4218
				// Select either the original source image, or the Gaussian filtered version.
4219
				// From here the encoder *must* use these 2 sources.
4220
				const imagef& pass_src_img = (outer_pass == 2) ? enc_state.src_img_filtered2 :
4221
					((outer_pass == 1) ? enc_state.src_img_filtered1 : enc_state.src_img);
4222

4223
				const imagef& pass_src_img_itp = (outer_pass == 2) ? enc_state.src_img_filtered2_itp :
4224
					((outer_pass == 1) ? enc_state.src_img_filtered1_itp : enc_state.src_img_itp);
4225

4226
				// Extract source image block
4227
				vec4F block_pixels[BLOCK_H][BLOCK_W]; // [y][x]
4228
				pass_src_img.extract_block_clamped(&block_pixels[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);
4229

4230
				vec4F block_pixels_itp[BLOCK_H][BLOCK_W]; // [y][x]
4231
				pass_src_img_itp.extract_block_clamped(&block_pixels_itp[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);
4232

4233
				half_vec3 half_pixels[BLOCK_H][BLOCK_W]; // [y][x] half-float values
4234
				vec3F half_pixels_as_floats[BLOCK_H][BLOCK_W]; // [y][x] half float values, integer bits as floats
4235
				vec4F block_pixels_q16[BLOCK_H][BLOCK_W]; // [y][x], q16 space for low-level ASTC encoding
4236
				vec3F block_pixels_as_itp[BLOCK_H][BLOCK_W]; // [y][x] input converted to itp space, for faster error calculations
4237

4238
				bool is_grayscale = true;
4239

4240
				candidates.resize(0);
4241

4242
				float block_ly = BIG_FLOAT_VAL, block_hy = 0.0f, block_avg_y = 0.0f;
4243

4244
				for (uint32_t y = 0; y < BLOCK_H; y++)
4245
				{
4246
					for (uint32_t x = 0; x < BLOCK_W; x++)
4247
					{
4248
						vec3F rgb_input;
4249

4250
						for (uint32_t c = 0; c < 3; c++)
4251
						{
4252
							float v = block_pixels[y][x][c];
4253

4254
							rgb_input[c] = v;
4255

4256
							const basist::half_float h = basisu::fast_float_to_half_no_clamp_neg_nan_or_inf(v);
4257
							assert(h == basist::float_to_half(v));
4258

4259
							half_pixels[y][x][c] = h;
4260

4261
							block_pixels_q16[y][x][c] = (float)half_to_qlog16(h);
4262

4263
							half_pixels_as_floats[y][x][c] = (float)h;
4264

4265
						} // c
4266

4267
						float py = rgb_input.dot(vec3F(REC_709_R, REC_709_G, REC_709_B));
4268
						if (py < block_ly)
4269
							block_ly = py;
4270
						if (py > block_hy)
4271
							block_hy = py;
4272
						block_avg_y += py;
4273

4274
						//linear_rgb_to_itp(rgb_input, block_pixels_as_itp[y][x]);
4275

4276
						block_pixels_as_itp[y][x] = block_pixels_itp[y][x];
4277

4278
						block_pixels_q16[y][x][3] = 0.0f;
4279

4280
						if ((half_pixels[y][x][0] != half_pixels[y][x][1]) || (half_pixels[y][x][0] != half_pixels[y][x][2]))
4281
							is_grayscale = false;
4282

4283
					} // x
4284
				} // y
4285

4286
				block_avg_y *= (1.0f / (float)NUM_BLOCK_PIXELS);
4287

4288
				encode_astc_block_stats enc_block_stats;
4289
				enc_block_stats.init(NUM_BLOCK_PIXELS, &block_pixels_q16[0][0]);
4290

4291
				vec4F x_filtered[6][6], y_filtered[6][6];
4292

4293
				filter_block(3, 6, (vec4F*)block_pixels, (vec4F*)x_filtered); // filter rows (horizontal)
4294
				filter_block(6, 3, (vec4F*)block_pixels, (vec4F*)y_filtered); // filter cols (vertically)
4295

4296
				const float filtered_x_err = diff_blocks((vec4F*)block_pixels, (vec4F*)x_filtered);
4297
				const float filtered_y_err = diff_blocks((vec4F*)block_pixels, (vec4F*)y_filtered);
4298
				const bool filter_horizontally = filtered_x_err < filtered_y_err;
4299

4300
				//const float block_mag_gradient_mag = block_max_gradient_mag(bx, by);
4301

4302
				if (filter_horizontally)
4303
					debug_state.m_total_filter_horizontal.fetch_add(1, std::memory_order_relaxed);
4304

4305
				vec3F lowpass_filtered[6][6];
4306
				filter_block(3, 3, &half_pixels_as_floats[0][0], &lowpass_filtered[0][0]);
4307
				float lowpass_std_dev = sub_and_compute_std_dev(&lowpass_filtered[0][0], &half_pixels_as_floats[0][0]);
4308

4309
				const bool very_detailed_block = lowpass_std_dev > 350.0f;
4310
				const bool very_blurry_block = lowpass_std_dev < 30.0f;
4311
				const bool super_blurry_block = lowpass_std_dev < 15.0f;
4312

4313
				basisu::stats<float> half_comp_stats[3];
4314
				for (uint32_t c = 0; c < 3; c++)
4315
					half_comp_stats[c].calc(NUM_BLOCK_PIXELS, &half_pixels_as_floats[0][0][c], 3);
4316

4317
				const float SINGLE_PART_HALF_THRESH = 256.0f;
4318
				const float COMPLEX_HALF_THRESH = 1024.0f;
4319
				// HACK HACK
4320
				const float VERY_COMPLEX_HALF_THRESH = 1400.0f; // 1536.0f;
4321

4322
				const float max_std_dev = (float)maximum(half_comp_stats[0].m_std_dev, half_comp_stats[1].m_std_dev, half_comp_stats[2].m_std_dev);
4323

4324
				const bool very_simple_block = (max_std_dev < SINGLE_PART_HALF_THRESH);
4325
				const bool complex_block = (max_std_dev > COMPLEX_HALF_THRESH);
4326
				const bool very_complex_block = (max_std_dev > VERY_COMPLEX_HALF_THRESH);
4327

4328
				// Dynamically choose a comp_level for this block.
4329
				astc_hdr_codec_base_options coptions(enc_state.master_coptions);
4330
				uint32_t comp_level = global_cfg.m_master_comp_level;
4331

4332
				if (very_complex_block)
4333
					comp_level = global_cfg.m_highest_comp_level;
4334
				else if (complex_block)
4335
					comp_level = (global_cfg.m_master_comp_level + global_cfg.m_highest_comp_level + 1) / 2;
4336

4337
				debug_state.m_comp_level_hist[comp_level].fetch_add(1, std::memory_order_relaxed);
4338

4339
				bool any_2subset_enabled = false, any_2subset_mode11_enabled = false, any_2subset_mode7_enabled = false, any_3subset_enabled = false;
4340
				BASISU_NOTE_UNUSED(any_2subset_mode11_enabled);
4341

4342
				for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
4343
				{
4344
					if (comp_level == 0)
4345
					{
4346
						if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)
4347
							continue;
4348
					}
4349
					else if (comp_level == 1)
4350
					{
4351
						if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)
4352
							continue;
4353
					}
4354
					else if (comp_level == 2)
4355
					{
4356
						if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)
4357
							continue;
4358
					}
4359

4360
					if (g_block_mode_descs[i].m_num_partitions == 2)
4361
					{
4362
						any_2subset_enabled = true;
4363

4364
						if (g_block_mode_descs[i].m_cem == 7)
4365
						{
4366
							any_2subset_mode7_enabled = true;
4367
						}
4368
						else
4369
						{
4370
							assert(g_block_mode_descs[i].m_cem == 11);
4371
							any_2subset_mode11_enabled = true;
4372
						}
4373
					}
4374
					else if (g_block_mode_descs[i].m_num_partitions == 3)
4375
						any_3subset_enabled = true;
4376
				}
4377

4378
				coptions.m_mode7_full_s_optimization = (comp_level >= 2);
4379

4380
				const bool uber_mode_flag = (comp_level >= 3);
4381
				coptions.m_allow_uber_mode = uber_mode_flag;
4382

4383
				coptions.m_ultra_quant = (comp_level >= 4);
4384

4385
				coptions.m_take_first_non_clamping_mode11_submode = (comp_level <= 2);
4386
				coptions.m_take_first_non_clamping_mode7_submode = (comp_level <= 2);
4387

4388
				coptions.m_disable_weight_plane_optimization = (comp_level >= 2);
4389

4390
				// -------------------
4391

4392
				uint32_t total_used_block_chans = 0;
4393
				for (uint32_t i = 0; i < 3; i++)
4394
					total_used_block_chans += (half_comp_stats[i].m_range > 0.0f);
4395

4396
				const bool is_solid_block = (total_used_block_chans == 0);
4397

4398
				basisu::comparative_stats<float> half_cross_chan_stats[3];
4399

4400
				// R vs. G
4401
				half_cross_chan_stats[0].calc_pearson(NUM_BLOCK_PIXELS,
4402
					&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][1],
4403
					3, 3,
4404
					&half_comp_stats[0], &half_comp_stats[1]);
4405

4406
				// R vs. B
4407
				half_cross_chan_stats[1].calc_pearson(NUM_BLOCK_PIXELS,
4408
					&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][2],
4409
					3, 3,
4410
					&half_comp_stats[0], &half_comp_stats[2]);
4411

4412
				// G vs. B
4413
				half_cross_chan_stats[2].calc_pearson(NUM_BLOCK_PIXELS,
4414
					&half_pixels_as_floats[0][0][1], &half_pixels_as_floats[0][0][2],
4415
					3, 3,
4416
					&half_comp_stats[1], &half_comp_stats[2]);
4417

4418
				const float rg_corr = fabsf((float)half_cross_chan_stats[0].m_pearson);
4419
				const float rb_corr = fabsf((float)half_cross_chan_stats[1].m_pearson);
4420
				const float gb_corr = fabsf((float)half_cross_chan_stats[2].m_pearson);
4421

4422
				float min_corr = BIG_FLOAT_VAL, max_corr = -BIG_FLOAT_VAL;
4423
				for (uint32_t i = 0; i < 3; i++)
4424
				{
4425
					if (half_comp_stats[i].m_range > 0.0f)
4426
					{
4427
						const float c = fabsf((float)half_cross_chan_stats[i].m_pearson);
4428
						min_corr = minimum(min_corr, c);
4429
						max_corr = maximum(max_corr, c);
4430
					}
4431
				}
4432

4433
				bool use_single_subset_mode7 = true;
4434
				if (comp_level <= 1)
4435
				{
4436
					// TODO: could also compute angle between principle axis and the grayscale axis.
4437
					// TODO: Transform grayscale axis by covar matrix, compute variance vs. total variance
4438
					const float MODE7_MIN_CHAN_CORR = .5f;
4439
					const float MODE7_PCA_ANGLE_THRESH = .9f;
4440
					use_single_subset_mode7 = is_grayscale || is_solid_block || (min_corr >= MODE7_MIN_CHAN_CORR);
4441

4442
					if (use_single_subset_mode7)
4443
					{
4444
						float cos_ang = fabsf(enc_block_stats.m_axis_q16.dot(vec3F(0.5773502691f)));
4445
						if (cos_ang < MODE7_PCA_ANGLE_THRESH)
4446
							use_single_subset_mode7 = false;
4447
					}
4448
				}
4449

4450
				const float STRONG_CORR_THRESH = (comp_level <= 1) ? .5f : ((comp_level <= 3) ? .75f : .9f);
4451

4452
				int desired_dp_chan = -1;
4453
				if (total_used_block_chans <= 1)
4454
				{
4455
					// no need for dual plane (except possibly 2x2 weight grids for RDO)
4456
				}
4457
				else
4458
				{
4459
					if (min_corr >= STRONG_CORR_THRESH)
4460
					{
4461
						// all channel pairs strongly correlated, no need for dual plane
4462
						debug_state.m_dp_stats[0].fetch_add(1, std::memory_order_relaxed);
4463
					}
4464
					else
4465
					{
4466
						if (total_used_block_chans == 2)
4467
						{
4468
							if (half_comp_stats[0].m_range == 0.0f)
4469
							{
4470
								// r unused, check for strong gb correlation
4471
								if (gb_corr < STRONG_CORR_THRESH)
4472
									desired_dp_chan = 1;
4473
							}
4474
							else if (half_comp_stats[1].m_range == 0.0f)
4475
							{
4476
								// g unused, check for strong rb correlation
4477
								if (rb_corr < STRONG_CORR_THRESH)
4478
									desired_dp_chan = 0;
4479
							}
4480
							else
4481
							{
4482
								// b unused, check for strong rg correlation
4483
								if (rg_corr < STRONG_CORR_THRESH)
4484
									desired_dp_chan = 0;
4485
							}
4486
						}
4487
						else
4488
						{
4489
							assert(total_used_block_chans == 3);
4490

4491
							// see if rg/rb is weakly correlated vs. gb
4492
							if ((rg_corr < gb_corr) && (rb_corr < gb_corr))
4493
								desired_dp_chan = 0;
4494
							// see if gr/gb is weakly correlated vs. rb
4495
							else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))
4496
								desired_dp_chan = 1;
4497
							// assume b is weakest
4498
							else
4499
								desired_dp_chan = 2;
4500
						}
4501

4502
						if (desired_dp_chan == -1)
4503
							debug_state.m_dp_stats[1].fetch_add(1, std::memory_order_relaxed);
4504
						else
4505
							debug_state.m_dp_stats[2 + desired_dp_chan].fetch_add(1, std::memory_order_relaxed);
4506
					}
4507
				}
4508

4509
				// 2x2 is special for RDO at higher lambdas - always pick a preferred channel.
4510
				int desired_dp_chan_2x2 = 0;
4511
				if (total_used_block_chans == 2)
4512
				{
4513
					if (half_comp_stats[0].m_range == 0.0f)
4514
						desired_dp_chan_2x2 = 1;
4515
				}
4516
				else if (total_used_block_chans == 3)
4517
				{
4518
					// see if rg/rb is weakly correlated vs. gb
4519
					if ((rg_corr < gb_corr) && (rb_corr < gb_corr))
4520
						desired_dp_chan_2x2 = 0;
4521
					// see if gr/gb is weakly correlated vs. rb
4522
					else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))
4523
						desired_dp_chan_2x2 = 1;
4524
					// assume b is weakest
4525
					else
4526
						desired_dp_chan_2x2 = 2;
4527
				}
4528

4529
				// Gather all candidate encodings
4530
				bool status = false;
4531

4532
				// ---- Run candidate
4533
				if ((global_cfg.m_use_runs) && (has_left_neighbor || has_upper_neighbor))
4534
				{
4535
					candidate_encoding candidate;
4536
					candidate.m_coder.reserve(24);
4537

4538
					candidate.m_encoding_type = encoding_type::cRun;
4539

4540
					candidate.m_decomp_log_blk = prev_non_run_candidate_encoding.m_decomp_log_blk;
4541
					candidate.m_coded_log_blk = prev_non_run_candidate_encoding.m_coded_log_blk;
4542

4543
					memcpy(candidate.m_comp_pixels, prev_comp_pixels, sizeof(prev_comp_pixels));
4544

4545
					if (!prev_run_len)
4546
					{
4547
						candidate.m_coder.put_bits(RUN_CODE, RUN_CODE_LEN);
4548
						candidate.m_coder.put_vlc(0, 5);
4549
					}
4550
					else
4551
					{
4552
						// extend current run - compute the # of new bits needed for the extension.
4553

4554
						uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();
4555
						assert(prev_run_bits > 0);
4556

4557
						// We're not actually going to code this, because the previously emitted run code will be extended.
4558
						bitwise_coder temp_coder;
4559
						temp_coder.put_bits(RUN_CODE, RUN_CODE_LEN);
4560
						temp_coder.put_vlc((prev_run_len + 1) - 1, 5);
4561

4562
						uint32_t cur_run_bits = temp_coder.get_total_bits_u32();
4563
						assert(cur_run_bits >= prev_run_bits);
4564

4565
						uint32_t total_new_bits = cur_run_bits - prev_run_bits;
4566
						if (total_new_bits > 0)
4567
							candidate.m_coder.put_bits(0, total_new_bits); // dummy bits
4568
					}
4569

4570
					candidate.m_run_len = prev_run_len + 1;
4571

4572
					candidates.emplace_back(std::move(candidate));
4573
				}
4574

4575
				// ---- Reuse candidate
4576
				if ((!is_solid_block) && (global_cfg.m_lambda > 0.0f))
4577
				{
4578
					for (uint32_t reuse_delta_index = 0; reuse_delta_index < global_cfg.m_num_reuse_xy_deltas; reuse_delta_index++)
4579
					{
4580
						const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;
4581
						const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;
4582

4583
						const int reuse_bx = bx + reuse_delta_x, reuse_by = by + reuse_delta_y;
4584
						if ((reuse_bx < 0) || (reuse_bx >= (int)num_blocks_x))
4585
							continue;
4586
						if (reuse_by < (int)strip_first_by)
4587
							break;
4588

4589
						const candidate_encoding& prev_candidate = enc_state.coded_blocks(reuse_bx, reuse_by);
4590

4591
						// TODO - support this.
4592
						if (prev_candidate.m_encoding_type == encoding_type::cSolid)
4593
							continue;
4594
						assert((prev_candidate.m_encoding_type == encoding_type::cBlock) || (prev_candidate.m_encoding_type == encoding_type::cReuse));
4595

4596
						candidate_encoding candidate;
4597
						candidate.m_coder.reserve(24);
4598
						astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
4599
						astc_helpers::log_astc_block& decomp_log_blk = candidate.m_decomp_log_blk;
4600

4601
						const astc_helpers::log_astc_block& prev_coded_log_blk = prev_candidate.m_coded_log_blk;
4602

4603
						const uint32_t grid_x = prev_coded_log_blk.m_grid_width, grid_y = prev_coded_log_blk.m_grid_height;
4604
						const bool dual_plane = prev_candidate.m_coded_log_blk.m_dual_plane;
4605
						const uint32_t num_grid_samples = grid_x * grid_y;
4606
						const uint32_t num_endpoint_vals = get_num_endpoint_vals(prev_coded_log_blk.m_color_endpoint_modes[0]);
4607

4608
						coded_log_blk = prev_candidate.m_coded_log_blk;
4609
						decomp_log_blk = prev_candidate.m_decomp_log_blk;
4610

4611
						if (prev_coded_log_blk.m_num_partitions == 1)
4612
						{
4613
							// Now encode the block using the transcoded endpoints
4614
							basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
4615

4616
							if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)
4617
							{
4618
								status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,
4619
									astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4620
							}
4621
							else
4622
							{
4623
								status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,
4624
									astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4625
							}
4626
							assert(status);
4627

4628
							uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];
4629
							uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
4630

4631
							if (dual_plane)
4632
							{
4633
								eval_selectors_dual_plane(prev_candidate.m_coded_log_blk.m_color_component_selector,
4634
									BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
4635

4636
								downsample_ise_weights_dual_plane(
4637
									coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
4638
									BLOCK_W, BLOCK_H,
4639
									grid_x, grid_y,
4640
									trial_weights0, trial_weights1, coded_log_blk.m_weights);
4641

4642
								basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * 2, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
4643
							}
4644
							else
4645
							{
4646
								eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, coded_log_blk.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
4647

4648
								downsample_ise_weights(
4649
									coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
4650
									BLOCK_W, BLOCK_H,
4651
									grid_x, grid_y,
4652
									trial_weights0, coded_log_blk.m_weights);
4653

4654
								basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
4655
							}
4656

4657
							// Create the block the decoder would transcode into.
4658
							copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);
4659
						}
4660
						else if (prev_coded_log_blk.m_num_partitions == 2)
4661
						{
4662
							assert(!dual_plane);
4663

4664
							const int unique_pat_index = g_part2_seed_to_unique_index[coded_log_blk.m_partition_id];
4665
							assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS2));
4666

4667
							const partition_pattern_vec& pat_vec = g_partitions2[unique_pat_index];
4668

4669
							vec4F part_pixels_q16[2][64];
4670
							half_vec3 part_half_pixels[2][64];
4671
							uint32_t part_total_pixels[2] = { 0 };
4672

4673
							for (uint32_t y = 0; y < BLOCK_H; y++)
4674
							{
4675
								for (uint32_t x = 0; x < BLOCK_W; x++)
4676
								{
4677
									const uint32_t part_index = pat_vec[x + y * 6];
4678

4679
									uint32_t l = part_total_pixels[part_index];
4680

4681
									part_pixels_q16[part_index][l] = block_pixels_q16[y][x];
4682
									part_half_pixels[part_index][l] = half_pixels[y][x];
4683

4684
									part_total_pixels[part_index] = l + 1;
4685
								} // x 
4686
							} // y
4687

4688
							uint8_t blk_weights[2][BLOCK_W * BLOCK_H];
4689

4690
							for (uint32_t part_index = 0; part_index < 2; part_index++)
4691
							{
4692
								basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
4693

4694
								if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)
4695
								{
4696
									status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
4697
										astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4698
								}
4699
								else
4700
								{
4701
									status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
4702
										astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4703
								}
4704
								assert(status);
4705

4706
								eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,
4707
									(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
4708

4709
							} // part_index
4710

4711
							uint8_t ise_weights[BLOCK_W * BLOCK_H];
4712

4713
							uint32_t src_pixel_index[2] = { 0, 0 };
4714
							for (uint32_t y = 0; y < BLOCK_H; y++)
4715
							{
4716
								for (uint32_t x = 0; x < BLOCK_W; x++)
4717
								{
4718
									const uint32_t part_index = pat_vec[x + y * 6];
4719

4720
									ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
4721
									src_pixel_index[part_index]++;
4722
								} // x
4723
							} // y
4724

4725
							downsample_ise_weights(
4726
								coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
4727
								BLOCK_W, BLOCK_H,
4728
								grid_x, grid_y,
4729
								ise_weights, coded_log_blk.m_weights);
4730

4731
							// Transcode these codable weights to ASTC weights.
4732
							uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];
4733
							basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
4734

4735
							// Create the block the decoder would transcode into.
4736
							copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);
4737
						}
4738
						else if (prev_coded_log_blk.m_num_partitions == 3)
4739
						{
4740
							assert(!dual_plane);
4741

4742
							const int unique_pat_index = g_part3_seed_to_unique_index[coded_log_blk.m_partition_id];
4743
							assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS3));
4744

4745
							const partition_pattern_vec& pat = g_partitions3[unique_pat_index];
4746

4747
							vec4F part_pixels_q16[3][64];
4748
							half_vec3 part_half_pixels[3][64];
4749
							uint32_t part_total_pixels[3] = { 0 };
4750

4751
							for (uint32_t y = 0; y < BLOCK_H; y++)
4752
							{
4753
								for (uint32_t x = 0; x < BLOCK_W; x++)
4754
								{
4755
									const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];
4756

4757
									uint32_t l = part_total_pixels[part_index];
4758

4759
									part_pixels_q16[part_index][l] = block_pixels_q16[y][x];
4760
									part_half_pixels[part_index][l] = half_pixels[y][x];
4761

4762
									part_total_pixels[part_index] = l + 1;
4763
								} // x 
4764
							} // y
4765

4766
							uint8_t blk_weights[3][BLOCK_W * BLOCK_H];
4767

4768
							for (uint32_t part_index = 0; part_index < 3; part_index++)
4769
							{
4770
								basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
4771

4772
								status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
4773
									astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4774
								assert(status);
4775

4776
								eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,
4777
									(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
4778

4779
							} // part_index
4780

4781
							uint8_t ise_weights[BLOCK_W * BLOCK_H];
4782

4783
							uint32_t src_pixel_index[3] = { 0 };
4784
							for (uint32_t y = 0; y < BLOCK_H; y++)
4785
							{
4786
								for (uint32_t x = 0; x < BLOCK_W; x++)
4787
								{
4788
									const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];
4789

4790
									ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
4791
									src_pixel_index[part_index]++;
4792
								} // x
4793
							} // y
4794

4795
							downsample_ise_weights(
4796
								coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
4797
								BLOCK_W, BLOCK_H,
4798
								grid_x, grid_y,
4799
								ise_weights, coded_log_blk.m_weights);
4800

4801
							// Transcode these codable weights to ASTC weights.
4802
							uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];
4803
							basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
4804

4805
							// Create the block the decoder would transcode into.
4806
							copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);
4807
						}
4808

4809
						if (!validate_log_blk(decomp_log_blk))
4810
						{
4811
							fmt_error_printf("pack_astc_block() failed\n");
4812
							return false;
4813
						}
4814

4815
						status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_log_blk, &candidate.m_comp_pixels[0][0]);
4816
						if (!status)
4817
						{
4818
							fmt_error_printf("decode_astc_block() failed\n");
4819
							return false;
4820
						}
4821

4822
						candidate.m_coder.put_bits(REUSE_CODE, REUSE_CODE_LEN);
4823
						candidate.m_coder.put_bits(reuse_delta_index, REUSE_XY_DELTA_BITS);
4824
						encode_values(candidate.m_coder, num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range);
4825

4826
						candidate.m_encoding_type = encoding_type::cReuse;
4827
						candidate.m_block_mode = prev_candidate.m_block_mode;
4828
						candidate.m_endpoint_mode = prev_candidate.m_endpoint_mode;
4829
						candidate.m_reuse_delta_index = reuse_delta_index;
4830

4831
						candidates.emplace_back(std::move(candidate));
4832

4833
					} // reuse_delta_index
4834
				}
4835

4836
				// ---- Solid candidate
4837
				if (global_cfg.m_use_solid_blocks)
4838
				{
4839
					candidate_encoding candidate;
4840
					candidate.m_coder.reserve(24);
4841

4842
					// solid
4843
					candidate.m_encoding_type = encoding_type::cSolid;
4844

4845
					float r = 0.0f, g = 0.0f, b = 0.0f;
4846
					const float LOG_BIAS = .125f;
4847
					bool solid_block = true;
4848
					for (uint32_t y = 0; y < BLOCK_H; y++)
4849
					{
4850
						for (uint32_t x = 0; x < BLOCK_W; x++)
4851
						{
4852
							if ((block_pixels[0][0][0] != block_pixels[y][x][0]) ||
4853
								(block_pixels[0][0][1] != block_pixels[y][x][1]) ||
4854
								(block_pixels[0][0][2] != block_pixels[y][x][2]))
4855
							{
4856
								solid_block = false;
4857
							}
4858

4859
							r += log2f(block_pixels[y][x][0] + LOG_BIAS);
4860
							g += log2f(block_pixels[y][x][1] + LOG_BIAS);
4861
							b += log2f(block_pixels[y][x][2] + LOG_BIAS);
4862
						}
4863
					}
4864

4865
					if (solid_block)
4866
					{
4867
						r = block_pixels[0][0][0];
4868
						g = block_pixels[0][0][1];
4869
						b = block_pixels[0][0][2];
4870
					}
4871
					else
4872
					{
4873
						r = maximum<float>(0.0f, powf(2.0f, r * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
4874
						g = maximum<float>(0.0f, powf(2.0f, g * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
4875
						b = maximum<float>(0.0f, powf(2.0f, b * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
4876

4877
						r = minimum<float>(r, basist::MAX_HALF_FLOAT);
4878
						g = minimum<float>(g, basist::MAX_HALF_FLOAT);
4879
						b = minimum<float>(b, basist::MAX_HALF_FLOAT);
4880
					}
4881

4882
					basist::half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b);
4883

4884
					candidate.m_solid_color[0] = rh;
4885
					candidate.m_solid_color[1] = gh;
4886
					candidate.m_solid_color[2] = bh;
4887

4888
					candidate.m_coder.put_bits(SOLID_CODE, SOLID_CODE_LEN);
4889

4890
					candidate.m_coder.put_bits(rh, 15);
4891
					candidate.m_coder.put_bits(gh, 15);
4892
					candidate.m_coder.put_bits(bh, 15);
4893

4894
					vec3F cp(basist::half_to_float(rh), basist::half_to_float(gh), basist::half_to_float(bh));
4895

4896
					for (uint32_t y = 0; y < BLOCK_H; y++)
4897
						for (uint32_t x = 0; x < BLOCK_W; x++)
4898
							candidate.m_comp_pixels[y][x] = cp;
4899

4900
					astc_helpers::log_astc_block& log_blk = candidate.m_coded_log_blk;
4901

4902
					log_blk.clear();
4903
					log_blk.m_solid_color_flag_hdr = true;
4904
					log_blk.m_solid_color[0] = rh;
4905
					log_blk.m_solid_color[1] = gh;
4906
					log_blk.m_solid_color[2] = bh;
4907
					log_blk.m_solid_color[3] = basist::float_to_half(1.0f);
4908

4909
					candidate.m_decomp_log_blk = log_blk;
4910

4911
					candidates.emplace_back(std::move(candidate));
4912
				}
4913

4914
				if ((!is_solid_block) || (!global_cfg.m_use_solid_blocks))
4915
				{
4916
					static uint8_t s_parts2_normal[5] = { 0, 2, 4, 6, 8 };
4917
					static uint8_t s_parts3_normal[5] = { 0, 0, 4, 6, 8 };
4918

4919
					static uint8_t s_parts2_complex[5] = { 0, 4, 8, 10, 16 };
4920
					static uint8_t s_parts3_complex[5] = { 0, 0, 8, 10, 16 };
4921

4922
					static uint8_t s_parts2_very_complex[5] = { 0, 8, 12, 14, 20 };
4923
					static uint8_t s_parts3_very_complex[5] = { 0, 0, 12, 14, 20 };
4924

4925
					uint32_t total_parts2 = 0, total_parts3 = 0;
4926

4927
					assert(comp_level < 5);
4928
					if ((very_simple_block) && (comp_level <= 3))
4929
					{
4930
						// Block's std dev is so low that 2-3 subsets are unlikely to help much
4931
						total_parts2 = 0;
4932
						total_parts3 = 0;
4933

4934
						debug_state.m_total_part2_stats[0].fetch_add(1, std::memory_order_relaxed);
4935
					}
4936
					else if (very_complex_block)
4937
					{
4938
						total_parts2 = s_parts2_very_complex[comp_level];
4939
						total_parts3 = s_parts3_very_complex[comp_level];
4940

4941
						if (global_cfg.m_extra_patterns_flag)
4942
						{
4943
							total_parts2 += (comp_level == 4) ? 30 : 20;
4944
							total_parts3 += (comp_level == 4) ? 30 : 20;
4945
						}
4946

4947
						debug_state.m_total_part2_stats[2].fetch_add(1, std::memory_order_relaxed);
4948
					}
4949
					else if (complex_block)
4950
					{
4951
						total_parts2 = s_parts2_complex[comp_level];
4952
						total_parts3 = s_parts3_complex[comp_level];
4953

4954
						if (global_cfg.m_extra_patterns_flag)
4955
						{
4956
							total_parts2 += (comp_level == 4) ? 15 : 10;
4957
							total_parts3 += (comp_level == 4) ? 15 : 10;
4958
						}
4959

4960
						debug_state.m_total_part2_stats[3].fetch_add(1, std::memory_order_relaxed);
4961
					}
4962
					else
4963
					{
4964
						// moderate complexity - use defaults
4965
						total_parts2 = s_parts2_normal[comp_level];
4966
						total_parts3 = s_parts3_normal[comp_level];
4967

4968
						if (global_cfg.m_extra_patterns_flag)
4969
						{
4970
							total_parts2 += 5;
4971
							total_parts3 += 5;
4972
						}
4973

4974
						debug_state.m_total_part2_stats[1].fetch_add(1, std::memory_order_relaxed);
4975
					}
4976

4977
					if (!any_2subset_enabled)
4978
						total_parts2 = 0;
4979

4980
					if (!any_3subset_enabled)
4981
						total_parts3 = 0;
4982

4983
					int best_parts2_mode11[NUM_UNIQUE_PARTITIONS2], best_parts2_mode7[NUM_UNIQUE_PARTITIONS2];
4984
					bool has_estimated_parts2 = false;
4985

4986
					if (total_parts2)
4987
					{
4988
						if (global_cfg.m_brute_force_partition_matching)
4989
						{
4990
							int candidate_pats2[NUM_UNIQUE_PARTITIONS2];
4991
							for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS2; i++)
4992
								candidate_pats2[i] = i;
4993

4994
							if (any_2subset_enabled)
4995
							{
4996
								estimate_partitions_mode7_and_11(
4997
									2,
4998
									NUM_UNIQUE_PARTITIONS2, g_partitions2,
4999
									NUM_UNIQUE_PARTITIONS2, (uint32_t*)candidate_pats2,
5000
									&half_pixels_as_floats[0][0],
5001
									coptions,
5002
									total_parts2, best_parts2_mode11, best_parts2_mode7);
5003
							}
5004

5005
							has_estimated_parts2 = true;
5006
						}
5007
						else
5008
						{
5009
							if (comp_level >= 1)
5010
							{
5011
								const uint32_t MAX_CANDIDATES2 = 48;
5012
								int candidate_pats2[MAX_CANDIDATES2 * 2];
5013

5014
								uint32_t num_candidate_pats2 = maximum((total_parts2 * 3) / 2, very_complex_block ? MAX_CANDIDATES2 : (MAX_CANDIDATES2 / 2));
5015
								num_candidate_pats2 = minimum<uint32_t>(num_candidate_pats2, (uint32_t)std::size(candidate_pats2));
5016

5017
								has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, candidate_pats2, num_candidate_pats2);
5018

5019
								if (has_estimated_parts2)
5020
								{
5021
									estimate_partitions_mode7_and_11(
5022
										2,
5023
										NUM_UNIQUE_PARTITIONS2, g_partitions2,
5024
										num_candidate_pats2, (uint32_t*)candidate_pats2,
5025
										&half_pixels_as_floats[0][0],
5026
										coptions,
5027
										total_parts2, best_parts2_mode11, best_parts2_mode7);
5028
								}
5029
							}
5030
							else
5031
							{
5032
								has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, best_parts2_mode11, total_parts2);
5033

5034
								if ((has_estimated_parts2) && (any_2subset_mode7_enabled))
5035
									memcpy(best_parts2_mode7, best_parts2_mode11, total_parts2 * sizeof(best_parts2_mode7[0]));
5036
							}
5037
						}
5038
					}
5039

5040
					int best_parts3[NUM_UNIQUE_PARTITIONS3];
5041
					bool has_estimated_parts3 = false;
5042

5043
					if (total_parts3)
5044
					{
5045
#if 0
5046
						has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, best_parts3, total_parts3);
5047
#elif 1
5048
						if (global_cfg.m_brute_force_partition_matching)
5049
						{
5050
							int candidate_pats3[NUM_UNIQUE_PARTITIONS3];
5051
							for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS3; i++)
5052
								candidate_pats3[i] = i;
5053

5054
							estimate_partitions_mode7(
5055
								3,
5056
								NUM_UNIQUE_PARTITIONS3, g_partitions3,
5057
								NUM_UNIQUE_PARTITIONS3, (uint32_t*)candidate_pats3,
5058
								&half_pixels_as_floats[0][0],
5059
								coptions,
5060
								total_parts3, (uint32_t*)best_parts3);
5061

5062
							has_estimated_parts3 = true;
5063
						}
5064
						else
5065
						{
5066
							const uint32_t MAX_CANDIDATES3 = 48;
5067
							int candidate_pats3[MAX_CANDIDATES3 * 2];
5068

5069
							uint32_t num_candidate_pats3 = maximum((total_parts3 * 3) / 2, very_complex_block ? MAX_CANDIDATES3 : (MAX_CANDIDATES3 / 2));
5070
							num_candidate_pats3 = minimum<uint32_t>(num_candidate_pats3, (uint32_t)std::size(candidate_pats3));
5071

5072
							has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, candidate_pats3, num_candidate_pats3);
5073

5074
							if (has_estimated_parts3)
5075
							{
5076
								estimate_partitions_mode7(
5077
									3,
5078
									NUM_UNIQUE_PARTITIONS3, g_partitions3,
5079
									num_candidate_pats3, (uint32_t*)candidate_pats3,
5080
									&half_pixels_as_floats[0][0],
5081
									coptions,
5082
									total_parts3, (uint32_t*)best_parts3);
5083
							}
5084
						}
5085
#endif
5086
					}
5087

5088
					const opt_mode_t mode11_opt_mode = complex_block ? cWeightedLeastSquares : cOrdinaryLeastSquares;
5089

5090
					// ---- Encoded block candidate
5091
					for (uint32_t block_mode_iter = 0; block_mode_iter < (uint32_t)block_mode::cBMTotalModes; block_mode_iter++)
5092
					{
5093
						const block_mode bm = (block_mode)block_mode_iter;
5094

5095
						if (comp_level == 0)
5096
						{
5097
							if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)
5098
								continue;
5099
						}
5100
						else if (comp_level == 1)
5101
						{
5102
							if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)
5103
								continue;
5104
						}
5105
						else if (comp_level == 2)
5106
						{
5107
							if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)
5108
								continue;
5109
						}
5110

5111
						if (global_cfg.m_block_stat_optimizations_flag)
5112
						{
5113
							if ((comp_level <= 3) && (g_block_mode_descs[block_mode_iter].m_dp))
5114
							{
5115
								if ((global_cfg.m_lambda > 0.0f) && (!complex_block) && (g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))
5116
								{
5117
									if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan_2x2)
5118
										continue;
5119
								}
5120
								else
5121
								{
5122
									if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan)
5123
										continue;
5124
								}
5125
							}
5126

5127
							if (comp_level <= 3)
5128
							{
5129
								const uint32_t grid_x = g_block_mode_descs[block_mode_iter].m_grid_x;
5130
								const uint32_t grid_y = g_block_mode_descs[block_mode_iter].m_grid_y;
5131

5132
								if (!g_block_mode_descs[block_mode_iter].m_dp)
5133
								{
5134
									// Minor gain (.5-1% less canidates)
5135
									if (very_detailed_block)
5136
									{
5137
										if (grid_x * grid_y <= 12)
5138
										{
5139
											debug_state.m_detail_stats[0].fetch_add(1, std::memory_order_relaxed);
5140
											continue;
5141
										}
5142
									}
5143

5144
									// Major gains (10-25% less candidates)
5145
									if (very_blurry_block)
5146
									{
5147
										if ((grid_x > 4) || (grid_y > 4) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))
5148
										{
5149
											debug_state.m_detail_stats[1].fetch_add(1, std::memory_order_relaxed);
5150
											continue;
5151
										}
5152
									}
5153
									if (super_blurry_block)
5154
									{
5155
										if ((grid_x > 3) || (grid_y > 3) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))
5156
										{
5157
											debug_state.m_detail_stats[2].fetch_add(1, std::memory_order_relaxed);
5158
											continue;
5159
										}
5160
									}
5161
								}
5162

5163
								if (grid_x != grid_y)
5164
								{
5165
									if (grid_x < grid_y)
5166
									{
5167
										if (!filter_horizontally)
5168
										{
5169
											debug_state.m_detail_stats[3].fetch_add(1, std::memory_order_relaxed);
5170
											continue;
5171
										}
5172
									}
5173
									else
5174
									{
5175
										if (filter_horizontally)
5176
										{
5177
											debug_state.m_detail_stats[4].fetch_add(1, std::memory_order_relaxed);
5178
											continue;
5179
										}
5180
									}
5181
								}
5182
							}
5183

5184
							if (global_cfg.m_lambda == 0.0f)
5185
							{
5186
								// Rarely useful if lambda=0
5187
								if ((g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))
5188
									continue;
5189
							}
5190
						} // block_stat_optimizations_flag
5191

5192
						if ((!use_single_subset_mode7) &&
5193
							(g_block_mode_descs[block_mode_iter].m_cem == 7) &&
5194
							(g_block_mode_descs[block_mode_iter].m_num_partitions == 1))
5195
						{
5196
							debug_state.m_total_mode7_skips.fetch_add(1, std::memory_order_relaxed);
5197
							continue;
5198
						}
5199

5200
						for (uint32_t endpoint_mode_iter = 0; endpoint_mode_iter < (uint32_t)endpoint_mode::cTotal; endpoint_mode_iter++)
5201
						{
5202
							if (global_cfg.m_lambda == 0.0f)
5203
							{
5204
								// No use trying anything else
5205
								if (endpoint_mode_iter != (uint32_t)endpoint_mode::cRaw)
5206
									continue;
5207
							}
5208

5209
							if (global_cfg.m_disable_delta_endpoint_usage)
5210
							{
5211
								if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeftDelta))
5212
									continue;
5213
							}
5214

5215
							if (!global_cfg.m_favor_higher_compression)
5216
							{
5217
								if (comp_level == 0)
5218
								{
5219
									if (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta)
5220
										continue;
5221
								}
5222

5223
								if (comp_level <= 1)
5224
								{
5225
									if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeft) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpper))
5226
										continue;
5227
								}
5228
							}
5229

5230
							const endpoint_mode em = (endpoint_mode)endpoint_mode_iter;
5231

5232
							switch (em)
5233
							{
5234
							case endpoint_mode::cUseLeft:
5235
							case endpoint_mode::cUseUpper:
5236
							{
5237
								const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];
5238
								const uint32_t cem = local_md.m_cem;
5239

5240
								if (local_md.m_num_partitions > 1)
5241
									break;
5242

5243
								if ((em == endpoint_mode::cUseLeft) && (!has_left_neighbor))
5244
									break;
5245
								else if ((em == endpoint_mode::cUseUpper) && (!has_upper_neighbor))
5246
									break;
5247

5248
								candidate_encoding candidate;
5249
								candidate.m_coder.reserve(24);
5250
								astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5251

5252
								int nx = bx, ny = by;
5253
								if (em == endpoint_mode::cUseLeft)
5254
									nx--;
5255
								else
5256
									ny--;
5257

5258
								const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);
5259
								if (neighbor_blk.m_encoding_type == encoding_type::cSolid)
5260
									break;
5261
								assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));
5262

5263
								const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];
5264

5265
								if (neighbor_md.m_cem != cem)
5266
									break;
5267

5268
								assert(neighbor_blk.m_coded_log_blk.m_color_endpoint_modes[0] == cem);
5269

5270
								const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;
5271
								const bool dual_plane = local_md.m_dp;
5272
								const uint32_t num_grid_samples = grid_x * grid_y;
5273
								const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);
5274

5275
								coded_log_blk.m_grid_width = (uint8_t)grid_x;
5276
								coded_log_blk.m_grid_height = (uint8_t)grid_y;
5277
								coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
5278
								coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
5279
								coded_log_blk.m_num_partitions = 1;
5280
								coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)neighbor_md.m_cem;
5281
								coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;
5282

5283
								// We're not explictly writing any endpoints, just reusing existing ones. So copy the neighbor's endpoints unchanged (so no loss).
5284
								coded_log_blk.m_endpoint_ise_range = neighbor_blk.m_coded_log_blk.m_endpoint_ise_range;
5285
								memcpy(coded_log_blk.m_endpoints, neighbor_blk.m_coded_log_blk.m_endpoints, num_endpoint_vals);
5286

5287
								uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];
5288

5289
								// Requantize the neighbor's endpoints to whatever we'll have to transcode into to make a valid ASTC encoding.
5290
								basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,
5291
									neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,
5292
									local_md.m_transcode_endpoint_ise_range, transcode_endpoints);
5293

5294
								// Now encode the block using the transcoded endpoints
5295
								basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
5296

5297
								if (cem == 7)
5298
								{
5299
									status = get_astc_hdr_mode_7_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,
5300
										astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);
5301
								}
5302
								else
5303
								{
5304
									status = get_astc_hdr_mode_11_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,
5305
										astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);
5306
								}
5307
								if (!status)
5308
									break;
5309

5310
								uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];
5311
								if (dual_plane)
5312
								{
5313
									eval_selectors_dual_plane(local_md.m_dp_channel, BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
5314

5315
									downsample_ise_weights_dual_plane(
5316
										local_md.m_weight_ise_range, local_md.m_weight_ise_range,
5317
										BLOCK_W, BLOCK_H,
5318
										grid_x, grid_y,
5319
										trial_weights0, trial_weights1, coded_log_blk.m_weights);
5320
								}
5321
								else
5322
								{
5323
									eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, local_md.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
5324

5325
									downsample_ise_weights(
5326
										local_md.m_weight_ise_range, local_md.m_weight_ise_range,
5327
										BLOCK_W, BLOCK_H,
5328
										grid_x, grid_y,
5329
										trial_weights0, coded_log_blk.m_weights);
5330
								}
5331

5332
								// Transcode these codable weights to ASTC weights.
5333
								uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
5334
								basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);
5335

5336
								// Create the block the decoder would transcode into.
5337
								astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5338
								decomp_blk.clear();
5339

5340
								decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
5341
								decomp_blk.m_dual_plane = local_md.m_dp;
5342
								decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
5343
								decomp_blk.m_num_partitions = 1;
5344
								decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;
5345
								decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;
5346

5347
								memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);
5348

5349
								copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);
5350

5351
								if (!validate_log_blk(decomp_blk))
5352
								{
5353
									fmt_error_printf("pack_astc_block() failed\n");
5354
									return false;
5355
								}
5356

5357
								status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5358
								if (!status)
5359
								{
5360
									fmt_error_printf("decode_astc_block() failed\n");
5361
									return false;
5362
								}
5363

5364
								candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5365
								code_block(candidate.m_coder, candidate.m_coded_log_blk, (block_mode)block_mode_iter, em, nullptr);
5366

5367
								candidate.m_encoding_type = encoding_type::cBlock;
5368
								candidate.m_endpoint_mode = em;
5369
								candidate.m_block_mode = bm;
5370

5371
								candidates.emplace_back(std::move(candidate));
5372

5373
								break;
5374
							}
5375
							case endpoint_mode::cUseLeftDelta:
5376
							case endpoint_mode::cUseUpperDelta:
5377
							{
5378
								const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];
5379
								const uint32_t cem = local_md.m_cem;
5380

5381
								if (local_md.m_num_partitions > 1)
5382
									break;
5383

5384
								if ((em == endpoint_mode::cUseLeftDelta) && (!has_left_neighbor))
5385
									break;
5386
								else if ((em == endpoint_mode::cUseUpperDelta) && (!has_upper_neighbor))
5387
									break;
5388

5389
								candidate_encoding candidate;
5390
								candidate.m_coder.reserve(24);
5391
								astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5392

5393
								int nx = bx, ny = by;
5394
								if (em == endpoint_mode::cUseLeftDelta)
5395
									nx--;
5396
								else
5397
									ny--;
5398

5399
								const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);
5400
								if (neighbor_blk.m_encoding_type == encoding_type::cSolid)
5401
									break;
5402
								assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));
5403

5404
								const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];
5405

5406
								if (neighbor_md.m_cem != cem)
5407
									break;
5408

5409
								assert(neighbor_md.m_cem == local_md.m_cem);
5410

5411
								const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;
5412
								const bool dual_plane = local_md.m_dp;
5413
								const uint32_t num_grid_samples = grid_x * grid_y;
5414
								const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);
5415

5416
								// Dequantize neighbor's endpoints to ISE 20
5417
								uint8_t neighbor_endpoints_ise20[basist::NUM_MODE11_ENDPOINTS];
5418
								basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,
5419
									neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,
5420
									astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20);
5421

5422
								// Requantize neighbor's endpoints to our local desired coding ISE range
5423
								uint8_t neighbor_endpoints_coding_ise_local[basist::NUM_MODE11_ENDPOINTS];
5424
								basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20, local_md.m_endpoint_ise_range, neighbor_endpoints_coding_ise_local);
5425

5426
								uint8_t blk_endpoints[basist::NUM_MODE11_ENDPOINTS];
5427
								uint8_t blk_weights0[NUM_BLOCK_PIXELS], blk_weights1[NUM_BLOCK_PIXELS];
5428

5429
								// Now try to encode the current block using the neighbor's endpoints submode.
5430
								double err = 0.0f;
5431
								uint32_t best_submode = 0;
5432

5433
								if (cem == 7)
5434
								{
5435
									int maj_index, submode_index;
5436
									decode_cem_7_config(neighbor_endpoints_ise20, submode_index, maj_index);
5437

5438
									int first_submode = submode_index, last_submode = submode_index;
5439

5440
									err = encode_astc_hdr_block_mode_7(
5441
										NUM_BLOCK_PIXELS,
5442
										(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
5443
										local_md.m_weight_ise_range,
5444
										best_submode,
5445
										BIG_FLOAT_VAL,
5446
										blk_endpoints, blk_weights0,
5447
										coptions,
5448
										local_md.m_endpoint_ise_range,
5449
										first_submode, last_submode,
5450
										&enc_block_stats);
5451
								}
5452
								else
5453
								{
5454
									int maj_index, submode_index;
5455
									decode_cem_11_config(neighbor_endpoints_ise20, submode_index, maj_index);
5456

5457
									int first_submode = -1, last_submode = -1;
5458
									if (maj_index == 3)
5459
									{
5460
										// direct
5461
									}
5462
									else
5463
									{
5464
										first_submode = submode_index;
5465
										last_submode = submode_index;
5466
									}
5467

5468
									if (dual_plane)
5469
									{
5470
										err = encode_astc_hdr_block_mode_11_dual_plane(
5471
											NUM_BLOCK_PIXELS,
5472
											(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
5473
											local_md.m_dp_channel,
5474
											local_md.m_weight_ise_range,
5475
											best_submode,
5476
											BIG_FLOAT_VAL,
5477
											blk_endpoints, blk_weights0, blk_weights1,
5478
											coptions,
5479
											false,
5480
											local_md.m_endpoint_ise_range,
5481
											false, //uber_mode_flag,
5482
											false,
5483
											first_submode, last_submode, true);
5484
									}
5485
									else
5486
									{
5487
										err = encode_astc_hdr_block_mode_11(
5488
											NUM_BLOCK_PIXELS,
5489
											(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
5490
											local_md.m_weight_ise_range,
5491
											best_submode,
5492
											BIG_FLOAT_VAL,
5493
											blk_endpoints, blk_weights0,
5494
											coptions,
5495
											false,
5496
											local_md.m_endpoint_ise_range,
5497
											false, //uber_mode_flag,
5498
											false,
5499
											first_submode, last_submode, true,
5500
											mode11_opt_mode,
5501
											&enc_block_stats);
5502
									}
5503
								}
5504

5505
								if (err == BIG_FLOAT_VAL)
5506
									break;
5507

5508
								uint8_t endpoint_deltas[basist::NUM_MODE11_ENDPOINTS];
5509

5510
								// TODO: For now, just try 5 bits for each endpoint. Can tune later.
5511
								// This isn't right, it's computing the deltas in ISE space.
5512
								//const uint32_t NUM_ENDPOINT_DELTA_BITS = 5;
5513
								const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;
5514
								const int low_delta_limit = -(total_endpoint_delta_vals / 2), high_delta_limit = (total_endpoint_delta_vals / 2) - 1;
5515

5516
								const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(local_md.m_endpoint_ise_range).m_ISE_to_rank;
5517

5518
								bool all_deltas_in_limits = true;
5519
								for (uint32_t i = 0; i < num_endpoint_vals; i++)
5520
								{
5521
									int endpoint_delta = (int)ise_to_rank[blk_endpoints[i]] - (int)ise_to_rank[neighbor_endpoints_coding_ise_local[i]];
5522

5523
									if ((endpoint_delta < low_delta_limit) || (endpoint_delta > high_delta_limit))
5524
										all_deltas_in_limits = false;
5525

5526
									endpoint_deltas[i] = (uint8_t)(endpoint_delta + -low_delta_limit);
5527
								}
5528

5529
								if (all_deltas_in_limits)
5530
								{
5531
									coded_log_blk.m_grid_width = (uint8_t)grid_x;
5532
									coded_log_blk.m_grid_height = (uint8_t)grid_y;
5533
									coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
5534
									coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
5535
									coded_log_blk.m_num_partitions = 1;
5536
									coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
5537
									coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;
5538
									coded_log_blk.m_endpoint_ise_range = (uint8_t)local_md.m_endpoint_ise_range;
5539

5540
									memcpy(coded_log_blk.m_endpoints, blk_endpoints, num_endpoint_vals);
5541

5542
									uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];
5543
									uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
5544

5545
									basist::astc_6x6_hdr::requantize_ise_endpoints(local_md.m_cem, local_md.m_endpoint_ise_range, blk_endpoints, local_md.m_transcode_endpoint_ise_range, transcode_endpoints);
5546

5547
									if (dual_plane)
5548
									{
5549
										downsample_ise_weights_dual_plane(
5550
											local_md.m_weight_ise_range, local_md.m_weight_ise_range,
5551
											BLOCK_W, BLOCK_H,
5552
											grid_x, grid_y,
5553
											blk_weights0, blk_weights1,
5554
											coded_log_blk.m_weights);
5555
									}
5556
									else
5557
									{
5558
										downsample_ise_weights(
5559
											local_md.m_weight_ise_range, local_md.m_weight_ise_range,
5560
											BLOCK_W, BLOCK_H,
5561
											grid_x, grid_y,
5562
											blk_weights0, coded_log_blk.m_weights);
5563
									}
5564

5565
									basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);
5566

5567
									// Create the block the decoder would transcode into.
5568

5569
									astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5570
									decomp_blk.clear();
5571

5572
									decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
5573
									decomp_blk.m_dual_plane = local_md.m_dp;
5574
									decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
5575
									decomp_blk.m_num_partitions = 1;
5576
									decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;
5577
									decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;
5578

5579
									memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);
5580

5581
									copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);
5582

5583
									if (!validate_log_blk(decomp_blk))
5584
									{
5585
										fmt_error_printf("pack_astc_block() failed\n");
5586
										return false;
5587
									}
5588

5589
									status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5590
									if (!status)
5591
									{
5592
										fmt_error_printf("decode_astc_block() failed\n");
5593
										return false;
5594
									}
5595

5596
									candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5597
									code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, endpoint_deltas);
5598

5599
									candidate.m_encoding_type = encoding_type::cBlock;
5600
									candidate.m_endpoint_mode = em;
5601
									candidate.m_block_mode = bm;
5602

5603
									candidates.emplace_back(std::move(candidate));
5604
								}
5605

5606
								break;
5607
							}
5608
							case endpoint_mode::cRaw:
5609
							{
5610
								//if (candidates.size() == 339)
5611
								//	fmt_printf("!");
5612

5613
								const auto& mode_desc = g_block_mode_descs[(uint32_t)bm];
5614
								const uint32_t cem = mode_desc.m_cem;
5615
								//const uint32_t num_endpoint_vals = get_num_endpoint_vals(cem);
5616
								const bool dual_plane = mode_desc.m_dp;
5617

5618
								if ((global_cfg.m_disable_twothree_subsets) && (mode_desc.m_num_partitions >= 2))
5619
									break;
5620

5621
								if (mode_desc.m_num_partitions == 3)
5622
								{
5623
									assert(!dual_plane);
5624

5625
									if (!has_estimated_parts3)
5626
										break;
5627

5628
									assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);
5629
									assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);
5630

5631
									trial_result res;
5632

5633
									status = encode_block_3_subsets(
5634
										res,
5635
										cem,
5636
										mode_desc.m_grid_x, mode_desc.m_grid_y,
5637
										mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
5638
										&half_pixels[0][0], (vec4F*)block_pixels_q16,
5639
										coptions,
5640
										uber_mode_flag,
5641
										best_parts3, total_parts3, comp_level, mode11_opt_mode);
5642

5643
									if (!status)
5644
										break;
5645

5646
									assert(res.m_valid);
5647

5648
									candidate_encoding candidate;
5649
									candidate.m_coder.reserve(24);
5650
									astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5651

5652
									coded_log_blk = res.m_log_blk;
5653

5654
									astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5655
									decomp_blk = res.m_log_blk;
5656

5657
									if (!validate_log_blk(decomp_blk))
5658
									{
5659
										fmt_error_printf("pack_astc_block() failed\n");
5660
										return false;
5661
									}
5662

5663
									status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5664
									if (!status)
5665
									{
5666
										fmt_error_printf("decode_astc_block() failed\n");
5667
										return false;
5668
									}
5669

5670
									candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5671
									code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);
5672

5673
									candidate.m_encoding_type = encoding_type::cBlock;
5674
									candidate.m_endpoint_mode = em;
5675
									candidate.m_block_mode = bm;
5676

5677
									candidates.emplace_back(std::move(candidate));
5678
								}
5679
								else if (mode_desc.m_num_partitions == 2)
5680
								{
5681
									assert(!dual_plane);
5682

5683
									if (!has_estimated_parts2)
5684
										break;
5685

5686
									assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);
5687
									assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);
5688

5689
									for (uint32_t est_part_iter = 0; est_part_iter < total_parts2; est_part_iter++)
5690
									{
5691
										trial_result results[2];
5692

5693
										assert(((cem == 11) && any_2subset_mode11_enabled) || ((cem == 7) && any_2subset_mode7_enabled));
5694

5695
										status = encode_block_2_subsets(
5696
											results,
5697
											mode_desc.m_grid_x, mode_desc.m_grid_y,
5698
											mode_desc.m_cem,
5699
											mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
5700
											&half_pixels[0][0], (vec4F*)block_pixels_q16,
5701
											coptions,
5702
											uber_mode_flag,
5703
											(cem == 11) ? best_parts2_mode11[est_part_iter] : best_parts2_mode7[est_part_iter],
5704
											comp_level,
5705
											mode11_opt_mode,
5706
											true);
5707

5708
										if (!status)
5709
											continue;
5710

5711
										for (uint32_t r_iter = 0; r_iter < 2; r_iter++)
5712
										{
5713
											const trial_result& res = results[r_iter];
5714

5715
											if (!res.m_valid)
5716
												continue;
5717

5718
											candidate_encoding candidate;
5719
											candidate.m_coder.reserve(24);
5720
											astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5721

5722
											coded_log_blk = res.m_log_blk;
5723

5724
											astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5725
											decomp_blk = res.m_log_blk;
5726

5727
											if (!validate_log_blk(decomp_blk))
5728
											{
5729
												fmt_error_printf("pack_astc_block() failed\n");
5730
												return false;
5731
											}
5732

5733
											status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5734
											if (!status)
5735
											{
5736
												fmt_error_printf("decode_astc_block() failed\n");
5737
												return false;
5738
											}
5739

5740
											candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5741
											code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);
5742

5743
											candidate.m_encoding_type = encoding_type::cBlock;
5744
											candidate.m_endpoint_mode = em;
5745
											candidate.m_block_mode = bm;
5746

5747
											candidates.emplace_back(std::move(candidate));
5748

5749
										} // r_iter
5750
									}
5751
								}
5752
								else
5753
								{
5754
									// 1 subset
5755
									uint8_t blk_weights0[BLOCK_W * BLOCK_H], blk_weights1[BLOCK_W * BLOCK_H];
5756
									uint32_t best_submode = 0;
5757

5758
									candidate_encoding candidate;
5759
									candidate.m_coder.reserve(24);
5760
									astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5761

5762
									const uint32_t grid_x = mode_desc.m_grid_x, grid_y = mode_desc.m_grid_y;
5763
									const uint32_t num_grid_samples = grid_x * grid_y;
5764

5765
									const half_vec3* pBlock_pixels_half = &half_pixels[0][0];
5766
									const vec4F* pBlock_pixels_q16 = &block_pixels_q16[0][0];
5767

5768
									const uint32_t num_grid_samples_dp = num_grid_samples * (dual_plane ? 2 : 1);
5769

5770
									uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
5771

5772
									coded_log_blk.m_grid_width = (uint8_t)grid_x;
5773
									coded_log_blk.m_grid_height = (uint8_t)grid_y;
5774
									coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
5775
									coded_log_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;
5776
									coded_log_blk.m_num_partitions = 1;
5777
									coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;
5778
									coded_log_blk.m_weight_ise_range = (uint8_t)mode_desc.m_weight_ise_range;
5779
									coded_log_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_endpoint_ise_range;
5780

5781
									if ((cem == 11) && (!dual_plane) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))
5782
									{
5783
										double e = encode_astc_hdr_block_downsampled_mode_11(
5784
											BLOCK_W, BLOCK_H, grid_x, grid_y,
5785
											mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
5786
											NUM_BLOCK_PIXELS, (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
5787
											BIG_FLOAT_VAL,
5788
											FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode,
5789
											coded_log_blk.m_endpoints, coded_log_blk.m_weights, best_submode,
5790
											coptions,
5791
											&enc_block_stats);
5792

5793
										if (e == BIG_FLOAT_VAL)
5794
											break;
5795
									}
5796
									else
5797
									{
5798
										if (cem == 7)
5799
										{
5800
											assert(!dual_plane);
5801

5802
											double e = encode_astc_hdr_block_mode_7(
5803
												NUM_BLOCK_PIXELS,
5804
												(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
5805
												mode_desc.m_weight_ise_range,
5806
												best_submode,
5807
												BIG_FLOAT_VAL,
5808
												coded_log_blk.m_endpoints,
5809
												blk_weights0,
5810
												coptions,
5811
												mode_desc.m_endpoint_ise_range,
5812
												0, MAX_MODE7_SUBMODE_INDEX,
5813
												&enc_block_stats);
5814
											BASISU_NOTE_UNUSED(e);
5815
										}
5816
										else
5817
										{
5818
											double e;
5819

5820
											if (dual_plane)
5821
											{
5822
												e = encode_astc_hdr_block_mode_11_dual_plane(
5823
													NUM_BLOCK_PIXELS,
5824
													(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
5825
													mode_desc.m_dp_channel,
5826
													mode_desc.m_weight_ise_range,
5827
													best_submode,
5828
													BIG_FLOAT_VAL,
5829
													coded_log_blk.m_endpoints,
5830
													blk_weights0, blk_weights1,
5831
													coptions,
5832
													false,
5833
													mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false);
5834
											}
5835
											else
5836
											{
5837
												e = encode_astc_hdr_block_mode_11(
5838
													NUM_BLOCK_PIXELS,
5839
													(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
5840
													mode_desc.m_weight_ise_range,
5841
													best_submode,
5842
													BIG_FLOAT_VAL,
5843
													coded_log_blk.m_endpoints,
5844
													blk_weights0,
5845
													coptions,
5846
													false,
5847
													mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,
5848
													mode11_opt_mode,
5849
													&enc_block_stats);
5850
											}
5851

5852
											if (e == BIG_FLOAT_VAL)
5853
												break;
5854
										}
5855

5856
										if (dual_plane)
5857
										{
5858
											downsample_ise_weights_dual_plane(
5859
												mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,
5860
												BLOCK_W, BLOCK_H,
5861
												grid_x, grid_y,
5862
												blk_weights0, blk_weights1,
5863
												coded_log_blk.m_weights);
5864
										}
5865
										else
5866
										{
5867
											downsample_ise_weights(
5868
												mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,
5869
												BLOCK_W, BLOCK_H,
5870
												grid_x, grid_y,
5871
												blk_weights0, coded_log_blk.m_weights);
5872

5873
											if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))
5874
											{
5875
												bool refine_status = refine_endpoints(cem,
5876
													mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints,
5877
													6, 6, mode_desc.m_grid_x, mode_desc.m_grid_y,
5878
													coded_log_blk.m_weights, mode_desc.m_weight_ise_range,
5879
													BLOCK_W * BLOCK_H,
5880
													(basist::half_float(*)[3])pBlock_pixels_half, (vec4F*)pBlock_pixels_q16,
5881
													nullptr,
5882
													coptions, mode11_opt_mode);
5883
												BASISU_NOTE_UNUSED(refine_status);
5884
											}
5885
										}
5886
									}
5887

5888
									basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples_dp, coded_log_blk.m_weights, mode_desc.m_weight_ise_range, transcode_weights, mode_desc.m_transcode_weight_ise_range);
5889

5890
									// Create the block the decoder would transcode into.
5891
									astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5892
									decomp_blk.clear();
5893

5894
									decomp_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;
5895
									decomp_blk.m_dual_plane = mode_desc.m_dp;
5896
									decomp_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;
5897
									decomp_blk.m_num_partitions = 1;
5898
									decomp_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_transcode_endpoint_ise_range;
5899
									decomp_blk.m_weight_ise_range = (uint8_t)mode_desc.m_transcode_weight_ise_range;
5900

5901
									basist::astc_6x6_hdr::requantize_ise_endpoints(mode_desc.m_cem, mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints, mode_desc.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);
5902

5903
									copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);
5904

5905
									if (!validate_log_blk(decomp_blk))
5906
									{
5907
										fmt_error_printf("pack_astc_block() failed\n");
5908
										return false;
5909
									}
5910

5911
									status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5912
									if (!status)
5913
									{
5914
										fmt_error_printf("decode_astc_block() failed\n");
5915
										return false;
5916
									}
5917

5918
									candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5919
									code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);
5920

5921
									candidate.m_encoding_type = encoding_type::cBlock;
5922
									candidate.m_endpoint_mode = em;
5923
									candidate.m_block_mode = bm;
5924

5925
									candidates.emplace_back(std::move(candidate));
5926
								}
5927

5928
								break;
5929
							}
5930
							default:
5931
								assert(0);
5932
								fmt_debug_printf("Invalid endpoint mode\n");
5933
								return false;
5934

5935
							} // switch (em)
5936

5937
						} // endpoint_mode_iter
5938

5939
					} // block_mode_iter
5940

5941
				} // is_solid_block
5942

5943
				//------------------------------------------------
5944

5945
				debug_state.m_total_candidates_considered.fetch_add(candidates.size_u32(), std::memory_order_relaxed);
5946
				atomic_max(debug_state.m_max_candidates_considered, candidates.size_u32());
5947

5948
				for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
5949
				{
5950
					auto& candidate = candidates[candidate_iter];
5951

5952
					for (uint32_t y = 0; y < BLOCK_H; y++)
5953
						for (uint32_t x = 0; x < BLOCK_W; x++)
5954
							linear_rgb_to_itp(candidate.m_comp_pixels[y][x], candidate.m_comp_pixels_itp[y][x], global_cfg);
5955
				}
5956

5957
				// Find best overall candidate
5958
				double best_t = BIG_FLOAT_VAL;
5959
				int best_candidate_index = -1;
5960

5961
				float best_d_ssim = BIG_FLOAT_VAL;
5962

5963
				if (global_cfg.m_lambda == 0.0f)
5964
				{
5965
					for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
5966
					{
5967
						const auto& candidate = candidates[candidate_iter];
5968

5969
						float candidate_d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);
5970

5971
						if (candidate_d_ssim < best_d_ssim)
5972
							best_d_ssim = candidate_d_ssim;
5973

5974
						candidate_d_ssim *= SSIM_WEIGHT;
5975

5976
						float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);
5977

5978
						candidate_mse += candidate_d_ssim;
5979

5980
						float total_deblock_penalty = 0.0f;
5981
						if (global_cfg.m_deblocking_flag)
5982
						{
5983
							total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;
5984
						}
5985
						candidate_mse += total_deblock_penalty * SSIM_WEIGHT;
5986

5987
						if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))
5988
						{
5989
							// Bias the encoder away from 2 level blocks on complex blocks
5990
							// TODO: Perhaps only do this on large or non-interpolated grids
5991
							if (complex_block)
5992
							{
5993
								if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)
5994
								{
5995
									candidate_mse *= TWO_LEVEL_PENALTY;
5996
								}
5997
							}
5998

5999
							// Bias the encoder away from smaller weight grids if the block is very complex
6000
							// TODO: Use the DCT to compute an approximation of the block energy/variance retained vs. lost by downsampling.
6001
							if (complex_block)
6002
							{
6003
								if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))
6004
									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;
6005
								else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)
6006
									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;
6007
								else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)
6008
									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;
6009
							}
6010
						}
6011

6012
						float candidate_t = candidate_mse;
6013

6014
						if (candidate_t < best_t)
6015
						{
6016
							best_t = candidate_t;
6017
							best_candidate_index = candidate_iter;
6018
						}
6019

6020
					} // candidate_iter
6021

6022
					if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))
6023
					{
6024
						debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);
6025
						continue;
6026
					}
6027

6028
					const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);
6029

6030
					if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&
6031
						(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&
6032
						(block_avg_y >= 1.5f))
6033
					{
6034
						debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);
6035
						continue;
6036
					}
6037
				}
6038
				else
6039
				{
6040
					assert(enc_state.smooth_block_mse_scales.get_width() > 0);
6041

6042
					// Compute block's perceptual weighting
6043
					float perceptual_scale = 0.0f;
6044
					for (uint32_t y = 0; y < BLOCK_H; y++)
6045
						for (uint32_t x = 0; x < BLOCK_W; x++)
6046
							perceptual_scale = basisu::maximumf(perceptual_scale, enc_state.smooth_block_mse_scales.at_clamped(bx * BLOCK_W + x, by * BLOCK_H + y));
6047

6048
					// Very roughly normalize the computed distortion vs. bits.
6049
					perceptual_scale *= 10.0f;
6050

6051
					for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
6052
					{
6053
						auto& candidate = candidates[candidate_iter];
6054

6055
						float d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);
6056

6057
						if (d_ssim < best_d_ssim)
6058
							best_d_ssim = (float)d_ssim;
6059

6060
						d_ssim *= SSIM_WEIGHT;
6061

6062
						float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);
6063

6064
						candidate_mse += d_ssim;
6065

6066
						float total_deblock_penalty = 0.0f;
6067
						if (global_cfg.m_deblocking_flag)
6068
						{
6069
							total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;
6070
						}
6071
						candidate_mse += total_deblock_penalty * SSIM_WEIGHT;
6072

6073
						if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))
6074
						{
6075
							// Bias the encoder away from 2 level blocks on complex blocks
6076
							if (complex_block)
6077
							{
6078
								if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)
6079
								{
6080
									candidate_mse *= TWO_LEVEL_PENALTY;
6081
								}
6082
							}
6083

6084
							// Bias the encoder away from smaller weight grids if the block is very complex
6085
							if (complex_block)
6086
							{
6087
								if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))
6088
									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;
6089
								else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)
6090
									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;
6091
								else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)
6092
									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;
6093
							}
6094
						}
6095

6096
						float mode_penalty = 1.0f;
6097
						if (candidate.m_encoding_type == encoding_type::cSolid)
6098
							mode_penalty *= SOLID_PENALTY;
6099
						else if (candidate.m_encoding_type == encoding_type::cReuse)
6100
							mode_penalty *= REUSE_PENALTY;
6101
						else if (candidate.m_encoding_type == encoding_type::cRun)
6102
							mode_penalty *= (complex_block ? RUN_PENALTY * 2.0f : RUN_PENALTY);
6103

6104
						float candidate_bits = (float)candidate.m_coder.get_total_bits();
6105
						float candidate_d = candidate_mse * mode_penalty;
6106

6107
						const float D_POWER = 2.0f;
6108
						float candidate_t = perceptual_scale * powf(candidate_d, D_POWER) + candidate_bits * (global_cfg.m_lambda * 1000.0f);
6109

6110
						candidate.m_t = candidate_t;
6111
						candidate.m_d = candidate_d;
6112
						candidate.m_bits = candidate_bits;
6113

6114
						if (candidate_t < best_t)
6115
						{
6116
							best_t = candidate_t;
6117
							best_candidate_index = candidate_iter;
6118
						}
6119

6120
					} // candidate_iter
6121

6122
					if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))
6123
					{
6124
						debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);
6125
						continue;
6126
					}
6127

6128
					const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);
6129

6130
					if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&
6131
						(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&
6132
						(block_avg_y >= 1.5f))
6133
					{
6134
						debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);
6135
						continue;
6136
					}
6137

6138
					if (global_cfg.m_rdo_candidate_diversity_boost)
6139
					{
6140
						// candidate diversity boosting - consider candidates along/near the Pareto front
6141
						const candidate_encoding& comp_candidate = candidates[best_candidate_index];
6142

6143
						float best_d = BIG_FLOAT_VAL;
6144

6145
						for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
6146
						{
6147
							const auto& candidate = candidates[candidate_iter];
6148

6149
							if (candidate.m_bits <= comp_candidate.m_bits * global_cfg.m_rdo_candidate_diversity_boost_bit_window_weight)
6150
							{
6151
								if (candidate.m_d < best_d)
6152
								{
6153
									best_d = candidate.m_d;
6154
									best_candidate_index = candidate_iter;
6155
								}
6156
							}
6157
						}
6158
					}
6159

6160
					// candidate JND optimization - if there's a cheaper to code candidate that is nearly equivalent visually to the best candidate chose, choose that
6161
					if (global_cfg.m_jnd_optimization)
6162
					{
6163
						const candidate_encoding& cur_comp_candidate = candidates[best_candidate_index];
6164

6165
						float new_best_candidate_bits = BIG_FLOAT_VAL;
6166
						int new_best_candidate_index = -1;
6167

6168
						for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
6169
						{
6170
							if ((int)candidate_iter == best_candidate_index)
6171
								continue;
6172

6173
							const auto& candidate = candidates[candidate_iter];
6174

6175
							if (candidate.m_bits >= cur_comp_candidate.m_bits)
6176
								continue;
6177

6178
							float max_delta_itp = 0.0f;
6179
							for (uint32_t y = 0; y < BLOCK_H; y++)
6180
							{
6181
								for (uint32_t x = 0; x < BLOCK_W; x++)
6182
								{
6183
									float delta_itp = compute_pixel_delta_itp(cur_comp_candidate.m_comp_pixels_itp[y][x], candidate.m_comp_pixels_itp[y][x], block_pixels_as_itp[y][x], global_cfg.m_delta_itp_dark_adjustment);
6184
									max_delta_itp = maximum(max_delta_itp, delta_itp);
6185

6186
									if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)
6187
										goto skip;
6188
								}
6189
							}
6190

6191
						skip:
6192
							if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)
6193
								continue;
6194

6195
							if (candidate.m_bits < new_best_candidate_bits)
6196
							{
6197
								new_best_candidate_bits = candidate.m_bits;
6198
								new_best_candidate_index = candidate_iter;
6199
							}
6200
						}
6201

6202
						if (new_best_candidate_index != -1)
6203
						{
6204
							best_candidate_index = new_best_candidate_index;
6205
							debug_state.m_total_jnd_replacements.fetch_add(1, std::memory_order_relaxed);
6206
						}
6207
					}
6208

6209
				} // if (lambda == 0.0f)
6210

6211
				if (global_cfg.m_debug_images)
6212
				{
6213
					std::lock_guard<std::mutex> lck(debug_state.m_stat_vis_mutex);
6214
					debug_state.m_stat_vis.fill_box(bx * 6, by * 6, 6, 6, vec4F(best_d_ssim, max_std_dev, lowpass_std_dev, 1.0f));
6215
				}
6216

6217
				if (best_candidate_index < 0)
6218
				{
6219
					assert(best_candidate_index >= 0);
6220
					fmt_error_printf("No candidates!\n");
6221
					return false;
6222
				}
6223

6224
				const auto& best_candidate = candidates[best_candidate_index];
6225

6226
				assert(best_candidate.m_encoding_type != encoding_type::cInvalid);
6227

6228
				if (best_candidate.m_encoding_type == encoding_type::cRun)
6229
				{
6230
					if (!prev_run_len)
6231
					{
6232
						if (prev_encoding.get_total_bits())
6233
						{
6234
#if SYNC_MARKERS
6235
							strip_coded_bits.put_bits(0xDEAD, 16);
6236
#endif
6237

6238
							strip_coded_bits.append(prev_encoding);
6239
						}
6240

6241
						assert(best_candidate.m_coder.get_total_bits());
6242

6243
						prev_encoding = best_candidate.m_coder;
6244

6245
						prev_run_len = 1;
6246
					}
6247
					else
6248
					{
6249
						prev_run_len++;
6250

6251
						const uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();
6252
						assert(prev_run_bits);
6253
						BASISU_NOTE_UNUSED(prev_run_bits);
6254

6255
						const uint32_t num_dummy_bits = best_candidate.m_coder.get_total_bits_u32();
6256
						BASISU_NOTE_UNUSED(num_dummy_bits);
6257

6258
						// Rewrite the previous encoding to extend the run length.
6259
						prev_encoding.restart();
6260
						prev_encoding.put_bits(RUN_CODE, RUN_CODE_LEN);
6261
						prev_encoding.put_vlc(prev_run_len - 1, 5);
6262

6263
						assert(prev_encoding.get_total_bits() == prev_run_bits + num_dummy_bits);
6264
					}
6265
				}
6266
				else
6267
				{
6268
					if (prev_encoding.get_total_bits())
6269
					{
6270
#if SYNC_MARKERS
6271
						strip_coded_bits.put_bits(0xDEAD, 16);
6272
#endif
6273

6274
						strip_coded_bits.append(prev_encoding);
6275
					}
6276

6277
					prev_encoding = best_candidate.m_coder;
6278
					prev_run_len = 0;
6279
				}
6280

6281
				memcpy(prev_comp_pixels, best_candidate.m_comp_pixels, sizeof(vec3F) * BLOCK_W * BLOCK_H);
6282

6283
				prev_candidate_encoding = best_candidate;
6284

6285
				if (best_candidate.m_encoding_type != encoding_type::cRun)
6286
					prev_non_run_candidate_encoding = best_candidate;
6287

6288
				{
6289
					std::lock_guard<std::mutex> lck(debug_state.m_stats_mutex);
6290

6291
					debug_state.m_encoding_type_hist[(uint32_t)best_candidate.m_encoding_type]++;
6292

6293
					if (best_candidate.m_encoding_type == encoding_type::cBlock)
6294
					{
6295
						debug_state.m_endpoint_mode_hist[(uint32_t)best_candidate.m_endpoint_mode]++;
6296
					}
6297

6298
					if ((best_candidate.m_encoding_type == encoding_type::cReuse) || (best_candidate.m_encoding_type == encoding_type::cBlock))
6299
					{
6300
						const uint32_t bm_index = (uint32_t)best_candidate.m_block_mode;
6301
						assert(bm_index < (uint32_t)block_mode::cBMTotalModes);
6302

6303
						debug_state.m_block_mode_hist[bm_index]++;
6304
						debug_state.m_block_mode_total_bits[bm_index] += best_candidate.m_coder.get_total_bits();
6305

6306
						for (uint32_t i = 0; i < 3; i++)
6307
						{
6308
							debug_state.m_block_mode_comp_stats[bm_index][i].push_back(half_comp_stats[i]);
6309
							debug_state.m_block_mode_comparative_stats[bm_index][i].push_back(half_cross_chan_stats[i]);
6310
						}
6311
					}
6312

6313
					if (best_candidate.m_encoding_type == encoding_type::cReuse)
6314
					{
6315
						debug_state.m_reuse_num_parts[best_candidate.m_coded_log_blk.m_num_partitions].fetch_add(1, std::memory_order_relaxed);
6316

6317
						if (best_candidate.m_coded_log_blk.m_dual_plane)
6318
							debug_state.m_reuse_total_dp.fetch_add(1, std::memory_order_relaxed);
6319
					}
6320
				}
6321

6322
				enc_state.coded_blocks(bx, by) = prev_non_run_candidate_encoding;
6323

6324
				// Update decoded image
6325
				vec4F decoded_float_pixels[BLOCK_H][BLOCK_W];
6326
				for (uint32_t y = 0; y < BLOCK_H; y++)
6327
					for (uint32_t x = 0; x < BLOCK_W; x++)
6328
						decoded_float_pixels[y][x] = best_candidate.m_comp_pixels[y][x];
6329

6330
				enc_state.packed_img.set_block_clipped((vec4F*)decoded_float_pixels, bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);
6331

6332
				status = astc_helpers::pack_astc_block(enc_state.final_astc_blocks(bx, by), best_candidate.m_decomp_log_blk, nullptr, nullptr);
6333
				if (!status)
6334
				{
6335
					fmt_error_printf("Failed packing block\n");
6336
					return false;
6337
				}
6338

6339
				const uint32_t r = debug_state.m_total_blocks_compressed.fetch_add(1, std::memory_order_relaxed);
6340
				if ((r & 2047) == 2047)
6341
				{
6342
					if (global_cfg.m_status_output)
6343
					{
6344
						basisu::fmt_printf("{} of {} total blocks compressed, {3.2}%\n", r, total_blocks, (r * 100.0f) / total_blocks);
6345
					}
6346
				}
6347

6348
				if ((global_cfg.m_debug_images) &&
6349
					((best_candidate.m_encoding_type != encoding_type::cRun) && (best_candidate.m_encoding_type != encoding_type::cSolid)))
6350
				{
6351
					std::lock_guard<std::mutex> lck(debug_state.m_vis_image_mutex);
6352

6353
					if (best_candidate.m_decomp_log_blk.m_num_partitions == 2)
6354
					{
6355
						const int part2_unique_index = g_part2_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];
6356
						assert((part2_unique_index >= 0) && (part2_unique_index < (int)NUM_UNIQUE_PARTITIONS2));
6357

6358
						const partition_pattern_vec& pat = g_partitions2[part2_unique_index];
6359

6360
						for (uint32_t y = 0; y < 6; y++)
6361
						{
6362
							for (uint32_t x = 0; x < 6; x++)
6363
							{
6364
								const uint32_t p = pat[x + y * 6];
6365
								debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, color_rgba(p ? 100 : 0, 128, p ? 100 : 0, 255));
6366
							} // x
6367
						} // y 
6368
					}
6369
					else if (best_candidate.m_decomp_log_blk.m_num_partitions == 3)
6370
					{
6371
						//part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(0, 0, 255, 255));
6372

6373
						const int part3_unique_index = g_part3_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];
6374
						assert((part3_unique_index >= 0) && (part3_unique_index < (int)NUM_UNIQUE_PARTITIONS3));
6375

6376
						const partition_pattern_vec& pat = g_partitions3[part3_unique_index];
6377

6378
						for (uint32_t y = 0; y < 6; y++)
6379
						{
6380
							for (uint32_t x = 0; x < 6; x++)
6381
							{
6382
								const uint32_t p = pat[x + y * 6];
6383
								color_rgba c(0, 0, 150, 255);
6384
								if (p == 1)
6385
									c.set(100, 0, 150, 255);
6386
								else if (p == 2)
6387
									c.set(0, 100, 150, 255);
6388
								debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, c);
6389
							} // x
6390
						} // y 
6391
					}
6392
					else if (best_candidate.m_decomp_log_blk.m_dual_plane)
6393
					{
6394
						debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 255, 255));
6395
					}
6396
					else
6397
					{
6398
						debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 0, 255));
6399
					}
6400

6401
					color_rgba c;
6402
					c.set((best_candidate.m_coded_log_blk.m_grid_width * best_candidate.m_coded_log_blk.m_grid_height * 255 + 18) / 36);
6403
					debug_state.m_grid_vis.fill_box(bx * 6, by * 6, 6, 6, c);
6404

6405
					c.set(0, 0, 0, 255);
6406
					if (complex_block)
6407
						c[0] = 255;
6408

6409
					if (very_complex_block)
6410
						c[1] = 255;
6411

6412
					if (outer_pass == 2)
6413
						c[2] = 255;
6414
					else if (outer_pass == 1)
6415
						c[2] = 128;
6416

6417
					debug_state.m_mode_vis.fill_box(bx * 6, by * 6, 6, 6, c);
6418

6419
					c.set(0, 255, 0, 255);
6420
					if (best_candidate.m_coded_log_blk.m_color_endpoint_modes[0] == 7)
6421
						c.set(255, 0, 0, 255);
6422
					debug_state.m_mode_vis2.fill_box(bx * 6, by * 6, 6, 6, c);
6423

6424
					switch (best_candidate.m_encoding_type)
6425
					{
6426
					case encoding_type::cRun:
6427
						c.set(0, 0, 0, 255);
6428
						break;
6429
					case encoding_type::cSolid:
6430
						c.set(128, 128, 128, 255); // dark grey
6431
						break;
6432
					case encoding_type::cReuse:
6433
						c.set(255, 255, 0, 255); // yellow
6434
						break;
6435
					case encoding_type::cBlock:
6436
					{
6437
						switch (best_candidate.m_endpoint_mode)
6438
						{
6439
						case endpoint_mode::cRaw:
6440
							c.set(255, 0, 0, 255); // red
6441
							break;
6442
						case endpoint_mode::cUseLeft:
6443
							c.set(0, 0, 255, 255); // blue
6444
							break;
6445
						case endpoint_mode::cUseUpper:
6446
							c.set(0, 0, 192, 255); // darker blue
6447
							break;
6448
						case endpoint_mode::cUseLeftDelta:
6449
							c.set(0, 255, 0, 255); // green
6450
							break;
6451
						case endpoint_mode::cUseUpperDelta:
6452
							c.set(0, 192, 0, 255); // darker green
6453
							break;
6454
						default:
6455
							break;
6456
						}
6457

6458
						break;
6459
					}
6460
					default:
6461
						break;
6462
					}
6463

6464
					if (filtered_x_err < filtered_y_err)
6465
						c[3] = 0;
6466
					else
6467
						c[3] = 255;
6468

6469
					debug_state.m_enc_vis.fill_box(bx * 6, by * 6, 6, 6, c);
6470
				}
6471

6472
				break;
6473

6474
			} // outer_pass
6475

6476
		} // bx
6477

6478
	} // by
6479

6480
	if (prev_encoding.get_total_bits())
6481
	{
6482
#if SYNC_MARKERS
6483
		strip_coded_bits.put_bits(0xDEAD, 16);
6484
#endif
6485

6486
		strip_coded_bits.append(prev_encoding);
6487
	}
6488

6489
	return true;
6490
}
6491

6492
bool g_initialized = false;
6493

6494
void global_init()
6495
{
6496
	if (g_initialized)
6497
		return;
6498

6499
	interval_timer tm;
6500
	tm.start();
6501

6502
	init_pq_tables();
6503
		
6504
	init_partitions2_6x6();
6505
	init_partitions3_6x6();
6506

6507
	init_contrib_lists();
6508

6509
	g_initialized = true;
6510

6511
	//fmt_printf("astc_6x6_hdr::global_init() total time: {}\n", tm.get_elapsed_secs());
6512
}
6513

6514
bool compress_photo(const basisu::imagef &orig_src_img, const astc_hdr_6x6_global_config &orig_global_cfg, job_pool *pJob_pool,
6515
	basisu::uint8_vec& intermediate_tex_data, basisu::uint8_vec& astc_tex_data, result_metrics& metrics)
6516
{
6517
	assert(g_initialized);
6518
	if (!g_initialized)
6519
		return false;
6520
	
6521
	assert(pJob_pool);
6522

6523
	if (orig_global_cfg.m_debug_output)
6524
	{
6525
		fmt_debug_printf("------ astc_6x6_hdr::compress_photo:\n");
6526
		fmt_debug_printf("Source image dimensions: {}x{}\n", orig_src_img.get_width(), orig_src_img.get_height());
6527
		fmt_debug_printf("Job pool total threads: {}\n", (uint64_t)pJob_pool->get_total_threads());
6528
		orig_global_cfg.print();
6529
	}
6530

6531
	if (!orig_src_img.get_width() || !orig_src_img.get_height())
6532
	{
6533
		assert(false);
6534
		fmt_error_printf("compress_photo: Invalid source image\n");
6535
		return false;
6536
	}
6537

6538
	astc_hdr_6x6_global_config global_cfg(orig_global_cfg);
6539

6540
	uastc_hdr_6x6_encode_state enc_state;
6541
	enc_state.master_coptions.m_q_log_bias = Q_LOG_BIAS_6x6;
6542
	enc_state.src_img = orig_src_img;
6543

6544
	//src_img.crop(256, 256);
6545

6546
	const uint32_t width = enc_state.src_img.get_width();
6547
	const uint32_t height = enc_state.src_img.get_height();
6548
	const uint32_t num_blocks_x = enc_state.src_img.get_block_width(BLOCK_W);
6549
	const uint32_t num_blocks_y = enc_state.src_img.get_block_height(BLOCK_H);
6550
	const uint32_t total_blocks = num_blocks_x * num_blocks_y;
6551

6552
	for (uint32_t y = 0; y < height; y++)
6553
	{
6554
		for (uint32_t x = 0; x < width; x++)
6555
		{
6556
			for (uint32_t c = 0; c < 3; c++)
6557
			{
6558
				float f = enc_state.src_img(x, y)[c];
6559

6560
				if (std::isinf(f) || std::isnan(f) || (f < 0.0f))
6561
					f = 0;
6562
				else if (f > basist::ASTC_HDR_MAX_VAL)
6563
					f = basist::ASTC_HDR_MAX_VAL;
6564

6565
				enc_state.src_img(x, y)[c] = f;
6566
								
6567
			} // c
6568
						
6569
		} // x
6570
	} // y
6571
	
6572
	if (global_cfg.m_debug_images)
6573
	{
6574
		write_exr((global_cfg.m_debug_image_prefix + "orig.exr").c_str(), enc_state.src_img, 3, 0);
6575
	}
6576
			
6577
	image src_img_compressed;
6578
	tonemap_image_compressive2(src_img_compressed, enc_state.src_img);
6579

6580
	if (global_cfg.m_debug_images)
6581
	{
6582
		save_png(global_cfg.m_debug_image_prefix + "compressive_tone_map.png", src_img_compressed);
6583
	}
6584

6585
	smooth_map_params rp;
6586
	rp.m_debug_images = global_cfg.m_debug_images;
6587

6588
	if (global_cfg.m_lambda != 0.0f)
6589
	{
6590
		if (global_cfg.m_status_output)
6591
			fmt_printf("Creating RDO perceptual weighting maps\n");
6592

6593
		create_smooth_maps2(enc_state.smooth_block_mse_scales, src_img_compressed, rp);
6594
	}
6595

6596
	if (global_cfg.m_status_output)
6597
		fmt_printf("Blurring image\n");
6598

6599
	enc_state.src_img_filtered1.resize(width, height);
6600
	image_resample(enc_state.src_img, enc_state.src_img_filtered1, "gaussian", global_cfg.m_gaussian1_strength); //1.45f);
6601
	
6602
	enc_state.src_img_filtered2.resize(width, height);
6603
	image_resample(enc_state.src_img, enc_state.src_img_filtered2, "gaussian", global_cfg.m_gaussian2_strength); //1.83f);
6604
		
6605
	if (global_cfg.m_debug_images)
6606
	{
6607
		write_exr((global_cfg.m_debug_image_prefix + "blurred1.exr").c_str(), enc_state.src_img_filtered1, 3, 0);
6608
		write_exr((global_cfg.m_debug_image_prefix + "blurred2.exr").c_str(), enc_state.src_img_filtered2, 3, 0);
6609
	}
6610

6611
	if (global_cfg.m_status_output)
6612
		fmt_printf("Transforming to ITP\n");
6613

6614
	enc_state.src_img_itp.resize(width, height);
6615
	convet_rgb_image_to_itp(enc_state.src_img, enc_state.src_img_itp, global_cfg);
6616
	
6617
	enc_state.src_img_filtered1_itp.resize(width, height);
6618
	convet_rgb_image_to_itp(enc_state.src_img_filtered1, enc_state.src_img_filtered1_itp, global_cfg);
6619
	
6620
	enc_state.src_img_filtered2_itp.resize(width, height);
6621
	convet_rgb_image_to_itp(enc_state.src_img_filtered2, enc_state.src_img_filtered2_itp, global_cfg);
6622

6623
	if (global_cfg.m_lambda == 0.0f)
6624
		global_cfg.m_favor_higher_compression = false;
6625

6626
	uint32_t total_strips = 0, rows_per_strip = 0;
6627
	if (!calc_strip_size(global_cfg.m_lambda, num_blocks_y, (uint32_t)pJob_pool->get_total_threads(), global_cfg.m_force_one_strip, total_strips, rows_per_strip, global_cfg))
6628
	{
6629
		fmt_error_printf("compress_photo: Failed computing strip sizes\n");
6630
		return false;
6631
	}
6632
		
6633
	if (global_cfg.m_debug_output)
6634
		fmt_printf("lambda: {}, comp_level: {}, highest_comp_level: {}, extra patterns: {}\n", global_cfg.m_lambda, global_cfg.m_master_comp_level, global_cfg.m_highest_comp_level, global_cfg.m_extra_patterns_flag);
6635
					
6636
	enc_state.coded_blocks.resize(num_blocks_x, num_blocks_y);
6637
						
6638
	bitwise_coder coded_bits;
6639

6640
	coded_bits.put_bits(0xABCD, 16);
6641
	coded_bits.put_bits(width, 16);
6642
	coded_bits.put_bits(height, 16);
6643
					
6644
	enc_state.packed_img.resize(width, height);
6645
		
6646
	enc_state.strip_bits.resize(total_strips);
6647

6648
	enc_state.final_astc_blocks.resize(num_blocks_x, num_blocks_y);
6649

6650
	uastc_hdr_6x6_debug_state debug_state;
6651

6652
	if (global_cfg.m_debug_images)
6653
		debug_state.init(width, height);
6654
	else
6655
		debug_state.init(0, 0);
6656
		
6657
	interval_timer tm;
6658
	tm.start();
6659

6660
	std::atomic_bool any_failed_flag;
6661
	any_failed_flag.store(false);
6662

6663
	for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)
6664
	{
6665
		const uint32_t strip_first_by = strip_index * rows_per_strip;
6666
		
6667
		uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);
6668
		if (strip_index == (total_strips - 1))
6669
			strip_last_by = num_blocks_y - 1;
6670

6671
		pJob_pool->add_job([&any_failed_flag, &global_cfg, &debug_state, &enc_state,
6672
			strip_index, total_strips, strip_first_by, strip_last_by,
6673
			num_blocks_x, num_blocks_y, total_blocks, width, height]
6674
		{
6675
			if (!any_failed_flag)
6676
			{
6677
				bool status = compress_strip_task(
6678
					strip_index, total_strips, strip_first_by, strip_last_by,
6679
					num_blocks_x, num_blocks_y, total_blocks, width, height,
6680
					global_cfg, debug_state, enc_state);
6681

6682
				if (!status)
6683
				{
6684
					fmt_error_printf("compress_photo: compress_strip_task() failed\n");
6685
					any_failed_flag.store(true, std::memory_order_relaxed);
6686
				}
6687
			}
6688
		} );
6689

6690
		if (any_failed_flag)
6691
			break;
6692
	
6693
	} // strip_index
6694

6695
	pJob_pool->wait_for_all();
6696

6697
	if (any_failed_flag)
6698
	{
6699
		fmt_error_printf("One or more strips failed during compression\n");
6700
		return false;
6701
	}
6702
				
6703
	if (global_cfg.m_debug_output)
6704
		fmt_printf("Encoding time: {} secs\n", tm.get_elapsed_secs());
6705

6706
	if (global_cfg.m_debug_output)
6707
		debug_state.print(total_blocks);
6708

6709
	if (global_cfg.m_debug_images)
6710
	{
6711
		save_png(global_cfg.m_debug_image_prefix +  "part_vis.png", debug_state.m_part_vis);
6712
		save_png(global_cfg.m_debug_image_prefix + "grid_vis.png", debug_state.m_grid_vis);
6713
		save_png(global_cfg.m_debug_image_prefix + "mode_vis.png", debug_state.m_mode_vis);
6714
		save_png(global_cfg.m_debug_image_prefix + "mode_vis2.png", debug_state.m_mode_vis2);
6715
		save_png(global_cfg.m_debug_image_prefix + "enc_vis.png", debug_state.m_enc_vis);
6716
		write_exr((global_cfg.m_debug_image_prefix + "stat_vis.exr").c_str(), debug_state.m_stat_vis, 3, 0);
6717
	}
6718

6719
	for (uint32_t i = 0; i < total_strips; i++)
6720
		coded_bits.append(enc_state.strip_bits[i]);
6721
		
6722
	coded_bits.put_bits(0xA742, 16);
6723

6724
	coded_bits.flush();
6725

6726
	if (global_cfg.m_output_images)
6727
	{
6728
		write_exr((global_cfg.m_output_image_prefix + "comp.exr").c_str(), enc_state.packed_img, 3, 0);
6729
	}
6730
	
6731
	if (global_cfg.m_debug_output)
6732
		fmt_printf("\nTotal intermediate output bits/pixel: {3.4}\n", (float)coded_bits.get_total_bits() / (float)(width * height));
6733

6734
	vector2D<astc_helpers::astc_block> decoded_blocks1;
6735
	vector2D<astc_helpers::astc_block> decoded_blocks2;
6736
	
6737
	if (global_cfg.m_debug_output)
6738
		fmt_printf("decode_file\n");
6739

6740
	uint32_t unpacked_width = 0, unpacked_height = 0;
6741
	bool status = decode_file(coded_bits.get_bytes(), decoded_blocks1, unpacked_width, unpacked_height);
6742
	if (!status)
6743
	{
6744
		fmt_error_printf("decode_file() failed\n");
6745
		return false;
6746
	}
6747

6748
	if (global_cfg.m_debug_output)
6749
		fmt_printf("decode_6x6_hdr\n");
6750

6751
	status = decode_6x6_hdr(coded_bits.get_bytes().get_ptr(), coded_bits.get_bytes().size_in_bytes_u32(), decoded_blocks2, unpacked_width, unpacked_height);
6752
	if (!status)
6753
	{
6754
		fmt_error_printf("decode_6x6_hdr_file() failed\n");
6755
		return false;
6756
	}
6757

6758
	if ((enc_state.final_astc_blocks.get_width() != decoded_blocks1.get_width()) ||
6759
		(enc_state.final_astc_blocks.get_height() != decoded_blocks1.get_height()))
6760
	{
6761
		fmt_error_printf("Decode size mismatch with decode_file\n");
6762
		return false;
6763
	}
6764

6765
	if ((enc_state.final_astc_blocks.get_width() != decoded_blocks2.get_width()) ||
6766
		(enc_state.final_astc_blocks.get_height() != decoded_blocks2.get_height()))
6767
	{
6768
		fmt_error_printf("Decode size mismatch with decode_6x6_hdr_file\n");
6769
		return false;
6770
	}
6771

6772
	if (memcmp(decoded_blocks1.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks1.size_in_bytes()) != 0)
6773
	{
6774
		fmt_error_printf("Decoded ASTC blocks verification failed\n");
6775
		return false;
6776
	}
6777

6778
	if (memcmp(decoded_blocks2.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks2.size_in_bytes()) != 0)
6779
	{
6780
		fmt_error_printf("Decoded ASTC blocks verification failed\n");
6781
		return false;
6782
	}
6783

6784
	if (global_cfg.m_debug_output)
6785
		basisu::fmt_printf("Decoded ASTC verification checks succeeded\n");
6786

6787
	if (global_cfg.m_output_images)
6788
	{
6789
		if (write_astc_file((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), decoded_blocks1.get_ptr(), BLOCK_W, BLOCK_H, width, height))
6790
		{
6791
			basisu::platform_sleep(20);
6792

6793
			uint8_vec astc_file_data;
6794
			if (read_file_to_vec((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), astc_file_data))
6795
			{
6796
				if (astc_file_data.size() > 16)
6797
				{
6798
					astc_file_data.erase(0, 16);
6799

6800
					size_t comp_size = 0;
6801
					void* pComp_data = tdefl_compress_mem_to_heap(&astc_file_data[0], astc_file_data.size(), &comp_size, TDEFL_MAX_PROBES_MASK);
6802
					mz_free(pComp_data);
6803

6804
					if (global_cfg.m_debug_output)
6805
					{
6806
						fmt_printf(".ASTC file size (less header): {}, bits/pixel: {}, Deflate bits/pixel: {}\n",
6807
							(uint64_t)astc_file_data.size(),
6808
							(float)astc_file_data.size() * 8.0f / (float)(width * height),
6809
							(float)comp_size * 8.0f / (float)(width * height));
6810
					}
6811
				}
6812
			}
6813
		}
6814
	}
6815

6816
	// Must decode all the blocks (even padded rows/cols) to match what the transcoder does.
6817
	imagef unpacked_astc_img(num_blocks_x * 6, num_blocks_y * 6);
6818
	imagef unpacked_astc_google_img(num_blocks_x * 6, num_blocks_y * 6);
6819

6820
	for (uint32_t y = 0; y < decoded_blocks1.get_height(); y++)
6821
	{
6822
		for (uint32_t x = 0; x < decoded_blocks1.get_width(); x++)
6823
		{
6824
			const auto& phys_blk = decoded_blocks1(x, y);
6825

6826
			vec4F pixels[MAX_BLOCK_W * MAX_BLOCK_H];
6827
			status = unpack_physical_astc_block(&phys_blk, BLOCK_W, BLOCK_H, pixels);
6828
			if (!status)
6829
			{
6830
				fmt_error_printf("unpack_physical_astc_block() failed\n");
6831
				return false;
6832
			}
6833
			
6834
			unpacked_astc_img.set_block_clipped(pixels, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);
6835

6836
			vec4F pixels_google[MAX_BLOCK_W * MAX_BLOCK_H];
6837
			status = unpack_physical_astc_block_google(&phys_blk, BLOCK_W, BLOCK_H, pixels_google);
6838
			if (!status)
6839
			{
6840
				fmt_error_printf("unpack_physical_astc_block_google() failed\n");
6841
				return false;
6842
			}
6843

6844
			unpacked_astc_google_img.set_block_clipped(pixels_google, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);
6845

6846
			for (uint32_t i = 0; i < 36; i++)
6847
			{
6848
				if (pixels[i] != pixels_google[i])
6849
				{
6850
					fmt_error_printf("pixel unpack mismatch\n");
6851
					return false;
6852
				}
6853
			}
6854
		}
6855
	}
6856
		
6857
	if (global_cfg.m_debug_output)
6858
		fmt_printf("\nUnpack succeeded\n");
6859

6860
	imagef unpacked_bc6h_img;
6861

6862
	{
6863
		vector2D<basist::bc6h_block> bc6h_blocks;
6864
		
6865
		fast_bc6h_params enc_params;
6866
						
6867
		bool pack_status = pack_bc6h_image(unpacked_astc_img, bc6h_blocks, &unpacked_bc6h_img, enc_params);
6868
		if (!pack_status)
6869
		{
6870
			fmt_error_printf("pack_bc6h_image() failed!");
6871
			return false;
6872
		}
6873

6874
		unpacked_bc6h_img.crop(width, height);
6875
		
6876
		if (global_cfg.m_output_images)
6877
		{
6878
			write_exr((global_cfg.m_output_image_prefix + "unpacked_bc6h.exr").c_str(), unpacked_bc6h_img, 3, 0);
6879
		}
6880
	}
6881

6882
	unpacked_astc_img.crop(width, height);
6883
	unpacked_astc_google_img.crop(width, height);
6884
	
6885
	if (global_cfg.m_output_images)
6886
	{
6887
		write_exr((global_cfg.m_output_image_prefix + "unpacked_astc.exr").c_str(), unpacked_astc_img, 3, 0);
6888
		write_exr((global_cfg.m_output_image_prefix + "unpacked_google_astc.exr").c_str(), unpacked_astc_google_img, 3, 0);
6889
	}
6890

6891
	// ASTC metrics
6892
	if (global_cfg.m_image_stats)
6893
	{
6894
		image_metrics im;
6895

6896
		if (global_cfg.m_debug_output)
6897
			printf("\nASTC log2 float error metrics:\n");
6898

6899
		for (uint32_t i = 0; i < 3; i++)
6900
		{
6901
			im.calc(enc_state.src_img, unpacked_astc_img, i, 1, true, true);
6902

6903
			if (global_cfg.m_debug_output)
6904
			{
6905
				printf("%c:   ", "RGBA"[i]);
6906
				im.print_hp();
6907
			}
6908
		}
6909
		
6910
		metrics.m_im_astc_log2.calc(enc_state.src_img, unpacked_astc_img, 0, 3, true, true);
6911

6912
		if (global_cfg.m_debug_output)
6913
		{
6914
			printf("RGB: ");
6915
			metrics.m_im_astc_log2.print_hp();
6916

6917
			printf("\n");
6918
		}
6919
	}
6920

6921
	if (global_cfg.m_image_stats)
6922
	{
6923
		image_metrics im;
6924

6925
		if (global_cfg.m_debug_output)
6926
			printf("ASTC half float space error metrics (a piecewise linear approximation of log2 error):\n");
6927

6928
		for (uint32_t i = 0; i < 3; i++)
6929
		{
6930
			im.calc_half(enc_state.src_img, unpacked_astc_img, i, 1, true);
6931

6932
			if (global_cfg.m_debug_output)
6933
			{
6934
				printf("%c:   ", "RGBA"[i]);
6935
				im.print_hp();
6936
			}
6937
		}
6938

6939
		metrics.m_im_astc_half.calc_half(enc_state.src_img, unpacked_astc_img, 0, 3, true);
6940

6941
		if (global_cfg.m_debug_output)
6942
		{
6943
			printf("RGB: ");
6944
			metrics.m_im_astc_half.print_hp();
6945
		}
6946
	}
6947

6948
	// BC6H metrics
6949
	if (global_cfg.m_image_stats)
6950
	{
6951
		image_metrics im;
6952

6953
		if (global_cfg.m_debug_output)
6954
			printf("\nBC6H log2 float error metrics:\n");
6955

6956
		for (uint32_t i = 0; i < 3; i++)
6957
		{
6958
			im.calc(enc_state.src_img, unpacked_bc6h_img, i, 1, true, true);
6959
			
6960
			if (global_cfg.m_debug_output)
6961
			{
6962
				printf("%c:   ", "RGBA"[i]);
6963
				im.print_hp();
6964
			}
6965
		}
6966

6967
		metrics.m_im_bc6h_log2.calc(enc_state.src_img, unpacked_bc6h_img, 0, 3, true, true);
6968

6969
		if (global_cfg.m_debug_output)
6970
		{
6971
			printf("RGB: ");
6972
			metrics.m_im_bc6h_log2.print_hp();
6973

6974
			printf("\n");
6975
		}
6976
	}
6977

6978
	if (global_cfg.m_image_stats)
6979
	{
6980
		image_metrics im;
6981
		
6982
		if (global_cfg.m_debug_output)
6983
			printf("BC6H half float space error metrics (a piecewise linear approximation of log2 error):\n");
6984

6985
		for (uint32_t i = 0; i < 3; i++)
6986
		{
6987
			im.calc_half(enc_state.src_img, unpacked_bc6h_img, i, 1, true);
6988
			
6989
			if (global_cfg.m_debug_output)
6990
			{
6991
				printf("%c:   ", "RGBA"[i]);
6992
				im.print_hp();
6993
			}
6994
		}
6995

6996
		metrics.m_im_bc6h_half.calc_half(enc_state.src_img, unpacked_bc6h_img, 0, 3, true);
6997
		
6998
		if (global_cfg.m_debug_output)
6999
		{
7000
			printf("RGB: ");
7001
			metrics.m_im_bc6h_half.print_hp();
7002

7003
			printf("\n");
7004
		}
7005
	}
7006

7007
	intermediate_tex_data.swap(coded_bits.get_bytes());
7008

7009
	astc_tex_data.resize(decoded_blocks1.size_in_bytes());
7010
	memcpy(astc_tex_data.data(), decoded_blocks1.get_ptr(), decoded_blocks1.size_in_bytes());
7011

7012
	return true;
7013
}
7014

7015
} // namespace astc_6x6_hdr
7016

7017
Product

Resources

Company